diff --git a/modules/host/kernel.nix b/modules/host/kernel.nix index 83d0d9ff7..444859458 100644 --- a/modules/host/kernel.nix +++ b/modules/host/kernel.nix @@ -6,10 +6,22 @@ pkgs, ... }: let - baseKernel = pkgs.linux_latest; - + baseKernel = + if hyp_cfg.enable + then + pkgs.linux_6_1.override { + argsOverride = rec { + src = pkgs.fetchurl { + url = "mirror://kernel/linux/kernel/v6.x/linux-${version}.tar.xz"; + hash = "sha256-qH4kHsFdU0UsTv4hlxOjdp2IzENrW5jPbvsmLEr/FcA="; + }; + version = "6.1.55"; + modDirVersion = "6.1.55"; + }; + } + else pkgs.linux_latest; hardened_kernel = pkgs.linuxManualConfig rec { - inherit (baseKernel) src modDirVersion; + inherit (baseKernel) src modDirVersion kernelPatches; version = "${baseKernel.version}-ghaf-hardened"; /* baseline "make tinyconfig" @@ -55,19 +67,42 @@ - also see https://github.com/NixOS/nixpkgs/issues/109280 for the context > */ + configfile = ./ghaf_host_hardened_baseline; allowImportFromDerivation = true; }; - cfg = config.ghaf.host.kernel_hardening; + pkvm_patch = lib.mkIf config.ghaf.hardware.x86_64.common.enable [ + { + name = "pkvm-patch"; + patch = ../virtualization/pkvm/0001-pkvm-enable-pkvm-on-intel-x86-6.1-lts.patch; + structuredExtraConfig = with lib.kernel; { + KVM_INTEL = yes; + KSM = no; + PKVM_INTEL = yes; + PKVM_INTEL_DEBUG = yes; + PKVM_GUEST = yes; + EARLY_PRINTK_USB_XDBC = yes; + RETPOLINE = yes; + }; + } + ]; + + kern_cfg = config.ghaf.host.kernel_hardening; + hyp_cfg = config.ghaf.host.hypervisor_hardening; in with lib; { options.ghaf.host.kernel_hardening = { enable = mkEnableOption "Host kernel hardening"; }; - config = mkIf cfg.enable { + options.ghaf.host.hypervisor_hardening = { + enable = mkEnableOption "Hypervisor hardening"; + }; + + config = mkIf kern_cfg.enable { boot.kernelPackages = pkgs.linuxPackagesFor hardened_kernel; + boot.kernelPatches = mkIf (hyp_cfg.enable && "${baseKernel.version}" == "6.1.55") pkvm_patch; # https://github.com/NixOS/nixpkgs/issues/109280#issuecomment-973636212 nixpkgs.overlays = [ (_final: prev: { diff --git a/modules/virtualization/pkvm/0001-pkvm-enable-pkvm-on-intel-x86-6.1-lts.patch b/modules/virtualization/pkvm/0001-pkvm-enable-pkvm-on-intel-x86-6.1-lts.patch new file mode 100644 index 000000000..b99602189 --- /dev/null +++ b/modules/virtualization/pkvm/0001-pkvm-enable-pkvm-on-intel-x86-6.1-lts.patch @@ -0,0 +1,21564 @@ +From 57625c591800467ae5eeabbeba25c42121310c7e Mon Sep 17 00:00:00 2001 +From: Kalle Marjamaki +Date: Thu, 28 Sep 2023 13:37:09 +0300 +Subject: [PATCH] pkvm: enable pkvm on intel x86, 6.1 lts + +Signed-off-by: Kalle Marjamaki +Signed-off-by: Janne Karhunen +--- + arch/arm64/include/asm/kvm_host.h | 8 +- + arch/arm64/include/asm/kvm_pkvm.h | 31 +- + .../asm/pkvm_spinlock.h} | 31 +- + arch/arm64/kvm/Makefile | 3 + + arch/arm64/kvm/arm.c | 8 +- + arch/arm64/kvm/hyp/hyp-constants.c | 4 +- + arch/arm64/kvm/hyp/include/nvhe/gfp.h | 34 - + arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 4 +- + arch/arm64/kvm/hyp/include/nvhe/memory.h | 48 - + arch/arm64/kvm/hyp/include/nvhe/mm.h | 10 +- + arch/arm64/kvm/hyp/nvhe/Makefile | 4 +- + arch/arm64/kvm/hyp/nvhe/early_alloc.c | 2 +- + arch/arm64/kvm/hyp/nvhe/mem_protect.c | 54 +- + arch/arm64/kvm/hyp/nvhe/mm.c | 30 +- + arch/arm64/kvm/hyp/nvhe/psci-relay.c | 2 +- + arch/arm64/kvm/hyp/nvhe/setup.c | 28 +- + arch/arm64/kvm/pkvm.c | 87 +- + arch/arm64/mm/init.c | 2 +- + arch/x86/Kconfig | 11 + + arch/x86/coco/Makefile | 3 +- + arch/x86/coco/core.c | 34 + + arch/x86/coco/pkvm/Makefile | 3 + + arch/x86/coco/pkvm/pkvm.c | 113 + + arch/x86/coco/pkvm/pkvmcall.S | 42 + + arch/x86/coco/tdx/tdx.c | 115 +- + arch/x86/coco/virt_exception.c | 126 + + arch/x86/include/asm/coco.h | 1 + + arch/x86/include/asm/hypervisor.h | 2 + + arch/x86/include/asm/idtentry.h | 2 +- + arch/x86/include/asm/kvm-x86-ops.h | 2 + + arch/x86/include/asm/kvm_host.h | 18 + + arch/x86/include/asm/kvm_pkvm.h | 250 + + arch/x86/include/asm/pkvm.h | 151 + + arch/x86/include/asm/pkvm_image.h | 48 + + arch/x86/include/asm/pkvm_image_vars.h | 23 + + arch/x86/include/asm/pkvm_spinlock.h | 62 + + arch/x86/include/asm/tdx.h | 19 +- + arch/x86/include/asm/virt_exception.h | 41 + + arch/x86/include/asm/vmx.h | 7 + + arch/x86/include/uapi/asm/kvm.h | 3 + + arch/x86/kernel/cpu/Makefile | 1 + + arch/x86/kernel/cpu/hypervisor.c | 3 + + arch/x86/kernel/cpu/pkvm.c | 33 + + arch/x86/kernel/idt.c | 2 +- + arch/x86/kernel/setup.c | 3 + + arch/x86/kernel/traps.c | 9 +- + arch/x86/kernel/vmlinux.lds.S | 37 + + arch/x86/kvm/Kconfig | 24 + + arch/x86/kvm/Makefile | 1 + + arch/x86/kvm/mmu.h | 16 + + arch/x86/kvm/mmu/mmu.c | 47 +- + arch/x86/kvm/mmu/paging_tmpl.h | 3 +- + arch/x86/kvm/mmu/spte.h | 1 + + arch/x86/kvm/mmu/tdp_mmu.c | 7 +- + arch/x86/kvm/svm/svm.c | 6 + + arch/x86/kvm/vmx/pkvm/.gitignore | 1 + + arch/x86/kvm/vmx/pkvm/Makefile | 29 + + arch/x86/kvm/vmx/pkvm/hyp/Makefile | 79 + + arch/x86/kvm/vmx/pkvm/hyp/bug.h | 23 + + arch/x86/kvm/vmx/pkvm/hyp/cpu.h | 53 + + arch/x86/kvm/vmx/pkvm/hyp/debug.h | 20 + + arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c | 76 + + arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h | 15 + + arch/x86/kvm/vmx/pkvm/hyp/ept.c | 1066 ++++ + arch/x86/kvm/vmx/pkvm/hyp/ept.h | 70 + + arch/x86/kvm/vmx/pkvm/hyp/idt.S | 67 + + arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c | 371 ++ + arch/x86/kvm/vmx/pkvm/hyp/io.h | 82 + + arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c | 374 ++ + arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h | 67 + + arch/x86/kvm/vmx/pkvm/hyp/iommu.c | 2372 ++++++++ + arch/x86/kvm/vmx/pkvm/hyp/iommu.h | 16 + + arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c | 199 + + arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h | 347 ++ + arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c | 106 + + arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h | 19 + + arch/x86/kvm/vmx/pkvm/hyp/irq.c | 60 + + arch/x86/kvm/vmx/pkvm/hyp/lapic.c | 222 + + arch/x86/kvm/vmx/pkvm/hyp/lapic.h | 12 + + arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c | 16 + + arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S | 26 + + arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S | 24 + + arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S | 115 + + arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c | 1013 ++++ + arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h | 205 + + arch/x86/kvm/vmx/pkvm/hyp/memory.c | 363 ++ + arch/x86/kvm/vmx/pkvm/hyp/memory.h | 51 + + arch/x86/kvm/vmx/pkvm/hyp/mmu.c | 258 + + arch/x86/kvm/vmx/pkvm/hyp/mmu.h | 28 + + arch/x86/kvm/vmx/pkvm/hyp/nested.c | 1485 +++++ + arch/x86/kvm/vmx/pkvm/hyp/nested.h | 32 + + arch/x86/kvm/vmx/pkvm/hyp/pci.c | 350 ++ + arch/x86/kvm/vmx/pkvm/hyp/pci.h | 24 + + arch/x86/kvm/vmx/pkvm/hyp/pgtable.c | 801 +++ + arch/x86/kvm/vmx/pkvm/hyp/pgtable.h | 155 + + arch/x86/kvm/vmx/pkvm/hyp/pkvm.c | 470 ++ + arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S | 10 + + arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h | 187 + + .../vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h | 191 + + arch/x86/kvm/vmx/pkvm/hyp/ptdev.c | 213 + + arch/x86/kvm/vmx/pkvm/hyp/ptdev.h | 53 + + arch/x86/kvm/vmx/pkvm/hyp/trace.c | 117 + + arch/x86/kvm/vmx/pkvm/hyp/trace.h | 15 + + arch/x86/kvm/vmx/pkvm/hyp/vmexit.c | 360 ++ + arch/x86/kvm/vmx/pkvm/hyp/vmexit.h | 11 + + arch/x86/kvm/vmx/pkvm/hyp/vmsr.c | 120 + + arch/x86/kvm/vmx/pkvm/hyp/vmsr.h | 11 + + arch/x86/kvm/vmx/pkvm/hyp/vmx.c | 79 + + arch/x86/kvm/vmx/pkvm/hyp/vmx.h | 63 + + arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S | 186 + + arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h | 173 + + arch/x86/kvm/vmx/pkvm/include/capabilities.h | 95 + + arch/x86/kvm/vmx/pkvm/include/pkvm.h | 155 + + arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h | 29 + + arch/x86/kvm/vmx/pkvm/pkvm_constants.c | 26 + + arch/x86/kvm/vmx/pkvm/pkvm_constants.h | 21 + + arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c | 204 + + arch/x86/kvm/vmx/pkvm/pkvm_host.c | 1300 +++++ + arch/x86/kvm/vmx/vmcs12.c | 6 + + arch/x86/kvm/vmx/vmcs12.h | 16 +- + arch/x86/kvm/vmx/vmx.c | 259 +- + arch/x86/kvm/vmx/vmx_lib.h | 241 + + arch/x86/kvm/vmx/vmx_ops.h | 19 +- + arch/x86/kvm/x86.c | 60 +- + arch/x86/mm/pat/set_memory.c | 4 + + drivers/iommu/intel/debugfs.c | 14 +- + drivers/iommu/intel/dmar.c | 108 +- + drivers/iommu/intel/iommu.c | 60 +- + drivers/iommu/intel/iommu.h | 18 +- + drivers/iommu/intel/irq_remapping.c | 24 +- + drivers/iommu/intel/pasid.c | 4 +- + drivers/iommu/intel/svm.c | 34 +- + include/asm-generic/vmlinux.lds.h | 16 + + include/linux/intel-iommu.h | 863 +++ + include/linux/kvm_host.h | 34 +- + include/linux/kvm_types.h | 1 + + include/uapi/linux/kvm.h | 1 + + include/uapi/linux/kvm_para.h | 11 + + tools/arch/x86/include/uapi/asm/kvm.h | 3 + + tools/include/uapi/linux/kvm.h | 1 + + virt/kvm/kvm_main.c | 35 +- + virt/kvm/pkvm/buddy_memory.h | 36 + + virt/kvm/pkvm/gfp.h | 35 + + .../hyp/nvhe => virt/kvm/pkvm}/page_alloc.c | 135 +- + virt/kvm/pkvm/pkvm.c | 85 + + virt/kvm/pkvm/pkvm_spinlock.h | 47 + + virt/kvm/vfio.c | 13 + + 150 files changed, 28003 insertions(+), 808 deletions(-) + rename arch/arm64/{kvm/hyp/include/nvhe/spinlock.h => include/asm/pkvm_spinlock.h} (73%) + delete mode 100644 arch/arm64/kvm/hyp/include/nvhe/gfp.h + delete mode 100644 arch/arm64/kvm/hyp/include/nvhe/memory.h + create mode 100644 arch/x86/coco/pkvm/Makefile + create mode 100644 arch/x86/coco/pkvm/pkvm.c + create mode 100644 arch/x86/coco/pkvm/pkvmcall.S + create mode 100644 arch/x86/coco/virt_exception.c + create mode 100644 arch/x86/include/asm/kvm_pkvm.h + create mode 100644 arch/x86/include/asm/pkvm.h + create mode 100644 arch/x86/include/asm/pkvm_image.h + create mode 100644 arch/x86/include/asm/pkvm_image_vars.h + create mode 100644 arch/x86/include/asm/pkvm_spinlock.h + create mode 100644 arch/x86/include/asm/virt_exception.h + create mode 100644 arch/x86/kernel/cpu/pkvm.c + create mode 100644 arch/x86/kvm/vmx/pkvm/.gitignore + create mode 100644 arch/x86/kvm/vmx/pkvm/Makefile + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/Makefile + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/bug.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/cpu.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/debug.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/ept.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/ept.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/idt.S + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/io.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/irq.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lapic.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lapic.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/memory.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/memory.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/mmu.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/mmu.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/nested.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/nested.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pci.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pci.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pgtable.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pgtable.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pkvm.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/ptdev.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/ptdev.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/trace.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/trace.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmexit.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmexit.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmsr.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmsr.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmx.c + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmx.h + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S + create mode 100644 arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h + create mode 100644 arch/x86/kvm/vmx/pkvm/include/capabilities.h + create mode 100644 arch/x86/kvm/vmx/pkvm/include/pkvm.h + create mode 100644 arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h + create mode 100644 arch/x86/kvm/vmx/pkvm/pkvm_constants.c + create mode 100644 arch/x86/kvm/vmx/pkvm/pkvm_constants.h + create mode 100644 arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c + create mode 100644 arch/x86/kvm/vmx/pkvm/pkvm_host.c + create mode 100644 arch/x86/kvm/vmx/vmx_lib.h + create mode 100644 include/linux/intel-iommu.h + create mode 100644 virt/kvm/pkvm/buddy_memory.h + create mode 100644 virt/kvm/pkvm/gfp.h + rename {arch/arm64/kvm/hyp/nvhe => virt/kvm/pkvm}/page_alloc.c (56%) + create mode 100644 virt/kvm/pkvm/pkvm.c + create mode 100644 virt/kvm/pkvm/pkvm_spinlock.h + +diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h +index 577cf444c113..77f906dcbd50 100644 +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -962,11 +962,11 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); + + int kvm_trng_call(struct kvm_vcpu *vcpu); + #ifdef CONFIG_KVM +-extern phys_addr_t hyp_mem_base; +-extern phys_addr_t hyp_mem_size; +-void __init kvm_hyp_reserve(void); ++extern phys_addr_t pkvm_mem_base; ++extern phys_addr_t pkvm_mem_size; ++void __init pkvm_reserve(void); + #else +-static inline void kvm_hyp_reserve(void) { } ++static inline void pkvm_reserve(void) { } + #endif + + void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); +diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h +index 9f4ad2a8df59..6ea44fde0672 100644 +--- a/arch/arm64/include/asm/kvm_pkvm.h ++++ b/arch/arm64/include/asm/kvm_pkvm.h +@@ -8,11 +8,32 @@ + + #include + #include ++#include + +-#define HYP_MEMBLOCK_REGIONS 128 ++#define PKVM_MEMBLOCK_REGIONS 128 + +-extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; +-extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); ++#define __hyp_va(phys) ((void *)((phys_addr_t)(phys) - hyp_physvirt_offset)) ++ ++static inline void *hyp_phys_to_virt(phys_addr_t phys) ++{ ++ return __hyp_va(phys); ++} ++ ++static inline phys_addr_t hyp_virt_to_phys(void *addr) ++{ ++ return __hyp_pa(addr); ++} ++ ++#define __pkvm_pa __hyp_pa ++#define __pkvm_va __hyp_va ++ ++#define pkvm_sym kvm_nvhe_sym ++ ++extern struct memblock_region kvm_nvhe_sym(pkvm_memory)[]; ++extern unsigned int kvm_nvhe_sym(pkvm_memblock_nr); ++ ++int pkvm_pre_reserve_check(void); ++u64 pkvm_total_reserve_pages(void); + + static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) + { +@@ -32,8 +53,8 @@ static inline unsigned long __hyp_pgtable_total_pages(void) + unsigned long res = 0, i; + + /* Cover all of memory with page-granularity */ +- for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { +- struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i]; ++ for (i = 0; i < kvm_nvhe_sym(pkvm_memblock_nr); i++) { ++ struct memblock_region *reg = &kvm_nvhe_sym(pkvm_memory)[i]; + res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT); + } + +diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/include/asm/pkvm_spinlock.h +similarity index 73% +rename from arch/arm64/kvm/hyp/include/nvhe/spinlock.h +rename to arch/arm64/include/asm/pkvm_spinlock.h +index 4652fd04bdbe..21f204f7c9c5 100644 +--- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h ++++ b/arch/arm64/include/asm/pkvm_spinlock.h +@@ -10,14 +10,14 @@ + * Copyright (C) 2012 ARM Ltd. + */ + +-#ifndef __ARM64_KVM_NVHE_SPINLOCK_H__ +-#define __ARM64_KVM_NVHE_SPINLOCK_H__ ++#ifndef __ASM_ARM64_PKVM_SPINLOCK_H__ ++#define __ASM_ARM64_PKVM_SPINLOCK_H__ + + #include + #include + #include + +-typedef union hyp_spinlock { ++typedef union arch_pkvm_spinlock { + u32 __val; + struct { + #ifdef __AARCH64EB__ +@@ -26,17 +26,14 @@ typedef union hyp_spinlock { + u16 owner, next; + #endif + }; +-} hyp_spinlock_t; ++} arch_pkvm_spinlock_t; + +-#define hyp_spin_lock_init(l) \ +-do { \ +- *(l) = (hyp_spinlock_t){ .__val = 0 }; \ +-} while (0) ++#define __ARCH_PKVM_SPINLOCK_UNLOCKED { 0 } + +-static inline void hyp_spin_lock(hyp_spinlock_t *lock) ++static inline void arch_pkvm_spin_lock(arch_pkvm_spinlock_t *lock) + { + u32 tmp; +- hyp_spinlock_t lockval, newval; ++ arch_pkvm_spinlock_t lockval, newval; + + asm volatile( + /* Atomically increment the next ticket. */ +@@ -71,7 +68,7 @@ static inline void hyp_spin_lock(hyp_spinlock_t *lock) + : "memory"); + } + +-static inline void hyp_spin_unlock(hyp_spinlock_t *lock) ++static inline void arch_pkvm_spin_unlock(arch_pkvm_spinlock_t *lock) + { + u64 tmp; + +@@ -90,15 +87,15 @@ static inline void hyp_spin_unlock(hyp_spinlock_t *lock) + : "memory"); + } + +-static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock) ++static inline bool arch_pkvm_spin_is_locked(arch_pkvm_spinlock_t *lock) + { +- hyp_spinlock_t lockval = READ_ONCE(*lock); ++ arch_pkvm_spinlock_t lockval = READ_ONCE(*lock); + + return lockval.owner != lockval.next; + } + + #ifdef CONFIG_NVHE_EL2_DEBUG +-static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) ++static inline void arch_pkvm_assert_lock_held(arch_pkvm_spinlock_t *lock) + { + /* + * The __pkvm_init() path accesses protected data-structures without +@@ -108,10 +105,10 @@ static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) + * wait until it is set before checking the lock state. + */ + if (static_branch_likely(&kvm_protected_mode_initialized)) +- BUG_ON(!hyp_spin_is_locked(lock)); ++ BUG_ON(!arch_pkvm_spin_is_locked(lock)); + } + #else +-static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { } ++static inline void arch_pkvm_assert_lock_held(arch_pkvm_spinlock_t *lock) { } + #endif + +-#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */ ++#endif /* __ASM_ARM64_PKVM_SPINLOCK_H__ */ +diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile +index 5e33c2d4645a..9691fd90de6b 100644 +--- a/arch/arm64/kvm/Makefile ++++ b/arch/arm64/kvm/Makefile +@@ -22,6 +22,8 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ + vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ + vgic/vgic-its.o vgic/vgic-debug.o + ++kvm-y += ../../../virt/kvm/pkvm/pkvm.o ++ + kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o + + always-y := hyp_constants.h hyp-constants.s +@@ -31,6 +33,7 @@ define rule_gen_hyp_constants + endef + + CFLAGS_hyp-constants.o = -I $(srctree)/$(src)/hyp/include ++CFLAGS_hyp-constants.o += -I $(srctree)/virt/kvm/pkvm + $(obj)/hyp-constants.s: $(src)/hyp/hyp-constants.c FORCE + $(call if_changed_dep,cc_s_c) + +diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c +index 6cc380a15eb7..720961355dcc 100644 +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -1888,7 +1888,7 @@ static int do_pkvm_init(u32 hyp_va_bits) + + preempt_disable(); + cpu_hyp_init_context(); +- ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, ++ ret = kvm_call_hyp_nvhe(__pkvm_init, pkvm_mem_base, pkvm_mem_size, + num_possible_cpus(), kern_hyp_va(per_cpu_base), + hyp_va_bits); + cpu_hyp_init_features(); +@@ -1941,10 +1941,10 @@ static void kvm_hyp_init_symbols(void) + + static int kvm_hyp_init_protection(u32 hyp_va_bits) + { +- void *addr = phys_to_virt(hyp_mem_base); ++ void *addr = phys_to_virt(pkvm_mem_base); + int ret; + +- ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); ++ ret = create_hyp_mappings(addr, addr + pkvm_mem_size, PAGE_HYP); + if (ret) + return ret; + +@@ -1970,7 +1970,7 @@ static int init_hyp_mode(void) + * The protected Hyp-mode cannot be initialized if the memory pool + * allocation has failed. + */ +- if (is_protected_kvm_enabled() && !hyp_mem_base) ++ if (is_protected_kvm_enabled() && !pkvm_mem_base) + goto out_err; + + /* +diff --git a/arch/arm64/kvm/hyp/hyp-constants.c b/arch/arm64/kvm/hyp/hyp-constants.c +index b3742a6691e8..3e604ce08796 100644 +--- a/arch/arm64/kvm/hyp/hyp-constants.c ++++ b/arch/arm64/kvm/hyp/hyp-constants.c +@@ -1,10 +1,10 @@ + // SPDX-License-Identifier: GPL-2.0-only + + #include +-#include ++#include + + int main(void) + { +- DEFINE(STRUCT_HYP_PAGE_SIZE, sizeof(struct hyp_page)); ++ DEFINE(STRUCT_HYP_PAGE_SIZE, sizeof(struct pkvm_page)); + return 0; + } +diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h +deleted file mode 100644 +index 0a048dc06a7d..000000000000 +--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h ++++ /dev/null +@@ -1,34 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-only */ +-#ifndef __KVM_HYP_GFP_H +-#define __KVM_HYP_GFP_H +- +-#include +- +-#include +-#include +- +-#define HYP_NO_ORDER USHRT_MAX +- +-struct hyp_pool { +- /* +- * Spinlock protecting concurrent changes to the memory pool as well as +- * the struct hyp_page of the pool's pages until we have a proper atomic +- * API at EL2. +- */ +- hyp_spinlock_t lock; +- struct list_head free_area[MAX_ORDER]; +- phys_addr_t range_start; +- phys_addr_t range_end; +- unsigned short max_order; +-}; +- +-/* Allocation */ +-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order); +-void hyp_split_page(struct hyp_page *page); +-void hyp_get_page(struct hyp_pool *pool, void *addr); +-void hyp_put_page(struct hyp_pool *pool, void *addr); +- +-/* Used pages cannot be freed */ +-int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, +- unsigned int reserved_pages); +-#endif /* __KVM_HYP_GFP_H */ +diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +index 80e99836eac7..6ea3f31e7741 100644 +--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h ++++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +@@ -10,7 +10,7 @@ + #include + #include + #include +-#include ++#include + + /* + * SW bits 0-1 are reserved to track the memory ownership state of each page: +@@ -47,7 +47,7 @@ struct host_kvm { + struct kvm_arch arch; + struct kvm_pgtable pgt; + struct kvm_pgtable_mm_ops mm_ops; +- hyp_spinlock_t lock; ++ pkvm_spinlock_t lock; + }; + extern struct host_kvm host_kvm; + +diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h +deleted file mode 100644 +index 592b7edb3edb..000000000000 +--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h ++++ /dev/null +@@ -1,48 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-only */ +-#ifndef __KVM_HYP_MEMORY_H +-#define __KVM_HYP_MEMORY_H +- +-#include +-#include +- +-#include +- +-struct hyp_page { +- unsigned short refcount; +- unsigned short order; +-}; +- +-extern u64 __hyp_vmemmap; +-#define hyp_vmemmap ((struct hyp_page *)__hyp_vmemmap) +- +-#define __hyp_va(phys) ((void *)((phys_addr_t)(phys) - hyp_physvirt_offset)) +- +-static inline void *hyp_phys_to_virt(phys_addr_t phys) +-{ +- return __hyp_va(phys); +-} +- +-static inline phys_addr_t hyp_virt_to_phys(void *addr) +-{ +- return __hyp_pa(addr); +-} +- +-#define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) +-#define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) +-#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)]) +-#define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt)) +-#define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt)) +- +-#define hyp_page_to_pfn(page) ((struct hyp_page *)(page) - hyp_vmemmap) +-#define hyp_page_to_phys(page) hyp_pfn_to_phys((hyp_page_to_pfn(page))) +-#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) +-#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) +- +-static inline int hyp_page_count(void *addr) +-{ +- struct hyp_page *p = hyp_virt_to_page(addr); +- +- return p->refcount; +-} +- +-#endif /* __KVM_HYP_MEMORY_H */ +diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h +index 42d8eb9bfe72..9a18d3c1d6f1 100644 +--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h ++++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h +@@ -7,11 +7,11 @@ + #include + #include + +-#include +-#include ++#include ++#include + + extern struct kvm_pgtable pkvm_pgtable; +-extern hyp_spinlock_t pkvm_pgd_lock; ++extern pkvm_spinlock_t pkvm_pgd_lock; + + int hyp_create_idmap(u32 hyp_va_bits); + int hyp_map_vectors(void); +@@ -28,10 +28,10 @@ static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size, + unsigned long *start, unsigned long *end) + { + unsigned long nr_pages = size >> PAGE_SHIFT; +- struct hyp_page *p = hyp_phys_to_page(phys); ++ struct pkvm_page *p = pkvm_phys_to_page(phys); + + *start = (unsigned long)p; +- *end = *start + nr_pages * sizeof(struct hyp_page); ++ *end = *start + nr_pages * sizeof(struct pkvm_page); + *start = ALIGN_DOWN(*start, PAGE_SIZE); + *end = ALIGN(*end, PAGE_SIZE); + } +diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile +index be0a2bc3e20d..8e246861616b 100644 +--- a/arch/arm64/kvm/hyp/nvhe/Makefile ++++ b/arch/arm64/kvm/hyp/nvhe/Makefile +@@ -10,6 +10,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS + # will explode instantly (Words of Marc Zyngier). So introduce a generic flag + # __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM. + ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__ ++ccflags-y += -I $(srctree)/virt/kvm/pkvm + ccflags-y += -fno-stack-protector \ + -DDISABLE_BRANCH_PROFILING \ + $(DISABLE_STACKLEAK_PLUGIN) +@@ -21,10 +22,11 @@ lib-objs := clear_page.o copy_page.o memcpy.o memset.o + lib-objs := $(addprefix ../../../lib/, $(lib-objs)) + + hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ +- hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ ++ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o \ + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o + hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ + ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o ++hyp-obj-y += ../../../../../virt/kvm/pkvm/page_alloc.o + hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o + hyp-obj-y += $(lib-objs) + +diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c +index 00de04153cc6..be1e72cdcbce 100644 +--- a/arch/arm64/kvm/hyp/nvhe/early_alloc.c ++++ b/arch/arm64/kvm/hyp/nvhe/early_alloc.c +@@ -7,7 +7,7 @@ + #include + + #include +-#include ++#include + + struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops; + s64 __ro_after_init hyp_physvirt_offset; +diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c +index 07f9dc9848ef..89d04330ca95 100644 +--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c ++++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c +@@ -14,8 +14,8 @@ + + #include + +-#include +-#include ++#include ++#include + #include + #include + +@@ -24,35 +24,35 @@ + extern unsigned long hyp_nr_cpus; + struct host_kvm host_kvm; + +-static struct hyp_pool host_s2_pool; ++static struct pkvm_pool host_s2_pool; + + const u8 pkvm_hyp_id = 1; + + static void host_lock_component(void) + { +- hyp_spin_lock(&host_kvm.lock); ++ pkvm_spin_lock(&host_kvm.lock); + } + + static void host_unlock_component(void) + { +- hyp_spin_unlock(&host_kvm.lock); ++ pkvm_spin_unlock(&host_kvm.lock); + } + + static void hyp_lock_component(void) + { +- hyp_spin_lock(&pkvm_pgd_lock); ++ pkvm_spin_lock(&pkvm_pgd_lock); + } + + static void hyp_unlock_component(void) + { +- hyp_spin_unlock(&pkvm_pgd_lock); ++ pkvm_spin_unlock(&pkvm_pgd_lock); + } + + static void *host_s2_zalloc_pages_exact(size_t size) + { +- void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size)); ++ void *addr = pkvm_alloc_pages(&host_s2_pool, get_order(size)); + +- hyp_split_page(hyp_virt_to_page(addr)); ++ pkvm_split_page(pkvm_virt_to_page(addr)); + + /* + * The size of concatenated PGDs is always a power of two of PAGE_SIZE, +@@ -66,17 +66,17 @@ static void *host_s2_zalloc_pages_exact(size_t size) + + static void *host_s2_zalloc_page(void *pool) + { +- return hyp_alloc_pages(pool, 0); ++ return pkvm_alloc_pages(pool, 0); + } + + static void host_s2_get_page(void *addr) + { +- hyp_get_page(&host_s2_pool, addr); ++ pkvm_get_page(&host_s2_pool, addr); + } + + static void host_s2_put_page(void *addr) + { +- hyp_put_page(&host_s2_pool, addr); ++ pkvm_put_page(&host_s2_pool, addr); + } + + static int prepare_s2_pool(void *pgt_pool_base) +@@ -84,9 +84,9 @@ static int prepare_s2_pool(void *pgt_pool_base) + unsigned long nr_pages, pfn; + int ret; + +- pfn = hyp_virt_to_pfn(pgt_pool_base); ++ pfn = pkvm_virt_to_pfn(pgt_pool_base); + nr_pages = host_s2_pgtable_pages(); +- ret = hyp_pool_init(&host_s2_pool, pfn, nr_pages, 0); ++ ret = pkvm_pool_init(&host_s2_pool, pfn, nr_pages, 0); + if (ret) + return ret; + +@@ -95,7 +95,7 @@ static int prepare_s2_pool(void *pgt_pool_base) + .zalloc_page = host_s2_zalloc_page, + .phys_to_virt = hyp_phys_to_virt, + .virt_to_phys = hyp_virt_to_phys, +- .page_count = hyp_page_count, ++ .page_count = pkvm_page_count, + .get_page = host_s2_get_page, + .put_page = host_s2_put_page, + }; +@@ -123,7 +123,7 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) + int ret; + + prepare_host_vtcr(); +- hyp_spin_lock_init(&host_kvm.lock); ++ pkvm_spinlock_init(&host_kvm.lock); + mmu->arch = &host_kvm.arch; + + ret = prepare_s2_pool(pgt_pool_base); +@@ -181,8 +181,8 @@ static int host_stage2_unmap_dev_all(void) + int i, ret; + + /* Unmap all non-memory regions to recycle the pages */ +- for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) { +- reg = &hyp_memory[i]; ++ for (i = 0; i < pkvm_memblock_nr; i++, addr = reg->base + reg->size) { ++ reg = &pkvm_memory[i]; + ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr); + if (ret) + return ret; +@@ -197,7 +197,7 @@ struct kvm_mem_range { + + static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) + { +- int cur, left = 0, right = hyp_memblock_nr; ++ int cur, left = 0, right = pkvm_memblock_nr; + struct memblock_region *reg; + phys_addr_t end; + +@@ -207,7 +207,7 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) + /* The list of memblock regions is sorted, binary search it */ + while (left < right) { + cur = (left + right) >> 1; +- reg = &hyp_memory[cur]; ++ reg = &pkvm_memory[cur]; + end = reg->base + reg->size; + if (addr < reg->base) { + right = cur; +@@ -263,7 +263,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end, + #define host_stage2_try(fn, ...) \ + ({ \ + int __ret; \ +- hyp_assert_lock_held(&host_kvm.lock); \ ++ pkvm_assert_lock_held(&host_kvm.lock); \ + __ret = fn(__VA_ARGS__); \ + if (__ret == -ENOMEM) { \ + __ret = host_stage2_unmap_dev_all(); \ +@@ -286,7 +286,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) + u32 level; + int ret; + +- hyp_assert_lock_held(&host_kvm.lock); ++ pkvm_assert_lock_held(&host_kvm.lock); + ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level); + if (ret) + return ret; +@@ -459,7 +459,7 @@ static int __host_check_page_state_range(u64 addr, u64 size, + .get_page_state = host_get_page_state, + }; + +- hyp_assert_lock_held(&host_kvm.lock); ++ pkvm_assert_lock_held(&host_kvm.lock); + return check_page_state_range(&host_kvm.pgt, addr, size, &d); + } + +@@ -516,7 +516,7 @@ static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte) + if (!kvm_pte_valid(pte)) + return PKVM_NOPAGE; + +- return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); ++ return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); + } + + static int __hyp_check_page_state_range(u64 addr, u64 size, +@@ -527,7 +527,7 @@ static int __hyp_check_page_state_range(u64 addr, u64 size, + .get_page_state = hyp_get_page_state, + }; + +- hyp_assert_lock_held(&pkvm_pgd_lock); ++ pkvm_assert_lock_held(&pkvm_pgd_lock); + return check_page_state_range(&pkvm_pgtable, addr, size, &d); + } + +@@ -735,7 +735,7 @@ static int do_unshare(struct pkvm_mem_share *share) + int __pkvm_host_share_hyp(u64 pfn) + { + int ret; +- u64 host_addr = hyp_pfn_to_phys(pfn); ++ u64 host_addr = pkvm_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_share share = { + .tx = { +@@ -768,7 +768,7 @@ int __pkvm_host_share_hyp(u64 pfn) + int __pkvm_host_unshare_hyp(u64 pfn) + { + int ret; +- u64 host_addr = hyp_pfn_to_phys(pfn); ++ u64 host_addr = pkvm_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_share share = { + .tx = { +diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c +index 96193cb31a39..eb0071451cb4 100644 +--- a/arch/arm64/kvm/hyp/nvhe/mm.c ++++ b/arch/arm64/kvm/hyp/nvhe/mm.c +@@ -12,16 +12,16 @@ + #include + + #include +-#include +-#include ++#include ++#include + #include +-#include ++#include + + struct kvm_pgtable pkvm_pgtable; +-hyp_spinlock_t pkvm_pgd_lock; ++pkvm_spinlock_t pkvm_pgd_lock; + +-struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; +-unsigned int hyp_memblock_nr; ++struct memblock_region pkvm_memory[PKVM_MEMBLOCK_REGIONS]; ++unsigned int pkvm_memblock_nr; + + static u64 __io_map_base; + +@@ -30,9 +30,9 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size, + { + int err; + +- hyp_spin_lock(&pkvm_pgd_lock); ++ pkvm_spin_lock(&pkvm_pgd_lock); + err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot); +- hyp_spin_unlock(&pkvm_pgd_lock); ++ pkvm_spin_unlock(&pkvm_pgd_lock); + + return err; + } +@@ -52,7 +52,7 @@ int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr) + unsigned long base, addr; + int ret = 0; + +- hyp_spin_lock(&pkvm_pgd_lock); ++ pkvm_spin_lock(&pkvm_pgd_lock); + + /* Align the allocation based on the order of its size */ + addr = ALIGN(__io_map_base, PAGE_SIZE << get_order(size)); +@@ -61,14 +61,14 @@ int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr) + base = addr + PAGE_ALIGN(size); + + /* Are we overflowing on the vmemmap ? */ +- if (!addr || base > __hyp_vmemmap) ++ if (!addr || base > __pkvm_vmemmap) + ret = -ENOMEM; + else { + __io_map_base = base; + *haddr = addr; + } + +- hyp_spin_unlock(&pkvm_pgd_lock); ++ pkvm_spin_unlock(&pkvm_pgd_lock); + + return ret; + } +@@ -100,7 +100,7 @@ int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot + unsigned long virt_addr; + phys_addr_t phys; + +- hyp_assert_lock_held(&pkvm_pgd_lock); ++ pkvm_assert_lock_held(&pkvm_pgd_lock); + + start = start & PAGE_MASK; + end = PAGE_ALIGN(end); +@@ -122,9 +122,9 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) + { + int ret; + +- hyp_spin_lock(&pkvm_pgd_lock); ++ pkvm_spin_lock(&pkvm_pgd_lock); + ret = pkvm_create_mappings_locked(from, to, prot); +- hyp_spin_unlock(&pkvm_pgd_lock); ++ pkvm_spin_unlock(&pkvm_pgd_lock); + + return ret; + } +@@ -209,7 +209,7 @@ int hyp_create_idmap(u32 hyp_va_bits) + */ + __io_map_base = start & BIT(hyp_va_bits - 2); + __io_map_base ^= BIT(hyp_va_bits - 2); +- __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3); ++ __pkvm_vmemmap = __io_map_base | BIT(hyp_va_bits - 3); + + return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC); + } +diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c +index 08508783ec3d..1c757bd02d4d 100644 +--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c ++++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c +@@ -11,7 +11,7 @@ + #include + #include + +-#include ++#include + #include + + void kvm_hyp_cpu_entry(unsigned long r0); +diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c +index e8d4ea2fcfa0..2c9ce8761e79 100644 +--- a/arch/arm64/kvm/hyp/nvhe/setup.c ++++ b/arch/arm64/kvm/hyp/nvhe/setup.c +@@ -12,8 +12,8 @@ + + #include + #include +-#include +-#include ++#include ++#include + #include + #include + #include +@@ -27,7 +27,7 @@ static void *vmemmap_base; + static void *hyp_pgt_base; + static void *host_s2_pgt_base; + static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; +-static struct hyp_pool hpool; ++static struct pkvm_pool ppool; + + static int divide_memory_pool(void *virt, unsigned long size) + { +@@ -126,10 +126,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, + * and addresses corresponding to the guard page have the + * PAGE_SHIFT bit as 0 - this is used for overflow detection. + */ +- hyp_spin_lock(&pkvm_pgd_lock); ++ pkvm_spin_lock(&pkvm_pgd_lock); + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, hyp_addr + PAGE_SIZE, + PAGE_SIZE, params->stack_pa, PAGE_HYP); +- hyp_spin_unlock(&pkvm_pgd_lock); ++ pkvm_spin_unlock(&pkvm_pgd_lock); + if (ret) + return ret; + +@@ -173,17 +173,17 @@ static void update_nvhe_init_params(void) + + static void *hyp_zalloc_hyp_page(void *arg) + { +- return hyp_alloc_pages(&hpool, 0); ++ return pkvm_alloc_pages(&ppool, 0); + } + + static void hpool_get_page(void *addr) + { +- hyp_get_page(&hpool, addr); ++ pkvm_get_page(&ppool, addr); + } + + static void hpool_put_page(void *addr) + { +- hyp_put_page(&hpool, addr); ++ pkvm_put_page(&ppool, addr); + } + + static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, +@@ -246,8 +246,8 @@ static int finalize_host_mappings(void) + }; + int i, ret; + +- for (i = 0; i < hyp_memblock_nr; i++) { +- struct memblock_region *reg = &hyp_memory[i]; ++ for (i = 0; i < pkvm_memblock_nr; i++) { ++ struct memblock_region *reg = &pkvm_memory[i]; + u64 start = (u64)hyp_phys_to_virt(reg->base); + + ret = kvm_pgtable_walk(&pkvm_pgtable, start, reg->size, &walker); +@@ -266,10 +266,10 @@ void __noreturn __pkvm_init_finalise(void) + int ret; + + /* Now that the vmemmap is backed, install the full-fledged allocator */ +- pfn = hyp_virt_to_pfn(hyp_pgt_base); ++ pfn = pkvm_virt_to_pfn(hyp_pgt_base); + nr_pages = hyp_s1_pgtable_pages(); + reserved_pages = hyp_early_alloc_nr_used_pages(); +- ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages); ++ ret = pkvm_pool_init(&ppool, pfn, nr_pages, reserved_pages); + if (ret) + goto out; + +@@ -283,7 +283,7 @@ void __noreturn __pkvm_init_finalise(void) + .virt_to_phys = hyp_virt_to_phys, + .get_page = hpool_get_page, + .put_page = hpool_put_page, +- .page_count = hyp_page_count, ++ .page_count = pkvm_page_count, + }; + pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops; + +@@ -314,7 +314,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) + return -EINVAL; + +- hyp_spin_lock_init(&pkvm_pgd_lock); ++ pkvm_spinlock_init(&pkvm_pgd_lock); + hyp_nr_cpus = nr_cpus; + + ret = divide_memory_pool(virt, size); +diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c +index ebecb7c045f4..a83a350c1956 100644 +--- a/arch/arm64/kvm/pkvm.c ++++ b/arch/arm64/kvm/pkvm.c +@@ -5,72 +5,27 @@ + */ + + #include +-#include +-#include + + #include +- + #include "hyp_constants.h" + +-static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); +-static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); +- +-phys_addr_t hyp_mem_base; +-phys_addr_t hyp_mem_size; +- +-static int cmp_hyp_memblock(const void *p1, const void *p2) ++int pkvm_pre_reserve_check(void) + { +- const struct memblock_region *r1 = p1; +- const struct memblock_region *r2 = p2; +- +- return r1->base < r2->base ? -1 : (r1->base > r2->base); +-} +- +-static void __init sort_memblock_regions(void) +-{ +- sort(hyp_memory, +- *hyp_memblock_nr_ptr, +- sizeof(struct memblock_region), +- cmp_hyp_memblock, +- NULL); +-} +- +-static int __init register_memblock_regions(void) +-{ +- struct memblock_region *reg; +- +- for_each_mem_region(reg) { +- if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) +- return -ENOMEM; ++ if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) ++ return -EINVAL; + +- hyp_memory[*hyp_memblock_nr_ptr] = *reg; +- (*hyp_memblock_nr_ptr)++; +- } +- sort_memblock_regions(); ++ if (kvm_get_mode() != KVM_MODE_PROTECTED) ++ return -EINVAL; + + return 0; + } + +-void __init kvm_hyp_reserve(void) ++u64 pkvm_total_reserve_pages(void) + { +- u64 nr_pages, prev, hyp_mem_pages = 0; +- int ret; +- +- if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) +- return; +- +- if (kvm_get_mode() != KVM_MODE_PROTECTED) +- return; ++ u64 nr_pages, prev, total_pages = 0; + +- ret = register_memblock_regions(); +- if (ret) { +- *hyp_memblock_nr_ptr = 0; +- kvm_err("Failed to register hyp memblocks: %d\n", ret); +- return; +- } +- +- hyp_mem_pages += hyp_s1_pgtable_pages(); +- hyp_mem_pages += host_s2_pgtable_pages(); ++ total_pages += hyp_s1_pgtable_pages(); ++ total_pages += host_s2_pgtable_pages(); + + /* + * The hyp_vmemmap needs to be backed by pages, but these pages +@@ -80,30 +35,12 @@ void __init kvm_hyp_reserve(void) + nr_pages = 0; + do { + prev = nr_pages; +- nr_pages = hyp_mem_pages + prev; ++ nr_pages = total_pages + prev; + nr_pages = DIV_ROUND_UP(nr_pages * STRUCT_HYP_PAGE_SIZE, + PAGE_SIZE); + nr_pages += __hyp_pgtable_max_pages(nr_pages); + } while (nr_pages != prev); +- hyp_mem_pages += nr_pages; +- +- /* +- * Try to allocate a PMD-aligned region to reduce TLB pressure once +- * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. +- */ +- hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; +- hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), +- PMD_SIZE); +- if (!hyp_mem_base) +- hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); +- else +- hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); +- +- if (!hyp_mem_base) { +- kvm_err("Failed to reserve hyp memory\n"); +- return; +- } ++ total_pages += nr_pages; + +- kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, +- hyp_mem_base); ++ return total_pages; + } +diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c +index 4b4651ee47f2..f939487b24b4 100644 +--- a/arch/arm64/mm/init.c ++++ b/arch/arm64/mm/init.c +@@ -420,7 +420,7 @@ void __init bootmem_init(void) + + dma_pernuma_cma_reserve(); + +- kvm_hyp_reserve(); ++ pkvm_reserve(); + + /* + * sparse_init() tries to allocate memory from memblock, so must be +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 4c9bfc4be58d..f430abaad5aa 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -896,6 +896,17 @@ config INTEL_TDX_GUEST + memory contents and CPU state. TDX guests are protected from + some attacks from the VMM. + ++config PKVM_GUEST ++ bool "PKVM Guest Support" ++ depends on X86_64 && !PKVM_INTEL ++ select ARCH_HAS_CC_PLATFORM ++ select X86_MEM_ENCRYPT ++ default n ++ help ++ Support running as a protected guest under Protected KVM. ++ Without this support, the guest kernel can not boot or run ++ under Protected KVM. ++ + endif # HYPERVISOR_GUEST + + source "arch/x86/Kconfig.cpu" +diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile +index c816acf78b6a..878cb2f5cccd 100644 +--- a/arch/x86/coco/Makefile ++++ b/arch/x86/coco/Makefile +@@ -5,4 +5,5 @@ CFLAGS_core.o += -fno-stack-protector + + obj-y += core.o + +-obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/ ++obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/ virt_exception.o ++obj-$(CONFIG_PKVM_GUEST) += pkvm/ virt_exception.o +diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c +index 49b44f881484..701019e9191e 100644 +--- a/arch/x86/coco/core.c ++++ b/arch/x86/coco/core.c +@@ -16,6 +16,38 @@ + static enum cc_vendor vendor __ro_after_init; + static u64 cc_mask __ro_after_init; + ++static bool pkvm_cc_platform_has(enum cc_attr attr) ++{ ++ /* ++ * Since primary VM can't access pkvm guest's memory, pkvm guest need ++ * explicitly share DMA buffer with primary VM to make virtio work. By ++ * using these attribute, pkvm guest will using bounce buffer for DMA ++ * operation, and share the bounce buffer with primary VM. ++ * ++ * CC_ATTR_GUEST_UNROLL_STRING_IO: Since string io cause KVM to do ++ * instruction decode, to avoid it, using this attribute will unroll the ++ * string io. For example, in , the definition of the outsb ++ * check to attribute to determine if using string io. ++ * ++ * CC_ATTR_GUEST_MEM_ENCRYPT: This attribute has been checked in the ++ * force_dma_unencrypted(). Which means all DMA buffer will be shared ++ * between pkvm guest and primary VM. And checked in ++ * pci_swiotlb_detect(), this makes pkvm guest using bounce buffer. ++ * ++ * CC_ATTR_MEM_ENCRYPT: This attribute has been checked in the ++ * mem_encrypt_init(). Which will make all bounce buffer being shared ++ * between pkvm guest and primary VM. ++ */ ++ switch (attr) { ++ case CC_ATTR_GUEST_UNROLL_STRING_IO: ++ case CC_ATTR_GUEST_MEM_ENCRYPT: ++ case CC_ATTR_MEM_ENCRYPT: ++ return true; ++ default: ++ return false; ++ } ++} ++ + static bool intel_cc_platform_has(enum cc_attr attr) + { + switch (attr) { +@@ -90,6 +122,8 @@ bool cc_platform_has(enum cc_attr attr) + return intel_cc_platform_has(attr); + case CC_VENDOR_HYPERV: + return hyperv_cc_platform_has(attr); ++ case CC_VENDOR_PKVM: ++ return pkvm_cc_platform_has(attr); + default: + return false; + } +diff --git a/arch/x86/coco/pkvm/Makefile b/arch/x86/coco/pkvm/Makefile +new file mode 100644 +index 000000000000..7896f6d4f4b2 +--- /dev/null ++++ b/arch/x86/coco/pkvm/Makefile +@@ -0,0 +1,3 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++obj-y += pkvm.o pkvmcall.o +diff --git a/arch/x86/coco/pkvm/pkvm.c b/arch/x86/coco/pkvm/pkvm.c +new file mode 100644 +index 000000000000..3590bed967db +--- /dev/null ++++ b/arch/x86/coco/pkvm/pkvm.c +@@ -0,0 +1,113 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "pkvm: " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static bool pkvm_guest_detected; ++ ++bool pkvm_is_protected_guest(void) ++{ ++ return pkvm_guest_detected; ++} ++ ++int pkvm_set_mem_host_visibility(unsigned long addr, int numpages, bool enc) ++{ ++ unsigned long size = numpages * PAGE_SIZE; ++ ++ if (!enc) { ++ /* ++ * When pkvm guest want to share a range of memory, these pages ++ * may have not been setup in the guest ept pagetables. So when ++ * the pkvm do the __pkvm_guest_share_host() thing, if no page ++ * found in guest ept, this function will failed, thus the share ++ * page function will failed. ++ * So before share these pages to host, first touch them, so ++ * they will have entry in the guest ept, to make sure the ++ * sharing will success. ++ * ++ * TODO: Another good way to mitigate this touch is to fake ept ++ * violation when the sharing function find that there is no ++ * page in the guest ept. ++ */ ++ memset((void *)addr, 0, size); ++ kvm_hypercall2(PKVM_GHC_SHARE_MEM, __pa(addr), size); ++ } else ++ kvm_hypercall2(PKVM_GHC_UNSHARE_MEM, __pa(addr), size); ++ ++ return 0; ++} ++ ++void pkvm_get_ve_info(struct ve_info *ve) ++{ ++ /* Reuse the tdx output for pkvm. */ ++ struct tdx_module_output out; ++ ++ __pkvm_module_call(PKVM_GHC_GET_VE_INFO, &out); ++ ++ /* Transfer the output parameters */ ++ ve->exit_reason = out.rcx; ++ ve->exit_qual = out.rdx; ++ ve->gla = out.r8; ++ ve->gpa = out.r9; ++} ++ ++static bool mmio_write(int size, unsigned long addr, unsigned long val) ++{ ++ kvm_hypercall3(PKVM_GHC_IOWRITE, addr, size, val); ++ ++ return true; ++} ++ ++static bool mmio_read(int size, unsigned long addr, unsigned long *val) ++{ ++ *val = kvm_hypercall2(PKVM_GHC_IOREAD, addr, size); ++ ++ return true; ++} ++ ++static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) ++{ ++ switch (ve->exit_reason) { ++ case EXIT_REASON_EPT_VIOLATION: ++ return ve_handle_mmio(regs, ve); ++ default: ++ pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); ++ return -EIO; ++ } ++} ++ ++static bool pkvm_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) ++{ ++ int insn_len; ++ ++ insn_len = virt_exception_kernel(regs, ve); ++ if (insn_len < 0) ++ return false; ++ ++ /* After successful #VE handling, move the IP */ ++ regs->ip += insn_len; ++ ++ return true; ++} ++ ++__init void pkvm_guest_init_coco(void) ++{ ++ cc_set_vendor(CC_VENDOR_PKVM); ++ ++ pkvm_guest_detected = true; ++ ++ ve_x86_ops.mmio_read = mmio_read; ++ ve_x86_ops.mmio_write = mmio_write; ++ ve_x86_ops.handle_virt_exception = pkvm_handle_virt_exception; ++ ve_x86_ops.get_ve_info = pkvm_get_ve_info; ++} +diff --git a/arch/x86/coco/pkvm/pkvmcall.S b/arch/x86/coco/pkvm/pkvmcall.S +new file mode 100644 +index 000000000000..b7cbe432f065 +--- /dev/null ++++ b/arch/x86/coco/pkvm/pkvmcall.S +@@ -0,0 +1,42 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++.macro PKVM_MODULE_CALL ++ /* Save the output parameter. */ ++ push %r12 ++ ++ /* Push output pointer to stack. */ ++ push %rsi ++ ++ mov %rdi, %rax ++ ++ vmcall ++ ++ pop %r12 ++ ++ test %r12, %r12 ++ jz .Lno_output_struct ++ ++ /* Copy result registers to output struct. */ ++ movq %rcx, 0(%r12) ++ movq %rdx, 8(%r12) ++ movq %r8, 16(%r12) ++ movq %r9, 24(%r12) ++ ++.Lno_output_struct: ++ pop %r12 ++.endm ++ ++SYM_FUNC_START(__pkvm_module_call) ++ FRAME_BEGIN ++ PKVM_MODULE_CALL ++ FRAME_END ++ RET ++SYM_FUNC_END(__pkvm_module_call) +diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c +index 8a1d48b8c2a3..e8393b62ed0c 100644 +--- a/arch/x86/coco/tdx/tdx.c ++++ b/arch/x86/coco/tdx/tdx.c +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + /* TDX module Call Leaf IDs */ + #define TDX_GET_INFO 1 +@@ -342,111 +343,6 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val) + EPT_WRITE, addr, val); + } + +-static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) +-{ +- unsigned long *reg, val, vaddr; +- char buffer[MAX_INSN_SIZE]; +- struct insn insn = {}; +- enum mmio_type mmio; +- int size, extend_size; +- u8 extend_val = 0; +- +- /* Only in-kernel MMIO is supported */ +- if (WARN_ON_ONCE(user_mode(regs))) +- return -EFAULT; +- +- if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE)) +- return -EFAULT; +- +- if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64)) +- return -EINVAL; +- +- mmio = insn_decode_mmio(&insn, &size); +- if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED)) +- return -EINVAL; +- +- if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { +- reg = insn_get_modrm_reg_ptr(&insn, regs); +- if (!reg) +- return -EINVAL; +- } +- +- /* +- * Reject EPT violation #VEs that split pages. +- * +- * MMIO accesses are supposed to be naturally aligned and therefore +- * never cross page boundaries. Seeing split page accesses indicates +- * a bug or a load_unaligned_zeropad() that stepped into an MMIO page. +- * +- * load_unaligned_zeropad() will recover using exception fixups. +- */ +- vaddr = (unsigned long)insn_get_addr_ref(&insn, regs); +- if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) +- return -EFAULT; +- +- /* Handle writes first */ +- switch (mmio) { +- case MMIO_WRITE: +- memcpy(&val, reg, size); +- if (!mmio_write(size, ve->gpa, val)) +- return -EIO; +- return insn.length; +- case MMIO_WRITE_IMM: +- val = insn.immediate.value; +- if (!mmio_write(size, ve->gpa, val)) +- return -EIO; +- return insn.length; +- case MMIO_READ: +- case MMIO_READ_ZERO_EXTEND: +- case MMIO_READ_SIGN_EXTEND: +- /* Reads are handled below */ +- break; +- case MMIO_MOVS: +- case MMIO_DECODE_FAILED: +- /* +- * MMIO was accessed with an instruction that could not be +- * decoded or handled properly. It was likely not using io.h +- * helpers or accessed MMIO accidentally. +- */ +- return -EINVAL; +- default: +- WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?"); +- return -EINVAL; +- } +- +- /* Handle reads */ +- if (!mmio_read(size, ve->gpa, &val)) +- return -EIO; +- +- switch (mmio) { +- case MMIO_READ: +- /* Zero-extend for 32-bit operation */ +- extend_size = size == 4 ? sizeof(*reg) : 0; +- break; +- case MMIO_READ_ZERO_EXTEND: +- /* Zero extend based on operand size */ +- extend_size = insn.opnd_bytes; +- break; +- case MMIO_READ_SIGN_EXTEND: +- /* Sign extend based on operand size */ +- extend_size = insn.opnd_bytes; +- if (size == 1 && val & BIT(7)) +- extend_val = 0xFF; +- else if (size > 1 && val & BIT(15)) +- extend_val = 0xFF; +- break; +- default: +- /* All other cases has to be covered with the first switch() */ +- WARN_ON_ONCE(1); +- return -EINVAL; +- } +- +- if (extend_size) +- memset(reg, extend_val, extend_size); +- memcpy(reg, &val, size); +- return insn.length; +-} +- + static bool handle_in(struct pt_regs *regs, int size, int port) + { + struct tdx_hypercall_args args = { +@@ -606,7 +502,7 @@ static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) + case EXIT_REASON_CPUID: + return handle_cpuid(regs, ve); + case EXIT_REASON_EPT_VIOLATION: +- return handle_mmio(regs, ve); ++ return ve_handle_mmio(regs, ve); + case EXIT_REASON_IO_INSTRUCTION: + return handle_io(regs, ve); + default: +@@ -615,7 +511,7 @@ static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) + } + } + +-bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) ++static bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) + { + int insn_len; + +@@ -829,5 +725,10 @@ void __init tdx_early_init(void) + x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; + x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; + ++ ve_x86_ops.mmio_read = mmio_read; ++ ve_x86_ops.mmio_write = mmio_write; ++ ve_x86_ops.handle_virt_exception = tdx_handle_virt_exception; ++ ve_x86_ops.get_ve_info = tdx_get_ve_info; ++ + pr_info("Guest detected\n"); + } +diff --git a/arch/x86/coco/virt_exception.c b/arch/x86/coco/virt_exception.c +new file mode 100644 +index 000000000000..15db92c24787 +--- /dev/null ++++ b/arch/x86/coco/virt_exception.c +@@ -0,0 +1,126 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++#include ++#include ++ ++struct ve_x86_ops ve_x86_ops; ++ ++int ve_handle_mmio(struct pt_regs *regs, struct ve_info *ve) ++{ ++ unsigned long *reg, val, vaddr; ++ char buffer[MAX_INSN_SIZE]; ++ struct insn insn = {}; ++ enum mmio_type mmio; ++ int size, extend_size; ++ u8 extend_val = 0; ++ ++ /* Only in-kernel MMIO is supported */ ++ if (WARN_ON_ONCE(user_mode(regs))) ++ return -EFAULT; ++ ++ if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE)) ++ return -EFAULT; ++ ++ if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64)) ++ return -EINVAL; ++ ++ mmio = insn_decode_mmio(&insn, &size); ++ if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED)) ++ return -EINVAL; ++ ++ if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { ++ reg = insn_get_modrm_reg_ptr(&insn, regs); ++ if (!reg) ++ return -EINVAL; ++ } ++ ++ /* ++ * Reject EPT violation #VEs that split pages. ++ * ++ * MMIO accesses are supposed to be naturally aligned and therefore ++ * never cross page boundaries. Seeing split page accesses indicates ++ * a bug or a load_unaligned_zeropad() that stepped into an MMIO page. ++ * ++ * load_unaligned_zeropad() will recover using exception fixups. ++ */ ++ vaddr = (unsigned long)insn_get_addr_ref(&insn, regs); ++ if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) ++ return -EFAULT; ++ ++ /* Handle writes first */ ++ switch (mmio) { ++ case MMIO_WRITE: ++ memcpy(&val, reg, size); ++ if (!ve_x86_ops.mmio_write(size, ve->gpa, val)) ++ return -EIO; ++ return insn.length; ++ case MMIO_WRITE_IMM: ++ val = insn.immediate.value; ++ if (!ve_x86_ops.mmio_write(size, ve->gpa, val)) ++ return -EIO; ++ return insn.length; ++ case MMIO_READ: ++ case MMIO_READ_ZERO_EXTEND: ++ case MMIO_READ_SIGN_EXTEND: ++ /* Reads are handled below */ ++ break; ++ case MMIO_MOVS: ++ case MMIO_DECODE_FAILED: ++ /* ++ * MMIO was accessed with an instruction that could not be ++ * decoded or handled properly. It was likely not using io.h ++ * helpers or accessed MMIO accidentally. ++ */ ++ return -EINVAL; ++ default: ++ WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?"); ++ return -EINVAL; ++ } ++ ++ /* Handle reads */ ++ if (!ve_x86_ops.mmio_read(size, ve->gpa, &val)) ++ return -EIO; ++ ++ switch (mmio) { ++ case MMIO_READ: ++ /* Zero-extend for 32-bit operation */ ++ extend_size = size == 4 ? sizeof(*reg) : 0; ++ break; ++ case MMIO_READ_ZERO_EXTEND: ++ /* Zero extend based on operand size */ ++ extend_size = insn.opnd_bytes; ++ break; ++ case MMIO_READ_SIGN_EXTEND: ++ /* Sign extend based on operand size */ ++ extend_size = insn.opnd_bytes; ++ if (size == 1 && val & BIT(7)) ++ extend_val = 0xFF; ++ else if (size > 1 && val & BIT(15)) ++ extend_val = 0xFF; ++ break; ++ default: ++ /* All other cases has to be covered with the first switch() */ ++ WARN_ON_ONCE(1); ++ return -EINVAL; ++ } ++ ++ if (extend_size) ++ memset(reg, extend_val, extend_size); ++ memcpy(reg, &val, size); ++ return insn.length; ++} ++ ++bool handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) ++{ ++ if (ve_x86_ops.handle_virt_exception) ++ return ve_x86_ops.handle_virt_exception(regs, ve); ++ ++ return false; ++} ++ ++void get_ve_info(struct ve_info *ve) ++{ ++ if (ve_x86_ops.get_ve_info) ++ ve_x86_ops.get_ve_info(ve); ++} +diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h +index 3d98c3a60d34..065a70448d8a 100644 +--- a/arch/x86/include/asm/coco.h ++++ b/arch/x86/include/asm/coco.h +@@ -9,6 +9,7 @@ enum cc_vendor { + CC_VENDOR_AMD, + CC_VENDOR_HYPERV, + CC_VENDOR_INTEL, ++ CC_VENDOR_PKVM, + }; + + void cc_set_vendor(enum cc_vendor v); +diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h +index e41cbf2ec41d..731db23c6197 100644 +--- a/arch/x86/include/asm/hypervisor.h ++++ b/arch/x86/include/asm/hypervisor.h +@@ -30,6 +30,7 @@ enum x86_hypervisor_type { + X86_HYPER_KVM, + X86_HYPER_JAILHOUSE, + X86_HYPER_ACRN, ++ X86_HYPER_PKVM, + }; + + #ifdef CONFIG_HYPERVISOR_GUEST +@@ -64,6 +65,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv; + extern const struct hypervisor_x86 x86_hyper_kvm; + extern const struct hypervisor_x86 x86_hyper_jailhouse; + extern const struct hypervisor_x86 x86_hyper_acrn; ++extern const struct hypervisor_x86 x86_hyper_pkvm; + extern struct hypervisor_x86 x86_hyper_xen_hvm; + + extern bool nopv; +diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h +index 72184b0b2219..51d79424c09a 100644 +--- a/arch/x86/include/asm/idtentry.h ++++ b/arch/x86/include/asm/idtentry.h +@@ -632,7 +632,7 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); + DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap); + #endif + +-#ifdef CONFIG_INTEL_TDX_GUEST ++#if defined(CONFIG_INTEL_TDX_GUEST) || defined(CONFIG_PKVM_GUEST) + DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception); + #endif + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index 2c6698aa218b..6edd77847405 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -19,8 +19,10 @@ KVM_X86_OP(hardware_disable) + KVM_X86_OP(hardware_unsetup) + KVM_X86_OP(has_emulated_msr) + KVM_X86_OP(vcpu_after_set_cpuid) ++KVM_X86_OP(is_vm_type_supported) + KVM_X86_OP(vm_init) + KVM_X86_OP_OPTIONAL(vm_destroy) ++KVM_X86_OP_OPTIONAL(vm_free) + KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) + KVM_X86_OP(vcpu_create) + KVM_X86_OP(vcpu_free) +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index c1dcaa3d2d6e..13cce3625c9a 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1148,6 +1148,7 @@ enum kvm_apicv_inhibit { + }; + + struct kvm_arch { ++ unsigned long vm_type; + unsigned long n_used_mmu_pages; + unsigned long n_requested_mmu_pages; + unsigned long n_max_mmu_pages; +@@ -1462,10 +1463,12 @@ struct kvm_x86_ops { + void (*hardware_unsetup)(void); + bool (*has_emulated_msr)(struct kvm *kvm, u32 index); + void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); ++ bool (*is_vm_type_supported)(unsigned long vm_type); + + unsigned int vm_size; + int (*vm_init)(struct kvm *kvm); + void (*vm_destroy)(struct kvm *kvm); ++ void (*vm_free)(struct kvm *kvm); + + /* Create, but do not attach this VCPU */ + int (*vcpu_precreate)(struct kvm *kvm); +@@ -1666,6 +1669,7 @@ struct kvm_x86_nested_ops { + }; + + struct kvm_x86_init_ops { ++ int (*pkvm_init)(void); + int (*cpu_has_kvm_support)(void); + int (*disabled_by_bios)(void); + int (*check_processor_compatibility)(void); +@@ -1717,9 +1721,23 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) + return -ENOTSUPP; + } + ++#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB_WITH_RANGE ++static inline int kvm_arch_flush_remote_tlb_with_range(struct kvm *kvm, ++ struct kvm_tlb_range *range) ++{ ++ if (range && kvm_x86_ops.tlb_remote_flush_with_range && ++ !static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range)) ++ return 0; ++ ++ return -ENOTSUPP; ++} ++ + #define kvm_arch_pmi_in_guest(vcpu) \ + ((vcpu) && (vcpu)->arch.handling_intr_from_guest) + ++#ifdef CONFIG_PKVM_INTEL ++int __init pkvm_init(void); ++#endif + void __init kvm_mmu_x86_module_init(void); + int kvm_mmu_vendor_module_init(void); + void kvm_mmu_vendor_module_exit(void); +diff --git a/arch/x86/include/asm/kvm_pkvm.h b/arch/x86/include/asm/kvm_pkvm.h +new file mode 100644 +index 000000000000..73ec34f2c4df +--- /dev/null ++++ b/arch/x86/include/asm/kvm_pkvm.h +@@ -0,0 +1,250 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#ifndef _ASM_X86_KVM_PKVM_H ++#define _ASM_X86_KVM_PKVM_H ++ ++#include ++ ++#ifdef CONFIG_PKVM_INTEL ++ ++#include ++#include ++#include ++ ++#define PKVM_MEMBLOCK_REGIONS 128 ++#define PKVM_PGTABLE_MAX_LEVELS 5U ++ ++extern struct memblock_region pkvm_sym(pkvm_memory)[]; ++extern unsigned int pkvm_sym(pkvm_memblock_nr); ++ ++void *pkvm_phys_to_virt(unsigned long phys); ++unsigned long pkvm_virt_to_phys(void *virt); ++ ++#define __pkvm_pa(virt) pkvm_virt_to_phys((void *)(virt)) ++#define __pkvm_va(phys) pkvm_phys_to_virt((unsigned long)(phys)) ++ ++extern phys_addr_t pkvm_mem_base; ++extern phys_addr_t pkvm_mem_size; ++ ++void __init pkvm_reserve(void); ++ ++static inline unsigned long __pkvm_pgtable_max_pages(unsigned long nr_pages) ++{ ++ unsigned long total = 0, i; ++ ++ /* Provision the worst case */ ++ for (i = 0; i < PKVM_PGTABLE_MAX_LEVELS; i++) { ++ nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); ++ total += nr_pages; ++ } ++ ++ return total; ++} ++ ++static inline unsigned long __pkvm_pgtable_total_pages(void) ++{ ++ unsigned long total = 0, i; ++ ++ for (i = 0; i < pkvm_sym(pkvm_memblock_nr); i++) { ++ struct memblock_region *reg = &pkvm_sym(pkvm_memory)[i]; ++ total += __pkvm_pgtable_max_pages(reg->size >> PAGE_SHIFT); ++ } ++ ++ return total; ++} ++ ++static inline unsigned long host_ept_pgtable_pages(void) ++{ ++ unsigned long res; ++ ++ /* ++ * Include an extra 16 pages to safely upper-bound the worst case of ++ * concatenated pgds. ++ */ ++ res = __pkvm_pgtable_total_pages() + 16; ++ ++ /* Allow 1 GiB for MMIO mappings */ ++ res += __pkvm_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); ++ ++ return res; ++} ++ ++static inline unsigned long pkvm_mmu_pgtable_pages(void) ++{ ++ unsigned long res; ++ ++ res = __pkvm_pgtable_total_pages(); ++ ++ return res; ++} ++ ++static inline unsigned long pkvm_vmemmap_memblock_size(struct memblock_region *reg, ++ size_t vmemmap_entry_size) ++{ ++ unsigned long nr_pages = reg->size >> PAGE_SHIFT; ++ unsigned long start, end; ++ ++ /* Translate the pfn to the vmemmap entry */ ++ start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size; ++ end = start + nr_pages * vmemmap_entry_size; ++ start = ALIGN_DOWN(start, PAGE_SIZE); ++ end = ALIGN(end, PAGE_SIZE); ++ ++ return end - start; ++} ++ ++static inline unsigned long pkvm_vmemmap_pages(size_t vmemmap_entry_size) ++{ ++ unsigned long total_size = 0, i; ++ ++ for (i = 0; i < pkvm_sym(pkvm_memblock_nr); i++) { ++ total_size += pkvm_vmemmap_memblock_size(&pkvm_sym(pkvm_memory)[i], ++ vmemmap_entry_size); ++ } ++ ++ return total_size >> PAGE_SHIFT; ++} ++ ++static inline unsigned long pkvm_data_struct_pages(unsigned long global_pgs, ++ unsigned long percpu_pgs, int num_cpus) ++{ ++ return (percpu_pgs * num_cpus + global_pgs); ++} ++ ++static inline int pkvm_pre_reserve_check(void) ++{ ++ /* no necessary check yet*/ ++ return 0; ++} ++ ++/* Calculate the total pages for Scalable IOMMU */ ++static inline unsigned long pkvm_iommu_pages(int max_pasid, int nr_pasid_pdev, ++ int nr_pdev, int nr_iommu, int qidesc_sz, ++ int qidesc_status_sz, int num_cpus) ++{ ++ unsigned long res = 0; ++ ++ /* PASID page table pages for each PASID capable pdev */ ++ res += ((max_pasid >> 6) + (max_pasid >> 15)) * nr_pasid_pdev; ++ /* PASID page table pages (PASID dir + PASID table) for each normal pdev */ ++ res += 2 * nr_pdev; ++ /* ++ * Context table page count is the minimal value of ++ * total pdev number and 256 bus * 2 (in scalable mode). ++ * Each pdev may require a context page if its bdf is ++ * discrete enough. ++ */ ++ res += min(256 * 2, nr_pasid_pdev + nr_pdev); ++ /* Root pages for each IOMMU */ ++ res += nr_iommu; ++ /* Desc and desc_status pages for each IOMMU */ ++ res += nr_iommu * ((1 << get_order(qidesc_sz)) + (1 << get_order(qidesc_status_sz))); ++ /* ++ * Reserve more IQ descriptor page. The size is calculated according to ++ * the IOMMU QI descriptor size(excludes the QI descriptor status as ++ * this is not needed to bunch requests) and the CPU number. Each CPU can ++ * have its own reserved QI descriptor page so that multiple CPUs can ++ * bunch the QI requests at the same time. ++ */ ++ res += num_cpus * (1 << get_order(qidesc_sz)); ++ ++ return res; ++} ++ ++/* ++ * Calculate the total pages for shadow EPT. The assumptions are that: ++ * 1. There is no shared memory between normal VMs or between secure VMs. ++ * 2. The normal VM or secure VM memory size is no larger than the platform ++ * memory size. ++ * 3. The virtual MMIO range for each VM is no larger than 1G. ++ * With these assumptions, we can reserve enough memory for normal VMs and ++ * secure VMs. ++ * 4. Each VM only has one shadow EPT. This will make vSMM mode and non-vSMM ++ * mode share the same shadow EPT for a VM, which brings security weakness for ++ * the vSMM mode. ++ */ ++static inline unsigned long pkvm_shadow_ept_pgtable_pages(int nr_vm) ++{ ++ unsigned long pgtable_pages = __pkvm_pgtable_total_pages(); ++ unsigned long res; ++ ++ /* ++ * Reserve enough pages to map all the platform memory in shadow ++ * EPT. With assumption#1 and assumption#4, these pages are enough ++ * for all VMs. ++ */ ++ res = pgtable_pages; ++ ++ /* ++ * There are multiple VMs. Although the total pages can be calculated ++ * through __pkvm_pgtable_total_pages() to map all the memory, this is ++ * enough to satisfy the level1 page table pages for all VMs but not ++ * enough to satisfy the level2:level5 page table pages. Each VM will ++ * require its own level2:level5 pages. Because __pkvm_pgtable_total_pages ++ * has already allocated 1 level2:level5, we just minus 1 from the total ++ * number of VMs, and multiply it by 2 considering SMM mode. ++ */ ++ res += __pkvm_pgtable_max_pages(pgtable_pages) * (nr_vm - 1) * 2; ++ ++ /* Allow 1 GiB for MMIO mappings for each VM */ ++ res += __pkvm_pgtable_max_pages(SZ_1G >> PAGE_SHIFT) * nr_vm; ++ ++ /* ++ * Each shadow VM has two page tables. One is used to manage page state ++ * and reused as IOMMU second-level pagetable for passthrough device in ++ * protected VM. Another one is used as shadow EPT. ++ */ ++ return (res * 2); ++} ++ ++/* ++ * Calculate the total pages for shadow IOMMU page tables for the host's ++ * devices used with Legacy IOMMU. Similarly to the calculation for shadow EPT, ++ * we assume that there is no shared memory between devices using different ++ * page tables. ++ * ++ * TODO: do not reserve these pages if legacy mode is not used by pKVM, i.e. ++ * if all the IOMMUs have scalable mode capability. ++ */ ++static inline unsigned long pkvm_host_shadow_iommu_pgtable_pages(int nr_pdev) ++{ ++ unsigned long pgtable_pages = __pkvm_pgtable_total_pages(); ++ unsigned long res; ++ ++ res = pgtable_pages; ++ ++ /* ++ * Similarly to shadow VMs (see the comment in ++ * pkvm_shadow_ept_pgtable_pages()), each device may require ++ * its own level2:level5 page table pages. ++ */ ++ res += __pkvm_pgtable_max_pages(pgtable_pages) * (nr_pdev - 1); ++ ++ return res; ++} ++ ++ ++u64 pkvm_total_reserve_pages(void); ++ ++int pkvm_init_shadow_vm(struct kvm *kvm); ++void pkvm_teardown_shadow_vm(struct kvm *kvm); ++int pkvm_init_shadow_vcpu(struct kvm_vcpu *vcpu); ++void pkvm_teardown_shadow_vcpu(struct kvm_vcpu *vcpu); ++int pkvm_tlb_remote_flush(struct kvm *kvm); ++int pkvm_tlb_remote_flush_with_range(struct kvm *kvm, ++ struct kvm_tlb_range *range); ++int pkvm_set_mmio_ve(struct kvm_vcpu *vcpu, unsigned long gfn); ++#else ++static inline void pkvm_reserve(void) {} ++static inline int pkvm_init_shadow_vm(struct kvm *kvm) { return 0; } ++static inline void pkvm_teardown_shadow_vm(struct kvm *kvm) {} ++static inline int pkvm_init_shadow_vcpu(struct kvm_vcpu *vcpu) { return 0; } ++static inline void pkvm_teardown_shadow_vcpu(struct kvm_vcpu *vcpu) {} ++static inline int pkvm_set_mmio_ve(struct kvm_vcpu *vcpu, unsigned long gfn) { return 0; } ++#endif ++ ++#endif ++ +diff --git a/arch/x86/include/asm/pkvm.h b/arch/x86/include/asm/pkvm.h +new file mode 100644 +index 000000000000..a404dd549a1b +--- /dev/null ++++ b/arch/x86/include/asm/pkvm.h +@@ -0,0 +1,151 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _ASM_X86_PKVM_H ++#define _ASM_X86_PKVM_H ++ ++#include ++#include ++#include ++#include ++ ++/* PKVM Hypercalls */ ++#define PKVM_HC_INIT_FINALISE 1 ++#define PKVM_HC_INIT_SHADOW_VM 2 ++#define PKVM_HC_INIT_SHADOW_VCPU 3 ++#define PKVM_HC_TEARDOWN_SHADOW_VM 4 ++#define PKVM_HC_TEARDOWN_SHADOW_VCPU 5 ++#define PKVM_HC_MMIO_ACCESS 6 ++#define PKVM_HC_ACTIVATE_IOMMU 7 ++#define PKVM_HC_TLB_REMOTE_FLUSH_RANGE 8 ++#define PKVM_HC_SET_MMIO_VE 9 ++#define PKVM_HC_ADD_PTDEV 10 ++ ++/* ++ * 15bits for PASID, DO NOT change it, based on it, ++ * the size of PASID DIR table can kept as one page ++ */ ++#define PKVM_MAX_PASID_BITS 15 ++#define PKVM_MAX_PASID (1 << PKVM_MAX_PASID_BITS) ++ ++#ifdef CONFIG_PKVM_INTEL ++DECLARE_PER_CPU_READ_MOSTLY(bool, pkvm_enabled); ++ ++static inline u64 pkvm_readq(void __iomem *reg, unsigned long reg_phys, ++ unsigned long offset) ++{ ++ if (likely(this_cpu_read(pkvm_enabled))) ++ return (u64)kvm_hypercall3(PKVM_HC_MMIO_ACCESS, true, ++ sizeof(u64), reg_phys + offset); ++ else ++ return readq(reg + offset); ++} ++ ++static inline u32 pkvm_readl(void __iomem *reg, unsigned long reg_phys, ++ unsigned long offset) ++{ ++ if (likely(this_cpu_read(pkvm_enabled))) ++ return (u32)kvm_hypercall3(PKVM_HC_MMIO_ACCESS, true, ++ sizeof(u32), reg_phys + offset); ++ else ++ return readl(reg + offset); ++} ++ ++static inline void pkvm_writeq(void __iomem *reg, unsigned long reg_phys, ++ unsigned long offset, u64 val) ++{ ++ if (likely(this_cpu_read(pkvm_enabled))) ++ kvm_hypercall4(PKVM_HC_MMIO_ACCESS, false, sizeof(u64), ++ reg_phys + offset, val); ++ else ++ writeq(val, reg + offset); ++} ++ ++static inline void pkvm_writel(void __iomem *reg, unsigned long reg_phys, ++ unsigned long offset, u32 val) ++{ ++ if (likely(this_cpu_read(pkvm_enabled))) ++ kvm_hypercall4(PKVM_HC_MMIO_ACCESS, false, sizeof(u32), ++ reg_phys + offset, (u64)val); ++ else ++ writel(val, reg + offset); ++} ++ ++static inline void pkvm_update_iommu_virtual_caps(u64 *cap, u64 *ecap) ++{ ++ if (cap) ++ /* ++ * Set caching mode as linux OS will run in a VM ++ * controlling a virtual IOMMU device emulated ++ * by pkvm. ++ */ ++ *cap |= 1 << 7; ++ ++ if (ecap) { ++ u64 tmp; ++ ++ /* ++ * Some IOMMU capabilities cannot be directly used by the linux ++ * IOMMU driver after the linux is deprivileged, which is because after ++ * deprivileging, pkvm IOMMU driver will control the physical IOMMU and ++ * it is designed to use physical IOMMU in two ways for better performance ++ * and simpler implementation: ++ * 1. using nested translation with the first-level from the deprivileged ++ * linux IOMMU driver and EPT as second-level. ++ * 2. using second-level only translation with EPT. ++ * The linux IOMMU driver then uses a virtual IOMMU device emulated by ++ * pkvm IOMMU driver. ++ * ++ * Way#1 and way#2 can only support the linux IOMMU driver working in ++ * first-level translation mode or HW pass-through mode. To guarantee ++ * this, let linux IOMMU driver pick up the supported capabilities ++ * when running at the bare metal if pkvm is enabled, to make it a ++ * pkvm-awared IOMMU kernel driver. ++ * ++ * So disable SLTS and Nest. ++ */ ++ *ecap &= ~((1UL << 46) | (1UL << 26)); ++ ++ /* limit PASID to reduce the memory consumptions */ ++ tmp = min_t(u64, (PKVM_MAX_PASID_BITS - 1), ++ (*ecap & GENMASK_ULL(39, 35)) >> 35); ++ *ecap = (*ecap & ~GENMASK_ULL(39, 35)) | (tmp << 35); ++ ++ /* ++ * Disable Device TLB capability for security. ++ * ++ * ATS is only enabled for trusted devices by the host OS. ++ * However with pkvm, the host OS including the device driver ++ * is treated as untrusted software. A malicious software in ++ * host OS may enable ATS for untrusted devices so that one ++ * untrusted device can still exploit the ATS weakness to bypass ++ * VT-d's translation protection and access the isolated memory. ++ * ++ * To resolve this, tell the host IOMMU driver not to enable ++ * any device's ATS as pkvm controls IOMMU not to enable the ++ * device TLB. ++ */ ++ *ecap &= ~(1UL << 2); ++ } ++} ++#endif ++ ++#ifdef CONFIG_PKVM_GUEST ++ ++void pkvm_guest_init_coco(void); ++bool pkvm_is_protected_guest(void); ++int pkvm_set_mem_host_visibility(unsigned long addr, int numpages, bool enc); ++ ++u64 __pkvm_module_call(u64 fn, struct tdx_module_output *out); ++ ++#else ++ ++static inline void pkvm_guest_init_coco(void) { } ++static inline bool pkvm_is_protected_guest(void) { return false; } ++static inline int ++pkvm_set_mem_host_visibility(unsigned long addr, int numpages, bool enc) { return 0; } ++ ++#endif ++ ++#endif +diff --git a/arch/x86/include/asm/pkvm_image.h b/arch/x86/include/asm/pkvm_image.h +new file mode 100644 +index 000000000000..5ae6a53177eb +--- /dev/null ++++ b/arch/x86/include/asm/pkvm_image.h +@@ -0,0 +1,48 @@ ++/* SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#ifndef __X86_INTEL_PKVM_IMAGE_H ++#define __X86_INTEL_PKVM_IMAGE_H ++ ++#if defined(CONFIG_PKVM_INTEL_DEBUG) || defined(__PKVM_HYP__) ++/* No prefix will be added */ ++#define PKVM_DECLARE(type, f) type f ++#define pkvm_sym(sym) sym ++#else ++/* prefix is added by Makefile */ ++#define PKVM_DECLARE(type, f) type __pkvm_##f ++#define pkvm_sym(sym) __pkvm_##sym ++#endif ++ ++#define __PKVM_CONCAT(a, b) a ## b ++#define PKVM_CONCAT(a, b) __PKVM_CONCAT(a, b) ++ ++#ifdef LINKER_SCRIPT ++ ++#define PKVM_SECTION_NAME(NAME) .pkvm##NAME ++ ++#define PKVM_SECTION_SYMBOL_NAME(NAME) \ ++ PKVM_CONCAT(__pkvm_section_, PKVM_SECTION_NAME(NAME)) ++ ++#define BEGIN_PKVM_SECTION(NAME) \ ++ PKVM_SECTION_NAME(NAME) : { \ ++ PKVM_SECTION_SYMBOL_NAME(NAME) = .; ++ ++#define END_PKVM_SECTION \ ++ } ++ ++#define PKVM_SECTION(NAME) \ ++ BEGIN_PKVM_SECTION(NAME) \ ++ *(NAME NAME##.*) \ ++ END_PKVM_SECTION ++ ++/* ++ * Defines a linker script alias of a kernel-proper symbol referenced by ++ * PKVM code. ++ */ ++#define PKVM_ALIAS(sym) pkvm_sym(sym) = sym; ++ ++#endif /* LINKER_SCRIPT */ ++ ++#endif /* __X86_INTEL_PKVM_IMAGE_H */ +diff --git a/arch/x86/include/asm/pkvm_image_vars.h b/arch/x86/include/asm/pkvm_image_vars.h +new file mode 100644 +index 000000000000..94d8d6910299 +--- /dev/null ++++ b/arch/x86/include/asm/pkvm_image_vars.h +@@ -0,0 +1,23 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef __ASM_x86_PKVM_IMAGE_VARS_H ++#define __ASM_x86_PKVM_IMAGE_VARS_H ++ ++#ifndef CONFIG_PKVM_INTEL_DEBUG ++ ++#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK ++PKVM_ALIAS(physical_mask); ++#endif ++ ++#ifdef CONFIG_AMD_MEM_ENCRYPT ++PKVM_ALIAS(sme_me_mask); ++#endif ++ ++PKVM_ALIAS(__default_kernel_pte_mask); ++PKVM_ALIAS(vmcs12_field_offsets); ++PKVM_ALIAS(nr_vmcs12_fields); ++#endif ++ ++#endif +diff --git a/arch/x86/include/asm/pkvm_spinlock.h b/arch/x86/include/asm/pkvm_spinlock.h +new file mode 100644 +index 000000000000..e524116fe15d +--- /dev/null ++++ b/arch/x86/include/asm/pkvm_spinlock.h +@@ -0,0 +1,62 @@ ++/* ++ * SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 ++ * Copyright (C) 2018-2022 Intel Corporation ++ * ++ * pkvm runs in a self-contained environment ++ * and requires a self-contained spinlock implementation ++ * which doesn't rely on any other external symbols. ++ * ++ * This is arch specific implementation ++ * */ ++#ifndef _ASM_X86_PKVM_SPINLOCK_H ++#define _ASM_X86_PKVM_SPINLOCK_H ++ ++#include ++ ++typedef struct arch_pkvm_spinlock { ++ union { ++ u64 head_tail; ++ struct { ++ u32 head; ++ u32 tail; ++ }; ++ }; ++} arch_pkvm_spinlock_t; ++ ++#define __ARCH_PKVM_SPINLOCK_UNLOCKED { { 0 } } ++ ++static inline void arch_pkvm_spin_lock(arch_pkvm_spinlock_t *lock) ++{ ++ /* The lock function atomically increments and exchanges the head ++ * counter of the queue. If the old head of the queue is equal to the ++ * tail, we have locked the spinlock. Otherwise we have to wait. ++ */ ++ ++ asm volatile (" movl $0x1,%%eax\n" ++ " lock xaddl %%eax,%[head]\n" ++ " cmpl %%eax,%[tail]\n" ++ " jz 1f\n" ++ "2: pause\n" ++ " cmpl %%eax,%[tail]\n" ++ " jnz 2b\n" ++ "1:\n" ++ : ++ : ++ [head] "m"(lock->head), ++ [tail] "m"(lock->tail) ++ : "cc", "memory", "eax"); ++} ++ ++static inline void arch_pkvm_spin_unlock(arch_pkvm_spinlock_t *lock) ++{ ++ /* Increment tail of queue */ ++ asm volatile (" lock incl %[tail]\n" ++ : ++ : [tail] "m" (lock->tail) ++ : "cc", "memory"); ++ ++} ++ ++static inline void arch_pkvm_assert_lock_held(arch_pkvm_spinlock_t *lock) { } ++ ++#endif +diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h +index 020c81a7c729..e44b97f69ccd 100644 +--- a/arch/x86/include/asm/tdx.h ++++ b/arch/x86/include/asm/tdx.h +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + + /* + * SW-defined error codes. +@@ -35,22 +36,6 @@ struct tdx_module_output { + u64 r11; + }; + +-/* +- * Used by the #VE exception handler to gather the #VE exception +- * info from the TDX module. This is a software only structure +- * and not part of the TDX module/VMM ABI. +- */ +-struct ve_info { +- u64 exit_reason; +- u64 exit_qual; +- /* Guest Linear (virtual) Address */ +- u64 gla; +- /* Guest Physical Address */ +- u64 gpa; +- u32 instr_len; +- u32 instr_info; +-}; +- + #ifdef CONFIG_INTEL_TDX_GUEST + + void __init tdx_early_init(void); +@@ -61,8 +46,6 @@ u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + + void tdx_get_ve_info(struct ve_info *ve); + +-bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); +- + void tdx_safe_halt(void); + + bool tdx_early_handle_ve(struct pt_regs *regs); +diff --git a/arch/x86/include/asm/virt_exception.h b/arch/x86/include/asm/virt_exception.h +new file mode 100644 +index 000000000000..ec75523624d7 +--- /dev/null ++++ b/arch/x86/include/asm/virt_exception.h +@@ -0,0 +1,41 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_X86_VIRT_EXCEPTION_H ++#define _ASM_X86_VIRT_EXCEPTION_H ++ ++#include ++ ++#ifndef __ASSEMBLY__ ++ ++/* ++ * Used by the #VE exception handler to gather the #VE exception ++ * info from the TDX module. This is a software only structure ++ * and not part of the TDX module/VMM ABI. ++ */ ++struct ve_info { ++ u64 exit_reason; ++ u64 exit_qual; ++ /* Guest Linear (virtual) Address */ ++ u64 gla; ++ /* Guest Physical Address */ ++ u64 gpa; ++ u32 instr_len; ++ u32 instr_info; ++}; ++ ++int ve_handle_mmio(struct pt_regs *regs, struct ve_info *ve); ++ ++void get_ve_info(struct ve_info *ve); ++ ++bool handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); ++ ++struct ve_x86_ops { ++ bool (*mmio_read)(int size, unsigned long addr, unsigned long *val); ++ bool (*mmio_write)(int size, unsigned long addr, unsigned long val); ++ bool (*handle_virt_exception)(struct pt_regs *regs, struct ve_info *ve); ++ void (*get_ve_info)(struct ve_info *ve); ++}; ++ ++extern struct ve_x86_ops ve_x86_ops; ++ ++#endif /* !__ASSEMBLY__ */ ++#endif /* _ASM_X86_VIRT_EXCEPTION_H */ +diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h +index 498dc600bd5c..0af92c414c19 100644 +--- a/arch/x86/include/asm/vmx.h ++++ b/arch/x86/include/asm/vmx.h +@@ -68,6 +68,7 @@ + #define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING) + #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) + #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) ++#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE) + #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) + #define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES) + #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) +@@ -223,6 +224,8 @@ enum vmcs_field { + VMREAD_BITMAP_HIGH = 0x00002027, + VMWRITE_BITMAP = 0x00002028, + VMWRITE_BITMAP_HIGH = 0x00002029, ++ VE_INFO_ADDR = 0x0000202A, ++ VE_INFO_ADDR_HIGH = 0x0000202B, + XSS_EXIT_BITMAP = 0x0000202C, + XSS_EXIT_BITMAP_HIGH = 0x0000202D, + ENCLS_EXITING_BITMAP = 0x0000202E, +@@ -322,6 +325,10 @@ enum vmcs_field { + CR3_TARGET_VALUE2 = 0x0000600c, + CR3_TARGET_VALUE3 = 0x0000600e, + EXIT_QUALIFICATION = 0x00006400, ++ EXIT_IO_RCX = 0x00006402, ++ EXIT_IO_RSI = 0x00006404, ++ EXIT_IO_RDI = 0x00006406, ++ EXIT_IO_RIP = 0x00006408, + GUEST_LINEAR_ADDRESS = 0x0000640a, + GUEST_CR0 = 0x00006800, + GUEST_CR3 = 0x00006802, +diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h +index 46de10a809ec..f513dc0ae610 100644 +--- a/arch/x86/include/uapi/asm/kvm.h ++++ b/arch/x86/include/uapi/asm/kvm.h +@@ -532,4 +532,7 @@ struct kvm_pmu_event_filter { + #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ + #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ + ++#define KVM_X86_DEFAULT_VM 0 ++#define KVM_X86_PROTECTED_VM 1 ++ + #endif /* _ASM_X86_KVM_H */ +diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile +index f10a921ee756..9aeff7157d86 100644 +--- a/arch/x86/kernel/cpu/Makefile ++++ b/arch/x86/kernel/cpu/Makefile +@@ -56,6 +56,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o + + obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o + obj-$(CONFIG_ACRN_GUEST) += acrn.o ++obj-$(CONFIG_PKVM_GUEST) += pkvm.o + + ifdef CONFIG_X86_FEATURE_NAMES + quiet_cmd_mkcapflags = MKCAP $@ +diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c +index 553bfbfc3a1b..e658f7c7c950 100644 +--- a/arch/x86/kernel/cpu/hypervisor.c ++++ b/arch/x86/kernel/cpu/hypervisor.c +@@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = + #ifdef CONFIG_ACRN_GUEST + &x86_hyper_acrn, + #endif ++#ifdef CONFIG_PKVM_GUEST ++ &x86_hyper_pkvm, ++#endif + }; + + enum x86_hypervisor_type x86_hyper_type; +diff --git a/arch/x86/kernel/cpu/pkvm.c b/arch/x86/kernel/cpu/pkvm.c +new file mode 100644 +index 000000000000..e68ae5f3e263 +--- /dev/null ++++ b/arch/x86/kernel/cpu/pkvm.c +@@ -0,0 +1,33 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * PKVM detection support. ++ */ ++ ++#include ++#include ++ ++static u32 __init pkvm_detect(void) ++{ ++ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) ++ return hypervisor_cpuid_base("PKVMPKVMPKVM", 0); ++ ++ return 0; ++} ++ ++static void __init pkvm_init_platform(void) ++{ ++ pkvm_guest_init_coco(); ++} ++ ++static bool pkvm_x2apic_available(void) ++{ ++ return boot_cpu_has(X86_FEATURE_X2APIC); ++} ++ ++const __initconst struct hypervisor_x86 x86_hyper_pkvm = { ++ .name = "PKVM", ++ .detect = pkvm_detect, ++ .type = X86_HYPER_PKVM, ++ .init.init_platform = pkvm_init_platform, ++ .init.x2apic_available = pkvm_x2apic_available, ++}; +diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c +index a58c6bc1cd68..bd113065f802 100644 +--- a/arch/x86/kernel/idt.c ++++ b/arch/x86/kernel/idt.c +@@ -69,7 +69,7 @@ static const __initconst struct idt_data early_idts[] = { + */ + INTG(X86_TRAP_PF, asm_exc_page_fault), + #endif +-#ifdef CONFIG_INTEL_TDX_GUEST ++#if defined(CONFIG_INTEL_TDX_GUEST) || defined(CONFIG_PKVM_GUEST) + INTG(X86_TRAP_VE, asm_exc_virtualization_exception), + #endif + }; +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index 804a252382da..6af7e3d985b8 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1327,6 +1328,8 @@ void __init setup_arch(char **cmdline_p) + #endif + + unwind_init(); ++ ++ pkvm_reserve(); + } + + #ifdef CONFIG_X86_32 +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index c0a5a4f225d9..f026ffb62de6 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -63,6 +63,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -1354,7 +1355,7 @@ DEFINE_IDTENTRY(exc_device_not_available) + } + } + +-#ifdef CONFIG_INTEL_TDX_GUEST ++#if defined(CONFIG_INTEL_TDX_GUEST) || defined(CONFIG_PKVM_GUEST) + + #define VE_FAULT_STR "VE fault" + +@@ -1426,15 +1427,15 @@ DEFINE_IDTENTRY(exc_virtualization_exception) + * till TDGETVEINFO TDCALL is executed. This ensures that VE + * info cannot be overwritten by a nested #VE. + */ +- tdx_get_ve_info(&ve); ++ get_ve_info(&ve); + + cond_local_irq_enable(regs); + + /* +- * If tdx_handle_virt_exception() could not process ++ * If handle_virt_exception() could not process + * it successfully, treat it as #GP(0) and handle it. + */ +- if (!tdx_handle_virt_exception(regs, &ve)) ++ if (!handle_virt_exception(regs, &ve)) + ve_raise_fault(regs, 0, ve.gla); + + cond_local_irq_disable(regs); +diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S +index 78ccb5ec3c0e..c24394c7c245 100644 +--- a/arch/x86/kernel/vmlinux.lds.S ++++ b/arch/x86/kernel/vmlinux.lds.S +@@ -111,6 +111,35 @@ PHDRS { + note PT_NOTE FLAGS(0); /* ___ */ + } + ++#ifdef CONFIG_PKVM_INTEL ++#include ++ ++#define PKVM_TEXT \ ++ . = ALIGN(PAGE_SIZE); \ ++ __pkvm_text_start = .; \ ++ *(PKVM_SECTION_NAME(.text)) \ ++ . = ALIGN(PAGE_SIZE); \ ++ __pkvm_text_end = .; ++ ++#define PKVM_BSS \ ++ . = ALIGN(PAGE_SIZE); \ ++ __pkvm_bss_start = .; \ ++ *(PKVM_SECTION_NAME(.bss)) \ ++ . = ALIGN(PAGE_SIZE); \ ++ __pkvm_bss_end = .; ++ ++#define PKVM_DATA \ ++ . = ALIGN(PAGE_SIZE); \ ++ __pkvm_data_start = .; \ ++ *(PKVM_SECTION_NAME(.data)) \ ++ . = ALIGN(PAGE_SIZE); \ ++ __pkvm_data_end = .; ++#else ++#define PKVM_TEXT ++#define PKVM_BSS ++#define PKVM_DATA ++#endif ++ + SECTIONS + { + #ifdef CONFIG_X86_32 +@@ -150,6 +179,7 @@ SECTIONS + ALIGN_ENTRY_TEXT_END + SOFTIRQENTRY_TEXT + STATIC_CALL_TEXT ++ PKVM_TEXT + *(.gnu.warning) + + #ifdef CONFIG_RETPOLINE +@@ -166,6 +196,7 @@ SECTIONS + . = ALIGN(PAGE_SIZE); + + X86_ALIGN_RODATA_BEGIN ++ PKVM_RODATA + RO_DATA(PAGE_SIZE) + X86_ALIGN_RODATA_END + +@@ -181,6 +212,7 @@ SECTIONS + /* 32 bit has nosave before _edata */ + NOSAVE_DATA + #endif ++ PKVM_DATA + + PAGE_ALIGNED_DATA(PAGE_SIZE) + +@@ -394,6 +426,7 @@ SECTIONS + . = ALIGN(PAGE_SIZE); + *(BSS_MAIN) + BSS_DECRYPTED ++ PKVM_BSS + . = ALIGN(PAGE_SIZE); + __bss_stop = .; + } +@@ -507,6 +540,10 @@ INIT_PER_CPU(irq_stack_backing_store); + "fixed_percpu_data is not at start of per-cpu area"); + #endif + ++#ifdef CONFIG_PKVM_INTEL ++#include ++#endif ++ + #ifdef CONFIG_RETHUNK + . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); + . = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); +diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig +index 67be7f217e37..c025486d728f 100644 +--- a/arch/x86/kvm/Kconfig ++++ b/arch/x86/kvm/Kconfig +@@ -87,6 +87,30 @@ config KVM_INTEL + To compile this as a module, choose M here: the module + will be called kvm-intel. + ++config PKVM_INTEL ++ bool "pKVM for Intel processors support" ++ depends on KVM_INTEL=y ++ depends on X86_64 ++ depends on !KSM ++ select INTEL_IOMMU_DEFAULT_ON ++ select INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON ++ help ++ Provides support for pKVM on Intel processors. ++ ++ This will deprivilege the host as a VM running in non-root VMX ++ operation mode, and pKVM hypervisor will run in root VMX ++ operation mode. ++ ++ If unsure, say N. ++ ++config PKVM_INTEL_DEBUG ++ bool "Debug pKVM" ++ depends on PKVM_INTEL ++ help ++ Provides debug support for pKVM. ++ ++ If unsure, say N. ++ + config X86_SGX_KVM + bool "Software Guard eXtensions (SGX) Virtualization" + depends on X86_SGX && KVM_INTEL +diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile +index f453a0f96e24..7f1ccc11c610 100644 +--- a/arch/x86/kvm/Makefile ++++ b/arch/x86/kvm/Makefile +@@ -33,6 +33,7 @@ endif + + obj-$(CONFIG_KVM) += kvm.o + obj-$(CONFIG_KVM_INTEL) += kvm-intel.o ++obj-$(CONFIG_PKVM_INTEL) += vmx/pkvm/ + obj-$(CONFIG_KVM_AMD) += kvm-amd.o + + AFLAGS_svm/vmenter.o := -iquote $(obj) +diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h +index 59804be91b5b..a8ef25726236 100644 +--- a/arch/x86/kvm/mmu.h ++++ b/arch/x86/kvm/mmu.h +@@ -8,6 +8,11 @@ + + extern bool __read_mostly enable_mmio_caching; + ++#define PT64_PT_BITS 9 ++#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) ++#define PT32_PT_BITS 10 ++#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) ++ + #define PT_WRITABLE_SHIFT 1 + #define PT_USER_SHIFT 2 + +@@ -36,6 +41,17 @@ extern bool __read_mostly enable_mmio_caching; + #define PT32_ROOT_LEVEL 2 + #define PT32E_ROOT_LEVEL 3 + ++#define PT64_LEVEL_BITS 9 ++#define PT64_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, PT64_LEVEL_BITS) ++#define PT_LEVEL_INDEX(addr, level) \ ++ (((addr) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) ++ ++#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK ++#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) ++#else ++#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) ++#endif ++ + #define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \ + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index d30325e297a0..2c89e6f2c457 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -255,18 +255,6 @@ static inline bool kvm_available_flush_tlb_with_range(void) + return kvm_x86_ops.tlb_remote_flush_with_range; + } + +-static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, +- struct kvm_tlb_range *range) +-{ +- int ret = -ENOTSUPP; +- +- if (range && kvm_x86_ops.tlb_remote_flush_with_range) +- ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range); +- +- if (ret) +- kvm_flush_remote_tlbs(kvm); +-} +- + void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, + u64 start_gfn, u64 pages) + { +@@ -275,7 +263,8 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, + range.start_gfn = start_gfn; + range.pages = pages; + +- kvm_flush_remote_tlbs_with_range(kvm, &range); ++ if (kvm_flush_remote_tlbs_with_range(kvm, &range)) ++ kvm_flush_remote_tlbs(kvm); + } + + static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, +@@ -1166,7 +1155,8 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) + drop_spte(kvm, sptep); + + if (flush) +- kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, ++ kvm_flush_remote_tlbs_with_address(kvm, ++ kvm_mmu_page_get_gfn(sp, sptep - sp->spt), + KVM_PAGES_PER_HPAGE(sp->role.level)); + } + +@@ -1619,7 +1609,7 @@ static void __rmap_add(struct kvm *kvm, + if (rmap_count > RMAP_RECYCLE_THRESHOLD) { + kvm_zap_all_rmap_sptes(kvm, rmap_head); + kvm_flush_remote_tlbs_with_address( +- kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); ++ kvm, gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + } + } + +@@ -2950,6 +2940,9 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) + return; + ++ if (vcpu->kvm->arch.vm_type == KVM_X86_PROTECTED_VM) ++ return; ++ + __direct_pte_prefetch(vcpu, sp, sptep); + } + +@@ -4243,13 +4236,19 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, + static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) + { + bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); +- ++ struct kvm_pinned_page *ppage = NULL; + unsigned long mmu_seq; + int r; + + fault->gfn = fault->addr >> PAGE_SHIFT; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + ++ if (vcpu->kvm->arch.vm_type == KVM_X86_PROTECTED_VM) { ++ ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT); ++ if (!ppage) ++ return -ENOMEM; ++ } ++ + if (page_fault_handle_page_track(vcpu, fault)) + return RET_PF_EMULATE; + +@@ -4291,6 +4290,19 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault + r = __direct_map(vcpu, fault); + } + ++ if (ppage) { ++ struct page *page = kvm_pfn_to_refcounted_page(fault->pfn); ++ if (r == RET_PF_FIXED && page) { ++ ppage->page = pfn_to_page(fault->pfn); ++ get_page(ppage->page); ++ spin_lock(&vcpu->kvm->pkvm.pinned_page_lock); ++ list_add(&ppage->list, &vcpu->kvm->pkvm.pinned_pages); ++ spin_unlock(&vcpu->kvm->pkvm.pinned_page_lock); ++ } else { ++ kfree(ppage); ++ } ++ } ++ + out_unlock: + if (is_tdp_mmu_fault) + read_unlock(&vcpu->kvm->mmu_lock); +@@ -6428,7 +6440,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); + + if (kvm_available_flush_tlb_with_range()) +- kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, ++ kvm_flush_remote_tlbs_with_address(kvm, ++ kvm_mmu_page_get_gfn(sp, sptep - sp->spt), + KVM_PAGES_PER_HPAGE(sp->role.level)); + else + need_tlb_flush = 1; +diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h +index 1f4f5e703f13..ff34fe5e915d 100644 +--- a/arch/x86/kvm/mmu/paging_tmpl.h ++++ b/arch/x86/kvm/mmu/paging_tmpl.h +@@ -938,7 +938,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) + mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); + if (is_shadow_present_pte(old_spte)) + kvm_flush_remote_tlbs_with_address(vcpu->kvm, +- sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); ++ kvm_mmu_page_get_gfn(sp, sptep - sp->spt), ++ KVM_PAGES_PER_HPAGE(sp->role.level)); + + if (!rmap_can_add(vcpu)) + break; +diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h +index 7670c13ce251..c6d8508594a0 100644 +--- a/arch/x86/kvm/mmu/spte.h ++++ b/arch/x86/kvm/mmu/spte.h +@@ -55,6 +55,7 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); + #define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) + #define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) + #define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) ++#define SHADOW_PT_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) + + /* + * The mask/shift to use for saving the original R/X bits when marking the PTE +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index c3b0f973375b..15639983c03e 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -9,6 +9,7 @@ + + #include + #include ++#include + + static bool __read_mostly tdp_mmu_enabled = true; + module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); +@@ -1037,8 +1038,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, + bool wrprot = false; + + WARN_ON(sp->role.level != fault->goal_level); +- if (unlikely(!fault->slot)) ++ if (unlikely(!fault->slot)) { + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); ++ ++ if (pkvm_set_mmio_ve(vcpu, iter->gfn)) ++ return RET_PF_RETRY; ++ } + else + wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, + fault->pfn, iter->old_spte, fault->prefetch, true, +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index c871a6d6364c..13bec526a693 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4727,6 +4727,11 @@ static void svm_vm_destroy(struct kvm *kvm) + sev_vm_destroy(kvm); + } + ++static bool svm_is_vm_type_supported(unsigned long type) ++{ ++ return type == KVM_X86_DEFAULT_VM; ++} ++ + static int svm_vm_init(struct kvm *kvm) + { + if (!pause_filter_count || !pause_filter_thresh) +@@ -4753,6 +4758,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .vcpu_free = svm_vcpu_free, + .vcpu_reset = svm_vcpu_reset, + ++ .is_vm_type_supported = svm_is_vm_type_supported, + .vm_size = sizeof(struct kvm_svm), + .vm_init = svm_vm_init, + .vm_destroy = svm_vm_destroy, +diff --git a/arch/x86/kvm/vmx/pkvm/.gitignore b/arch/x86/kvm/vmx/pkvm/.gitignore +new file mode 100644 +index 000000000000..3ac372c4eca7 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/.gitignore +@@ -0,0 +1 @@ ++pkvm.lds +diff --git a/arch/x86/kvm/vmx/pkvm/Makefile b/arch/x86/kvm/vmx/pkvm/Makefile +new file mode 100644 +index 000000000000..6ca49fffb4dd +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/Makefile +@@ -0,0 +1,29 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++KVM_PKVM ?= ../../../../../virt/kvm/pkvm ++ccflags-y += -I $(srctree)/arch/x86/kvm ++ccflags-y += -I $(srctree)/arch/x86/kvm/vmx/pkvm/include ++ ++pkvm-obj := pkvm_host.o pkvm_debugfs.o ++ ++pkvm-obj += $(KVM_PKVM)/pkvm.o ++ ++obj-$(CONFIG_PKVM_INTEL) += $(pkvm-obj) ++obj-$(CONFIG_PKVM_INTEL) += hyp/ ++ ++always-y := pkvm_constants.h pkvm-constants.s ++ ++define rule_gen_hyp_constants ++ $(call filechk,offsets,__PKVM_CONSTANTS_H__) ++endef ++ ++CFLAGS_pkvm-constants.o = -I $(src)/include ++CFLAGS_pkvm-constants.o += -I $(srctree)/virt/kvm/pkvm ++$(obj)/pkvm-constants.s: $(src)/pkvm_constants.c FORCE ++ $(call if_changed_dep,cc_s_c) ++ ++$(obj)/pkvm_constants.h: $(obj)/pkvm-constants.s FORCE ++ $(call if_changed_rule,gen_hyp_constants) ++ ++obj-intel-pkvm := $(addprefix $(obj)/, $(pkvm-obj)) ++$(obj-intel-pkvm): $(obj)/pkvm_constants.h +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/Makefile b/arch/x86/kvm/vmx/pkvm/hyp/Makefile +new file mode 100644 +index 000000000000..682e34a7901f +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/Makefile +@@ -0,0 +1,79 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++KVM_PKVM = virt/kvm/pkvm ++ ++ccflags-y += -I $(srctree)/$(KVM_PKVM)/ ++ccflags-y += -I $(srctree)/arch/x86/kvm ++ccflags-y += -I $(srctree)/arch/x86/kvm/vmx/pkvm/include ++ccflags-y += -fno-stack-protector ++ccflags-y += -D__DISABLE_EXPORTS ++ccflags-y += -D__PKVM_HYP__ ++ ++pkvm-hyp-obj := $(obj)/vmx_asm.o $(obj)/vmexit.o \ ++ $(obj)/memory.o $(obj)/early_alloc.o \ ++ $(obj)/pgtable.o $(obj)/mmu.o \ ++ $(obj)/ept.o $(obj)/pkvm.o \ ++ $(obj)/idt.o $(obj)/irq.o \ ++ $(obj)/init_finalise.o $(obj)/nested.o \ ++ $(obj)/vmx.o $(obj)/vmsr.o \ ++ $(obj)/iommu.o $(obj)/iommu_debug.o \ ++ $(obj)/mem_protect.o $(obj)/lapic.o \ ++ $(obj)/ptdev.o $(obj)/iommu_spgt.o \ ++ $(obj)/io_emulate.o $(obj)/pci.o \ ++ $(obj)/trace.o ++ ++virt-dir := $(objtree)/$(KVM_PKVM) ++pkvm-hyp-obj += $(virt-dir)/page_alloc.o ++ ++ifndef CONFIG_PKVM_INTEL_DEBUG ++lib-dir := $(obj)/lib ++lib2-dir := $(objtree)/lib ++pkvm-hyp-obj += $(lib-dir)/memset_64.o ++pkvm-hyp-obj += $(lib-dir)/memcpy_64.o ++pkvm-hyp-obj += $(lib2-dir)/find_bit.o ++ifdef CONFIG_DEBUG_LIST ++pkvm-hyp-obj += $(lib-dir)/list_debug.o ++endif ++ifdef CONFIG_RETPOLINE ++pkvm-hyp-obj += $(lib-dir)/retpoline.o ++endif ++endif ++ ++dir-obj := $(lib-dir) $(lib2-dir) $(virt-dir) ++ ++pkvm-rename-obj := $(patsubst %.o,%.pkvm.o,$(pkvm-hyp-obj)) ++pkvm-obj := pkvm.o ++ ++$(dir-obj): FORCE ++ifndef CONFIG_PKVM_INTEL_DEBUG ++ $(Q)mkdir -p $(lib-dir) $(lib2-dir) ++endif ++ $(Q)mkdir -p $(virt-dir) ++ ++%.pkvm.o: %.c $(dir-obj) FORCE ++ $(call if_changed_rule,cc_o_c) ++%.pkvm.o: %.S $(dir-obj) FORCE ++ $(call if_changed_rule,as_o_S) ++ ++$(obj)/pkvm.lds: $(src)/pkvm.lds.S FORCE ++ $(call if_changed_dep,cpp_lds_S) ++ ++LDFLAGS_pkvm.tmp.o := -r -T ++$(obj)/pkvm.tmp.o: $(obj)/pkvm.lds $(pkvm-rename-obj) FORCE ++ $(call if_changed,ld) ++ ++$(obj)/pkvm.o: $(obj)/pkvm.tmp.o FORCE ++ $(call if_changed,pkvmcopy) ++ ++quiet_cmd_pkvmcopy = PKVMPCOPY $@ ++ifdef CONFIG_PKVM_INTEL_DEBUG ++ cmd_pkvmcopy = $(OBJCOPY) --prefix-symbols= $< $@ ++else ++ cmd_pkvmcopy = $(OBJCOPY) --prefix-symbols=__pkvm_ --remove-section=.retpoline_sites --remove-section=.return_sites $< $@ ++endif ++ ++obj-$(CONFIG_PKVM_INTEL) += $(pkvm-obj) ++ ++# Remove ftrace, Shadow Call Stack, and CFI CFLAGS. ++# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations. ++KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS)) +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/bug.h b/arch/x86/kvm/vmx/pkvm/hyp/bug.h +new file mode 100644 +index 000000000000..019c5f2755fe +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/bug.h +@@ -0,0 +1,23 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef __PKVM_BUG_H ++#define __PKVM_BUG_H ++ ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++#include ++ ++#define PKVM_ASSERT(c) \ ++do { \ ++ if (!(c)) { \ ++ pr_err("assertion failed %s: %d: %s\n", \ ++ __FILE__, __LINE__, #c); \ ++ BUG(); \ ++ } \ ++} while (0) ++#else ++#define PKVM_ASSERT(c) do { } while (!(c)) ++#endif ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/cpu.h b/arch/x86/kvm/vmx/pkvm/hyp/cpu.h +new file mode 100644 +index 000000000000..cd3d60034890 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/cpu.h +@@ -0,0 +1,53 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _PKVM_CPU_H_ ++#define _PKVM_CPU_H_ ++ ++static inline u64 pkvm_msr_read(u32 reg) ++{ ++ u32 msrl, msrh; ++ ++ asm volatile (" rdmsr ":"=a"(msrl), "=d"(msrh) : "c" (reg)); ++ return (((u64)msrh << 32U) | msrl); ++} ++ ++#define pkvm_rdmsr(msr, low, high) \ ++do { \ ++ u64 __val = pkvm_msr_read(msr); \ ++ (void)((low) = (u32)__val); \ ++ (void)((high) = (u32)(__val >> 32)); \ ++} while (0) ++ ++#define pkvm_rdmsrl(msr, val) \ ++ ((val) = pkvm_msr_read((msr))) ++ ++static inline void pkvm_msr_write(u32 reg, u64 msr_val) ++{ ++ asm volatile (" wrmsr " : : "c" (reg), "a" ((u32)msr_val), "d" ((u32)(msr_val >> 32U))); ++} ++ ++#define pkvm_wrmsr(msr, low, high) \ ++do { \ ++ u64 __val = (u64)(high) << 32 | (u64)(low); \ ++ pkvm_msr_write(msr, __val); \ ++} while (0) ++ ++#define pkvm_wrmsrl(msr, val) pkvm_msr_write(msr, val) ++ ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++#include ++static inline u64 get_pcpu_id(void) ++{ ++ return raw_smp_processor_id(); ++} ++#else ++/* this function shall only be used during pkvm runtime */ ++static inline u64 get_pcpu_id(void) ++{ ++ return pkvm_msr_read(MSR_GS_BASE); ++} ++#endif ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/debug.h b/arch/x86/kvm/vmx/pkvm/hyp/debug.h +new file mode 100644 +index 000000000000..29d9804cf580 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/debug.h +@@ -0,0 +1,20 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#ifndef _PKVM_DEBUG_H_ ++#define _PKVM_DEBUG_H_ ++ ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++#include ++#define pkvm_dbg(f, x...) pr_debug(f, ## x) ++#define pkvm_info(f, x...) pr_info(f, ## x) ++#define pkvm_err(f, x...) pr_err(f, ## x) ++#else ++#define pkvm_dbg(x...) ++#define pkvm_info(x...) ++#define pkvm_err(x...) ++#endif ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c b/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c +new file mode 100644 +index 000000000000..766ff87e989d +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.c +@@ -0,0 +1,76 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++ ++#include "pkvm_spinlock.h" ++#include "pgtable.h" ++ ++static unsigned long base; ++static unsigned long end; ++static unsigned long cur; ++ ++static pkvm_spinlock_t early_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED }; ++ ++struct pkvm_mm_ops pkvm_early_alloc_mm_ops; ++ ++unsigned long pkvm_early_alloc_nr_used_pages(void) ++{ ++ return (cur - base) >> PAGE_SHIFT; ++} ++ ++void *pkvm_early_alloc_contig(unsigned int nr_pages) ++{ ++ unsigned long size = (nr_pages << PAGE_SHIFT); ++ void *ret; ++ ++ if (!nr_pages) ++ return NULL; ++ ++ pkvm_spin_lock(&early_lock); ++ if (end - cur < size) { ++ pkvm_spin_unlock(&early_lock); ++ return NULL; ++ } ++ ret = (void *)cur; ++ cur += size; ++ pkvm_spin_unlock(&early_lock); ++ ++ memset(ret, 0, size); ++ ++ return ret; ++} ++ ++void *pkvm_early_alloc_page(void) ++{ ++ return pkvm_early_alloc_contig(1); ++} ++ ++static void pkvm_early_alloc_get_page(void *addr) { } ++static void pkvm_early_alloc_put_page(void *addr) { } ++static void pkvm_early_flush_tlb(struct pkvm_pgtable *pgt, ++ unsigned long addr, unsigned long size) ++{ ++} ++ ++static int pkvm_early_page_count(void *vaddr) ++{ ++ return 512; ++} ++ ++void pkvm_early_alloc_init(void *virt, unsigned long size) ++{ ++ base = cur = (unsigned long)virt; ++ end = base + size; ++ ++ pkvm_early_alloc_mm_ops.zalloc_page = pkvm_early_alloc_page; ++ pkvm_early_alloc_mm_ops.get_page = pkvm_early_alloc_get_page; ++ pkvm_early_alloc_mm_ops.put_page = pkvm_early_alloc_put_page; ++ pkvm_early_alloc_mm_ops.phys_to_virt = pkvm_phys_to_virt; ++ pkvm_early_alloc_mm_ops.virt_to_phys = pkvm_virt_to_phys; ++ pkvm_early_alloc_mm_ops.page_count = pkvm_early_page_count; ++ pkvm_early_alloc_mm_ops.flush_tlb = pkvm_early_flush_tlb; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h b/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h +new file mode 100644 +index 000000000000..59bede62cd03 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/early_alloc.h +@@ -0,0 +1,15 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef __PKVM_EARLY_ALLOC_H ++#define __PKVM_EARLY_ALLOC_H ++ ++unsigned long pkvm_early_alloc_nr_used_pages(void); ++void *pkvm_early_alloc_contig(unsigned int nr_pages); ++void *pkvm_early_alloc_page(void); ++void pkvm_early_alloc_init(void *virt, unsigned long size); ++ ++extern struct pkvm_mm_ops pkvm_early_alloc_mm_ops; ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ept.c b/arch/x86/kvm/vmx/pkvm/hyp/ept.c +new file mode 100644 +index 000000000000..4d89c8972115 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/ept.c +@@ -0,0 +1,1066 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "pkvm_hyp.h" ++#include "early_alloc.h" ++#include "pgtable.h" ++#include "ept.h" ++#include "pkvm_spinlock.h" ++#include "memory.h" ++#include "iommu.h" ++#include "vmx.h" ++#include "mem_protect.h" ++#include "debug.h" ++#include "ptdev.h" ++#include "io_emulate.h" ++ ++static struct pkvm_pool host_ept_pool; ++static struct pkvm_pgtable host_ept; ++static struct pkvm_pgtable host_ept_notlbflush; ++static pkvm_spinlock_t _host_ept_lock = __PKVM_SPINLOCK_UNLOCKED; ++ ++static struct pkvm_pool shadow_pgt_pool; ++static struct rsvd_bits_validate ept_zero_check; ++ ++static void flush_tlb_noop(struct pkvm_pgtable *pgt, ++ unsigned long addr, unsigned long size) ++{ ++} ++ ++static inline void pkvm_init_ept_page(void *page) ++{ ++ /* ++ * Normal VM: Never clear the "suppress #VE" bit, so #VE will never ++ * be triggered. ++ * ++ * Protected VM: pkvm sets EPT_VIOLATION_VE for Protected VM, "suppress ++ * #VE" bit must be set to get EPT violation, thus pkvm can build the ++ * EPT mapping for memory region, and clear "suppress #VE" for mmio ++ * region, thus mmio can trigger #VE. ++ * ++ * For simplicity, unconditionally initialize SEPT to set "suppress ++ * #VE". ++ */ ++ memset64((u64 *)page, EPT_PROT_DEF, 512); ++} ++ ++static void *ept_zalloc_page(struct pkvm_pool *pool) ++{ ++ void *page; ++ ++ page = pkvm_alloc_pages(pool, 0); ++ if (page) ++ pkvm_init_ept_page(page); ++ ++ return page; ++} ++ ++static void *host_ept_zalloc_page(void) ++{ ++ /* ++ * Also initiailize the host ept with SUPPRESS_VE bit set although this ++ * bit is ignored in host ept. Because host_ept and shadow_ept share the ++ * same ept_ops, this will make the ept_entry_mapped work for both ++ * host_ept and shadow_ept. ++ */ ++ return ept_zalloc_page(&host_ept_pool); ++} ++ ++static void host_ept_get_page(void *vaddr) ++{ ++ pkvm_get_page(&host_ept_pool, vaddr); ++} ++ ++static void host_ept_put_page(void *vaddr) ++{ ++ pkvm_put_page(&host_ept_pool, vaddr); ++} ++ ++static void host_ept_flush_cache(void *vaddr, unsigned int size) ++{ ++ if (!pkvm_hyp->iommu_coherent) ++ pkvm_clflush_cache_range(vaddr, size); ++} ++ ++static void host_ept_flush_tlb(struct pkvm_pgtable *pgt, ++ unsigned long vaddr, unsigned long size) ++{ ++ struct pkvm_host_vcpu *hvcpu; ++ int i; ++ ++ for (i = 0; i < pkvm_hyp->num_cpus; i++) { ++ hvcpu = pkvm_hyp->host_vm.host_vcpus[i]; ++ ++ kvm_make_request(PKVM_REQ_TLB_FLUSH_HOST_EPT, &hvcpu->vmx.vcpu); ++ pkvm_kick_vcpu(&hvcpu->vmx.vcpu); ++ } ++ ++ /* ++ * Also needs to flush the IOTLB as host EPT is used ++ * as second-stage page table for some devices. ++ */ ++ pkvm_iommu_flush_iotlb(pgt, vaddr, size); ++} ++ ++struct pkvm_mm_ops host_ept_mm_ops = { ++ .phys_to_virt = pkvm_phys_to_virt, ++ .virt_to_phys = pkvm_virt_to_phys, ++ .zalloc_page = host_ept_zalloc_page, ++ .get_page = host_ept_get_page, ++ .put_page = host_ept_put_page, ++ .page_count = pkvm_page_count, ++ .flush_tlb = host_ept_flush_tlb, ++ .flush_cache = host_ept_flush_cache, ++}; ++ ++static struct pkvm_mm_ops host_ept_mm_ops_no_tlbflush = { ++ .phys_to_virt = pkvm_phys_to_virt, ++ .virt_to_phys = pkvm_virt_to_phys, ++ .zalloc_page = host_ept_zalloc_page, ++ .get_page = host_ept_get_page, ++ .put_page = host_ept_put_page, ++ .page_count = pkvm_page_count, ++ .flush_tlb = flush_tlb_noop, ++ .flush_cache = host_ept_flush_cache, ++}; ++ ++static bool ept_entry_present(void *ptep) ++{ ++ u64 val = *(u64 *)ptep; ++ ++ return !!(val & VMX_EPT_RWX_MASK); ++} ++ ++static bool ept_entry_mapped(void *ptep) ++{ ++ /* ++ * Both present and non-present (shadow)EPT entry is counted as a ++ * mapped entry because a non-present entry with non-zero value may ++ * contain page state and ownership information created through map ++ * operation. So simply count non-zero entry as mapped to cover both ++ * cases. ++ * ++ * Since we initialize every pte with SUPPRESS_VE bit set, which means ++ * if a pte does not equal to the default value, it has been mapped. ++ */ ++ return !(*(u64 *)ptep == EPT_PROT_DEF); ++} ++ ++static bool ept_entry_huge(void *ptep) ++{ ++ return is_large_pte(*(u64 *)ptep); ++} ++ ++static void ept_entry_mkhuge(void *ptep) ++{ ++ *(u64 *)ptep |= PT_PAGE_SIZE_MASK; ++} ++ ++static unsigned long ept_entry_to_phys(void *ptep) ++{ ++ return *(u64 *)ptep & PT64_BASE_ADDR_MASK; ++} ++ ++static u64 ept_entry_to_prot(void *ptep) ++{ ++ u64 prot = *(u64 *)ptep & ~(PT64_BASE_ADDR_MASK); ++ ++ return prot & ~PT_PAGE_SIZE_MASK; ++} ++ ++static int ept_entry_to_index(unsigned long vaddr, int level) ++{ ++ return SHADOW_PT_INDEX(vaddr, level); ++} ++ ++static bool ept_entry_is_leaf(void *ptep, int level) ++{ ++ if (level == PG_LEVEL_4K || ++ !ept_entry_present(ptep) || ++ ept_entry_huge(ptep)) ++ return true; ++ ++ return false; ++ ++} ++ ++static int ept_level_entry_size(int level) ++{ ++ return PAGE_SIZE / PT64_ENT_PER_PAGE; ++} ++ ++static int ept_level_to_entries(int level) ++{ ++ return PT64_ENT_PER_PAGE; ++} ++ ++static u64 ept_level_page_mask(int level) ++{ ++ return (~((1UL << PT64_LEVEL_SHIFT(level)) - 1)); ++} ++ ++static unsigned long ept_level_to_size(int level) ++{ ++ return KVM_HPAGE_SIZE(level); ++} ++ ++static void ept_set_entry(void *sptep, u64 spte) ++{ ++ WRITE_ONCE(*(u64 *)sptep, spte); ++} ++ ++struct pkvm_pgtable_ops ept_ops = { ++ .pgt_entry_present = ept_entry_present, ++ .pgt_entry_mapped = ept_entry_mapped, ++ .pgt_entry_huge = ept_entry_huge, ++ .pgt_entry_mkhuge = ept_entry_mkhuge, ++ .pgt_entry_to_phys = ept_entry_to_phys, ++ .pgt_entry_to_prot = ept_entry_to_prot, ++ .pgt_entry_to_index = ept_entry_to_index, ++ .pgt_level_page_mask = ept_level_page_mask, ++ .pgt_entry_is_leaf = ept_entry_is_leaf, ++ .pgt_level_entry_size = ept_level_entry_size, ++ .pgt_level_to_entries = ept_level_to_entries, ++ .pgt_level_to_size = ept_level_to_size, ++ .pgt_set_entry = ept_set_entry, ++ .default_prot = EPT_PROT_DEF, ++}; ++ ++bool is_pgt_ops_ept(struct pkvm_pgtable *pgt) ++{ ++ return pgt && (pgt->pgt_ops == &ept_ops); ++} ++ ++int pkvm_host_ept_map(unsigned long vaddr_start, unsigned long phys_start, ++ unsigned long size, int pgsz_mask, u64 prot) ++{ ++ return pkvm_pgtable_map(&host_ept, vaddr_start, phys_start, size, ++ pgsz_mask, prot, NULL); ++} ++ ++int pkvm_host_ept_unmap(unsigned long vaddr_start, unsigned long phys_start, ++ unsigned long size) ++{ ++ return pkvm_pgtable_unmap_safe(&host_ept, vaddr_start, phys_start, size, NULL); ++} ++ ++void pkvm_host_ept_lookup(unsigned long vaddr, unsigned long *pphys, ++ u64 *pprot, int *plevel) ++{ ++ pkvm_pgtable_lookup(&host_ept, vaddr, pphys, pprot, plevel); ++} ++ ++void pkvm_host_ept_destroy(void) ++{ ++ pkvm_pgtable_destroy(&host_ept, NULL); ++} ++ ++void host_ept_lock(void) ++{ ++ pkvm_spin_lock(&_host_ept_lock); ++} ++ ++void host_ept_unlock(void) ++{ ++ pkvm_spin_unlock(&_host_ept_lock); ++} ++ ++void pkvm_flush_host_ept(void) ++{ ++ u64 eptp = pkvm_construct_eptp(host_ept.root_pa, host_ept.level); ++ ++ flush_ept(eptp); ++} ++ ++static void reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, ++ u64 pa_bits_rsvd, bool execonly, ++ int huge_page_level) ++{ ++ u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); ++ u64 large_1g_rsvd = 0, large_2m_rsvd = 0; ++ u64 bad_mt_xwr; ++ ++ if (huge_page_level < PG_LEVEL_1G) ++ large_1g_rsvd = rsvd_bits(7, 7); ++ if (huge_page_level < PG_LEVEL_2M) ++ large_2m_rsvd = rsvd_bits(7, 7); ++ ++ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); ++ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); ++ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; ++ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; ++ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; ++ ++ /* large page */ ++ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; ++ rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; ++ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; ++ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; ++ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; ++ ++ bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ ++ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ ++ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ ++ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ ++ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ ++ if (!execonly) { ++ /* bits 0..2 must not be 100 unless VMX capabilities allow it */ ++ bad_mt_xwr |= REPEAT_BYTE(1ull << 4); ++ } ++ rsvd_check->bad_mt_xwr = bad_mt_xwr; ++} ++ ++int pkvm_host_ept_init(struct pkvm_pgtable_cap *cap, ++ void *ept_pool_base, unsigned long ept_pool_pages) ++{ ++ unsigned long pfn = __pkvm_pa(ept_pool_base) >> PAGE_SHIFT; ++ int ret; ++ u8 pa_bits; ++ ++ ret = pkvm_pool_init(&host_ept_pool, pfn, ept_pool_pages, 0); ++ if (ret) ++ return ret; ++ ++ pa_bits = get_max_physaddr_bits(); ++ if (!pa_bits) ++ return -EINVAL; ++ reset_rsvds_bits_mask_ept(&ept_zero_check, rsvd_bits(pa_bits, 63), ++ vmx_has_ept_execute_only(), ++ fls(cap->allowed_pgsz) - 1); ++ ++ pkvm_hyp->host_vm.ept = &host_ept; ++ ret = pkvm_pgtable_init(&host_ept, &host_ept_mm_ops, &ept_ops, cap, true); ++ if (ret) ++ return ret; ++ ++ /* ++ * Prepare an instance for host EPT without doing TLB flushing. ++ * This is used for some fastpath code which wants to avoid ++ * doing TLB flushing for each host EPT modifications. It doesn't ++ * mean TLB flushing is not needed. The user still needs to do ++ * TLB flushing explicitly after finishing all the host EPT ++ * modifications. ++ */ ++ host_ept_notlbflush = host_ept; ++ host_ept_notlbflush.mm_ops = &host_ept_mm_ops_no_tlbflush; ++ pkvm_hyp->host_vm.ept_notlbflush = &host_ept_notlbflush; ++ ++ return 0; ++} ++ ++int handle_host_ept_violation(struct kvm_vcpu *vcpu, bool *skip_instruction) ++{ ++ unsigned long hpa, gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); ++ struct mem_range range, cur; ++ bool is_memory = find_mem_range(gpa, &range); ++ u64 prot = pkvm_mkstate(HOST_EPT_DEF_MMIO_PROT, PKVM_PAGE_OWNED); ++ int level; ++ int ret; ++ *skip_instruction = true; ++ ++ if (is_memory) { ++ pkvm_err("%s: not handle for memory address 0x%lx\n", __func__, gpa); ++ return -EPERM; ++ } ++ ++ ret = try_emul_host_mmio(vcpu, gpa); ++ if (ret != -EINVAL) { ++ return ret; ++ } ++ ++ pkvm_spin_lock(&_host_ept_lock); ++ ++ pkvm_pgtable_lookup(&host_ept, gpa, &hpa, NULL, &level); ++ if (hpa != INVALID_ADDR) { ++ ret = -EAGAIN; ++ goto out; ++ } ++ ++ do { ++ unsigned long size = ept_level_to_size(level); ++ ++ cur.start = ALIGN_DOWN(gpa, size); ++ cur.end = cur.start + size - 1; ++ /* ++ * TODO: ++ * check if this MMIO belongs to a secure VM pass-through device. ++ */ ++ if ((1 << level & host_ept.allowed_pgsz) && ++ mem_range_included(&cur, &range) && ++ !is_mem_range_overlap_iommu(cur.start, cur.end)) ++ break; ++ level--; ++ } while (level != PG_LEVEL_NONE); ++ ++ if (level == PG_LEVEL_NONE) { ++ pkvm_err("pkvm: No valid range: gpa 0x%lx, cur 0x%lx ~ 0x%lx size 0x%lx level %d\n", ++ gpa, cur.start, cur.end, cur.end - cur.start + 1, level); ++ ret = -EPERM; ++ goto out; ++ } ++ ++ pkvm_dbg("pkvm: %s: cur MMIO range 0x%lx ~ 0x%lx size 0x%lx level %d\n", ++ __func__, cur.start, cur.end, cur.end - cur.start + 1, level); ++ ++ ret = pkvm_host_ept_map(cur.start, cur.start, cur.end - cur.start + 1, ++ 1 << level, prot); ++ if (ret == -ENOMEM) { ++ /* TODO: reclaim MMIO range pages first and try do map again */ ++ pkvm_dbg("%s: no memory to set host ept for addr 0x%lx\n", ++ __func__, gpa); ++ } ++out: ++ pkvm_spin_unlock(&_host_ept_lock); ++ ++ if (ret == 0) ++ *skip_instruction = false; ++ return ret; ++} ++ ++int pkvm_shadow_ept_pool_init(void *ept_pool_base, unsigned long ept_pool_pages) ++{ ++ unsigned long pfn = __pkvm_pa(ept_pool_base) >> PAGE_SHIFT; ++ ++ return pkvm_pool_init(&shadow_pgt_pool, pfn, ept_pool_pages, 0); ++} ++ ++static void *shadow_pgt_zalloc_page(void) ++{ ++ return ept_zalloc_page(&shadow_pgt_pool); ++} ++ ++static void shadow_pgt_get_page(void *vaddr) ++{ ++ pkvm_get_page(&shadow_pgt_pool, vaddr); ++} ++ ++static void shadow_pgt_put_page(void *vaddr) ++{ ++ pkvm_put_page(&shadow_pgt_pool, vaddr); ++} ++ ++static void shadow_ept_flush_tlb(struct pkvm_pgtable *pgt, ++ unsigned long addr, ++ unsigned long size) ++{ ++ struct pkvm_shadow_vm *shadow_vm = sept_to_shadow_vm(pgt); ++ struct shadow_vcpu_state *shadow_vcpu; ++ struct kvm_vcpu *vcpu; ++ s64 shadow_vcpu_handle; ++ int i, shadow_vm_handle = shadow_vm->shadow_vm_handle; ++ ++ for (i = 0; i < shadow_vm->created_vcpus; i++) { ++ shadow_vcpu_handle = to_shadow_vcpu_handle(shadow_vm_handle, i); ++ shadow_vcpu = get_shadow_vcpu(shadow_vcpu_handle); ++ /* ++ * For a shadow_vcpu which is already teardown, no need to kick ++ * it as its shadow EPT tlb entries are already flushed when ++ * this shadow vcpu is doing vmclear before teardown. ++ */ ++ if (!shadow_vcpu) ++ continue; ++ ++ /* ++ * If this shadow_vcpu is not loaded then there is vcpu ++ * pointer for it, so can skip this remote tlb flushing. ++ */ ++ vcpu = READ_ONCE(shadow_vcpu->vcpu); ++ if (!vcpu) ++ goto next; ++ ++ kvm_make_request(PKVM_REQ_TLB_FLUSH_SHADOW_EPT, vcpu); ++ pkvm_kick_vcpu(vcpu); ++next: ++ put_shadow_vcpu(shadow_vcpu_handle); ++ } ++} ++ ++static struct pkvm_mm_ops shadow_ept_mm_ops = { ++ .phys_to_virt = pkvm_phys_to_virt, ++ .virt_to_phys = pkvm_virt_to_phys, ++ .zalloc_page = shadow_pgt_zalloc_page, ++ .get_page = shadow_pgt_get_page, ++ .put_page = shadow_pgt_put_page, ++ .page_count = pkvm_page_count, ++ .flush_tlb = shadow_ept_flush_tlb, ++}; ++ ++/* ++ * mm_ops for shadow second-level IOMMU page tables. These tables ++ * are similar to shadow EPT tables, as they also have the EPT ++ * format and their memory is reserved together with shadow EPT ++ * pages. The difference is that this mm_ops doesn't have the ++ * flush_tlb callback. ++ * ++ * Precisely, shadow_sl_iommu_pgt_mm_ops is used for two kinds of ++ * 2nd level iommu page tables: ++ * ++ * - pgstate_pgt which is reused as IOMMU page table for protected ++ * VM with passthrough devices. In this case the memory is pinned, ++ * and the mapping is not allowed to be removed from pgstate_pgt, ++ * so the flush_tlb callback is not needed. ++ * ++ * - Host shadow IOMMU page tables used for the host's devices when ++ * legacy IOMMU is used. They do not need the flush_tlb callback ++ * either, since IOTLB flush after unmapping pages from these ++ * tables is performed in other ways: either as a part of vIOMMU ++ * IOTLB flush emulation when initiated by the host, or together ++ * with host EPT TLB flush when ensuring pKVM memory protection. ++ * ++ * TODO: refactor the code: move all the management of both types ++ * of 2nd level iommu page tables to iommu_spgt.c to some common API. ++ * That means also refactoring of pkvm_ptdev structure. ++ */ ++static struct pkvm_mm_ops shadow_sl_iommu_pgt_mm_ops = { ++ .phys_to_virt = pkvm_phys_to_virt, ++ .virt_to_phys = pkvm_virt_to_phys, ++ .zalloc_page = shadow_pgt_zalloc_page, ++ .get_page = shadow_pgt_get_page, ++ .put_page = shadow_pgt_put_page, ++ .page_count = pkvm_page_count, ++ .flush_tlb = flush_tlb_noop, ++}; ++ ++/* ++ * Flushing cache is needed when modifying IOMMU page table entries ++ * if the IOMMU is not coherent. This ops has flush_cache callback ++ * so it can be used for a pgtable which is used as IOMMU page table ++ * with noncoherent IOMMU. ++ */ ++static struct pkvm_mm_ops shadow_sl_iommu_pgt_mm_ops_noncoherency = { ++ .phys_to_virt = pkvm_phys_to_virt, ++ .virt_to_phys = pkvm_virt_to_phys, ++ .zalloc_page = shadow_pgt_zalloc_page, ++ .get_page = shadow_pgt_get_page, ++ .put_page = shadow_pgt_put_page, ++ .page_count = pkvm_page_count, ++ .flush_tlb = flush_tlb_noop, ++ .flush_cache = pkvm_clflush_cache_range, ++}; ++ ++static int pkvm_pgstate_pgt_map_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr, int level, ++ void *ptep, struct pgt_flush_data *flush_data, void *arg) ++{ ++ struct pkvm_pgtable_map_data *data = arg; ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ unsigned long level_size = pgt_ops->pgt_level_to_size(level); ++ unsigned long map_phys = data->phys & PAGE_MASK; ++ struct pkvm_shadow_vm *vm = pgstate_pgt_to_shadow_vm(pgt); ++ int ret; ++ ++ /* ++ * It is possible that another CPU just created same mapping when ++ * multiple EPT violations happen on different CPUs. ++ */ ++ if (pgt_ops->pgt_entry_present(ptep)) { ++ unsigned long phys = pgt_ops->pgt_entry_to_phys(ptep); ++ ++ /* ++ * Check if the existing mapping is the same as the wanted one. ++ * If not the same, report an error so that the map_leaf caller ++ * will not map the different addresses in its shadow EPT. ++ */ ++ if (phys != map_phys) { ++ pkvm_err("%s: gpa 0x%lx @level%d old_phys 0x%lx != new_phys 0x%lx\n", ++ __func__, vaddr, level, phys, map_phys); ++ return -EPERM; ++ } ++ ++ /* ++ * The pgstate_pgt now is EPT format with fixed property bits. No ++ * need to check and update property bits for pgstate_pgt. ++ */ ++ goto out; ++ } ++ ++ switch (vm->vm_type) { ++ case KVM_X86_DEFAULT_VM: ++ ret = __pkvm_host_share_guest(map_phys, pgt, vaddr, level_size, data->prot); ++ break; ++ case KVM_X86_PROTECTED_VM: ++ if (vm->need_prepopulation) ++ /* ++ * As pgstate pgt is the source of the shadow EPT, only after pgstate ++ * pgt is set up, shadow EPT can be set up. So protected VM will not be ++ * able to use the memory donated in pgstate pgt before its shadow EPT ++ * is setting up. So it is safe to use the fastpath to donate all the ++ * pages to improve the pre-population performance. TLB flushing ++ * can be done in the caller after the pre-population is done but before ++ * setting up its shadow EPT. ++ */ ++ ret = __pkvm_host_donate_guest_fastpath(map_phys, pgt, vaddr, ++ level_size, data->prot); ++ else ++ ret = __pkvm_host_donate_guest(map_phys, pgt, vaddr, ++ level_size, data->prot); ++ break; ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (ret) { ++ pkvm_err("%s failed: ret %d vm_type %ld L2 GPA 0x%lx level %d HPA 0x%lx prot 0x%llx\n", ++ __func__, ret, vm->vm_type, vaddr, level, map_phys, data->prot); ++ return ret; ++ } ++ ++out: ++ /* Increase the physical address for the next mapping */ ++ data->phys += level_size; ++ ++ return 0; ++} ++ ++static int pkvm_pgstate_pgt_free_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr, int level, ++ void *ptep, struct pgt_flush_data *flush_data, void *arg) ++{ ++ unsigned long phys = pgt->pgt_ops->pgt_entry_to_phys(ptep); ++ unsigned long size = pgt->pgt_ops->pgt_level_to_size(level); ++ struct pkvm_shadow_vm *vm = pgstate_pgt_to_shadow_vm(pgt); ++ int ret; ++ ++ if (!pgt->pgt_ops->pgt_entry_present(ptep)) ++ return 0; ++ ++ /* ++ * For normal VM, call __pkvm_host_unshare_guest() to unshare all previous ++ * shared pages. A page table entry with present bits indicates the page ++ * was shared before. ++ * ++ * For protected VM, call __pkvm_host_undonate_guest() to undonate all ++ * previous donated pages, the donated pages are indicated by their page ++ * table entries which state is present. ++ * ++ * Since the pgtable_free_cb in this current page walker is still ++ * walking the page state table, the __pkvm_host_unshare_guest() or ++ * __pkvm_host_undonate_guest() are not allowed to release page state ++ * table pages. So get_page() should be called before these APIs, then ++ * put_page() to allow pgtable_free_cb free table pages with correct ++ * refcount. ++ */ ++ switch(vm->vm_type) { ++ case KVM_X86_DEFAULT_VM: ++ pgt->mm_ops->get_page(ptep); ++ ret = __pkvm_host_unshare_guest(phys, pgt, vaddr, size); ++ pgt->mm_ops->put_page(ptep); ++ flush_data->flushtlb |= true; ++ break; ++ case KVM_X86_PROTECTED_VM: { ++ struct mem_range range; ++ /* ++ * before returning to host, the memory page previously owned by ++ * protected VM shall be memset to 0 to avoid secret leakage. ++ */ ++ if (find_mem_range(phys, &range)) ++ memset(pgt->mm_ops->phys_to_virt(phys), 0, min(size, range.end - phys)); ++ pgt->mm_ops->get_page(ptep); ++ ret = __pkvm_host_undonate_guest(phys, pgt, vaddr, size); ++ pgt->mm_ops->put_page(ptep); ++ flush_data->flushtlb |= true; ++ break; ++ } ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (ret) ++ pkvm_err("%s failed: ret %d vm_type %ld phys 0x%lx GPA 0x%lx size 0x%lx\n", ++ __func__, ret, vm->vm_type, phys, vaddr, size); ++ return ret; ++} ++ ++static void __invalidate_shadow_ept_with_range(struct shadow_ept_desc *desc, ++ unsigned long vaddr, unsigned long size) ++{ ++ struct pkvm_shadow_vm *vm = sept_desc_to_shadow_vm(desc); ++ struct pkvm_pgtable *sept = &desc->sept; ++ ++ if (!size) ++ return; ++ ++ pkvm_spin_lock(&vm->lock); ++ ++ if (!is_valid_eptp(desc->shadow_eptp)) ++ goto out; ++ ++ pkvm_pgtable_unmap_nosplit(sept, vaddr, size, NULL); ++ ++ /* ++ * As for normal VM, its memory might need to be swapped out ++ * or other kinds of management from primary VM thus should ++ * unmap from pgstate pgt as well. ++ * ++ * As for protected VM, its memory is pinned thus no need to ++ * unmap from pgstate pgt. ++ */ ++ if (vm->vm_type == KVM_X86_DEFAULT_VM) ++ pkvm_pgtable_unmap_nosplit(&vm->pgstate_pgt, vaddr, size, ++ pkvm_pgstate_pgt_free_leaf); ++out: ++ pkvm_spin_unlock(&vm->lock); ++} ++ ++void pkvm_invalidate_shadow_ept(struct shadow_ept_desc *desc) ++{ ++ struct pkvm_pgtable *sept = &desc->sept; ++ unsigned long size = sept->pgt_ops->pgt_level_to_size(sept->level + 1); ++ ++ __invalidate_shadow_ept_with_range(desc, 0, size); ++} ++ ++void pkvm_invalidate_shadow_ept_with_range(struct shadow_ept_desc *desc, ++ unsigned long vaddr, unsigned long size) ++{ ++ __invalidate_shadow_ept_with_range(desc, vaddr, size); ++} ++ ++void pkvm_shadow_ept_deinit(struct shadow_ept_desc *desc) ++{ ++ struct pkvm_shadow_vm *vm = sept_desc_to_shadow_vm(desc); ++ ++ pkvm_spin_lock(&vm->lock); ++ ++ if (desc->shadow_eptp) ++ pkvm_pgtable_destroy(&desc->sept, NULL); ++ ++ memset(desc, 0, sizeof(struct shadow_ept_desc)); ++ ++ pkvm_spin_unlock(&vm->lock); ++} ++ ++int pkvm_shadow_ept_init(struct shadow_ept_desc *desc) ++{ ++ struct pkvm_pgtable_cap cap = { ++ .level = 4, ++ .allowed_pgsz = 1 << PG_LEVEL_4K, ++ .table_prot = VMX_EPT_RWX_MASK, ++ }; ++ int ret; ++ ++ if (vmx_ept_has_2m_page()) ++ cap.allowed_pgsz |= 1 << PG_LEVEL_2M; ++ if (vmx_ept_has_1g_page()) ++ cap.allowed_pgsz |= 1 << PG_LEVEL_1G; ++ ++ memset(desc, 0, sizeof(struct shadow_ept_desc)); ++ ++ ret = pkvm_pgtable_init(&desc->sept, &shadow_ept_mm_ops, &ept_ops, &cap, true); ++ if (ret) ++ return ret; ++ ++ desc->shadow_eptp = pkvm_construct_eptp(desc->sept.root_pa, cap.level); ++ flush_ept(desc->shadow_eptp); ++ ++ return 0; ++} ++ ++void pkvm_pgstate_pgt_deinit(struct pkvm_shadow_vm *vm) ++{ ++ pkvm_spin_lock(&vm->lock); ++ ++ pkvm_pgtable_destroy(&vm->pgstate_pgt, pkvm_pgstate_pgt_free_leaf); ++ ++ pkvm_spin_unlock(&vm->lock); ++} ++ ++int pkvm_pgstate_pgt_init(struct pkvm_shadow_vm *vm) ++{ ++ struct pkvm_pgtable *pgt = &vm->pgstate_pgt; ++ struct pkvm_pgtable_cap cap = { ++ .level = pkvm_hyp->ept_iommu_pgt_level, ++ .allowed_pgsz = pkvm_hyp->ept_iommu_pgsz_mask, ++ .table_prot = VMX_EPT_RWX_MASK, ++ }; ++ ++ return pkvm_pgtable_init(pgt, &shadow_sl_iommu_pgt_mm_ops, &ept_ops, &cap, true); ++} ++ ++struct pkvm_mm_ops *pkvm_shadow_sl_iommu_pgt_get_mm_ops(bool coherent) ++{ ++ return coherent ? &shadow_sl_iommu_pgt_mm_ops ++ : &shadow_sl_iommu_pgt_mm_ops_noncoherency; ++} ++ ++void pkvm_shadow_sl_iommu_pgt_update_coherency(struct pkvm_pgtable *pgt, bool coherent) ++{ ++ if (coherent) ++ pkvm_pgtable_set_mm_ops(pgt, &shadow_sl_iommu_pgt_mm_ops); ++ else ++ pkvm_pgtable_set_mm_ops(pgt, &shadow_sl_iommu_pgt_mm_ops_noncoherency); ++} ++ ++/* ++ * virtual_ept_mm_ops is used as the ops for the ept constructed by ++ * KVM high in host. ++ * The physical address in this ept is the host VM GPA, which is ++ * the same with HPA. ++ */ ++struct pkvm_mm_ops virtual_ept_mm_ops = { ++ .phys_to_virt = host_gpa2hva, ++}; ++ ++void pkvm_guest_ept_deinit(struct shadow_vcpu_state *shadow_vcpu) ++{ ++ struct pkvm_pgtable *vept = &shadow_vcpu->vept; ++ ++ memset(vept, 0, sizeof(struct pkvm_pgtable)); ++} ++ ++void pkvm_guest_ept_init(struct shadow_vcpu_state *shadow_vcpu, u64 guest_eptp) ++{ ++ struct pkvm_pgtable_cap cap = { ++ .level = 4, ++ .allowed_pgsz = 1 << PG_LEVEL_4K, ++ .table_prot = VMX_EPT_RWX_MASK, ++ }; ++ ++ /* ++ * TODO: we just assume guest will use page level the HW supported, ++ * it actually need align with KVM high ++ */ ++ if ((guest_eptp & VMX_EPTP_PWL_MASK) == VMX_EPTP_PWL_5) ++ cap.level = 5; ++ if (vmx_ept_has_2m_page()) ++ cap.allowed_pgsz |= 1 << PG_LEVEL_2M; ++ if (vmx_ept_has_1g_page()) ++ cap.allowed_pgsz |= 1 << PG_LEVEL_1G; ++ ++ pkvm_pgtable_init(&shadow_vcpu->vept, &virtual_ept_mm_ops, &ept_ops, &cap, false); ++ shadow_vcpu->vept.root_pa = host_gpa2hpa(guest_eptp & PT64_BASE_ADDR_MASK); ++} ++ ++static bool is_access_violation(u64 ept_entry, u64 exit_qual) ++{ ++ bool access_violation = false; ++ ++ if (/* Caused by data read */ ++ (((exit_qual & 0x1UL) != 0UL) && ((ept_entry & VMX_EPT_READABLE_MASK) == 0)) || ++ /* Caused by data write */ ++ (((exit_qual & 0x2UL) != 0UL) && ((ept_entry & VMX_EPT_WRITABLE_MASK) == 0)) || ++ /* Caused by instruction fetch */ ++ (((exit_qual & 0x4UL) != 0UL) && ((ept_entry & VMX_EPT_EXECUTABLE_MASK) == 0))) { ++ access_violation = true; ++ } ++ ++ return access_violation; ++} ++ ++static int populate_pgstate_pgt(struct pkvm_pgtable *pgt) ++{ ++ struct pkvm_shadow_vm *vm = pgstate_pgt_to_shadow_vm(pgt); ++ struct list_head *ptdev_head = &vm->ptdev_head; ++ struct pkvm_ptdev *ptdev, *tmp; ++ u64 *prot_override; ++ bool populated; ++ u64 prot; ++ int ret; ++ ++ list_for_each_entry(ptdev, ptdev_head, vm_node) { ++ /* No need to populate if vpgt.root_pa doesn't exist */ ++ if (!ptdev->vpgt.root_pa) ++ continue; ++ ++ populated = false; ++ list_for_each_entry(tmp, ptdev_head, vm_node) { ++ if (tmp == ptdev) ++ break; ++ if (tmp->vpgt.root_pa == ptdev->vpgt.root_pa) { ++ populated = true; ++ break; ++ } ++ } ++ ++ if (populated) ++ continue; ++ ++ if (ptdev->vpgt.pgt_ops != pgt->pgt_ops) { ++ /* Populate with EPT format */ ++ if (is_pgt_ops_ept(pgt)) { ++ prot = VMX_EPT_RWX_MASK; ++ } else { ++ pkvm_err("pkvm: not supported populating\n"); ++ return -EOPNOTSUPP; ++ } ++ prot_override = &prot; ++ } else { ++ prot_override = NULL; ++ } ++ ++ ret = pkvm_pgtable_sync_map(&ptdev->vpgt, pgt, prot_override, ++ pkvm_pgstate_pgt_map_leaf); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static bool allow_shadow_ept_mapping(struct pkvm_shadow_vm *vm, ++ u64 gpa, unsigned long hpa, ++ unsigned long size) ++{ ++ struct pkvm_pgtable *pgstate_pgt = &vm->pgstate_pgt; ++ unsigned long mapped_hpa; ++ int level; ++ ++ /* ++ * VM will be marked as need_prepopulation when a passthrough device is ++ * attached. With this flag being set, VM's pgstate_pgt will be pre-populated ++ * before handling EPT violation. After the population is done, this flag ++ * can be cleared. ++ */ ++ if (vm->need_prepopulation) { ++ unsigned long size; ++ ++ if (populate_pgstate_pgt(pgstate_pgt)) ++ return false; ++ /* ++ * Explicitly flush TLB of the host EPT after populating the page ++ * state pgt. ++ * ++ * During the population, some pages are donated from primary VM to ++ * this VM with the fastpath interface to avoid doing TLB flushing ++ * during each iteration of the page donation so that to have a fast ++ * population performance. So still need to do TLB flushing in the ++ * end after finishing all the donations. ++ */ ++ size = host_ept.pgt_ops->pgt_level_to_size(host_ept.level + 1); ++ host_ept_flush_tlb(&host_ept, 0, size); ++ vm->need_prepopulation = false; ++ } ++ ++ /* ++ * Lookup the page state pgt to check if the mapping is already created ++ * or not. ++ */ ++ pkvm_pgtable_lookup(pgstate_pgt, gpa, &mapped_hpa, NULL, &level); ++ ++ if ((pgstate_pgt->pgt_ops->pgt_level_to_size(level) < size) || ++ mapped_hpa == INVALID_ADDR) { ++ u64 prot; ++ /* ++ * Page state pgt doesn't have mapping yet, or it has mapping ++ * but with a smaller size, so try to map with the desired size ++ * in page state pgt first. Although page state pgt may already ++ * have all the desired mappings with smaller size, map_leaf ++ * can help to check if the mapped phys matches with the desired ++ * hpa to guarantee shadow EPT maps GPA to the right HPA. ++ */ ++ if (is_pgt_ops_ept(pgstate_pgt)) { ++ prot = VMX_EPT_RWX_MASK; ++ } else { ++ pkvm_err("%s: pgstate_pgt format not supported\n", __func__); ++ return false; ++ } ++ ++ if (pkvm_pgtable_map(pgstate_pgt, gpa, hpa, size, ++ 0, prot, pkvm_pgstate_pgt_map_leaf)) { ++ pkvm_err("%s: pgstate_pgt map gpa 0x%llx hpa 0x%lx size 0x%lx failed\n", ++ __func__, gpa, hpa, size); ++ return false; ++ } ++ } else if (mapped_hpa != hpa) { ++ /* ++ * Page state pgt has mapping already, so check if the mapped ++ * phys matches with the hpa, and report an error if doesn't ++ * match. ++ */ ++ pkvm_err("pgstate_pgt not match: mapped_hpa 0x%lx != 0x%lx for gpa 0x%llx\n", ++ mapped_hpa, hpa, gpa); ++ return false; ++ } ++ ++ return true; ++} ++ ++enum sept_handle_ret ++pkvm_handle_shadow_ept_violation(struct shadow_vcpu_state *shadow_vcpu, u64 l2_gpa, u64 exit_quali) ++{ ++ struct pkvm_shadow_vm *vm = shadow_vcpu->vm; ++ struct shadow_ept_desc *desc = &vm->sept_desc; ++ struct pkvm_pgtable *sept = &desc->sept; ++ struct pkvm_pgtable_ops *pgt_ops = sept->pgt_ops; ++ struct pkvm_pgtable *vept = &shadow_vcpu->vept; ++ enum sept_handle_ret ret = PKVM_NOT_HANDLED; ++ unsigned long phys; ++ int level; ++ u64 gprot, rsvd_chk_gprot; ++ ++ pkvm_spin_lock(&vm->lock); ++ ++ pkvm_pgtable_lookup(vept, l2_gpa, &phys, &gprot, &level); ++ if (phys == INVALID_ADDR) ++ /* Geust EPT not valid, back to kvm-high */ ++ goto out; ++ ++ if (is_access_violation(gprot, exit_quali)) ++ /* Guest EPT error, refuse to handle in shadow ept */ ++ goto out; ++ ++ rsvd_chk_gprot = gprot; ++ /* is_rsvd_spte() need based on PAGE_SIZE bit */ ++ if (level != PG_LEVEL_4K) ++ pgt_ops->pgt_entry_mkhuge(&rsvd_chk_gprot); ++ ++ if (is_rsvd_spte(&ept_zero_check, rsvd_chk_gprot, level)) { ++ ret = PKVM_INJECT_EPT_MISC; ++ } else { ++ unsigned long level_size = pgt_ops->pgt_level_to_size(level); ++ unsigned long gpa = ALIGN_DOWN(l2_gpa, level_size); ++ unsigned long hpa = ALIGN_DOWN(host_gpa2hpa(phys), level_size); ++ /* ++ * Still set SUPPRESS_VE bit here as some mapping may still ++ * cause EPT_VIOLATION and we want these EPT_VIOLATION to cause ++ * vmexit. ++ */ ++ u64 prot = (gprot & EPT_PROT_MASK) | EPT_PROT_DEF; ++ ++ if (allow_shadow_ept_mapping(vm, gpa, hpa, level_size) && ++ !pkvm_pgtable_map(sept, gpa, hpa, level_size, 0, prot, NULL)) ++ ret = PKVM_HANDLED; ++ } ++out: ++ pkvm_spin_unlock(&vm->lock); ++ return ret; ++} ++ ++void pkvm_flush_shadow_ept(struct shadow_ept_desc *desc) ++{ ++ if (!is_valid_eptp(desc->shadow_eptp)) ++ return; ++ ++ flush_ept(desc->shadow_eptp); ++} ++ ++void pkvm_shadow_clear_suppress_ve(struct kvm_vcpu *vcpu, unsigned long gfn) ++{ ++ unsigned long gpa = gfn * PAGE_SIZE; ++ struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu); ++ struct shadow_vcpu_state *shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu; ++ struct pkvm_shadow_vm *vm = shadow_vcpu->vm; ++ struct shadow_ept_desc *desc = &vm->sept_desc; ++ struct pkvm_pgtable *sept = &desc->sept; ++ ++ if (!shadow_vcpu_is_protected(shadow_vcpu)) ++ return; ++ ++ /* ++ * Set the mmio_pte with prot 0, which means it is invalid and with ++ * "Suppress #VE" bit cleared. Accessing this pte will trigger #VE. ++ */ ++ pkvm_pgtable_annotate(sept, gpa, PAGE_SIZE, SHADOW_EPT_MMIO_ENTRY); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ept.h b/arch/x86/kvm/vmx/pkvm/hyp/ept.h +new file mode 100644 +index 000000000000..a0b5e701fa31 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/ept.h +@@ -0,0 +1,70 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef __PKVM_EPT_H ++#define __PKVM_EPT_H ++ ++#include "pkvm_hyp.h" ++ ++#define HOST_EPT_DEF_MEM_PROT (VMX_EPT_RWX_MASK | \ ++ (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)) ++#define HOST_EPT_DEF_MMIO_PROT (VMX_EPT_RWX_MASK | \ ++ (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT)) ++#define EPT_PROT_MASK (VMX_EPT_RWX_MASK | VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT) ++#define EPT_PROT_DEF SUPPRESS_VE ++ ++#define SHADOW_EPT_MMIO_ENTRY 0 ++ ++enum sept_handle_ret { ++ PKVM_NOT_HANDLED, ++ PKVM_HANDLED, ++ PKVM_INJECT_EPT_MISC, ++}; ++ ++void host_ept_lock(void); ++void host_ept_unlock(void); ++int pkvm_host_ept_map(unsigned long vaddr_start, unsigned long phys_start, ++ unsigned long size, int pgsz_mask, u64 prot); ++int pkvm_host_ept_unmap(unsigned long vaddr_start, unsigned long phys_start, ++ unsigned long size); ++void pkvm_host_ept_lookup(unsigned long vaddr, unsigned long *pphys, ++ u64 *pprot, int *plevel); ++void pkvm_host_ept_destroy(void); ++int pkvm_host_ept_init(struct pkvm_pgtable_cap *cap, void *ept_pool_base, ++ unsigned long ept_pool_pages); ++int handle_host_ept_violation(struct kvm_vcpu *vcpu, bool *skip); ++void pkvm_flush_host_ept(void); ++int pkvm_shadow_ept_pool_init(void *ept_pool_base, unsigned long ept_pool_pages); ++int pkvm_shadow_ept_init(struct shadow_ept_desc *desc); ++void pkvm_shadow_ept_deinit(struct shadow_ept_desc *desc); ++void pkvm_guest_ept_init(struct shadow_vcpu_state *shadow_vcpu, u64 guest_eptp); ++void pkvm_guest_ept_deinit(struct shadow_vcpu_state *shadow_vcpu); ++enum sept_handle_ret ++pkvm_handle_shadow_ept_violation(struct shadow_vcpu_state *shadow_vcpu, u64 l2_gpa, u64 exit_quali); ++void pkvm_invalidate_shadow_ept(struct shadow_ept_desc *desc); ++void pkvm_invalidate_shadow_ept_with_range(struct shadow_ept_desc *desc, ++ unsigned long vaddr, unsigned long size); ++void pkvm_flush_shadow_ept(struct shadow_ept_desc *desc); ++void pkvm_shadow_clear_suppress_ve(struct kvm_vcpu *vcpu, unsigned long gfn); ++ ++int pkvm_pgstate_pgt_init(struct pkvm_shadow_vm *vm); ++void pkvm_pgstate_pgt_deinit(struct pkvm_shadow_vm *vm); ++ ++struct pkvm_mm_ops *pkvm_shadow_sl_iommu_pgt_get_mm_ops(bool coherent); ++void pkvm_shadow_sl_iommu_pgt_update_coherency(struct pkvm_pgtable *pgt, bool coherent); ++ ++bool is_pgt_ops_ept(struct pkvm_pgtable *pgt); ++ ++static inline bool is_valid_eptp(u64 eptp) ++{ ++ if (!eptp || (eptp == INVALID_GPA)) ++ return false; ++ ++ /* TODO: other bits check */ ++ return true; ++} ++ ++extern struct pkvm_pgtable_ops ept_ops; ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/idt.S b/arch/x86/kvm/vmx/pkvm/hyp/idt.S +new file mode 100644 +index 000000000000..87252724a501 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/idt.S +@@ -0,0 +1,67 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#include ++#include ++#include ++ ++.macro save_frame ++ push %r15 ++ push %r14 ++ push %r13 ++ push %r12 ++ push %r11 ++ push %r10 ++ push %r9 ++ push %r8 ++ push %_ASM_DI ++ push %_ASM_SI ++ push %_ASM_BP ++ push %_ASM_SP ++ push %_ASM_DX ++ push %_ASM_CX ++ push %_ASM_BX ++ push %_ASM_AX ++.endm ++ ++.macro restore_frame ++ pop %_ASM_AX ++ pop %_ASM_BX ++ pop %_ASM_CX ++ pop %_ASM_DX ++ pop %_ASM_SP ++ pop %_ASM_BP ++ pop %_ASM_SI ++ pop %_ASM_DI ++ pop %r8 ++ pop %r9 ++ pop %r10 ++ pop %r11 ++ pop %r12 ++ pop %r13 ++ pop %r14 ++ pop %r15 ++.endm ++ ++SYM_CODE_START(noop_handler) ++ UNWIND_HINT_EMPTY ++ save_frame ++ ++ call handle_noop ++ ++ restore_frame ++ ++ iretq ++SYM_CODE_END(noop_handler) ++ ++SYM_CODE_START(nmi_handler) ++ UNWIND_HINT_EMPTY ++ save_frame ++ ++ call handle_nmi ++ ++ restore_frame ++ ++ iretq ++SYM_CODE_END(nmi_handler) +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c b/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c +new file mode 100644 +index 000000000000..035dc092917e +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c +@@ -0,0 +1,371 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include "pkvm_hyp.h" ++#include "early_alloc.h" ++#include "memory.h" ++#include "pgtable.h" ++#include "mmu.h" ++#include "ept.h" ++#include "vmx.h" ++#include "nested.h" ++#include "debug.h" ++#include "iommu.h" ++#include "iommu_internal.h" ++#include "mem_protect.h" ++#include "lapic.h" ++#include "pci.h" ++ ++void *pkvm_vmemmap_base; ++void *pkvm_mmu_pgt_base; ++void *host_ept_pgt_base; ++static void *iommu_mem_base; ++static void *shadow_ept_base; ++ ++static int divide_memory_pool(phys_addr_t phys, unsigned long size) ++{ ++ int data_struct_size = pkvm_data_struct_pages( ++ PKVM_PAGES + PKVM_EXTRA_PAGES, ++ PKVM_PCPU_PAGES + PKVM_HOST_VCPU_PAGES ++ + PKVM_HOST_VCPU_VMCS_PAGES, pkvm_hyp->num_cpus) << PAGE_SHIFT; ++ void *virt = __pkvm_va(phys + data_struct_size); ++ unsigned long nr_pages; ++ ++ pkvm_early_alloc_init(virt, size - data_struct_size); ++ ++ nr_pages = pkvm_vmemmap_pages(sizeof(struct pkvm_page)); ++ pkvm_vmemmap_base = pkvm_early_alloc_contig(nr_pages); ++ if (!pkvm_vmemmap_base) ++ return -ENOMEM; ++ ++ nr_pages = pkvm_mmu_pgtable_pages(); ++ pkvm_mmu_pgt_base = pkvm_early_alloc_contig(nr_pages); ++ if (!pkvm_mmu_pgt_base) ++ return -ENOMEM; ++ ++ nr_pages = host_ept_pgtable_pages(); ++ host_ept_pgt_base = pkvm_early_alloc_contig(nr_pages); ++ if (!host_ept_pgt_base) ++ return -ENOMEM; ++ ++ nr_pages = pkvm_iommu_pages(PKVM_MAX_PASID, PKVM_MAX_PASID_PDEV_NUM, ++ PKVM_MAX_PDEV_NUM, PKVM_MAX_IOMMU_NUM, ++ PKVM_QI_DESC_ALIGNED_SIZE, ++ PKVM_QI_DESC_STATUS_ALIGNED_SIZE, ++ pkvm_hyp->num_cpus); ++ iommu_mem_base = pkvm_early_alloc_contig(nr_pages); ++ if (!iommu_mem_base) ++ return -ENOMEM; ++ ++ nr_pages = pkvm_shadow_ept_pgtable_pages(PKVM_MAX_NORMAL_VM_NUM + ++ PKVM_MAX_SECURE_VM_NUM) + ++ pkvm_host_shadow_iommu_pgtable_pages(PKVM_MAX_PDEV_NUM); ++ shadow_ept_base = pkvm_early_alloc_contig(nr_pages); ++ if (!shadow_ept_base) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int pkvm_back_vmemmap(phys_addr_t back_pa) ++{ ++ unsigned long i, start, start_va, size, end, end_va = 0; ++ struct memblock_region *reg; ++ int ret; ++ ++ /* vmemmap region map to virtual address 0 */ ++ __pkvm_vmemmap = 0; ++ ++ for (i = 0; i < pkvm_memblock_nr; i++) { ++ reg = &pkvm_memory[i]; ++ start = reg->base; ++ /* Translate a range of memory to vmemmap range */ ++ start_va = ALIGN_DOWN((unsigned long)pkvm_phys_to_page(start), ++ PAGE_SIZE); ++ /* ++ * The beginning of the pkvm_vmemmap region for the current ++ * memblock may already be backed by the page backing the end of ++ * the previous region, so avoid mapping it twice. ++ */ ++ start_va = max(start_va, end_va); ++ ++ end = reg->base + reg->size; ++ end_va = ALIGN((unsigned long)pkvm_phys_to_page(end), PAGE_SIZE); ++ /* vmemmap va shall below PKVM_IOVA_OFFSET*/ ++ if (end_va >= PKVM_IOVA_OFFSET) ++ return -ENOMEM; ++ if (start_va >= end_va) ++ continue; ++ ++ size = end_va - start_va; ++ /* ++ * Create mapping for vmemmap virtual address ++ * [start, start+size) to physical address ++ * [back, back+size). ++ */ ++ ret = pkvm_mmu_map(start_va, back_pa, size, 0, ++ (u64)pgprot_val(PAGE_KERNEL)); ++ if (ret) ++ return ret; ++ ++ memset(__pkvm_va(back_pa), 0, size); ++ back_pa += size; ++ } ++ ++ return 0; ++} ++ ++static int create_mmu_mapping(const struct pkvm_section sections[], ++ int section_sz) ++{ ++ unsigned long nr_pages = pkvm_mmu_pgtable_pages(); ++ int ret; ++#ifndef CONFIG_PKVM_INTEL_DEBUG ++ struct memblock_region *reg; ++ int i; ++#endif ++ ++ ret = pkvm_early_mmu_init(&pkvm_hyp->mmu_cap, ++ pkvm_mmu_pgt_base, nr_pages); ++ if (ret) ++ return ret; ++ ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++ /* ++ * clone host CR3 page mapping from __page_base_offset, it covers both ++ * direct mapping and symbol mapping for pkvm (same mapping as kernel) ++ */ ++ pkvm_mmu_clone_host(pkvm_hyp->mmu_cap.level, __page_base_offset); ++#else ++ /* ++ * Create mapping for the memory in memblocks. ++ * This will include all the memory host kernel can see, as well ++ * as the memory pkvm allocated during init. ++ * ++ * The virtual address for this mapping is the same with the kernel ++ * direct mapping. ++ */ ++ for (i = 0; i < pkvm_memblock_nr; i++) { ++ reg = &pkvm_memory[i]; ++ ret = pkvm_mmu_map((unsigned long)__pkvm_va(reg->base), ++ reg->base, reg->size, ++ 0, (u64)pgprot_val(PAGE_KERNEL)); ++ if (ret) ++ return ret; ++ } ++ ++ for (i = 0; i < section_sz; i++) { ++ if (sections[i].type != PKVM_RESERVED_MEMORY) { ++ ret = pkvm_mmu_map(sections[i].addr, ++ __pkvm_pa_symbol(sections[i].addr), ++ sections[i].size, ++ 0, sections[i].prot); ++ } ++ if (ret) ++ return ret; ++ } ++#endif ++ ++ ret = pkvm_back_vmemmap(__pkvm_pa(pkvm_vmemmap_base)); ++ if (ret) ++ return ret; ++ ++ /* Switch the mmu pgtable to enable pkvm_vmemmap */ ++ native_write_cr3(pkvm_hyp->mmu->root_pa); ++ ++ pkvm_later_mmu_init(pkvm_mmu_pgt_base, nr_pages); ++ ++ return 0; ++} ++ ++static int create_host_ept_mapping(void) ++{ ++ struct memblock_region *reg; ++ int ret, i; ++ unsigned long phys = 0; ++ u64 entry_prot; ++ ++ ret = pkvm_host_ept_init(&pkvm_hyp->ept_cap, ++ host_ept_pgt_base, host_ept_pgtable_pages()); ++ if (ret) ++ return ret; ++ ++ /* ++ * Create EPT mapping for memory with WB + RWX property ++ */ ++ entry_prot = pkvm_mkstate(HOST_EPT_DEF_MEM_PROT, PKVM_PAGE_OWNED); ++ for (i = 0; i < pkvm_memblock_nr; i++) { ++ reg = &pkvm_memory[i]; ++ ret = pkvm_host_ept_map((unsigned long)reg->base, ++ (unsigned long)reg->base, ++ (unsigned long)reg->size, ++ 0, entry_prot); ++ pkvm_info("create_host_ept_mapping(): mapped 0x%llx -> 0x%llx, sz %llu\n", reg->base, reg->base, reg->size); ++ if (ret) ++ return ret; ++ } ++ ++ /* ++ * The holes in memblocks are treated as MMIO with the ++ * mapping UC + RWX. ++ */ ++ entry_prot = pkvm_mkstate(HOST_EPT_DEF_MMIO_PROT, PKVM_PAGE_OWNED); ++ for (i = 0; i < pkvm_memblock_nr; i++, phys = reg->base + reg->size) { ++ reg = &pkvm_memory[i]; ++ pkvm_info("create_host_ept_mapping(): mapped 0x%lx -> 0x%lx, sz %llu\n", phys, phys, reg->base - phys); ++ ret = pkvm_host_ept_map(phys, phys, (unsigned long)reg->base - phys, ++ 0, entry_prot); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int protect_pkvm_pages(const struct pkvm_section sections[], ++ int section_sz, phys_addr_t phys, unsigned long size) ++{ ++ int i, ret; ++ ++ for (i = 0; i < section_sz; i++) { ++ u64 pa, size; ++ ++ if (sections[i].type == PKVM_CODE_DATA_SECTIONS) { ++ pa = __pkvm_pa_symbol(sections[i].addr); ++ size = sections[i].size; ++ kvm_info("protect_pkvm_pages(): unmapping pkvm addr 0x%llx -> 0x%llx, sz %llu\n", pa, pa, size); ++ ret = pkvm_host_ept_unmap(pa, pa, size); ++ if (ret) { ++ pkvm_err("%s: failed to protect section\n", __func__); ++ return ret; ++ } ++ } ++ } ++ ++ ret = pkvm_host_ept_unmap(phys, phys, size); ++ kvm_info("protect_pkvm_pages(): unmapping pkvm addr 0x%llx -> 0x%llx, sz %lu\n", phys, phys, size); ++ if (ret) { ++ pkvm_err("%s: failed to protect reserved memory\n", __func__); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int create_iommu(void) ++{ ++ int nr_pages = pkvm_iommu_pages(PKVM_MAX_PASID, PKVM_MAX_PASID_PDEV_NUM, ++ PKVM_MAX_PDEV_NUM, PKVM_MAX_IOMMU_NUM, ++ PKVM_QI_DESC_ALIGNED_SIZE, ++ PKVM_QI_DESC_STATUS_ALIGNED_SIZE, ++ pkvm_hyp->num_cpus); ++ ++ return pkvm_init_iommu(pkvm_virt_to_phys(iommu_mem_base), nr_pages); ++} ++ ++#define TMP_SECTION_SZ 16UL ++int __pkvm_init_finalise(struct kvm_vcpu *vcpu, struct pkvm_section sections[], ++ int section_sz) ++{ ++ int i, ret = 0; ++ static bool pkvm_init; ++ struct pkvm_host_vcpu *pkvm_host_vcpu = to_pkvm_hvcpu(vcpu); ++ struct pkvm_pcpu *pcpu = pkvm_host_vcpu->pcpu; ++ struct pkvm_section tmp_sections[TMP_SECTION_SZ]; ++ phys_addr_t pkvm_mem_base; ++ unsigned long pkvm_mem_size = 0; ++ u64 eptp; ++ ++ if (pkvm_init) { ++ /* Switch to pkvm mmu in root mode in case some setup may need this */ ++ native_write_cr3(pkvm_hyp->mmu->root_pa); ++ goto switch_pgt; ++ } ++ ++ if (section_sz > TMP_SECTION_SZ) { ++ pkvm_err("pkvm: no enough space to save sections[] array parameters!"); ++ goto out; ++ } ++ ++ /* kernel may use VMAP_STACK, which could make the parameter's vaddr ++ * not-valid after we switch new CR3 later, so copy parameter sections ++ * array from host space to pkvm space ++ */ ++ for (i = 0; i < section_sz; i++) { ++ tmp_sections[i] = sections[i]; ++ if (sections[i].type == PKVM_RESERVED_MEMORY) { ++ pkvm_mem_base = sections[i].addr; ++ pkvm_mem_size = sections[i].size; ++ } ++ } ++ if (pkvm_mem_size == 0) { ++ pkvm_err("pkvm: no pkvm reserve memory!"); ++ goto out; ++ } ++ ++ ret = divide_memory_pool(pkvm_mem_base, pkvm_mem_size); ++ if (ret) { ++ pkvm_err("pkvm: not reserve enough memory!"); ++ goto out; ++ } ++ ++ ret = create_mmu_mapping(tmp_sections, section_sz); ++ if (ret) ++ goto out; ++ ++ ret = create_host_ept_mapping(); ++ if (ret) ++ goto out; ++ ++ ret = protect_pkvm_pages(tmp_sections, section_sz, ++ pkvm_mem_base, pkvm_mem_size); ++ if (ret) ++ goto out; ++ ++ ret = init_finalize_pci(&pkvm_hyp->host_vm.pci_info); ++ if (ret) ++ goto out; ++ ++ ret = create_iommu(); ++ if (ret) ++ goto out; ++ ++ pkvm_init_nest(); ++ ++ ret = pkvm_shadow_ept_pool_init(shadow_ept_base, ++ pkvm_shadow_ept_pgtable_pages(PKVM_MAX_NORMAL_VM_NUM + ++ PKVM_MAX_SECURE_VM_NUM) + ++ pkvm_host_shadow_iommu_pgtable_pages(PKVM_MAX_PDEV_NUM)); ++ if (ret) ++ goto out; ++ ++ pkvm_init = true; ++ ++switch_pgt: ++ /* switch mmu */ ++ vmcs_writel(HOST_CR3, pkvm_hyp->mmu->root_pa); ++ pcpu->cr3 = pkvm_hyp->mmu->root_pa; ++ ++ /* enable ept */ ++ eptp = pkvm_construct_eptp(pkvm_hyp->host_vm.ept->root_pa, ++ pkvm_hyp->host_vm.ept->level); ++ secondary_exec_controls_setbit(&pkvm_host_vcpu->vmx, SECONDARY_EXEC_ENABLE_EPT); ++ vmcs_write64(EPT_POINTER, eptp); ++ ++ ept_sync_global(); ++ ++ ret = pkvm_setup_lapic(pcpu, vcpu->cpu); ++out: ++ return ret; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/io.h b/arch/x86/kvm/vmx/pkvm/hyp/io.h +new file mode 100644 +index 000000000000..bf62bbdc1697 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/io.h +@@ -0,0 +1,82 @@ ++/* SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2023 Intel Corporation ++ */ ++#ifndef _PKVM_IO_H_ ++#define _PKVM_IO_H_ ++ ++/* Size mask for I/O access */ ++#define IO_SIZE_1 1 ++#define IO_SIZE_2 2 ++#define IO_SIZE_4 4 ++#define IO_SIZE_FULL 7 ++ ++static inline void pkvm_pio_read(unsigned int port, int size, unsigned long *value) ++{ ++ switch (size) { ++ case IO_SIZE_1: ++ *(u8 *)value = inb(port); ++ break; ++ case IO_SIZE_2: ++ *(u16 *)value = inw(port); ++ break; ++ case IO_SIZE_4: ++ *(u32 *)value = inl(port); ++ break; ++ default: ++ break; ++ } ++} ++ ++static inline void pkvm_pio_write(unsigned int port, int size, unsigned long value) ++{ ++ switch (size) { ++ case IO_SIZE_1: ++ outb((u8)value, port); ++ break; ++ case IO_SIZE_2: ++ outw((u16)value, port); ++ break; ++ case IO_SIZE_4: ++ outl((u32)value, port); ++ break; ++ default: ++ break; ++ } ++} ++ ++ ++static inline void pkvm_mmio_read(u64 pos, int size, unsigned long *value) ++{ ++ switch (size) { ++ case IO_SIZE_1: ++ asm volatile("movb (%1),%%al" : "=a" (*(u8 *)value) : "r" (pos)); ++ break; ++ case IO_SIZE_2: ++ asm volatile("movw (%1),%%ax" : "=a" (*(u16 *)value) : "r" (pos)); ++ break; ++ case IO_SIZE_4: ++ asm volatile("movl (%1),%%eax" : "=a" (*(u32 *)value) : "r" (pos)); ++ break; ++ default: ++ break; ++ } ++} ++ ++static inline void pkvm_mmio_write(u64 pos, int size, unsigned long value) ++{ ++ switch (size) { ++ case IO_SIZE_1: ++ asm volatile("movb %%al,(%1)" : : "a" ((u8)value), "r" (pos) : "memory"); ++ break; ++ case IO_SIZE_2: ++ asm volatile("movw %%ax,(%1)" : : "a" ((u16)value), "r" (pos) : "memory"); ++ break; ++ case IO_SIZE_4: ++ asm volatile("movl %%eax,(%1)" : : "a" ((u32)value), "r" (pos) : "memory"); ++ break; ++ default: ++ break; ++ } ++} ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c b/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c +new file mode 100644 +index 000000000000..d48d804aaf12 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.c +@@ -0,0 +1,374 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* Copyright(c) 2023 Intel Corporation. */ ++#include ++#include "ept.h" ++#include "io.h" ++#include "io_emulate.h" ++ ++struct pkvm_pio_emul_table host_pio_emul_table; ++struct pkvm_mmio_emul_table host_mmio_emul_table; ++ ++static int pkvm_pio_default_in(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req) ++{ ++ pkvm_pio_read(req->port, req->size, req->value); ++ return 0; ++} ++ ++static int pkvm_pio_default_out(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req) ++{ ++ pkvm_pio_write(req->port, req->size, *req->value); ++ return 0; ++} ++ ++struct pkvm_pio_handler default_pio_handler = { ++ .read = pkvm_pio_default_in, ++ .write = pkvm_pio_default_out ++}; ++ ++/* ++ * Not thread safe and should hold a lock if called concurrently. ++ */ ++int register_host_pio_handler(struct pkvm_host_vm *host_vm, unsigned int port, ++ unsigned int size_mask, pio_handler_t read, pio_handler_t write) ++{ ++ struct pkvm_pio_emul_table *table; ++ struct pkvm_pio_handler *handler; ++ unsigned long index; ++ u8 bit; ++ ++ table = &host_pio_emul_table; ++ index = find_first_zero_bit(table->bitmap, PKVM_MAX_PIO_EMUL_NUM); ++ if (index >= PKVM_MAX_PIO_EMUL_NUM) ++ return -ENOSPC; ++ ++ __set_bit(index, table->bitmap); ++ ++ handler = &table->table[index]; ++ handler->port = port; ++ handler->size_mask = size_mask; ++ handler->read = read; ++ handler->write = write; ++ ++ index = port >> 3U; ++ bit = (u8)(1U << (port & 0x7U)); ++ host_vm->io_bitmap[index] |= bit; ++ ++ return 0; ++} ++ ++static bool pio_access_valid(int size) ++{ ++ return size == IO_SIZE_1 || size == IO_SIZE_2 || size == IO_SIZE_4; ++} ++ ++static struct pkvm_pio_handler *get_pio_handler(struct pkvm_pio_emul_table *table, ++ struct pkvm_pio_req *req) ++{ ++ struct pkvm_pio_handler *handler; ++ unsigned long index; ++ /* ++ * Port I/O access is expected to only based on their address and have a ++ * fixed access width. Note that they might overlap, for example PCI config ++ * space addr port 0xcf8 and ACPI reset port 0xcf9. So match the handler ++ * strictly based on their base address and access width here. ++ * ++ * There are two special situations to consider. One case is that the base ++ * address matches but the access width differs, this is regarded as an ++ * invalid access and thus return a NULL handler. Another case is no base ++ * address matches. This is due to an overlapped I/O access that triggered ++ * the IO VM exit, but we are not intended to handle the base address. So ++ * in this case choose the default handler to do plain pio. ++ */ ++ for_each_set_bit(index, table->bitmap, PKVM_MAX_PIO_EMUL_NUM) { ++ handler = &table->table[index]; ++ if (req->port == handler->port) { ++ if (pio_access_valid(req->size) && (req->size & handler->size_mask)) ++ return handler; ++ ++ pkvm_err("pkvm: I/O port 0x%x mismatched access witdth %d", ++ req->port, req->size); ++ return NULL; ++ } ++ } ++ ++ return &default_pio_handler; ++} ++ ++static int emulate_host_pio(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req) ++{ ++ struct pkvm_pio_emul_table *table; ++ struct pkvm_pio_handler *handler; ++ int ret = 0; ++ ++ table = &host_pio_emul_table; ++ handler = get_pio_handler(table, req); ++ if (!handler) ++ return -EINVAL; ++ ++ if (req->direction == PKVM_IO_READ && handler->read) ++ ret = handler->read(vcpu, req); ++ else if (req->direction == PKVM_IO_WRITE && handler->write) ++ ret = handler->write(vcpu, req); ++ ++ return ret; ++} ++ ++int handle_host_pio(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ unsigned long exit_qual; ++ struct pkvm_pio_req req; ++ int string; ++ ++ exit_qual = vmx->exit_qualification; ++ ++ string = (exit_qual & 16) != 0; ++ if (string) { ++ pkvm_err("pkvm: unsupported string instruction\n"); ++ return -EINVAL; ++ } ++ ++ req.port = exit_qual >> 16; ++ req.size = (exit_qual & 7) + 1; ++ req.value = &vcpu->arch.regs[VCPU_REGS_RAX]; ++ req.direction = (exit_qual & 8) == 0; ++ ++ pkvm_dbg("pkvm: host %s I/O port 0x%x width %d value %lx", req.direction ? ++ "write" : "read", req.port, req.size, *req.value); ++ ++ return emulate_host_pio(vcpu, &req); ++} ++ ++static int pkvm_mmio_default_read(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req) ++{ ++ pkvm_mmio_read((u64)host_mmio2hva(req->address), req->size, req->value); ++ return 0; ++} ++ ++static int pkvm_mmio_default_write(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req) ++{ ++ pkvm_mmio_write((u64)host_mmio2hva(req->address), req->size, *req->value); ++ return 0; ++} ++ ++struct pkvm_mmio_handler default_mmio_handler = { ++ .read = pkvm_mmio_default_read, ++ .write = pkvm_mmio_default_write ++}; ++ ++static struct pkvm_mmio_handler *emul_mmio_lookup(struct pkvm_mmio_emul_table *table, ++ unsigned long start, unsigned long end) ++{ ++ struct pkvm_mmio_handler *handler; ++ unsigned long index; ++ ++ for_each_set_bit(index, table->bitmap, PKVM_MAX_MMIO_EMUL_NUM) { ++ handler = &table->table[index]; ++ if (start <= handler->end && handler->start <= end) ++ return handler; ++ } ++ ++ return NULL; ++} ++ ++/* ++ * Not thread safe and should hold a lock if called concurrently. ++ */ ++int register_host_mmio_handler(unsigned long start, unsigned long end, ++ mmio_handler_t read, mmio_handler_t write) ++{ ++ struct pkvm_mmio_emul_table *table; ++ struct pkvm_mmio_handler *handler; ++ unsigned long index; ++ int ret = 0; ++ ++ if (start > end) ++ return -EINVAL; ++ ++ table = &host_mmio_emul_table; ++ ++ if (emul_mmio_lookup(table, start, end)) ++ return -EINVAL; ++ ++ index = find_first_zero_bit(table->bitmap, PKVM_MAX_MMIO_EMUL_NUM); ++ if (index >= PKVM_MAX_MMIO_EMUL_NUM) ++ return -ENOSPC; ++ ++ __set_bit(index, table->bitmap); ++ ++ handler = &table->table[index]; ++ handler->start = start; ++ handler->end = end; ++ handler->read = read; ++ handler->write = write; ++ ++ host_ept_lock(); ++ ret = pkvm_host_ept_unmap(start, start, end - start + 1); ++ host_ept_unlock(); ++ ++ return ret; ++} ++ ++/* ++ * mmcfg access in x86 only use simple mov instrcutions. So keep the decoder ++ * simple for now. ++ * TODO: make the decoder complete ++ */ ++static int mmio_instruction_decode(struct kvm_vcpu *vcpu, unsigned long gpa, ++ struct pkvm_mmio_req *req) ++{ ++ struct x86_exception exception; ++ bool direction, zero_extend = false; ++ unsigned long rip; ++ u8 insn[3]; ++ int size; ++ ++ rip = vmcs_readl(GUEST_RIP); ++ ++ /* ++ * Read first three bytes is enough to determine the opcode. ++ * Check arch/x86/include/asm/pci_x86.h. ++ */ ++ if (read_gva(vcpu, rip, insn, 3, &exception) < 0) ++ return -EINVAL; ++ ++ /* ++ * In case the compiler adds the REX prefix ++ */ ++ if ((insn[0] & 0xf0) == 0x40) { ++ insn[0] = insn[1]; ++ insn[1] = insn[2]; ++ } ++ ++ if (insn[0] == 0x66 && (insn[1] & 0xf0) == 0x40) ++ insn[1] = insn[2]; ++ ++ switch (insn[0]) { ++ case 0x0f: ++ switch (insn[1]) { ++ case 0xb6: ++ zero_extend = true; ++ direction = PKVM_IO_READ; ++ size = 1; ++ break; ++ default: ++ return -EIO; ++ } ++ break; ++ case 0x66: ++ size = 2; ++ switch (insn[1]) { ++ case 0x89: ++ direction = PKVM_IO_WRITE; ++ break; ++ case 0x8b: ++ direction = PKVM_IO_READ; ++ break; ++ default: ++ return -EIO; ++ } ++ break; ++ case 0x88: ++ size = 1; ++ direction = PKVM_IO_WRITE; ++ break; ++ case 0x89: ++ size = 4; ++ direction = PKVM_IO_WRITE; ++ break; ++ case 0x8a: ++ size = 1; ++ direction = PKVM_IO_READ; ++ break; ++ case 0x8b: ++ size = 4; ++ direction = PKVM_IO_READ; ++ break; ++ default: ++ return -EIO; ++ } ++ ++ req->address = gpa; ++ req->size = size; ++ req->value = &vcpu->arch.regs[VCPU_REGS_RAX]; ++ req->direction = direction; ++ ++ if (zero_extend) ++ *req->value = 0; ++ ++ return 0; ++} ++ ++static struct pkvm_mmio_handler *get_mmio_handler(struct pkvm_mmio_emul_table *table, ++ struct pkvm_mmio_req *req) ++{ ++ struct pkvm_mmio_handler *handler; ++ unsigned long start, end; ++ ++ start = req->address; ++ end = req->address + req->size - 1; ++ ++ handler = emul_mmio_lookup(table, start, end); ++ ++ /* ++ * If handler is NULL, this is an access that does not touch the emulated ++ * MMIO range. Return the default handler. ++ */ ++ if (!handler) ++ return &default_mmio_handler; ++ ++ /* Do not allow the access to cross the boundary. */ ++ if ((start < handler->start && end >= handler->start) || ++ (start <= handler->end && end > handler->end)) ++ return NULL; ++ ++ return handler; ++} ++ ++static int emulate_host_mmio(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req) ++{ ++ struct pkvm_mmio_emul_table *table; ++ struct pkvm_mmio_handler *handler; ++ int ret = 0; ++ ++ table = &host_mmio_emul_table; ++ ++ handler = get_mmio_handler(table, req); ++ if (!handler) ++ return -EINVAL; ++ ++ if (req->direction == PKVM_IO_READ && handler->read) ++ ret = handler->read(vcpu, req); ++ else if (req->direction == PKVM_IO_WRITE && handler->write) ++ ret = handler->write(vcpu, req); ++ ++ return ret; ++} ++ ++static int handle_host_mmio(struct kvm_vcpu *vcpu, unsigned long gpa) ++{ ++ struct pkvm_mmio_req req; ++ ++ if (mmio_instruction_decode(vcpu, gpa, &req)) { ++ pkvm_dbg("pkvm: MMIO instruction decode failed"); ++ return -EINVAL; ++ } ++ ++ pkvm_dbg("pkvm: host %s MMIO gpa 0x%lx width %d value 0x%lx", req.direction ? ++ "write" : "read", req.address, req.size, *req.value); ++ ++ return emulate_host_mmio(vcpu, &req); ++} ++ ++int try_emul_host_mmio(struct kvm_vcpu *vcpu, unsigned long gpa) ++{ ++ if (emul_mmio_lookup(&host_mmio_emul_table, gpa, gpa) == NULL) ++ return -EINVAL; ++ ++ if (handle_host_mmio(vcpu, gpa)) { ++ pkvm_err("%s: emulate MMIO failed for memory address 0x%lx\n", __func__, gpa); ++ return -EIO; ++ } ++ ++ return 0; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h b/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h +new file mode 100644 +index 000000000000..d9303bd8bf20 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/io_emulate.h +@@ -0,0 +1,67 @@ ++/* SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2023 Intel Corporation ++ */ ++#ifndef _PKVM_IO_EMULATE_H_ ++#define _PKVM_IO_EMULATE_H_ ++ ++/* I/O direction */ ++#define PKVM_IO_READ 0 ++#define PKVM_IO_WRITE 1 ++ ++/* Max num of port I/O emulation handlers */ ++#define PKVM_MAX_PIO_EMUL_NUM 32 ++ ++struct pkvm_pio_req { ++ unsigned int port; ++ int size; ++ bool direction; ++ unsigned long *value; ++}; ++ ++typedef int (*pio_handler_t)(struct kvm_vcpu *, struct pkvm_pio_req *); ++ ++struct pkvm_pio_handler { ++ unsigned int port; ++ int size_mask; ++ pio_handler_t read; ++ pio_handler_t write; ++}; ++ ++struct pkvm_pio_emul_table { ++ struct pkvm_pio_handler table[PKVM_MAX_PIO_EMUL_NUM]; ++ DECLARE_BITMAP(bitmap, PKVM_MAX_PIO_EMUL_NUM); ++}; ++ ++/* Max num of memory mapped I/O emulation handlers */ ++#define PKVM_MAX_MMIO_EMUL_NUM 256 ++ ++struct pkvm_mmio_req { ++ unsigned long address; ++ int size; ++ bool direction; ++ unsigned long *value; ++}; ++ ++typedef int (*mmio_handler_t)(struct kvm_vcpu *, struct pkvm_mmio_req *); ++ ++struct pkvm_mmio_handler { ++ unsigned long start; ++ unsigned long end; ++ mmio_handler_t read; ++ mmio_handler_t write; ++}; ++ ++struct pkvm_mmio_emul_table { ++ struct pkvm_mmio_handler table[PKVM_MAX_MMIO_EMUL_NUM]; ++ DECLARE_BITMAP(bitmap, PKVM_MAX_MMIO_EMUL_NUM); ++}; ++ ++int register_host_pio_handler(struct pkvm_host_vm *host_vm, unsigned int port, ++ unsigned int size_mask, pio_handler_t read, pio_handler_t write); ++int handle_host_pio(struct kvm_vcpu *vcpu); ++ ++int register_host_mmio_handler(unsigned long start, unsigned long end, ++ mmio_handler_t read, mmio_handler_t write); ++int try_emul_host_mmio(struct kvm_vcpu *vcpu, unsigned long gpa); ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu.c b/arch/x86/kvm/vmx/pkvm/hyp/iommu.c +new file mode 100644 +index 000000000000..6556ee9f4884 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu.c +@@ -0,0 +1,2372 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#include ++#include ++#include ++#include ++#include "pkvm_hyp.h" ++#include "memory.h" ++#include "mmu.h" ++#include "ept.h" ++#include "pgtable.h" ++#include "iommu_internal.h" ++#include "debug.h" ++#include "ptdev.h" ++#include "iommu_spgt.h" ++#include "bug.h" ++ ++#define for_each_valid_iommu(p) \ ++ for ((p) = iommus; (p) < iommus + PKVM_MAX_IOMMU_NUM; (p)++) \ ++ if (!(p) || !(p)->iommu.reg_phys) { \ ++ continue; \ ++ } else ++ ++static struct pkvm_iommu iommus[PKVM_MAX_IOMMU_NUM]; ++ ++static struct pkvm_pool iommu_pool; ++ ++/* Used in legacy mode only. */ ++struct shadow_pgt_sync_data { ++ unsigned long vaddr; ++ unsigned long vaddr_end; ++}; ++ ++/* ++ * Guest root/context/pasid table (hereinafter "id table") walking parameter. ++ * pkvm IOMMU driver walks the guest page table when syncing ++ * with the shadow id table. ++ */ ++struct id_sync_walk_data { ++ struct pkvm_iommu *iommu; ++ /* ++ * Used to hold shadow id table physical address ++ * which is used for sync shadow entries at each ++ * id table level. ++ */ ++ u64 shadow_pa[IOMMU_SM_LEVEL_NUM]; ++ /* ++ * Used when just syncing a part of shadow ++ * id table entries which match with this did if ++ * it is set as a non-zero did value. ++ */ ++ u16 did; ++ /* ++ * Used in legacy mode when just syncing a specific ++ * range of pages in shadow page tables. ++ */ ++ struct shadow_pgt_sync_data *spgt_data; ++}; ++ ++#define DEFINE_ID_SYNC_WALK_DATA(name, _iommu, domain_id, _spgt_data) \ ++ struct id_sync_walk_data (name) = { \ ++ .iommu = (_iommu), \ ++ .shadow_pa = {0}, \ ++ .did = (domain_id), \ ++ .spgt_data = (_spgt_data), \ ++ } ++ ++/* ++ * Used to config a shadow id table entry in root/context/pasid ++ * level. ++ */ ++struct id_sync_data { ++ union { ++ u64 root_entry; ++ struct context_entry ct_entry; ++ struct pasid_dir_entry pd_entry; ++ struct pasid_entry p_entry; ++ }; ++ void *guest_ptep; ++ void *shadow_ptep; ++ int level; ++ u64 iommu_ecap; ++ u64 shadow_pa; ++ struct pkvm_pgtable *shadow_id; ++ unsigned long vaddr; ++ struct shadow_pgt_sync_data *spgt_data; ++}; ++ ++static inline void *iommu_zalloc_pages(size_t size) ++{ ++ return pkvm_alloc_pages(&iommu_pool, get_order(size)); ++} ++ ++static void *iommu_zalloc_page(void) ++{ ++ return pkvm_alloc_pages(&iommu_pool, 0); ++} ++ ++static void iommu_get_page(void *vaddr) ++{ ++ pkvm_get_page(&iommu_pool, vaddr); ++} ++ ++static void iommu_put_page(void *vaddr) ++{ ++ pkvm_put_page(&iommu_pool, vaddr); ++} ++ ++static void iommu_flush_cache(void *ptep, unsigned int size) ++{ ++ pkvm_clflush_cache_range(ptep, size); ++} ++ ++static struct pkvm_mm_ops viommu_mm_ops = { ++ .phys_to_virt = host_gpa2hva, ++}; ++ ++static struct pkvm_mm_ops iommu_pw_coherency_mm_ops = { ++ .phys_to_virt = pkvm_phys_to_virt, ++ .virt_to_phys = pkvm_virt_to_phys, ++ .zalloc_page = iommu_zalloc_page, ++ .get_page = iommu_get_page, ++ .put_page = iommu_put_page, ++ .page_count = pkvm_page_count, ++}; ++ ++static struct pkvm_mm_ops iommu_pw_noncoherency_mm_ops = { ++ .phys_to_virt = pkvm_phys_to_virt, ++ .virt_to_phys = pkvm_virt_to_phys, ++ .zalloc_page = iommu_zalloc_page, ++ .get_page = iommu_get_page, ++ .put_page = iommu_put_page, ++ .page_count = pkvm_page_count, ++ .flush_cache = iommu_flush_cache, ++}; ++ ++static bool iommu_id_entry_present(void *ptep) ++{ ++ u64 val; ++ ++ val = *(u64 *)ptep; ++ return !!(val & 1); ++} ++ ++static unsigned long iommu_id_entry_to_phys(void *ptep) ++{ ++ u64 val = *(u64 *)ptep; ++ ++ return val & VTD_PAGE_MASK; ++} ++ ++static int iommu_sm_id_entry_to_index(unsigned long vaddr, int level) ++{ ++ switch (level) { ++ case IOMMU_PASID_TABLE: ++ return vaddr & (BIT(PASIDDIR_BITS) - 1); ++ case IOMMU_PASID_DIR: ++ return (vaddr >> PASIDDIR_SHIFT) & (BIT(PASIDDIR_BITS) - 1); ++ case IOMMU_SM_CONTEXT: ++ return (vaddr >> DEVFN_SHIFT) & (BIT(SM_DEVFN_BITS) - 1); ++ case IOMMU_SM_ROOT: ++ return (vaddr >> SM_BUS_SHIFT) & (BIT(SM_BUS_BITS) - 1); ++ default: ++ break; ++ } ++ ++ return -EINVAL; ++} ++ ++static bool iommu_id_entry_is_leaf(void *ptep, int level) ++{ ++ if (LAST_LEVEL(level) || ++ !iommu_id_entry_present(ptep)) ++ return true; ++ ++ return false; ++} ++ ++static int iommu_sm_id_level_entry_size(int level) ++{ ++ switch (level) { ++ case IOMMU_PASID_TABLE: ++ return sizeof(struct pasid_entry); ++ case IOMMU_PASID_DIR: ++ return sizeof(struct pasid_dir_entry); ++ case IOMMU_SM_CONTEXT: ++ /* scalable mode requires 32bytes for context */ ++ return sizeof(struct context_entry) * 2; ++ case IOMMU_SM_ROOT: ++ return sizeof(u64); ++ default: ++ break; ++ } ++ ++ return -EINVAL; ++} ++ ++static int iommu_sm_id_level_to_entries(int level) ++{ ++ switch (level) { ++ case IOMMU_PASID_TABLE: ++ return 1 << PASIDTAB_BITS; ++ case IOMMU_PASID_DIR: ++ return 1 << PASIDDIR_BITS; ++ case IOMMU_SM_CONTEXT: ++ return 1 << SM_DEVFN_BITS; ++ case IOMMU_SM_ROOT: ++ return 1 << SM_BUS_BITS; ++ default: ++ break; ++ } ++ ++ return -EINVAL; ++} ++ ++static unsigned long iommu_sm_id_level_to_size(int level) ++{ ++ switch (level) { ++ case IOMMU_PASID_TABLE: ++ return 1; ++ case IOMMU_PASID_DIR: ++ return 1 << PASIDDIR_SHIFT; ++ case IOMMU_SM_CONTEXT: ++ return 1 << DEVFN_SHIFT; ++ case IOMMU_SM_ROOT: ++ return 1 << SM_BUS_SHIFT; ++ default: ++ break; ++ } ++ ++ return 0; ++} ++ ++struct pkvm_pgtable_ops iommu_sm_id_ops = { ++ .pgt_entry_present = iommu_id_entry_present, ++ .pgt_entry_to_phys = iommu_id_entry_to_phys, ++ .pgt_entry_to_index = iommu_sm_id_entry_to_index, ++ .pgt_entry_is_leaf = iommu_id_entry_is_leaf, ++ .pgt_level_entry_size = iommu_sm_id_level_entry_size, ++ .pgt_level_to_entries = iommu_sm_id_level_to_entries, ++ .pgt_level_to_size = iommu_sm_id_level_to_size, ++}; ++ ++static int iommu_lm_id_entry_to_index(unsigned long vaddr, int level) ++{ ++ switch (level) { ++ case IOMMU_LM_CONTEXT: ++ return (vaddr >> LM_DEVFN_SHIFT) & (BIT(LM_DEVFN_BITS) - 1); ++ case IOMMU_LM_ROOT: ++ return (vaddr >> LM_BUS_SHIFT) & (BIT(LM_BUS_BITS) - 1); ++ default: ++ break; ++ } ++ ++ return -EINVAL; ++} ++ ++static int iommu_lm_id_level_entry_size(int level) ++{ ++ switch (level) { ++ case IOMMU_LM_CONTEXT: ++ return sizeof(struct context_entry); ++ case IOMMU_LM_ROOT: ++ return sizeof(struct root_entry); ++ default: ++ break; ++ } ++ ++ return -EINVAL; ++} ++ ++static int iommu_lm_id_level_to_entries(int level) ++{ ++ switch (level) { ++ case IOMMU_LM_CONTEXT: ++ return 1 << LM_DEVFN_BITS; ++ case IOMMU_LM_ROOT: ++ return 1 << LM_BUS_BITS; ++ default: ++ break; ++ } ++ ++ return -EINVAL; ++} ++ ++static unsigned long iommu_lm_id_level_to_size(int level) ++{ ++ switch (level) { ++ case IOMMU_LM_CONTEXT: ++ return 1 << LM_DEVFN_SHIFT; ++ case IOMMU_LM_ROOT: ++ return 1 << LM_BUS_SHIFT; ++ default: ++ break; ++ } ++ ++ return 0; ++} ++ ++struct pkvm_pgtable_ops iommu_lm_id_ops = { ++ .pgt_entry_present = iommu_id_entry_present, ++ .pgt_entry_to_phys = iommu_id_entry_to_phys, ++ .pgt_entry_to_index = iommu_lm_id_entry_to_index, ++ .pgt_entry_is_leaf = iommu_id_entry_is_leaf, ++ .pgt_level_entry_size = iommu_lm_id_level_entry_size, ++ .pgt_level_to_entries = iommu_lm_id_level_to_entries, ++ .pgt_level_to_size = iommu_lm_id_level_to_size, ++}; ++ ++static int iommu_pgtable_walk(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long vaddr_end, struct pkvm_pgtable_walker *walker) ++{ ++ if (!pgt->root_pa) ++ return 0; ++ ++ return pgtable_walk(pgt, vaddr, vaddr_end - vaddr, false, walker); ++} ++ ++static struct pkvm_ptdev *iommu_find_ptdev(struct pkvm_iommu *iommu, u16 bdf, u32 pasid) ++{ ++ struct pkvm_ptdev *p; ++ ++ list_for_each_entry(p, &iommu->ptdev_head, iommu_node) { ++ if (match_ptdev(p, bdf, pasid)) ++ return p; ++ } ++ ++ return NULL; ++} ++ ++static inline bool iommu_coherency(u64 ecap) ++{ ++ return ecap_smts(ecap) ? ecap_smpwc(ecap) : ecap_coherent(ecap); ++} ++ ++static struct pkvm_ptdev *iommu_add_ptdev(struct pkvm_iommu *iommu, u16 bdf, u32 pasid) ++{ ++ struct pkvm_ptdev *ptdev = pkvm_get_ptdev(bdf, pasid); ++ ++ if (!ptdev) { ++ ptdev = pkvm_alloc_ptdev(bdf, pasid, iommu_coherency(iommu->iommu.ecap)); ++ if (!ptdev) ++ return NULL; ++ } ++ ++ list_add_tail(&ptdev->iommu_node, &iommu->ptdev_head); ++ return ptdev; ++} ++ ++static void iommu_del_ptdev(struct pkvm_iommu *iommu, struct pkvm_ptdev *ptdev) ++{ ++ list_del_init(&ptdev->iommu_node); ++ pkvm_put_ptdev(ptdev); ++} ++ ++static int iommu_audit_did(struct pkvm_iommu *iommu, u16 did, int shadow_vm_handle) ++{ ++ struct pkvm_ptdev *tmp; ++ int ret = 0; ++ ++ list_for_each_entry(tmp, &iommu->ptdev_head, iommu_node) { ++ if (tmp->shadow_vm_handle != shadow_vm_handle) { ++ if (tmp->did == did) { ++ /* ++ * The devices belong to different VMs but behind ++ * the same IOMMU, cannot use the same did. ++ */ ++ ret = -EPERM; ++ break; ++ } ++ } ++ } ++ ++ return ret; ++} ++ ++static int shadow_pgt_map_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr, int level, ++ void *ptep, struct pgt_flush_data *flush_data, void *arg) ++{ ++ struct pkvm_pgtable_map_data *data = arg; ++ unsigned long map_phys; ++ int ret = 0; ++ ++ host_ept_lock(); ++ ++ pkvm_host_ept_lookup(data->phys, &map_phys, NULL, NULL); ++ if (map_phys == INVALID_ADDR) { ++ pkvm_err("pkvm: phys addr 0x%lx not mapped in host ept\n", data->phys); ++ goto out; ++ } ++ ++ ret = pgtable_map_leaf(pgt, vaddr, level, ptep, flush_data, arg); ++ ++out: ++ host_ept_unlock(); ++ return ret; ++} ++ ++/* used in legacy mode only */ ++static void sync_shadow_pgt(struct pkvm_ptdev *ptdev, struct shadow_pgt_sync_data *sdata) ++{ ++ struct pkvm_pgtable *spgt; ++ int ret; ++ ++ PKVM_ASSERT(is_pgt_ops_ept(&ptdev->vpgt)); ++ ++ /* ++ * ptdev->pgt should be already set to this shadow iommu pgtable. ++ * However, ptdev->pgt could change in the meantime due to ptdev ++ * attach to a VM. So to avoid race, do not use ptdev->pgt directly ++ * but get the same shadow iommu pgtable on our own. ++ */ ++ spgt = pkvm_get_host_iommu_spgt(ptdev->vpgt.root_pa, ptdev->iommu_coherency); ++ PKVM_ASSERT(spgt); ++ ++ if (sdata) ++ ret = pkvm_pgtable_sync_map_range(&ptdev->vpgt, spgt, ++ sdata->vaddr, ++ sdata->vaddr_end - sdata->vaddr, ++ NULL, shadow_pgt_map_leaf); ++ else ++ ret = pkvm_pgtable_sync_map(&ptdev->vpgt, spgt, ++ NULL, shadow_pgt_map_leaf); ++ PKVM_ASSERT(ret == 0); ++ ++ pkvm_put_host_iommu_spgt(spgt, ptdev->iommu_coherency); ++} ++ ++/* present root entry when shadow_pa valid, otherwise un-present it */ ++static bool sync_root_entry(struct id_sync_data *sdata) ++{ ++ u64 *sre = sdata->shadow_ptep; ++ u64 sre_val = sdata->shadow_pa ? (sdata->shadow_pa | 1) : 0; ++ ++ if (READ_ONCE(*sre) != sre_val) { ++ WRITE_ONCE(*sre, sre_val); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* sync context entry when guest_ptep & shadow_pa valid, otherwise un-present it */ ++static bool sync_shadow_context_entry(struct id_sync_data *sdata) ++{ ++ struct context_entry *shadow_ce = sdata->shadow_ptep, tmp = {0}; ++ struct context_entry *guest_ce = sdata->guest_ptep; ++ struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(sdata->shadow_id); ++ struct pkvm_ptdev *ptdev; ++ struct pkvm_pgtable_cap cap; ++ bool updated = false; ++ u8 tt, aw; ++ u16 bdf, did; ++ ++ if (ecap_smts(sdata->iommu_ecap)) { ++ if (sdata->guest_ptep && sdata->shadow_pa) { ++ tmp.hi = guest_ce->hi; ++ tmp.lo = sdata->shadow_pa | (guest_ce->lo & 0xfff); ++ ++ /* Clear DTE to make sure device TLB is disabled for security */ ++ context_sm_clear_dte(&tmp); ++ } ++ } else { ++ /* ++ * In legacy mode, a context entry is a leaf entry responsible for ++ * configuring the actual address translation for the given ptdev, ++ * much like a PASID table entry in scalable mode. So the below logic ++ * is quite similar to the logic in sync_shadow_pasid_table_entry() ++ * for scalable mode. ++ */ ++ bdf = sdata->vaddr >> LM_DEVFN_SHIFT; ++ ptdev = iommu_find_ptdev(iommu, bdf, 0); ++ ++ if (!ptdev) { ++ ptdev = iommu_add_ptdev(iommu, bdf, 0); ++ if (!ptdev) ++ return false; ++ } ++ ++ if (!sdata->guest_ptep) { ++ if (context_lm_is_present(shadow_ce)) { ++ pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false); ++ pkvm_setup_ptdev_did(ptdev, 0); ++ iommu_del_ptdev(iommu, ptdev); ++ ++ goto update_shadow_ce; ++ } ++ return false; ++ } ++ ++ tt = context_lm_get_tt(guest_ce); ++ switch (tt) { ++ case CONTEXT_TT_MULTI_LEVEL: ++ case CONTEXT_TT_DEV_IOTLB: ++ aw = context_lm_get_aw(guest_ce); ++ if (aw != 1 && aw != 2 && aw != 3) { ++ pkvm_err("pkvm: unsupported address width %u\n", aw); ++ ++ pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false); ++ pkvm_setup_ptdev_did(ptdev, 0); ++ ++ /* ++ * TODO: our error reporting to the host for invalid ++ * values of aw or tt is not good: the host will see ++ * translation fault reason "present bit is clear" ++ * instead of "invalid entry". ++ */ ++ goto update_shadow_ce; ++ } ++ cap.level = (aw == 1) ? 3 : ++ (aw == 2) ? 4 : 5; ++ cap.allowed_pgsz = pkvm_hyp->ept_cap.allowed_pgsz; ++ pkvm_setup_ptdev_vpgt(ptdev, context_lm_get_slptr(guest_ce), ++ &viommu_mm_ops, &ept_ops, &cap, true); ++ ++ if (!ptdev_attached_to_vm(ptdev)) ++ sync_shadow_pgt(ptdev, sdata->spgt_data); ++ ++ break; ++ case CONTEXT_TT_PASS_THROUGH: ++ /* ++ * When host IOMMU driver is using pass-through mode, pkvm ++ * IOMMU will actually use the address translation ++ * (CONTEXT_TT_MULTI_LEVEL) with the primary VM's EPT ++ * to guarantee the protection. ++ */ ++ break; ++ default: ++ pkvm_err("pkvm: unsupported translation type %u\n", tt); ++ ++ pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false); ++ pkvm_setup_ptdev_did(ptdev, 0); ++ goto update_shadow_ce; ++ } ++ ++ did = context_lm_get_did(guest_ce); ++ if (iommu_audit_did(iommu, did, ptdev->shadow_vm_handle)) ++ return false; ++ ++ pkvm_setup_ptdev_did(ptdev, did); ++ ++ if (!is_pgt_ops_ept(ptdev->pgt)) ++ return false; ++ ++ tmp = *guest_ce; ++ ++ /* ++ * Always set translation type to MULTI_LEVEL to ensure address ++ * translation and to disable device TLB for security. ++ */ ++ context_lm_set_tt(&tmp, CONTEXT_TT_MULTI_LEVEL); ++ context_lm_set_slptr(&tmp, ptdev->pgt->root_pa); ++ aw = (ptdev->pgt->level == 3) ? 1 : ++ (ptdev->pgt->level == 4) ? 2 : 3; ++ context_lm_set_aw(&tmp, aw); ++ } ++ ++update_shadow_ce: ++ if (READ_ONCE(shadow_ce->hi) != tmp.hi) { ++ WRITE_ONCE(shadow_ce->hi, tmp.hi); ++ updated = true; ++ } ++ ++ if (READ_ONCE(shadow_ce->lo) != tmp.lo) { ++ WRITE_ONCE(shadow_ce->lo, tmp.lo); ++ updated = true; ++ } ++ ++ return updated; ++} ++ ++/* sync pasid dir entry when guest_ptep & shadow_pa valid, otherwise un-present it */ ++static bool sync_shadow_pasid_dir_entry(struct id_sync_data *sdata) ++{ ++ struct pasid_dir_entry *shadow_pde = sdata->shadow_ptep; ++ u64 val = 0; ++ ++ if (sdata->guest_ptep && sdata->shadow_pa) { ++ struct pasid_dir_entry *guest_pde = sdata->guest_ptep; ++ ++ val = guest_pde->val & (PASID_PTE_FPD | PASID_PTE_PRESENT); ++ val |= sdata->shadow_pa; ++ } ++ ++ if (READ_ONCE(shadow_pde->val) != val) { ++ WRITE_ONCE(shadow_pde->val, val); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* sync pasid table entry when guest_ptep valid, otherwise un-present it */ ++static bool sync_shadow_pasid_table_entry(struct id_sync_data *sdata) ++{ ++ u16 bdf = sdata->vaddr >> DEVFN_SHIFT; ++ u32 pasid = sdata->vaddr & ((1UL << MAX_NR_PASID_BITS) - 1); ++ struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(sdata->shadow_id); ++ struct pkvm_ptdev *ptdev = iommu_find_ptdev(iommu, bdf, pasid); ++ struct pasid_entry *shadow_pte = sdata->shadow_ptep, tmp_pte = {0}; ++ struct pasid_entry *guest_pte; ++ bool synced = false; ++ u64 type, aw; ++ ++ if (!ptdev) { ++ ptdev = iommu_add_ptdev(iommu, bdf, pasid); ++ if (!ptdev) ++ return false; ++ } ++ ++ if (!sdata->guest_ptep) { ++ if (pasid_pte_is_present(shadow_pte)) { ++ /* ++ * Making a pasid entry not present needs to remove ++ * the corresponding ptdev from IOMMU. It also means ++ * a ptdev's vpgt/did should be reset as well as ++ * deleting ptdev from this iommu. ++ */ ++ pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false); ++ pkvm_setup_ptdev_did(ptdev, 0); ++ iommu_del_ptdev(iommu, ptdev); ++ ++ synced = pasid_copy_entry(shadow_pte, &tmp_pte); ++ } ++ return synced; ++ } ++ ++ guest_pte = sdata->guest_ptep; ++ type = pasid_pte_get_pgtt(guest_pte); ++ if (type == PASID_ENTRY_PGTT_FL_ONLY) { ++ struct pkvm_pgtable_cap cap; ++ ++ if (ptdev_attached_to_vm(ptdev)) ++ /* ++ * For the attached ptdev, use SL Only mode with ++ * using ptdev->pgt so that the translation is ++ * totally controlled by pkvm. ++ */ ++ type = PASID_ENTRY_PGTT_SL_ONLY; ++ else ++ /* ++ * For the other ptdev, pkvm IOMMU will use nested ++ * translation to add one more layer translation to ++ * guarantee the protection. This one more layer is the ++ * primary VM's EPT. ++ */ ++ type = PASID_ENTRY_PGTT_NESTED; ++ ++ /* ptdev vpgt can be initialized with flptr */ ++ cap.level = pasid_get_flpm(guest_pte) == 0 ? 4 : 5; ++ cap.allowed_pgsz = pkvm_hyp->mmu_cap.allowed_pgsz; ++ pkvm_setup_ptdev_vpgt(ptdev, pasid_get_flptr(guest_pte), ++ &viommu_mm_ops, &mmu_ops, &cap, false); ++ } else if (type == PASID_ENTRY_PGTT_PT) { ++ /* ++ * When host IOMMU driver is using pass-through mode, pkvm ++ * IOMMU will actually use the second-level only translation ++ * to guarantee the protection. This second-level is als ++ * the EPT. ++ */ ++ type = PASID_ENTRY_PGTT_SL_ONLY; ++ } else { ++ /* ++ * As the host IOMMU driver in the pkvm enabled kernel has ++ * already been configured to use first-level only or ++ * pass-through mode, it will not use any other mode. But ++ * in case this happens, reset the ptdev vpgt/did, keep ptdev ++ * linked to this IOMMU, and clear the shadow entry in order ++ * not to support it. ++ */ ++ pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL, false); ++ pkvm_setup_ptdev_did(ptdev, 0); ++ ++ pkvm_err("pkvm: unsupported pasid type %lld\n", type); ++ ++ return pasid_copy_entry(shadow_pte, &tmp_pte); ++ } ++ ++ pkvm_setup_ptdev_did(ptdev, pasid_get_domain_id(guest_pte)); ++ ++ if (iommu_audit_did(iommu, ptdev->did, ptdev->shadow_vm_handle)) ++ /* ++ * It is possible that this ptdev will be attached to a protected ++ * VM so primary VM allocates the same did used by this protected ++ * VM and did a TLB flush. But at this moment, this ptdev is not ++ * attached yet so audit is failed. For this case, can skip the sync ++ * of this pasid table entry and it will be synced again when this ++ * ptdev is attached. ++ * ++ * It is also possible that this ptdev is just detached from a ++ * protected VM but still using the previous did due to primary VM ++ * has not configured this ptdev yet. In this case, the did of this ++ * ptdev is still the same as the did used by other ptdevs not ++ * detached yet. For this case, can skip the sync of this pasid ++ * table entry and it will be synced again when primary VM configures ++ * this ptdev. ++ * ++ * If not the above cases but primary VM does this by purpose, also ++ * not sync the pasid table entry to guarantee the isolation. ++ */ ++ return false; ++ ++ /* ++ * ptdev->pgt will be used as second-level translation table ++ * which should be EPT format. ++ */ ++ if (!is_pgt_ops_ept(ptdev->pgt)) ++ return false; ++ ++ /* ++ * Copy all the bits from guest_pte. As the translation type will ++ * be re-configured in below, even some bits inherit from guest_pte ++ * but hardware will ignore those bits according to the translation ++ * type. ++ */ ++ memcpy(&tmp_pte, guest_pte, sizeof(struct pasid_entry)); ++ ++ pasid_set_page_snoop(&tmp_pte, !!ecap_smpwc(sdata->iommu_ecap)); ++ if (ecap_sc_support(sdata->iommu_ecap)) ++ pasid_set_pgsnp(&tmp_pte); ++ ++ /* ++ * Modify the second-level related bits: ++ * Set PGTT/SLPTR/AW. ++ * Clear SLADE/SLEE ++ * Reuse FPD/P ++ */ ++ pasid_set_translation_type(&tmp_pte, type); ++ pasid_set_slptr(&tmp_pte, ptdev->pgt->root_pa); ++ aw = (ptdev->pgt->level == 4) ? 2 : 3; ++ pasid_set_address_width(&tmp_pte, aw); ++ pasid_set_ssade(&tmp_pte, 0); ++ pasid_set_ssee(&tmp_pte, 0); ++ ++ return pasid_copy_entry(shadow_pte, &tmp_pte); ++} ++ ++static bool iommu_id_sync_entry(struct id_sync_data *sdata) ++{ ++ bool ret = false; ++ struct pkvm_pgtable *shadow_id = sdata->shadow_id; ++ ++ if (ecap_smts(sdata->iommu_ecap)) { ++ switch (sdata->level) { ++ case IOMMU_PASID_TABLE: ++ ret = sync_shadow_pasid_table_entry(sdata); ++ break; ++ case IOMMU_PASID_DIR: ++ ret = sync_shadow_pasid_dir_entry(sdata); ++ break; ++ case IOMMU_SM_CONTEXT: ++ ret = sync_shadow_context_entry(sdata); ++ break; ++ case IOMMU_SM_ROOT: ++ ret = sync_root_entry(sdata); ++ break; ++ default: ++ break; ++ } ++ } else { ++ switch (sdata->level) { ++ case IOMMU_LM_CONTEXT: ++ ret = sync_shadow_context_entry(sdata); ++ break; ++ case IOMMU_LM_ROOT: ++ ret = sync_root_entry(sdata); ++ break; ++ default: ++ break; ++ } ++ } ++ ++ if (ret) { ++ int entry_size = shadow_id->pgt_ops->pgt_level_entry_size(sdata->level); ++ ++ if (entry_size && shadow_id->mm_ops->flush_cache) ++ shadow_id->mm_ops->flush_cache(sdata->shadow_ptep, entry_size); ++ } ++ ++ return ret; ++} ++ ++static int initialize_iommu_pgt(struct pkvm_iommu *iommu) ++{ ++ struct pkvm_pgtable *pgt = &iommu->pgt; ++ struct pkvm_pgtable *vpgt = &iommu->viommu.pgt; ++ static struct pkvm_mm_ops *iommu_mm_ops; ++ struct pkvm_pgtable_ops *iommu_ops; ++ struct pkvm_pgtable_cap cap; ++ u64 grt_pa = readq(iommu->iommu.reg + DMAR_RTADDR_REG) & VTD_PAGE_MASK; ++ int ret; ++ ++ if (ecap_smts(iommu->iommu.ecap)) { ++ cap.level = IOMMU_SM_ROOT; ++ iommu_ops = &iommu_sm_id_ops; ++ } else { ++ cap.level = IOMMU_LM_ROOT; ++ iommu_ops = &iommu_lm_id_ops; ++ } ++ ++ vpgt->root_pa = grt_pa; ++ ret = pkvm_pgtable_init(vpgt, &viommu_mm_ops, iommu_ops, &cap, false); ++ if (ret) ++ return ret; ++ ++ /* ++ * For the IOMMU without Page-Walk Coherency, should use ++ * iommu_pw_noncoherency_mm_ops to flush CPU cache when ++ * modifying any remapping structure entry. ++ * ++ * For the IOMMU with Page-Walk Coherency, can use ++ * iommu_pw_coherency_mm_ops to skip the CPU cache flushing. ++ */ ++ if (!ecap_coherent(iommu->iommu.ecap)) ++ iommu_mm_ops = &iommu_pw_noncoherency_mm_ops; ++ else ++ iommu_mm_ops = &iommu_pw_coherency_mm_ops; ++ ++ ret = pkvm_pgtable_init(pgt, iommu_mm_ops, iommu_ops, &cap, true); ++ if (!ret) { ++ /* ++ * Hold additional reference count to make ++ * sure root page won't be freed ++ */ ++ void *root = pgt->mm_ops->phys_to_virt(pgt->root_pa); ++ ++ pgt->mm_ops->get_page(root); ++ } ++ return ret; ++} ++ ++int pkvm_init_iommu(unsigned long mem_base, unsigned long nr_pages) ++{ ++ struct pkvm_iommu_info *info = &pkvm_hyp->iommu_infos[0]; ++ struct pkvm_iommu *piommu = &iommus[0]; ++ int i, ret = pkvm_pool_init(&iommu_pool, mem_base >> PAGE_SHIFT, nr_pages, 0); ++ ++ if (ret) ++ return ret; ++ ++ for (i = 0; i < PKVM_MAX_IOMMU_NUM; piommu++, info++, i++) { ++ if (!info->reg_phys) ++ break; ++ ++ INIT_LIST_HEAD(&piommu->ptdev_head); ++ ++ pkvm_spinlock_init(&piommu->lock); ++ piommu->iommu.reg_phys = info->reg_phys; ++ piommu->iommu.reg_size = info->reg_size; ++ piommu->iommu.reg = pkvm_iophys_to_virt(info->reg_phys); ++ if ((unsigned long)piommu->iommu.reg == INVALID_ADDR) ++ return -ENOMEM; ++ piommu->iommu.seq_id = i; ++ ++ ret = pkvm_mmu_map((unsigned long)piommu->iommu.reg, ++ (unsigned long)info->reg_phys, ++ info->reg_size, 1 << PG_LEVEL_4K, ++ PKVM_PAGE_IO_NOCACHE); ++ if (ret) ++ return ret; ++ ++ piommu->iommu.cap = readq(piommu->iommu.reg + DMAR_CAP_REG); ++ piommu->iommu.ecap = readq(piommu->iommu.reg + DMAR_ECAP_REG); ++ /* cache the enabled features from Global Status register */ ++ piommu->iommu.gcmd = readl(piommu->iommu.reg + DMAR_GSTS_REG) & ++ DMAR_GSTS_EN_BITS; ++ ++ ret = pkvm_host_ept_unmap((unsigned long)info->reg_phys, ++ (unsigned long)info->reg_phys, ++ info->reg_size); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int free_shadow_id_cb(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long vaddr_end, int level, void *ptep, ++ unsigned long flags, struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ struct pkvm_mm_ops *mm_ops = pgt->mm_ops; ++ struct id_sync_data sync_data = {0}; ++ struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(pgt); ++ void *child_ptep; ++ ++ /* Doesn't need to do anything if the shadow entry is not present */ ++ if (!pgt_ops->pgt_entry_present(ptep)) ++ return 0; ++ ++ sync_data.shadow_ptep = ptep; ++ sync_data.level = level; ++ sync_data.shadow_id = pgt; ++ sync_data.iommu_ecap = iommu->iommu.ecap; ++ sync_data.vaddr = vaddr; ++ ++ /* Un-present a present PASID Table entry */ ++ if (LAST_LEVEL(level)) { ++ if (iommu_id_sync_entry(&sync_data)) ++ mm_ops->put_page(ptep); ++ return 0; ++ } ++ ++ /* ++ * it's a present entry for PASID DIR, context or root. ++ * its child ptep shall already be freed (the refcnt == 1), if so, we ++ * can un-present itself as well now. ++ */ ++ child_ptep = mm_ops->phys_to_virt(pgt_ops->pgt_entry_to_phys(ptep)); ++ if (mm_ops->page_count(child_ptep) == 1) { ++ if (iommu_id_sync_entry(&sync_data)) { ++ mm_ops->put_page(ptep); ++ mm_ops->put_page(child_ptep); ++ } ++ } ++ ++ return 0; ++} ++ ++/* sync_data != NULL, data != NULL */ ++static int init_sync_id_data(struct id_sync_data *sync_data, ++ struct id_sync_walk_data *data, ++ struct pkvm_iommu *iommu, void *guest_ptep, ++ unsigned long vaddr, int level) ++{ ++ struct pkvm_pgtable *shadow_id = &iommu->pgt; ++ int idx = shadow_id->pgt_ops->pgt_entry_to_index(vaddr, level); ++ int entry_size = shadow_id->pgt_ops->pgt_level_entry_size(level); ++ ++ if (ecap_smts(iommu->iommu.ecap)) { ++ switch (level) { ++ case IOMMU_PASID_TABLE: ++ sync_data->p_entry = *((struct pasid_entry *)guest_ptep); ++ sync_data->guest_ptep = &sync_data->p_entry; ++ break; ++ case IOMMU_PASID_DIR: ++ sync_data->pd_entry = *((struct pasid_dir_entry *)guest_ptep); ++ sync_data->guest_ptep = &sync_data->pd_entry; ++ break; ++ case IOMMU_SM_CONTEXT: ++ sync_data->ct_entry = *((struct context_entry *)guest_ptep); ++ sync_data->guest_ptep = &sync_data->ct_entry; ++ break; ++ case IOMMU_SM_ROOT: ++ sync_data->root_entry = *((u64 *)guest_ptep); ++ sync_data->guest_ptep = &sync_data->root_entry; ++ break; ++ default: ++ return -EINVAL; ++ } ++ } else { ++ switch (level) { ++ case IOMMU_LM_CONTEXT: ++ sync_data->ct_entry = *((struct context_entry *)guest_ptep); ++ sync_data->guest_ptep = &sync_data->ct_entry; ++ break; ++ case IOMMU_LM_ROOT: ++ sync_data->root_entry = *((u64 *)guest_ptep); ++ sync_data->guest_ptep = &sync_data->root_entry; ++ break; ++ default: ++ return -EINVAL; ++ } ++ } ++ ++ /* shadow_pa of current level must be there */ ++ if (!data->shadow_pa[level]) ++ return -EINVAL; ++ ++ /* get current shadow_ptep */ ++ sync_data->shadow_ptep = shadow_id->mm_ops->phys_to_virt(data->shadow_pa[level]); ++ sync_data->shadow_ptep += idx * entry_size; ++ ++ sync_data->level = level; ++ sync_data->shadow_id = shadow_id; ++ sync_data->iommu_ecap = iommu->iommu.ecap; ++ sync_data->shadow_pa = 0; ++ sync_data->vaddr = vaddr; ++ sync_data->spgt_data = data->spgt_data; ++ ++ return 0; ++} ++ ++static int free_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr, ++ unsigned long vaddr_end); ++static int sync_shadow_id_cb(struct pkvm_pgtable *vpgt, unsigned long vaddr, ++ unsigned long vaddr_end, int level, void *ptep, ++ unsigned long flags, struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct pkvm_pgtable_ops *vpgt_ops = vpgt->pgt_ops; ++ struct id_sync_walk_data *data = arg; ++ struct pkvm_iommu *iommu = data->iommu; ++ struct pkvm_pgtable *shadow_id = &iommu->pgt; ++ struct id_sync_data sync_data; ++ void *shadow_ptep, *guest_ptep; ++ bool shadow_p, guest_p; ++ int ret = init_sync_id_data(&sync_data, data, iommu, ptep, vaddr, level); ++ ++ if (ret < 0) ++ return ret; ++ ++ guest_ptep = sync_data.guest_ptep; ++ shadow_ptep = sync_data.shadow_ptep; ++ ++ /* ++ * WALK_TABLE_PRE is for non leaf, WALK_LEAF is for leaf ++ * if not match, it means guest changed it, return -EAGAIN ++ * to re-walk the page table. ++ */ ++ if ((flags == PKVM_PGTABLE_WALK_TABLE_PRE && ++ vpgt_ops->pgt_entry_is_leaf(guest_ptep, level)) || ++ (flags == PKVM_PGTABLE_WALK_LEAF && ++ !vpgt_ops->pgt_entry_is_leaf(guest_ptep, level))) ++ return -EAGAIN; ++ ++ shadow_p = shadow_id->pgt_ops->pgt_entry_present(shadow_ptep); ++ guest_p = vpgt_ops->pgt_entry_present(guest_ptep); ++ if (!guest_p) { ++ if (shadow_p) { ++ /* ++ * For the case that guest not present but shadow present, just ++ * simply free the shadow to make them consistent. ++ */ ++ unsigned long new_vaddr_end = shadow_id->pgt_ops->pgt_level_to_size(level) + ++ vaddr; ++ /* ++ * Get a reference count before free to make sure the current page ++ * of this level and the pages of its parent levels won't be freed. ++ * As here we only want to free its specific sub-level. ++ */ ++ shadow_id->mm_ops->get_page(shadow_ptep); ++ free_shadow_id(iommu, vaddr, new_vaddr_end); ++ shadow_id->mm_ops->put_page(shadow_ptep); ++ } ++ /* ++ * As now both guest and shadow are not ++ * present, don't need to do anything more. ++ */ ++ return ret; ++ } ++ ++ if (LAST_LEVEL(level)) { ++ /* ++ * Cache invalidation may want to sync specific PASID entries ++ * (in scalable mode) or context entries (in legacy mode) with ++ * DID matched. In such case we only need to sync the entries ++ * with the matching DID. ++ * ++ * According to vt-d spec 6.2.2.1 and 6.2.3.1, software must ++ * not use domain-id value of 0 when programming entries on ++ * implementations reporting CM=1 in the Capability register. ++ * So non-zero DID means a real DID from host software. ++ */ ++ if (data->did) { ++ u16 did = ecap_smts(iommu->iommu.ecap) ++ ? pasid_get_domain_id(guest_ptep) ++ : context_lm_get_did(guest_ptep); ++ ++ if (did != data->did) ++ return ret; ++ } ++ ++ /* ++ * For a leaf entry, the physical address of its child level ++ * is determined by the pgt used by the corresponding ptdev. ++ * So no need to set sync_data.shadow_pa. ++ */ ++ } else if (!shadow_p) { ++ /* ++ * For a non-present non-leaf (which may be root/context/pasid ++ * dir) entry, needs to allocate a new page to make this entry ++ * present. Root and context page are always one page with 4K ++ * size. As we fixed the pasid to only support 15 bits, ++ * the pasid dir is also one page with 4K size. ++ */ ++ void *shadow = shadow_id->mm_ops->zalloc_page(); ++ ++ if (!shadow) ++ return -ENOMEM; ++ /* Get the shadow id physical address of the child level */ ++ sync_data.shadow_pa = shadow_id->mm_ops->virt_to_phys(shadow); ++ } else ++ /* ++ * For a present non-leaf (which is probably root/context/pasid dir) ++ * entry, get the shadow id physical address of its child level. ++ */ ++ sync_data.shadow_pa = shadow_id->pgt_ops->pgt_entry_to_phys(shadow_ptep); ++ ++ if (iommu_id_sync_entry(&sync_data)) { ++ if (!shadow_p) ++ /* ++ * A non-present to present changing needs to get ++ * a new reference count for the shadow id page. ++ */ ++ shadow_id->mm_ops->get_page(shadow_ptep); ++ } ++ ++ if ((flags == PKVM_PGTABLE_WALK_TABLE_PRE) && (!LAST_LEVEL(level))) { ++ /* ++ * As guest page table walking will go to the child level, pass ++ * the shadow id physical address of the child level to sync. ++ */ ++ data->shadow_pa[level - 1] = sync_data.shadow_pa; ++ } ++ ++ return ret; ++} ++ ++static int free_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr, ++ unsigned long vaddr_end) ++{ ++ struct pkvm_pgtable_walker walker = { ++ .cb = free_shadow_id_cb, ++ .flags = PKVM_PGTABLE_WALK_LEAF | ++ PKVM_PGTABLE_WALK_TABLE_POST, ++ }; ++ ++ /* ++ * To free the shadow IOMMU page table, walk the shadow IOMMU ++ * page table. ++ */ ++ if (!(iommu->viommu.vreg.gsts & DMA_GSTS_TES)) ++ return 0; ++ ++ return iommu_pgtable_walk(&iommu->pgt, vaddr, vaddr_end, &walker); ++} ++ ++static int sync_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr, ++ unsigned long vaddr_end, u16 did, ++ struct shadow_pgt_sync_data *spgt_data) ++{ ++ DEFINE_ID_SYNC_WALK_DATA(arg, iommu, did, spgt_data); ++ struct pkvm_pgtable_walker walker = { ++ .cb = sync_shadow_id_cb, ++ .flags = PKVM_PGTABLE_WALK_TABLE_PRE | ++ PKVM_PGTABLE_WALK_LEAF, ++ .arg = &arg, ++ }; ++ int ret, retry_cnt = 0; ++ ++ if (!(iommu->viommu.vreg.gsts & DMA_GSTS_TES)) ++ return 0; ++ ++retry: ++ if (ecap_smts(iommu->iommu.ecap)) ++ arg.shadow_pa[IOMMU_SM_ROOT] = iommu->pgt.root_pa; ++ else ++ arg.shadow_pa[IOMMU_LM_ROOT] = iommu->pgt.root_pa; ++ /* ++ * To sync the shadow IOMMU page table, walks the guest IOMMU ++ * page table ++ */ ++ ret = iommu_pgtable_walk(&iommu->viommu.pgt, vaddr, vaddr_end, &walker); ++ if ((ret == -EAGAIN) && (retry_cnt++ < 5)) ++ goto retry; ++ ++ return ret; ++} ++ ++static void enable_qi(struct pkvm_iommu *iommu) ++{ ++ void *desc = iommu->qi.desc; ++ int dw, qs; ++ u32 sts; ++ ++ dw = !!ecap_smts(iommu->iommu.ecap); ++ qs = fls(iommu->qi.free_cnt >> (7 + !dw)) - 1; ++ ++ /* Disable QI */ ++ sts = readl(iommu->iommu.reg + DMAR_GSTS_REG); ++ if (sts & DMA_GSTS_QIES) { ++ iommu->iommu.gcmd &= ~DMA_GCMD_QIE; ++ writel(iommu->iommu.gcmd, iommu->iommu.reg + DMAR_GCMD_REG); ++ PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, ++ readl, !(sts & DMA_GSTS_QIES), sts); ++ } ++ ++ /* Set tail to 0 */ ++ writel(0, iommu->iommu.reg + DMAR_IQT_REG); ++ ++ /* Set IQA */ ++ iommu->piommu_iqa = pkvm_virt_to_phys(desc) | (dw << 11) | qs; ++ writeq(iommu->piommu_iqa, iommu->iommu.reg + DMAR_IQA_REG); ++ ++ /* Enable QI */ ++ iommu->iommu.gcmd |= DMA_GCMD_QIE; ++ writel(iommu->iommu.gcmd, iommu->iommu.reg + DMAR_GCMD_REG); ++ PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, ++ readl, (sts & DMA_GSTS_QIES), sts); ++} ++ ++static int create_qi_desc(struct pkvm_iommu *iommu) ++{ ++ struct pkvm_viommu *viommu = &iommu->viommu; ++ struct q_inval *qi = &iommu->qi; ++ void __iomem *reg = iommu->iommu.reg; ++ ++ pkvm_spinlock_init(&iommu->qi_lock); ++ /* ++ * Before switching the descriptor, need to wait for any pending ++ * invalidation descriptor completed. According to spec 6.5.2, ++ * The invalidation queue is considered quiesced when the queue ++ * is empty (head and tail registers equal) and the last ++ * descriptor completed is an Invalidation Wait Descriptor ++ * (which indicates no invalidation requests are pending in hardware). ++ */ ++ while (readq(reg + DMAR_IQH_REG) != ++ readq(reg + DMAR_IQT_REG)) ++ cpu_relax(); ++ ++ viommu->vreg.iqa = viommu->iqa = readq(reg + DMAR_IQA_REG); ++ viommu->vreg.iq_head = readq(reg + DMAR_IQH_REG); ++ viommu->vreg.iq_tail = readq(reg + DMAR_IQT_REG); ++ ++ if (viommu->vreg.gsts & DMA_GSTS_QIES) { ++ struct qi_desc *wait_desc; ++ u64 iqa = viommu->iqa; ++ int shift = IQ_DESC_SHIFT(iqa); ++ int offset = ((viommu->vreg.iq_head >> shift) + ++ IQ_DESC_LEN(iqa) - 1) % IQ_DESC_LEN(iqa); ++ int *desc_status; ++ ++ /* Find out the last descriptor */ ++ wait_desc = pkvm_phys_to_virt(IQ_DESC_BASE_PHYS(iqa)) + (offset << shift); ++ ++ pkvm_dbg("pkvm: viommu iqa 0x%llx head 0x%llx tail 0x%llx qw0 0x%llx qw1 0x%llx", ++ viommu->vreg.iqa, viommu->vreg.iq_head, viommu->vreg.iq_tail, ++ wait_desc->qw0, wait_desc->qw1); ++ ++ if (QI_DESC_TYPE(wait_desc->qw0) != QI_IWD_TYPE) { ++ pkvm_err("pkvm: %s: expect wait desc but 0x%llx\n", ++ __func__, wait_desc->qw0); ++ return -EINVAL; ++ } ++ ++ desc_status = pkvm_phys_to_virt(wait_desc->qw1); ++ /* ++ * Wait until the wait descriptor is completed. ++ * ++ * The desc_status is from host. Checking this in pkvm ++ * relies on host IOMMU driver not to release the ++ * desc_status after it is completed, and this is guaranteed ++ * by the current Linux IOMMU driver. ++ */ ++ while (READ_ONCE(*desc_status) == QI_IN_USE) ++ cpu_relax(); ++ } ++ ++ qi->free_cnt = PKVM_QI_DESC_ALIGNED_SIZE / sizeof(struct qi_desc); ++ qi->desc = iommu_zalloc_pages(PKVM_QI_DESC_ALIGNED_SIZE); ++ if (!qi->desc) ++ return -ENOMEM; ++ ++ qi->desc_status = iommu_zalloc_pages(PKVM_QI_DESC_STATUS_ALIGNED_SIZE); ++ if (!qi->desc_status) { ++ iommu_put_page(qi->desc); ++ return -ENOMEM; ++ } ++ ++ enable_qi(iommu); ++ return 0; ++} ++ ++static int qi_check_fault(struct pkvm_iommu *iommu, int wait_index) ++{ ++ u32 fault; ++ struct q_inval *qi = &iommu->qi; ++ ++ if (qi->desc_status[wait_index] == QI_ABORT) ++ return -EAGAIN; ++ ++ fault = readl(iommu->iommu.reg + DMAR_FSTS_REG); ++ ++ /* ++ * If IQE happens, the head points to the descriptor associated ++ * with the error. No new descriptors are fetched until the IQE ++ * is cleared. ++ */ ++ if (fault & DMA_FSTS_IQE) { ++ writel(DMA_FSTS_IQE, iommu->iommu.reg + DMAR_FSTS_REG); ++ pkvm_dbg("pkvm: Invalidation Queue Error (IQE) cleared\n"); ++ } ++ ++ /* ++ * If ITE happens, all pending wait_desc commands are aborted. ++ * No new descriptors are fetched until the ITE is cleared. ++ */ ++ if (fault & DMA_FSTS_ITE) { ++ writel(DMA_FSTS_ITE, iommu->iommu.reg + DMAR_FSTS_REG); ++ pkvm_dbg("pkvm: Invalidation Time-out Error (ITE) cleared\n"); ++ } ++ ++ if (fault & DMA_FSTS_ICE) { ++ writel(DMA_FSTS_ICE, iommu->iommu.reg + DMAR_FSTS_REG); ++ pkvm_dbg("pkvm: Invalidation Completion Error (ICE) cleared\n"); ++ } ++ ++ return 0; ++} ++ ++static void __submit_qi(struct pkvm_iommu *iommu, struct qi_desc *base, int count) ++{ ++ int len = IQ_DESC_LEN(iommu->piommu_iqa), i, wait_index; ++ int shift = IQ_DESC_SHIFT(iommu->piommu_iqa); ++ struct q_inval *qi = &iommu->qi; ++ struct qi_desc *to, *from; ++ int required_cnt = count + 2; ++ void *desc = qi->desc; ++ int *desc_status, rc; ++ ++ pkvm_spin_lock(&iommu->qi_lock); ++ /* ++ * Detect if the free descriptor count is enough or not ++ */ ++ while (qi->free_cnt < required_cnt) { ++ u64 head = readq(iommu->iommu.reg + DMAR_IQH_REG) >> shift; ++ int busy_cnt = (READ_ONCE(qi->free_head) + len - head) % len; ++ int free_cnt = len - busy_cnt; ++ ++ if (free_cnt >= required_cnt) { ++ qi->free_cnt = free_cnt; ++ break; ++ } ++ pkvm_spin_unlock(&iommu->qi_lock); ++ cpu_relax(); ++ pkvm_spin_lock(&iommu->qi_lock); ++ } ++ ++ for (i = 0; i < count; i++) { ++ from = base + i; ++ to = qi->desc + (((qi->free_head + i) % len) << shift); ++ to->qw0 = from->qw0; ++ to->qw1 = from->qw1; ++ } ++ ++ wait_index = (qi->free_head + count) % len; ++ /* setup wait descriptor */ ++ to = desc + (wait_index << shift); ++ to->qw0 = QI_IWD_STATUS_DATA(QI_DONE) | ++ QI_IWD_STATUS_WRITE | QI_IWD_TYPE; ++ ++ desc_status = &qi->desc_status[wait_index]; ++ WRITE_ONCE(*desc_status, QI_IN_USE); ++ to->qw1 = pkvm_virt_to_phys(desc_status); ++ ++ /* submit to hardware with wait descriptor */ ++ qi->free_cnt -= count + 1; ++ qi->free_head = (qi->free_head + count + 1) % len; ++ writel(qi->free_head << shift, iommu->iommu.reg + DMAR_IQT_REG); ++ ++ while (READ_ONCE(*desc_status) != QI_DONE) { ++ rc = qi_check_fault(iommu, wait_index); ++ if (rc) ++ break; ++ pkvm_spin_unlock(&iommu->qi_lock); ++ cpu_relax(); ++ pkvm_spin_lock(&iommu->qi_lock); ++ } ++ ++ if (*desc_status != QI_DONE) ++ pkvm_err("pkvm: %s: failed with status %d\n", ++ __func__, *desc_status); ++ ++ /* release the free_cnt */ ++ qi->free_cnt += count + 1; ++ ++ pkvm_spin_unlock(&iommu->qi_lock); ++} ++ ++static void submit_qi(struct pkvm_iommu *iommu, struct qi_desc *base, int count) ++{ ++ int max_len = IQ_DESC_LEN(iommu->piommu_iqa) - 2; ++ int submit_count; ++ ++ do { ++ submit_count = count > max_len ? max_len : count; ++ __submit_qi(iommu, base, submit_count); ++ ++ count -= submit_count; ++ base += submit_count; ++ } while (count > 0); ++} ++ ++static void flush_context_cache(struct pkvm_iommu *iommu, u16 did, ++ u16 sid, u8 fm, u64 type) ++{ ++ struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; ++ ++ desc.qw0 = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did) | ++ QI_CC_GRAN(type) | QI_CC_TYPE; ++ ++ submit_qi(iommu, &desc, 1); ++} ++ ++static void flush_pasid_cache(struct pkvm_iommu *iommu, u16 did, ++ u64 granu, u32 pasid) ++{ ++ struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; ++ ++ desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) | ++ QI_PC_GRAN(granu) | QI_PC_TYPE; ++ ++ submit_qi(iommu, &desc, 1); ++} ++ ++static void setup_iotlb_qi_desc(struct pkvm_iommu *iommu, ++ struct qi_desc *desc, u16 did, ++ u64 addr, unsigned int size_order, ++ u64 type) ++{ ++ u8 dw = 0, dr = 0; ++ ++ if (cap_write_drain(iommu->iommu.cap)) ++ dw = 1; ++ ++ if (cap_read_drain(iommu->iommu.cap)) ++ dr = 1; ++ ++ desc->qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) | ++ QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; ++ desc->qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_AM(size_order); ++ desc->qw2 = 0; ++ desc->qw3 = 0; ++} ++ ++static void flush_iotlb(struct pkvm_iommu *iommu, u16 did, u64 addr, ++ unsigned int size_order, u64 type) ++{ ++ struct qi_desc desc; ++ ++ setup_iotlb_qi_desc(iommu, &desc, did, addr, size_order, type); ++ submit_qi(iommu, &desc, 1); ++} ++ ++static void set_root_table(struct pkvm_iommu *iommu) ++{ ++ u64 val = iommu->pgt.root_pa; ++ void __iomem *reg = iommu->iommu.reg; ++ u32 sts; ++ ++ /* Set scalable mode */ ++ if (ecap_smts(iommu->iommu.ecap)) ++ val |= DMA_RTADDR_SMT; ++ ++ writeq(val, reg + DMAR_RTADDR_REG); ++ ++ /* ++ * The shadow root table provides identical remapping results comparing ++ * with the previous guest root table, so it is allowed to switch if ++ * Translation Enable Status is still 1 according to IOMMU spec 6.6: ++ * ++ * " ++ * If software sets the root-table pointer while remapping hardware is ++ * active (TES=1 in Global Status register), software must ensure the ++ * structures referenced by the new root-table pointer provide identical ++ * remapping results as the structures referenced by the previous root-table ++ * pointer so that inflight requests are properly translated. ++ * " ++ * ++ * So don't need to turn off TE first before switching. ++ */ ++ writel(iommu->iommu.gcmd | DMA_GCMD_SRTP, reg + DMAR_GCMD_REG); ++ ++ PKVM_IOMMU_WAIT_OP(reg + DMAR_GSTS_REG, readl, (sts & DMA_GSTS_RTPS), sts); ++ ++ flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); ++ if (ecap_smts(iommu->iommu.ecap)) ++ flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); ++ flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); ++} ++ ++static void enable_translation(struct pkvm_iommu *iommu) ++{ ++ void __iomem *reg = iommu->iommu.reg; ++ u32 sts; ++ ++ if (iommu->iommu.gcmd & DMA_GCMD_TE) ++ return; ++ ++ iommu->iommu.gcmd |= DMA_GCMD_TE; ++ ++ writel(iommu->iommu.gcmd, reg + DMAR_GCMD_REG); ++ ++ PKVM_IOMMU_WAIT_OP(reg + DMAR_GSTS_REG, readl, (sts & DMA_GSTS_TES), sts); ++} ++ ++static void initialize_viommu_reg(struct pkvm_iommu *iommu) ++{ ++ struct viommu_reg *vreg = &iommu->viommu.vreg; ++ void __iomem *reg_base = iommu->iommu.reg; ++ ++ vreg->cap = readq(reg_base + DMAR_CAP_REG); ++ vreg->ecap = readq(reg_base + DMAR_ECAP_REG); ++ pkvm_update_iommu_virtual_caps(&vreg->cap, &vreg->ecap); ++ ++ vreg->gsts = readl(reg_base + DMAR_GSTS_REG); ++ vreg->rta = readq(reg_base + DMAR_RTADDR_REG); ++ ++ pkvm_dbg("%s: iommu phys reg 0x%llx cap 0x%llx ecap 0x%llx gsts 0x%x rta 0x%llx\n", ++ __func__, iommu->iommu.reg_phys, vreg->cap, vreg->ecap, vreg->gsts, vreg->rta); ++ ++ /* Invalidate Queue regs are updated when create descriptor */ ++} ++ ++static int activate_iommu(struct pkvm_iommu *iommu) ++{ ++ unsigned long vaddr = 0, vaddr_end = IOMMU_MAX_VADDR; ++ int ret; ++ ++ pkvm_dbg("%s: iommu%d\n", __func__, iommu->iommu.seq_id); ++ ++ pkvm_spin_lock(&iommu->lock); ++ ++ ret = initialize_iommu_pgt(iommu); ++ if (ret) ++ goto out; ++ ++ initialize_viommu_reg(iommu); ++ ++ ret = sync_shadow_id(iommu, vaddr, vaddr_end, 0, NULL); ++ if (ret) ++ goto out; ++ ++ ret = create_qi_desc(iommu); ++ if (ret) ++ goto free_shadow; ++ ++ set_root_table(iommu); ++ ++ /* ++ * It is possible that some IOMMU devices do not have memory ++ * remapping translation enabled by the host IOMMU driver during boot ++ * time, so pkvm IOMMU driver needs to make sure this enabled to ++ * guarantee the IO isolation from the devices behind this IOMMU. ++ * ++ */ ++ enable_translation(iommu); ++ ++ iommu->activated = true; ++ root_tbl_walk(iommu); ++ ++ pkvm_spin_unlock(&iommu->lock); ++ return 0; ++ ++free_shadow: ++ free_shadow_id(iommu, vaddr, vaddr_end); ++out: ++ pkvm_spin_unlock(&iommu->lock); ++ return ret; ++} ++ ++static int context_cache_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc) ++{ ++ u16 sid = QI_DESC_CC_SID(desc->qw0); ++ u16 did = ecap_smts(iommu->iommu.ecap) ? 0 : QI_DESC_CC_DID(desc->qw0); ++ u64 granu = QI_DESC_CC_GRANU(desc->qw0) << DMA_CCMD_INVL_GRANU_OFFSET; ++ unsigned long start, end; ++ int ret; ++ ++ switch (granu) { ++ case DMA_CCMD_GLOBAL_INVL: ++ start = 0; ++ end = MAX_NUM_OF_ADDRESS_SPACE(iommu); ++ pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id); ++ ret = sync_shadow_id(iommu, start, end, 0, NULL); ++ break; ++ case DMA_CCMD_DOMAIN_INVL: ++ /* ++ * Domain selective invalidation which is processed by ++ * hardware as global invalidations for scalable mode ++ * according to spec 6.5.2.1 ++ */ ++ start = 0; ++ end = MAX_NUM_OF_ADDRESS_SPACE(iommu); ++ pkvm_dbg("pkvm: %s: iommu%d: domain selective\n", ++ __func__, iommu->iommu.seq_id); ++ ret = sync_shadow_id(iommu, start, end, did, NULL); ++ break; ++ case DMA_CCMD_DEVICE_INVL: ++ if (ecap_smts(iommu->iommu.ecap)) { ++ start = (unsigned long)sid << DEVFN_SHIFT; ++ end = ((unsigned long)sid + 1) << DEVFN_SHIFT; ++ } else { ++ start = (unsigned long)sid << LM_DEVFN_SHIFT; ++ end = ((unsigned long)sid + 1) << LM_DEVFN_SHIFT; ++ } ++ pkvm_dbg("pkvm: %s: iommu%d: device selective sid 0x%x\n", ++ __func__, iommu->iommu.seq_id, sid); ++ ret = sync_shadow_id(iommu, start, end, did, NULL); ++ break; ++ default: ++ pkvm_err("pkvm: %s: iommu%d: invalidate granu %lld\n", ++ __func__, iommu->iommu.seq_id, granu >> DMA_CCMD_INVL_GRANU_OFFSET); ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (ret) ++ pkvm_err("pkvm: %s: iommu%d: granularity %lld failed with ret %d\n", ++ __func__, iommu->iommu.seq_id, granu >> DMA_CCMD_INVL_GRANU_OFFSET, ret); ++ return ret; ++} ++ ++static int pasid_cache_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc) ++{ ++ int pasid = QI_DESC_PC_PASID(desc->qw0); ++ u16 did = QI_DESC_PC_DID(desc->qw0); ++ int granu = QI_DESC_PC_GRANU(desc->qw0); ++ unsigned long start, end; ++ int ret; ++ ++ switch (granu) { ++ case QI_PC_ALL_PASIDS: ++ /* ++ * This is more like a global invalidation but to check ++ * if matching with a specific DID. ++ */ ++ pkvm_dbg("pkvm: %s: iommu%d: ALL_PASID did %d\n", ++ __func__, iommu->iommu.seq_id, did); ++ start = 0; ++ end = IOMMU_MAX_VADDR; ++ ret = sync_shadow_id(iommu, start, end, did, NULL); ++ break; ++ case QI_PC_PASID_SEL: { ++ /* ++ * Sync specific PASID entry for all contexts ++ */ ++ u64 bdf, end_bdf = 0x10000; ++ ++ pkvm_dbg("pkvm: %s: iommu%d: PASID_SEL did %d pasid 0x%x\n", ++ __func__, iommu->iommu.seq_id, did, pasid); ++ for (bdf = 0; bdf < end_bdf; bdf++) { ++ start = (bdf << DEVFN_SHIFT) + pasid; ++ end = start + 1; ++ ret = sync_shadow_id(iommu, start, end, did, NULL); ++ if (ret) ++ break; ++ } ++ break; ++ } ++ case QI_PC_GLOBAL: ++ start = 0; ++ end = IOMMU_MAX_VADDR; ++ pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id); ++ ret = sync_shadow_id(iommu, start, end, 0, NULL); ++ break; ++ default: ++ pkvm_err("pkvm: %s: iommu%d: invalid granularity %d 0x%llx\n", ++ __func__, iommu->iommu.seq_id, granu, desc->qw0); ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (ret) ++ pkvm_err("pkvm: %s: iommu%d: granularity %d failed with ret %d\n", ++ __func__, iommu->iommu.seq_id, granu, ret); ++ ++ return ret; ++} ++ ++static int iotlb_lm_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc) ++{ ++ u16 did = QI_DESC_IOTLB_DID(desc->qw0); ++ u64 granu = QI_DESC_IOTLB_GRANU(desc->qw0) << DMA_TLB_FLUSH_GRANU_OFFSET; ++ u64 addr = QI_DESC_IOTLB_ADDR(desc->qw1); ++ u64 mask = ((u64)-1) << (VTD_PAGE_SHIFT + QI_DESC_IOTLB_AM(desc->qw1)); ++ struct shadow_pgt_sync_data data; ++ struct pkvm_ptdev *p; ++ int ret; ++ ++ switch (granu) { ++ case DMA_TLB_GLOBAL_FLUSH: ++ pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id); ++ ret = sync_shadow_id(iommu, 0, IOMMU_LM_MAX_VADDR, 0, NULL); ++ break; ++ case DMA_TLB_DSI_FLUSH: ++ pkvm_dbg("pkvm: %s: iommu%d: domain selective did %u\n", ++ __func__, iommu->iommu.seq_id, did); ++ ++ /* optimization: walk just the needed devices, not the entire bdf space */ ++ list_for_each_entry(p, &iommu->ptdev_head, iommu_node) ++ if (p->did == did) ++ ret = sync_shadow_id(iommu, p->bdf, p->bdf + 1, did, NULL); ++ break; ++ case DMA_TLB_PSI_FLUSH: ++ data.vaddr = addr & mask; ++ data.vaddr_end = (addr | ~mask) + 1; ++ pkvm_dbg("pkvm: %s: iommu%d: page selective did %u start 0x%lx end 0x%lx\n", ++ __func__, iommu->iommu.seq_id, did, data.vaddr, data.vaddr_end); ++ ++ /* optimization: walk just the needed devices, not the entire bdf space */ ++ list_for_each_entry(p, &iommu->ptdev_head, iommu_node) ++ if (p->did == did) ++ ret = sync_shadow_id(iommu, p->bdf, p->bdf + 1, did, &data); ++ break; ++ default: ++ pkvm_err("pkvm: %s: iommu%d: invalid granularity %lld\n", ++ __func__, iommu->iommu.seq_id, granu >> DMA_TLB_FLUSH_GRANU_OFFSET); ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (ret) ++ pkvm_err("pkvm: %s: iommu%d: granularity %lld failed with ret %d\n", ++ __func__, iommu->iommu.seq_id, granu >> DMA_TLB_FLUSH_GRANU_OFFSET, ret); ++ ++ return ret; ++} ++ ++static int handle_descriptor(struct pkvm_iommu *iommu, struct qi_desc *desc) ++{ ++ int type = QI_DESC_TYPE(desc->qw0); ++ int ret = 0; ++ ++ switch (type) { ++ /* ++ * TODO: is it necessary to intercept the ++ * PGRP_RESP & PSTRM_RESP? ++ */ ++ case QI_PGRP_RESP_TYPE: ++ case QI_PSTRM_RESP_TYPE: ++ case QI_DIOTLB_TYPE: ++ case QI_DEIOTLB_TYPE: ++ case QI_IEC_TYPE: ++ case QI_IWD_TYPE: ++ case QI_EIOTLB_TYPE: ++ break; ++ case QI_CC_TYPE: ++ ret = context_cache_invalidate(iommu, desc); ++ break; ++ case QI_PC_TYPE: ++ ret = pasid_cache_invalidate(iommu, desc); ++ break; ++ case QI_IOTLB_TYPE: ++ if (!ecap_smts(iommu->iommu.ecap)) ++ ret = iotlb_lm_invalidate(iommu, desc); ++ break; ++ default: ++ pkvm_err("pkvm: %s: iommu%d: invalid type %d desc addr 0x%llx val 0x%llx\n", ++ __func__, iommu->iommu.seq_id, type, (u64)desc, desc->qw0); ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++static void handle_qi_submit(struct pkvm_iommu *iommu, void *vdesc, int vhead, int count) ++{ ++ struct pkvm_viommu *viommu = &iommu->viommu; ++ int vlen = IQ_DESC_LEN(viommu->iqa); ++ int vshift = IQ_DESC_SHIFT(viommu->iqa); ++ int len = IQ_DESC_LEN(iommu->piommu_iqa); ++ int shift = IQ_DESC_SHIFT(iommu->piommu_iqa); ++ struct q_inval *qi = &iommu->qi; ++ struct qi_desc *to, *from; ++ int required_cnt = count + 1, i; ++ ++ pkvm_spin_lock(&iommu->qi_lock); ++ /* ++ * Detect if the free descriptor count is enough or not ++ */ ++ while (qi->free_cnt < required_cnt) { ++ u64 head = readq(iommu->iommu.reg + DMAR_IQH_REG) >> shift; ++ int busy_cnt = (READ_ONCE(qi->free_head) + len - head) % len; ++ int free_cnt = len - busy_cnt; ++ ++ if (free_cnt >= required_cnt) { ++ qi->free_cnt = free_cnt; ++ break; ++ } ++ pkvm_spin_unlock(&iommu->qi_lock); ++ cpu_relax(); ++ pkvm_spin_lock(&iommu->qi_lock); ++ } ++ ++ for (i = 0; i < count; i++) { ++ from = vdesc + (((vhead + i) % vlen) << vshift); ++ to = qi->desc + (((qi->free_head + i) % len) << shift); ++ ++ to->qw0 = from->qw0; ++ to->qw1 = from->qw1; ++ } ++ ++ /* ++ * Reuse the desc_status from host so that host can poll ++ * the desc_status itself instead of waiting in pkvm. ++ */ ++ qi->free_cnt -= count; ++ qi->free_head = (qi->free_head + count) % len; ++ writel(qi->free_head << shift, iommu->iommu.reg + DMAR_IQT_REG); ++ ++ pkvm_spin_unlock(&iommu->qi_lock); ++} ++ ++static int handle_qi_invalidation(struct pkvm_iommu *iommu, unsigned long val) ++{ ++ struct pkvm_viommu *viommu = &iommu->viommu; ++ u64 viommu_iqa = viommu->iqa; ++ struct qi_desc *wait_desc; ++ int len = IQ_DESC_LEN(viommu_iqa); ++ int shift = IQ_DESC_SHIFT(viommu_iqa); ++ int head = viommu->vreg.iq_head >> shift; ++ int count, i, ret = 0; ++ int *desc_status; ++ void *desc; ++ ++ viommu->vreg.iq_tail = val; ++ desc = pkvm_phys_to_virt(IQ_DESC_BASE_PHYS(viommu_iqa)); ++ count = ((val >> shift) + len - head) % len; ++ ++ for (i = 0; i < count; i++) { ++ viommu->vreg.iq_head = ((head + i) % len) << shift; ++ ret = handle_descriptor(iommu, desc + viommu->vreg.iq_head); ++ if (ret) ++ break; ++ } ++ /* update iq_head */ ++ viommu->vreg.iq_head = val; ++ ++ if (likely(!ret)) { ++ /* ++ * Submit the descriptor to hardware. The desc_status ++ * will be taken care by hardware. ++ */ ++ handle_qi_submit(iommu, desc, head, count); ++ } else { ++ pkvm_err("pkvm: %s: failed with ret %d\n", __func__, ret); ++ /* ++ * The descriptor seems invalid. Mark the desc_status as ++ * QI_ABORT to make sure host driver won't be blocked. ++ */ ++ wait_desc = desc + (((head + count - 1) % len) << shift); ++ if (QI_DESC_TYPE(wait_desc->qw0) == QI_IWD_TYPE) { ++ desc_status = pkvm_phys_to_virt(wait_desc->qw1); ++ WRITE_ONCE(*desc_status, QI_ABORT); ++ } ++ } ++ ++ return ret; ++} ++ ++static void handle_gcmd_te(struct pkvm_iommu *iommu, bool en) ++{ ++ unsigned long vaddr = 0, vaddr_end = MAX_NUM_OF_ADDRESS_SPACE(iommu); ++ struct pkvm_viommu *viommu = &iommu->viommu; ++ ++ if (en) { ++ viommu->vreg.gsts |= DMA_GSTS_TES; ++ /* ++ * Sync shadow id table to emulate Translation enable. ++ */ ++ if (sync_shadow_id(iommu, vaddr, vaddr_end, 0, NULL)) ++ return; ++ pkvm_dbg("pkvm: %s: enable TE\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Free shadow to emulate Translation disable. ++ * ++ * Not really disable translation as still ++ * need to protect against the device. ++ */ ++ free_shadow_id(iommu, vaddr, vaddr_end); ++ viommu->vreg.gsts &= ~DMA_GSTS_TES; ++ pkvm_dbg("pkvm: %s: disable TE\n", __func__); ++out: ++ flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); ++ if (ecap_smts(iommu->iommu.ecap)) ++ flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); ++ flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); ++ ++ root_tbl_walk(iommu); ++} ++ ++static void handle_gcmd_srtp(struct pkvm_iommu *iommu) ++{ ++ struct viommu_reg *vreg = &iommu->viommu.vreg; ++ struct pkvm_pgtable *vpgt = &iommu->viommu.pgt; ++ ++ vreg->gsts &= ~DMA_GSTS_RTPS; ++ ++ /* Set the root table phys address from vreg */ ++ vpgt->root_pa = vreg->rta & VTD_PAGE_MASK; ++ ++ pkvm_dbg("pkvm: %s: set SRTP val 0x%llx\n", __func__, vreg->rta); ++ ++ if (vreg->gsts & DMA_GSTS_TES) { ++ unsigned long vaddr = 0, vaddr_end = MAX_NUM_OF_ADDRESS_SPACE(iommu); ++ ++ /* TE is already enabled, sync shadow */ ++ if (sync_shadow_id(iommu, vaddr, vaddr_end, 0, NULL)) ++ return; ++ ++ flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); ++ if (ecap_smts(iommu->iommu.ecap)) ++ flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); ++ flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); ++ } ++ ++ vreg->gsts |= DMA_GSTS_RTPS; ++ ++ root_tbl_walk(iommu); ++} ++ ++static void handle_gcmd_qie(struct pkvm_iommu *iommu, bool en) ++{ ++ struct viommu_reg *vreg = &iommu->viommu.vreg; ++ ++ if (en) { ++ if (vreg->iq_tail != 0) { ++ pkvm_err("pkvm: Queue invalidation descriptor tail is not zero\n"); ++ return; ++ } ++ ++ /* Update the iqa from vreg */ ++ iommu->viommu.iqa = vreg->iqa; ++ vreg->iq_head = 0; ++ vreg->gsts |= DMA_GSTS_QIES; ++ pkvm_dbg("pkvm: %s: enabled QI\n", __func__); ++ return; ++ } ++ ++ if (vreg->iq_head != vreg->iq_tail) { ++ pkvm_err("pkvm: Queue invalidation descriptor is not empty yet\n"); ++ return; ++ } ++ ++ vreg->iq_head = 0; ++ vreg->gsts &= ~DMA_GSTS_QIES; ++ pkvm_dbg("pkvm: %s: disabled QI\n", __func__); ++} ++ ++static void handle_gcmd_direct(struct pkvm_iommu *iommu, u32 val) ++{ ++ struct viommu_reg *vreg = &iommu->viommu.vreg; ++ unsigned long changed = ((vreg->gsts ^ val) & DMAR_GCMD_DIRECT) & ++ DMAR_GSTS_EN_BITS; ++ unsigned long set = (val & DMAR_GCMD_DIRECT) & ~DMAR_GSTS_EN_BITS; ++ u32 cmd, gcmd, sts; ++ int bit; ++ ++ if ((changed | set) & DMAR_GCMD_PROTECTED) { ++ pkvm_dbg("pkvm:%s touching protected bits changed 0x%lx set 0x%lx\n", ++ __func__, changed, set); ++ return; ++ } ++ ++ if (changed) { ++ pkvm_dbg("pkvm: %s: changed 0x%lx\n", __func__, changed); ++ gcmd = READ_ONCE(iommu->iommu.gcmd); ++ for_each_set_bit(bit, &changed, BITS_PER_BYTE * sizeof(vreg->gsts)) { ++ cmd = 1 << bit; ++ if (val & cmd) { ++ /* enable */ ++ gcmd |= cmd; ++ writel(gcmd, iommu->iommu.reg + DMAR_GCMD_REG); ++ PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, ++ readl, (sts & cmd), sts); ++ vreg->gsts |= cmd; ++ pkvm_dbg("pkvm: %s: enable cmd bit %d\n", __func__, bit); ++ } else { ++ /* disable */ ++ gcmd &= ~cmd; ++ writel(gcmd, iommu->iommu.reg + DMAR_GCMD_REG); ++ PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, ++ readl, !(sts & cmd), sts); ++ vreg->gsts &= ~cmd; ++ pkvm_dbg("pkvm: %s: disable cmd bit %d\n", __func__, bit); ++ } ++ } ++ WRITE_ONCE(iommu->iommu.gcmd, gcmd); ++ } ++ ++ if (set) { ++ pkvm_dbg("pkvm: %s: set 0x%lx\n", __func__, set); ++ gcmd = READ_ONCE(iommu->iommu.gcmd); ++ for_each_set_bit(bit, &set, BITS_PER_BYTE * sizeof(vreg->gsts)) { ++ cmd = 1 << bit; ++ vreg->gsts &= ~cmd; ++ writel(gcmd | cmd, iommu->iommu.reg + DMAR_GCMD_REG); ++ PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, ++ readl, (sts & cmd), sts); ++ vreg->gsts |= cmd; ++ pkvm_dbg("pkvm: %s: set cmd bit %d\n", __func__, bit); ++ } ++ } ++} ++ ++static void handle_global_cmd(struct pkvm_iommu *iommu, u32 val) ++{ ++ u32 changed = iommu->viommu.vreg.gsts ^ val; ++ ++ pkvm_dbg("pkvm: iommu%d: handle gcmd val 0x%x gsts 0x%x changed 0x%x\n", ++ iommu->iommu.seq_id, val, iommu->viommu.vreg.gsts, changed); ++ ++ if (changed & DMA_GCMD_TE) ++ handle_gcmd_te(iommu, !!(val & DMA_GCMD_TE)); ++ ++ if (val & DMA_GCMD_SRTP) ++ handle_gcmd_srtp(iommu); ++ ++ if (changed & DMA_GCMD_QIE) ++ handle_gcmd_qie(iommu, !!(val & DMA_GCMD_QIE)); ++ ++ handle_gcmd_direct(iommu, val); ++} ++ ++static struct pkvm_iommu *find_iommu_by_reg_phys(unsigned long phys) ++{ ++ struct pkvm_iommu *iommu; ++ ++ for_each_valid_iommu(iommu) { ++ if ((phys >= iommu->iommu.reg_phys) && ++ (phys < (iommu->iommu.reg_phys + iommu->iommu.reg_size))) ++ return iommu; ++ } ++ ++ return NULL; ++} ++ ++static unsigned long direct_access_iommu_mmio(struct pkvm_iommu *iommu, ++ bool is_read, int len, ++ unsigned long phys, ++ unsigned long val) ++{ ++ unsigned long offset = phys - iommu->iommu.reg_phys; ++ void *reg = iommu->iommu.reg + offset; ++ unsigned long ret = 0; ++ ++ switch (len) { ++ case 4: ++ if (is_read) ++ ret = (unsigned long)readl(reg); ++ else ++ writel((u32)val, reg); ++ break; ++ case 8: ++ if (is_read) ++ ret = (unsigned long)readq(reg); ++ else ++ writeq((u64)val, reg); ++ break; ++ default: ++ pkvm_err("%s: %s: unsupported len %d\n", __func__, ++ is_read ? "read" : "write", len); ++ break; ++ } ++ ++ return ret; ++} ++ ++static unsigned long access_iommu_mmio(struct pkvm_iommu *iommu, bool is_read, ++ int len, unsigned long phys, ++ unsigned long val) ++{ ++ struct pkvm_viommu *viommu = &iommu->viommu; ++ unsigned long offset = phys - iommu->iommu.reg_phys; ++ unsigned long ret = 0; ++ ++ /* pkvm IOMMU driver is not activated yet, so directly access MMIO */ ++ if (unlikely(!iommu->activated)) ++ return direct_access_iommu_mmio(iommu, is_read, len, phys, val); ++ ++ /* Only need to emulate part of the MMIO */ ++ switch (offset) { ++ case DMAR_CAP_REG: ++ if (is_read) ++ ret = viommu->vreg.cap; ++ break; ++ case DMAR_ECAP_REG: ++ if (is_read) ++ ret = viommu->vreg.ecap; ++ break; ++ case DMAR_GCMD_REG: ++ if (is_read) ++ ret = 0; ++ else ++ handle_global_cmd(iommu, val); ++ break; ++ case DMAR_GSTS_REG: ++ if (is_read) ++ ret = viommu->vreg.gsts; ++ break; ++ case DMAR_RTADDR_REG: ++ if (is_read) ++ ret = viommu->vreg.rta; ++ else ++ viommu->vreg.rta = val; ++ break; ++ case DMAR_IQA_REG: ++ if (is_read) ++ ret = viommu->vreg.iqa; ++ else ++ viommu->vreg.iqa = val; ++ break; ++ case DMAR_IQH_REG: ++ if (is_read) ++ ret = viommu->vreg.iq_head; ++ break; ++ case DMAR_IQT_REG: ++ if (is_read) ++ ret = viommu->vreg.iq_tail; ++ else { ++ if (viommu->vreg.gsts & DMA_GSTS_QIES) ++ ret = handle_qi_invalidation(iommu, val); ++ else ++ viommu->vreg.iq_tail = val; ++ } ++ break; ++ default: ++ /* Not emulated MMIO can directly goes to hardware */ ++ ret = direct_access_iommu_mmio(iommu, is_read, len, phys, val); ++ break; ++ } ++ ++ return ret; ++} ++ ++unsigned long pkvm_access_iommu(bool is_read, int len, unsigned long phys, unsigned long val) ++{ ++ struct pkvm_iommu *pkvm_iommu = find_iommu_by_reg_phys(phys); ++ unsigned long ret; ++ ++ if (!pkvm_iommu) { ++ pkvm_err("%s: cannot find pkvm iommu for reg 0x%lx\n", ++ __func__, phys); ++ return 0; ++ } ++ ++ pkvm_spin_lock(&pkvm_iommu->lock); ++ ret = access_iommu_mmio(pkvm_iommu, is_read, len, phys, val); ++ pkvm_spin_unlock(&pkvm_iommu->lock); ++ ++ return ret; ++} ++ ++int pkvm_activate_iommu(void) ++{ ++ struct pkvm_iommu *iommu; ++ int ret = 0; ++ ++ for_each_valid_iommu(iommu) { ++ ret = activate_iommu(iommu); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++bool is_mem_range_overlap_iommu(unsigned long start, unsigned long end) ++{ ++ struct pkvm_iommu *iommu; ++ ++ for_each_valid_iommu(iommu) { ++ if (end < iommu->iommu.reg_phys || ++ start > (iommu->iommu.reg_phys + iommu->iommu.reg_size - 1)) ++ continue; ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * TODO: ++ * Currently assume that the bdf/pasid has ever been synced ++ * so that the IOMMU can be found. If not synced, then cannot ++ * get a valid IOMMU by calling this function. ++ * ++ * To handle this case, pKVM IOMMU driver needs to check the ++ * DMAR to know which IOMMU should be used for this bdf/pasid. ++ */ ++static struct pkvm_iommu *bdf_pasid_to_iommu(u16 bdf, u32 pasid) ++{ ++ struct pkvm_iommu *iommu, *find = NULL; ++ struct pkvm_ptdev *p; ++ ++ for_each_valid_iommu(iommu) { ++ pkvm_spin_lock(&iommu->lock); ++ list_for_each_entry(p, &iommu->ptdev_head, iommu_node) { ++ if (match_ptdev(p, bdf, pasid)) { ++ find = iommu; ++ break; ++ } ++ } ++ pkvm_spin_unlock(&iommu->lock); ++ if (find) ++ break; ++ } ++ ++ return find; ++} ++ ++/* ++ * pkvm_iommu_sync() - Sync IOMMU context/pasid entry according to a ptdev ++ * ++ * @bdf/pasid: The corresponding IOMMU page table entry needs to sync. ++ */ ++int pkvm_iommu_sync(u16 bdf, u32 pasid) ++{ ++ struct pkvm_iommu *iommu = bdf_pasid_to_iommu(bdf, pasid); ++ unsigned long id_addr, id_addr_end; ++ struct pkvm_ptdev *ptdev; ++ u16 old_did; ++ int ret; ++ ++ if (!iommu) ++ return -ENODEV; ++ ++ ptdev = pkvm_get_ptdev(bdf, pasid); ++ if (!ptdev) ++ return -ENODEV; ++ ++ old_did = ptdev->did; ++ ++ if (ecap_smts(iommu->iommu.ecap)) { ++ id_addr = ((unsigned long)bdf << DEVFN_SHIFT) | ++ ((unsigned long)pasid & ((1UL << MAX_NR_PASID_BITS) - 1)); ++ id_addr_end = id_addr + 1; ++ } else { ++ id_addr = (unsigned long)bdf << LM_DEVFN_SHIFT; ++ id_addr_end = ((unsigned long)bdf + 1) << LM_DEVFN_SHIFT; ++ } ++ ++ pkvm_spin_lock(&iommu->lock); ++ ret = sync_shadow_id(iommu, id_addr, id_addr_end, 0, NULL); ++ if (!ret) { ++ if (old_did != ptdev->did) { ++ /* Flush pasid cache and IOTLB for the valid old_did */ ++ if (ecap_smts(iommu->iommu.ecap)) ++ flush_pasid_cache(iommu, old_did, QI_PC_PASID_SEL, pasid); ++ else ++ flush_context_cache(iommu, old_did, 0, 0, DMA_CCMD_DOMAIN_INVL); ++ flush_iotlb(iommu, old_did, 0, 0, DMA_TLB_DSI_FLUSH); ++ } ++ ++ /* Flush pasid cache and IOTLB to make sure no stale TLB for the new did */ ++ if (ecap_smts(iommu->iommu.ecap)) ++ flush_pasid_cache(iommu, ptdev->did, QI_PC_PASID_SEL, pasid); ++ else ++ flush_context_cache(iommu, ptdev->did, 0, 0, DMA_CCMD_DOMAIN_INVL); ++ flush_iotlb(iommu, ptdev->did, 0, 0, DMA_TLB_DSI_FLUSH); ++ } ++ pkvm_spin_unlock(&iommu->lock); ++ ++ pkvm_put_ptdev(ptdev); ++ return ret; ++} ++ ++bool pkvm_iommu_coherency(u16 bdf, u32 pasid) ++{ ++ struct pkvm_iommu *iommu = bdf_pasid_to_iommu(bdf, pasid); ++ ++ /* ++ * If cannot find a valid IOMMU by bdf/pasid, return ++ * false to present noncoherent, so that can guarantee ++ * the coherency through flushing cache by pkvm itself. ++ */ ++ if (!iommu) ++ return false; ++ ++ return iommu_coherency(iommu->iommu.ecap); ++} ++ ++struct iotlb_flush_data { ++ unsigned long desired_root_pa; ++ unsigned long addr; ++ int size_order; ++ struct qi_desc *desc; ++ int desc_max_index; ++}; ++ ++static void iommu_flush_iotlb(struct pkvm_iommu *iommu, struct iotlb_flush_data *data) ++{ ++ struct pkvm_ptdev *ptdev; ++ struct qi_desc *desc = data->desc; ++ int qi_desc_index = 0; ++ ++ pkvm_spin_lock(&iommu->lock); ++ ++ /* No need to flush IOTLB if there is no device on this IOMMU */ ++ if (list_empty(&iommu->ptdev_head)) ++ goto out; ++ ++ /* ++ * If the descriptor buffer is NULL, pKVM has to submit the QI ++ * request one by one which may be slow if there are a lot of ++ * devices connected to this IOMMU unit. So in this case, choose ++ * to submit one single global flush request to flush the IOTLB ++ * for all the devices. ++ */ ++ if (!desc) { ++ flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); ++ goto out; ++ } ++ ++ /* Flush per domain */ ++ list_for_each_entry(ptdev, &iommu->ptdev_head, iommu_node) { ++ struct qi_desc *tmp = desc; ++ bool did_exist = false; ++ int i; ++ ++ if (!ptdev->pgt || ptdev->pgt->root_pa != data->desired_root_pa) ++ continue; ++ ++ for (i = 0; i < qi_desc_index; i++, tmp++) { ++ /* The same did is already in descriptor page */ ++ if (ptdev->did == QI_DESC_IOTLB_DID(tmp->qw0)) { ++ did_exist = true; ++ break; ++ } ++ } ++ ++ if (did_exist) ++ continue; ++ /* ++ * Setup the page-selective or domain-selective qi descriptor ++ * based on IOMMU capability, and submit to HW when qi descriptor ++ * number reaches to the maximum count. ++ */ ++ if (cap_pgsel_inv(iommu->iommu.cap) && ++ data->size_order <= cap_max_amask_val(iommu->iommu.cap)) ++ setup_iotlb_qi_desc(iommu, desc + qi_desc_index++, ++ ptdev->did, data->addr, data->size_order, ++ DMA_TLB_PSI_FLUSH); ++ else ++ setup_iotlb_qi_desc(iommu, desc + qi_desc_index++, ++ ptdev->did, 0, 0, ++ DMA_TLB_DSI_FLUSH); ++ ++ if (qi_desc_index == data->desc_max_index) { ++ submit_qi(iommu, desc, qi_desc_index); ++ qi_desc_index = 0; ++ } ++ } ++ ++ if (qi_desc_index) ++ submit_qi(iommu, desc, qi_desc_index); ++out: ++ pkvm_spin_unlock(&iommu->lock); ++} ++ ++void pkvm_iommu_flush_iotlb(struct pkvm_pgtable *pgt, unsigned long addr, unsigned long size) ++{ ++ int size_order = ilog2(__roundup_pow_of_two(size >> VTD_PAGE_SHIFT)); ++ struct iotlb_flush_data data = { ++ .desired_root_pa = pgt->root_pa, ++ .addr = ALIGN_DOWN(addr, (1ULL << (VTD_PAGE_SHIFT + size_order))), ++ .size_order = size_order, ++ }; ++ struct pkvm_iommu *iommu; ++ ++ data.desc = iommu_zalloc_pages(PKVM_QI_DESC_ALIGNED_SIZE); ++ if (data.desc) ++ /* Reserve space for one wait desc and one desc between head and tail */ ++ data.desc_max_index = PKVM_QI_DESC_ALIGNED_SIZE / sizeof(struct qi_desc) - 2; ++ ++ for_each_valid_iommu(iommu) ++ iommu_flush_iotlb(iommu, &data); ++ ++ if (data.desc) ++ iommu_put_page(data.desc); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu.h b/arch/x86/kvm/vmx/pkvm/hyp/iommu.h +new file mode 100644 +index 000000000000..dd7fc31373e0 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu.h +@@ -0,0 +1,16 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _PKVM_IOMMU_H_ ++#define _PKVM_IOMMU_H_ ++ ++int pkvm_init_iommu(unsigned long mem_base, unsigned long nr_pages); ++unsigned long pkvm_access_iommu(bool is_read, int len, unsigned long reg, unsigned long val); ++bool is_mem_range_overlap_iommu(unsigned long start, unsigned long end); ++int pkvm_activate_iommu(void); ++int pkvm_iommu_sync(u16 bdf, u32 pasid); ++bool pkvm_iommu_coherency(u16 bdf, u32 pasid); ++void pkvm_iommu_flush_iotlb(struct pkvm_pgtable *pgt, unsigned long addr, unsigned long size); ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c b/arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c +new file mode 100644 +index 000000000000..9dfadadc2b74 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu_debug.c +@@ -0,0 +1,199 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include "debug.h" ++#include "memory.h" ++#include "pgtable.h" ++#include "ept.h" ++#include "pkvm_hyp.h" ++#include "iommu_internal.h" ++ ++struct tbl_walk { ++ u16 bus; ++ u16 devfn; ++ u32 pasid; ++ struct root_entry *rt_entry; ++ struct context_entry *ctx_entry; ++ struct pasid_entry *pasid_tbl_entry; ++}; ++ ++#define PASID_PDE_SHIFT 6 ++#define PASID_TBL_ENTRIES BIT(PASID_PDE_SHIFT) ++#define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) ++ ++static inline struct pasid_dir_entry *context_entry_present(struct context_entry *ce) ++{ ++ if (!(READ_ONCE(ce->lo) & 1)) ++ return NULL; ++ ++ return pkvm_phys_to_virt(READ_ONCE(ce->lo) & VTD_PAGE_MASK); ++} ++ ++/* Get PRESENT bit of a PASID directory entry. */ ++static inline bool pasid_pde_is_present(struct pasid_dir_entry *pde) ++{ ++ return READ_ONCE(pde->val) & 1; ++} ++ ++/* Get PASID table from a PASID directory entry. */ ++static inline struct pasid_entry * ++get_pasid_table_from_pde(struct pasid_dir_entry *pde) ++{ ++ if (!pasid_pde_is_present(pde)) ++ return NULL; ++ ++ return pkvm_phys_to_virt(READ_ONCE(pde->val) & VTD_PAGE_MASK); ++} ++ ++static struct context_entry *context_addr(struct pkvm_iommu *iommu, u8 bus, u8 devfn) ++{ ++ struct root_entry *root_entry = pkvm_phys_to_virt(iommu->pgt.root_pa); ++ struct root_entry *root = &root_entry[bus]; ++ struct context_entry *context; ++ u64 *entry; ++ ++ entry = &root->lo; ++ if (ecap_smts(iommu->iommu.ecap)) { ++ if (devfn >= 0x80) { ++ devfn -= 0x80; ++ entry = &root->hi; ++ } ++ devfn *= 2; ++ } ++ ++ if (*entry & 1) ++ context = pkvm_phys_to_virt(*entry & VTD_PAGE_MASK); ++ else ++ return NULL; ++ ++ return &context[devfn]; ++} ++ ++static inline void print_tbl_walk(struct tbl_walk *tbl_wlk) ++{ ++ /* ++ * A legacy mode DMAR doesn't support PASID, hence default it to -1 ++ * indicating that it's invalid. Also, default all PASID related fields ++ * to 0. ++ */ ++ if (!tbl_wlk->pasid_tbl_entry) ++ pkvm_dbg("%02x:%02x.%x\t0x%016llx:0x%016llx\t0x%016llx:0x%016llx\t%-6d\t0x%016llx:0x%016llx:0x%016llx\n", ++ tbl_wlk->bus, PCI_SLOT(tbl_wlk->devfn), ++ PCI_FUNC(tbl_wlk->devfn), tbl_wlk->rt_entry->hi, ++ tbl_wlk->rt_entry->lo, tbl_wlk->ctx_entry->hi, ++ tbl_wlk->ctx_entry->lo, -1, ++ (u64)0, (u64)0, (u64)0); ++ else ++ pkvm_dbg("%02x:%02x.%x\t0x%016llx:0x%016llx\t0x%016llx:0x%016llx\t%-6d\t0x%016llx:0x%016llx:0x%016llx\n", ++ tbl_wlk->bus, PCI_SLOT(tbl_wlk->devfn), ++ PCI_FUNC(tbl_wlk->devfn), tbl_wlk->rt_entry->hi, ++ tbl_wlk->rt_entry->lo, tbl_wlk->ctx_entry->hi, ++ tbl_wlk->ctx_entry->lo, tbl_wlk->pasid, ++ tbl_wlk->pasid_tbl_entry->val[2], ++ tbl_wlk->pasid_tbl_entry->val[1], ++ tbl_wlk->pasid_tbl_entry->val[0]); ++} ++ ++static void pasid_tbl_walk(struct tbl_walk *tbl_wlk, struct pasid_entry *tbl_entry, u16 dir_idx) ++{ ++ u8 tbl_idx; ++ ++ for (tbl_idx = 0; tbl_idx < PASID_TBL_ENTRIES; tbl_idx++) { ++ if (pasid_pte_is_present(tbl_entry)) { ++ tbl_wlk->pasid_tbl_entry = tbl_entry; ++ tbl_wlk->pasid = (dir_idx << PASID_PDE_SHIFT) + tbl_idx; ++ print_tbl_walk(tbl_wlk); ++ } ++ ++ tbl_entry++; ++ } ++} ++ ++static void pasid_dir_walk(struct tbl_walk *tbl_wlk, u64 pasid_dir_ptr, ++ u16 pasid_dir_size) ++{ ++ struct pasid_dir_entry *dir_entry = pkvm_phys_to_virt(pasid_dir_ptr); ++ struct pasid_entry *pasid_tbl; ++ u16 dir_idx; ++ ++ for (dir_idx = 0; dir_idx < pasid_dir_size; dir_idx++) { ++ pasid_tbl = get_pasid_table_from_pde(dir_entry); ++ if (pasid_tbl) ++ pasid_tbl_walk(tbl_wlk, pasid_tbl, dir_idx); ++ ++ dir_entry++; ++ } ++} ++ ++static void ctx_tbl_walk(struct pkvm_iommu *iommu, u16 bus) ++{ ++ struct root_entry *root_entry = pkvm_phys_to_virt(iommu->pgt.root_pa); ++ struct context_entry *context; ++ u16 devfn, pasid_dir_size; ++ u64 pasid_dir_ptr; ++ ++ for (devfn = 0; devfn < 256; devfn++) { ++ struct tbl_walk tbl_wlk = {0}; ++ ++ /* ++ * Scalable mode root entry points to upper scalable mode ++ * context table and lower scalable mode context table. Each ++ * scalable mode context table has 128 context entries whereas ++ * legacy mode context table has 256 context entries. So in ++ * scalable mode, the context entries for the former 128 devices are ++ * in the lower scalable mode context table, while the latter ++ * 128 devices are in the upper scalable mode context table. ++ * In scalable mode, when devfn > 127, iommu_context_addr() ++ * automatically refers to the upper scalable mode context table and ++ * hence the caller doesn't have to worry about differences ++ * between scalable mode and non scalable mode. ++ */ ++ context = context_addr(iommu, bus, devfn); ++ if (!context) ++ return; ++ ++ if (!context_entry_present(context)) ++ continue; ++ ++ tbl_wlk.bus = bus; ++ tbl_wlk.devfn = devfn; ++ tbl_wlk.rt_entry = &root_entry[bus]; ++ tbl_wlk.ctx_entry = context; ++ ++ if (ecap_smts(iommu->iommu.ecap)) { ++ pasid_dir_ptr = context->lo & VTD_PAGE_MASK; ++ pasid_dir_size = get_pasid_dir_size(context); ++ pasid_dir_walk(&tbl_wlk, pasid_dir_ptr, pasid_dir_size); ++ continue; ++ } ++ ++ print_tbl_walk(&tbl_wlk); ++ } ++} ++ ++void root_tbl_walk(struct pkvm_iommu *iommu) ++{ ++ u16 bus; ++ ++ pkvm_dbg("IOMMU %d: Root Table Address: 0x%llx\n", ++ iommu->iommu.seq_id, (u64)iommu->pgt.root_pa); ++ pkvm_dbg("B.D.F\tRoot_entry\t\t\t\tContext_entry\t\t\t\tPASID\tPASID_table_entry\n"); ++ ++ /* ++ * No need to check if the root entry is present or not because ++ * iommu_context_addr() performs the same check before returning ++ * context entry. ++ */ ++ for (bus = 0; bus < 256; bus++) ++ ctx_tbl_walk(iommu, bus); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h b/arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h +new file mode 100644 +index 000000000000..35b78fe21d48 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu_internal.h +@@ -0,0 +1,347 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef __PKVM_IOMMU_INTERNAL_H ++#define __PKVM_IOMMU_INTERNAL_H ++ ++#include ++#include ++#include ++#include "pgtable.h" ++ ++#define PKVM_QI_DESC_ALIGNED_SIZE ALIGN(QI_LENGTH * sizeof(struct qi_desc), PAGE_SIZE) ++#define PKVM_QI_DESC_STATUS_ALIGNED_SIZE ALIGN(QI_LENGTH * sizeof(int), PAGE_SIZE) ++ ++struct viommu_reg { ++ u64 cap; ++ u64 ecap; ++ u32 gsts; ++ u64 rta; ++ u64 iq_head; ++ u64 iq_tail; ++ u64 iqa; ++}; ++ ++struct pkvm_viommu { ++ struct pkvm_pgtable pgt; ++ struct viommu_reg vreg; ++ u64 iqa; ++}; ++ ++struct pkvm_iommu { ++ struct intel_iommu iommu; ++ pkvm_spinlock_t lock; ++ bool activated; ++ struct pkvm_pgtable pgt; ++ struct pkvm_viommu viommu; ++ ++ struct q_inval qi; ++ pkvm_spinlock_t qi_lock; ++ u64 piommu_iqa; ++ ++ /* Link ptdev information of this IOMMU */ ++ struct list_head ptdev_head; ++}; ++ ++enum lm_level { ++ IOMMU_LM_CONTEXT = 1, ++ IOMMU_LM_ROOT, ++}; ++ ++enum sm_level { ++ IOMMU_PASID_TABLE = 1, ++ IOMMU_PASID_DIR, ++ IOMMU_SM_CONTEXT, ++ IOMMU_SM_ROOT, ++ IOMMU_SM_LEVEL_NUM, ++}; ++ ++#define LAST_LEVEL(level) \ ++ (((level) == 1) ? true : false) ++ ++#define LM_DEVFN_BITS 8 ++#define LM_DEVFN_SHIFT 0 ++ ++#define LM_BUS_BITS 8 ++#define LM_BUS_SHIFT 8 ++#define IOMMU_LM_MAX_VADDR BIT(16) ++ ++#define PASID_PTE_PRESENT 1 ++#define PASID_PTE_FPD 2 ++#define MAX_NR_PASID_BITS PKVM_MAX_PASID_BITS ++ ++#define PASIDTAB_BITS 6 ++#define PASIDTAB_SHIFT 0 ++ ++#define PASIDDIR_BITS (MAX_NR_PASID_BITS - PASIDTAB_BITS) ++#define PASIDDIR_SHIFT PASIDTAB_BITS ++ ++#define DEVFN_BITS 8 ++#define DEVFN_SHIFT (PASIDDIR_SHIFT + PASIDDIR_BITS) ++ ++#define BUS_BITS 8 ++#define BUS_SHIFT (DEVFN_SHIFT + DEVFN_BITS) ++ ++/* Used to calculate the level-to-index */ ++#define SM_DEVFN_BITS 7 ++#define SM_BUS_BITS 9 ++#define SM_BUS_SHIFT (DEVFN_SHIFT + SM_DEVFN_BITS) ++ ++#define IOMMU_MAX_VADDR_LEN (BUS_SHIFT + BUS_BITS) ++#define IOMMU_MAX_VADDR BIT(IOMMU_MAX_VADDR_LEN) ++ ++#define MAX_NUM_OF_ADDRESS_SPACE(_iommu) \ ++ (ecap_smts((_iommu)->iommu.ecap) ? \ ++ IOMMU_MAX_VADDR : IOMMU_LM_MAX_VADDR) ++ ++#define DMAR_GSTS_EN_BITS (DMA_GCMD_TE | DMA_GCMD_EAFL | \ ++ DMA_GCMD_QIE | DMA_GCMD_IRE | \ ++ DMA_GCMD_CFI) ++#define DMAR_GCMD_PROTECTED (DMA_GCMD_TE | DMA_GCMD_SRTP | \ ++ DMA_GCMD_QIE) ++#define DMAR_GCMD_DIRECT (DMA_GCMD_SFL | DMA_GCMD_EAFL | \ ++ DMA_GCMD_WBF | DMA_GCMD_IRE | \ ++ DMA_GCMD_SIRTP | DMA_GCMD_CFI) ++ ++#define PKVM_IOMMU_WAIT_OP(offset, op, cond, sts) \ ++do { \ ++ while (1) { \ ++ (sts) = op(offset); \ ++ if (cond) \ ++ break; \ ++ cpu_relax(); \ ++ } \ ++} while (0) ++ ++#define IQ_DESC_BASE_PHYS(reg) ((reg) & ~0xfff) ++#define IQ_DESC_DW(reg) (((reg) >> 11) & 1) ++#define IQ_DESC_QS(reg) ((reg) & GENMASK_ULL(2, 0)) ++#define IQ_DESC_LEN(reg) (1 << (7 + IQ_DESC_QS(reg) + !IQ_DESC_DW(reg))) ++#define IQ_DESC_SHIFT(reg) (4 + IQ_DESC_DW(reg)) ++ ++#define QI_DESC_TYPE(qw) ((qw) & GENMASK_ULL(3, 0)) ++#define QI_DESC_CC_GRANU(qw) (((qw) & GENMASK_ULL(5, 4)) >> 4) ++#define QI_DESC_CC_DID(qw) (((qw) & GENMASK_ULL(31, 16)) >> 16) ++#define QI_DESC_CC_SID(qw) (((qw) & GENMASK_ULL(47, 32)) >> 32) ++ ++#define QI_DESC_PC_GRANU(qw) (((qw) & GENMASK_ULL(5, 4)) >> 4) ++#define QI_DESC_PC_DID(qw) (((qw) & GENMASK_ULL(31, 16)) >> 16) ++#define QI_DESC_PC_PASID(qw) (((qw) & GENMASK_ULL(51, 32)) >> 32) ++ ++#define QI_DESC_IOTLB_GRANU(qw) (((qw) & GENMASK_ULL(5, 4)) >> 4) ++#define QI_DESC_IOTLB_DID(qw) (((qw) & GENMASK_ULL(31, 16)) >> 16) ++#define QI_DESC_IOTLB_ADDR(qw) ((qw) & VTD_PAGE_MASK) ++#define QI_DESC_IOTLB_AM(qw) ((qw) & GENMASK_ULL(5, 0)) ++ ++#define pgt_to_pkvm_iommu(_pgt) container_of(_pgt, struct pkvm_iommu, pgt) ++ ++struct pasid_dir_entry { ++ u64 val; ++}; ++ ++struct pasid_entry { ++ u64 val[8]; ++}; ++ ++static inline void entry_set_bits(u64 *ptr, u64 mask, u64 bits) ++{ ++ u64 old; ++ ++ old = READ_ONCE(*ptr); ++ WRITE_ONCE(*ptr, (old & ~mask) | bits); ++} ++ ++static inline void context_sm_clear_dte(struct context_entry *ce) ++{ ++ entry_set_bits(&ce->lo, 1 << 2, 0); ++} ++ ++static inline bool context_lm_is_present(struct context_entry *ce) ++{ ++ return READ_ONCE(ce->lo) & 1; ++} ++ ++static inline u8 context_lm_get_tt(struct context_entry *ce) ++{ ++ return (READ_ONCE(ce->lo) >> 2) & 3; ++} ++ ++static inline u64 context_lm_get_slptr(struct context_entry *ce) ++{ ++ return READ_ONCE(ce->lo) & VTD_PAGE_MASK; ++} ++ ++static inline u8 context_lm_get_aw(struct context_entry *ce) ++{ ++ return READ_ONCE(ce->hi) & 0x7; ++} ++ ++static inline u16 context_lm_get_did(struct context_entry *ce) ++{ ++ return (READ_ONCE(ce->hi) >> 8) & 0xffff; ++} ++ ++static inline void context_lm_set_tt(struct context_entry *ce, u8 value) ++{ ++ entry_set_bits(&ce->lo, 3 << 2, value << 2); ++} ++ ++static inline void context_lm_set_slptr(struct context_entry *ce, u64 value) ++{ ++ entry_set_bits(&ce->lo, VTD_PAGE_MASK, value); ++} ++ ++static inline void context_lm_set_aw(struct context_entry *ce, u8 value) ++{ ++ entry_set_bits(&ce->hi, 0x7, value); ++} ++ ++/* Get PRESENT bit of a PASID table entry. */ ++static inline bool pasid_pte_is_present(struct pasid_entry *pte) ++{ ++ return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT; ++} ++ ++/* Get PGTT field of a PASID table entry */ ++static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) ++{ ++ return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7); ++} ++ ++/* ++ * Interfaces for PASID table entry manipulation: ++ */ ++static inline void pasid_clear_entry(struct pasid_entry *pe) ++{ ++ WRITE_ONCE(pe->val[0], 0); ++ WRITE_ONCE(pe->val[1], 0); ++ WRITE_ONCE(pe->val[2], 0); ++ WRITE_ONCE(pe->val[3], 0); ++ WRITE_ONCE(pe->val[4], 0); ++ WRITE_ONCE(pe->val[5], 0); ++ WRITE_ONCE(pe->val[6], 0); ++ WRITE_ONCE(pe->val[7], 0); ++} ++ ++/* ++ * Get domain ID value of a scalable mode PASID entry. ++ */ ++static inline u16 ++pasid_get_domain_id(struct pasid_entry *pe) ++{ ++ return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0)); ++} ++ ++/* ++ * Get the FLPTPTR(First Level Page Table Pointer) field (Bit 140 ~ 191) ++ * of a scalable mode PASID entry. ++ */ ++static inline u64 ++pasid_get_flptr(struct pasid_entry *pe) ++{ ++ return (u64)(READ_ONCE(pe->val[2]) & VTD_PAGE_MASK); ++} ++ ++/* ++ * Get the First Level Paging Mode field (Bit 130~131) of a ++ * scalable mode PASID entry. ++ */ ++static inline u8 ++pasid_get_flpm(struct pasid_entry *pe) ++{ ++ return (u8)((READ_ONCE(pe->val[2]) & GENMASK_ULL(3, 2)) >> 2); ++} ++ ++/* ++ * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63) ++ * of a scalable mode PASID entry. ++ */ ++static inline void ++pasid_set_slptr(struct pasid_entry *pe, u64 value) ++{ ++ entry_set_bits(&pe->val[0], VTD_PAGE_MASK, value); ++} ++ ++/* ++ * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID ++ * entry. ++ */ ++static inline void ++pasid_set_address_width(struct pasid_entry *pe, u64 value) ++{ ++ entry_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2); ++} ++ ++/* ++ * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8) ++ * of a scalable mode PASID entry. ++ */ ++static inline void ++pasid_set_translation_type(struct pasid_entry *pe, u64 value) ++{ ++ entry_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6); ++} ++ ++/* ++ * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID ++ * entry. ++ */ ++static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) ++{ ++ entry_set_bits(&pe->val[1], 1 << 23, value << 23); ++} ++ ++/* ++ * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode ++ * PASID entry. ++ */ ++static inline void ++pasid_set_pgsnp(struct pasid_entry *pe) ++{ ++ entry_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24); ++} ++ ++#define PASID_ENTRY_PGTT_FL_ONLY (1) ++#define PASID_ENTRY_PGTT_SL_ONLY (2) ++#define PASID_ENTRY_PGTT_NESTED (3) ++#define PASID_ENTRY_PGTT_PT (4) ++ ++/* ++ * Set the Second Stage Execute Enable field (Bit 5) of a scalable mode ++ * PASID entry. ++ */ ++static inline void pasid_set_ssee(struct pasid_entry *pe, bool value) ++{ ++ entry_set_bits(&pe->val[0], 1 << 5, value << 5); ++} ++ ++/* ++ * Set the Second Stage Access/Dirty bit Enable field (Bit 9) of a scalable mode ++ * PASID entry. ++ */ ++static inline void pasid_set_ssade(struct pasid_entry *pe, bool value) ++{ ++ entry_set_bits(&pe->val[0], 1 << 9, value << 9); ++} ++ ++static inline bool pasid_copy_entry(struct pasid_entry *to, struct pasid_entry *from) ++{ ++ bool updated = false; ++ int i; ++ ++ for (i = 0; i < 8; i++) { ++ u64 new = READ_ONCE(from->val[i]); ++ ++ if (READ_ONCE(to->val[i]) != new) { ++ WRITE_ONCE(to->val[i], new); ++ updated = true; ++ } ++ } ++ ++ return updated; ++} ++ ++extern void root_tbl_walk(struct pkvm_iommu *iommu); ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c b/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c +new file mode 100644 +index 000000000000..1da2fca89e5d +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.c +@@ -0,0 +1,106 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright(c) 2022 Intel Corporation. ++ * Copyright(c) 2023 Semihalf. ++ */ ++ ++#include ++#include ++#include ++#include ++#include "pkvm_hyp.h" ++#include "iommu_spgt.h" ++#include "ept.h" ++#include "bug.h" ++ ++static DEFINE_HASHTABLE(iommu_spgt_hasht, 8); ++static DECLARE_BITMAP(iommu_spgt_bitmap, PKVM_MAX_PDEV_NUM); ++static struct pkvm_iommu_spgt pkvm_iommu_spgt[PKVM_MAX_PDEV_NUM]; ++static pkvm_spinlock_t iommu_spgt_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED }; ++ ++struct pkvm_pgtable *pkvm_get_host_iommu_spgt(unsigned long root_gpa, bool coherency) ++{ ++ struct pkvm_iommu_spgt *spgt = NULL, *tmp; ++ unsigned long index; ++ int ret; ++ ++ pkvm_spin_lock(&iommu_spgt_lock); ++ ++ hash_for_each_possible(iommu_spgt_hasht, tmp, hnode, root_gpa) { ++ if (tmp->root_gpa == root_gpa) { ++ if (tmp->refcount > 0) { ++ spgt = tmp; ++ break; ++ } ++ } ++ } ++ ++ if (spgt) { ++ spgt->refcount++; ++ spgt->noncoherent_count += !coherency; ++ pkvm_shadow_sl_iommu_pgt_update_coherency(&spgt->pgt, ++ !spgt->noncoherent_count); ++ goto out; ++ } ++ ++ index = find_first_zero_bit(iommu_spgt_bitmap, PKVM_MAX_PDEV_NUM); ++ if (index < PKVM_MAX_PDEV_NUM) { ++ spgt = &pkvm_iommu_spgt[index]; ++ ++ ret = pkvm_pgtable_init(&spgt->pgt, ++ pkvm_shadow_sl_iommu_pgt_get_mm_ops(coherency), ++ &ept_ops, &pkvm_hyp->ept_cap, true); ++ if (ret) { ++ pkvm_err("%s: pgtable init failed err=%d\n", __func__, ret); ++ spgt = NULL; ++ goto out; ++ } ++ ++ __set_bit(index, iommu_spgt_bitmap); ++ spgt->root_gpa = root_gpa; ++ spgt->index = index; ++ spgt->refcount = 1; ++ spgt->noncoherent_count = !coherency; ++ hash_add(iommu_spgt_hasht, &spgt->hnode, root_gpa); ++ } ++out: ++ pkvm_spin_unlock(&iommu_spgt_lock); ++ ++ return spgt ? &spgt->pgt : NULL; ++} ++ ++void pkvm_put_host_iommu_spgt(struct pkvm_pgtable *pgt, bool coherency) ++{ ++ struct pkvm_iommu_spgt *spgt = NULL, *tmp; ++ int bkt; ++ ++ pkvm_spin_lock(&iommu_spgt_lock); ++ ++ hash_for_each(iommu_spgt_hasht, bkt, tmp, hnode) { ++ if (&tmp->pgt == pgt) { ++ spgt = tmp; ++ break; ++ } ++ } ++ PKVM_ASSERT(spgt); ++ PKVM_ASSERT(spgt->refcount > 0); ++ ++ if (--spgt->refcount > 0) { ++ spgt->noncoherent_count -= !coherency; ++ PKVM_ASSERT(spgt->noncoherent_count >= 0); ++ pkvm_shadow_sl_iommu_pgt_update_coherency(&spgt->pgt, ++ !spgt->noncoherent_count); ++ goto out; ++ } ++ ++ hash_del(&spgt->hnode); ++ ++ __clear_bit(spgt->index, iommu_spgt_bitmap); ++ ++ pkvm_pgtable_destroy(&spgt->pgt, NULL); ++ ++ memset(spgt, 0, sizeof(struct pkvm_iommu_spgt)); ++ ++out: ++ pkvm_spin_unlock(&iommu_spgt_lock); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h b/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h +new file mode 100644 +index 000000000000..9fb4667318b3 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/iommu_spgt.h +@@ -0,0 +1,19 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright(c) 2022 Intel Corporation. ++ * Copyright(c) 2023 Semihalf. ++ */ ++ ++#include "pgtable.h" ++ ++struct pkvm_iommu_spgt { ++ int refcount; ++ int noncoherent_count; ++ struct hlist_node hnode; ++ unsigned long root_gpa; ++ unsigned long index; ++ struct pkvm_pgtable pgt; ++}; ++ ++struct pkvm_pgtable *pkvm_get_host_iommu_spgt(unsigned long root_gpa, bool coherency); ++void pkvm_put_host_iommu_spgt(struct pkvm_pgtable *spgt, bool coherency); +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/irq.c b/arch/x86/kvm/vmx/pkvm/hyp/irq.c +new file mode 100644 +index 000000000000..0580edb21313 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/irq.c +@@ -0,0 +1,60 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++#include "cpu.h" ++#include "pkvm_hyp.h" ++#include "debug.h" ++ ++void handle_noop(void) ++{ ++ pkvm_err("%s: unexpected exception\n", __func__); ++} ++ ++void handle_nmi(void) ++{ ++ int cpu_id = get_pcpu_id(); ++ struct pkvm_host_vcpu *pkvm_host_vcpu = ++ pkvm_hyp->host_vm.host_vcpus[cpu_id]; ++ struct vcpu_vmx *vmx = &pkvm_host_vcpu->vmx; ++ ++ if (!pkvm_host_vcpu || !vmx) ++ return; ++ ++ if (pkvm_host_vcpu->pending_nmi) { ++ pkvm_dbg("%s: CPU%d already has a pending NMI\n", ++ __func__, cpu_id); ++ return; ++ } ++ ++ /* load host vcpu vmcs for sure */ ++ vmcs_load(vmx->loaded_vmcs->vmcs); ++ ++ /* ++ * This NMI could happen either before executing ++ * the injection code or after. ++ * For the before case, should record a pending NMI. ++ * For the after case, if no NMI is injected in guest ++ * we also need to record a pending NMI. If NMI is ++ * injected already, it is not necessary to inject ++ * again but injecting it in the next round should also ++ * be fine. So simply record a pending NMI here. ++ */ ++ pkvm_host_vcpu->pending_nmi = true; ++ ++ pkvm_dbg("%s: CPU%d pending NMI\n", __func__, cpu_id); ++ ++ /* For case that when NMI happens the injection code is ++ * already executed, open the NMI window. For the case ++ * happens before, opening NMI window doesn't cause trouble. ++ */ ++ _vmx_enable_nmi_window(vmx, false); ++ ++ /* switch if the current one is not host vcpu vmcs */ ++ if (pkvm_host_vcpu->current_vmcs && ++ (pkvm_host_vcpu->current_vmcs != vmx->loaded_vmcs->vmcs)) ++ vmcs_load(pkvm_host_vcpu->current_vmcs); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lapic.c b/arch/x86/kvm/vmx/pkvm/hyp/lapic.c +new file mode 100644 +index 000000000000..19bd45f2d394 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/lapic.c +@@ -0,0 +1,222 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#include "pkvm.h" ++#include "cpu.h" ++#include "memory.h" ++#include "mmu.h" ++#include "pgtable.h" ++#include "bug.h" ++#include "pkvm_hyp.h" ++ ++struct pkvm_lapic { ++ bool x2apic; ++ u32 apic_id; ++ unsigned long apic_base_phys; ++ void *apic_base_va; ++}; ++ ++static struct pkvm_lapic pkvm_lapic[CONFIG_NR_CPUS]; ++ ++#define APIC_BASE_PHYS_MASK GENMASK_ULL(get_max_physaddr_bits(), 12) ++ ++static u32 __pkvm_lapic_read(struct pkvm_lapic *lapic, u32 reg) ++{ ++ u64 val; ++ ++ if (lapic->x2apic) ++ pkvm_rdmsrl(APIC_BASE_MSR + (reg >> 4), val); ++ else ++ val = readl(lapic->apic_base_va + reg); ++ ++ return (u32)val; ++} ++ ++static u64 __pkvm_lapic_icr_read(struct pkvm_lapic *lapic) ++{ ++ u64 val; ++ ++ if (lapic->x2apic) ++ pkvm_rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); ++ else { ++ u64 icr2; ++ ++ icr2 = readl(lapic->apic_base_va + APIC_ICR2); ++ val = readl(lapic->apic_base_va + APIC_ICR); ++ val |= icr2 << 32; ++ } ++ ++ return val; ++} ++ ++static void __pkvm_wait_icr_idle(struct pkvm_lapic *lapic) ++{ ++ /* x2apic mode doesn't have delivery status bit */ ++ if (lapic->x2apic) ++ return; ++ ++ while (__pkvm_lapic_icr_read(lapic) & APIC_ICR_BUSY) ++ cpu_relax(); ++} ++ ++static void __pkvm_lapic_icr_write(struct pkvm_lapic *lapic, u32 low, u32 id) ++{ ++ if (lapic->x2apic) ++ pkvm_wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ++ low | ((u64)id << 32)); ++ else { ++ writel(id, lapic->apic_base_va + APIC_ICR2); ++ writel(low, lapic->apic_base_va + APIC_ICR); ++ __pkvm_wait_icr_idle(lapic); ++ } ++} ++ ++static int __pkvm_setup_lapic(struct pkvm_lapic *lapic, u64 apicbase) ++{ ++ /* Not allow lapic to be disabled as it will be used for kick */ ++ PKVM_ASSERT(apicbase & (X2APIC_ENABLE | XAPIC_ENABLE)); ++ ++ if (!(apicbase & X2APIC_ENABLE)) { ++ unsigned long base_phys = apicbase & APIC_BASE_PHYS_MASK; ++ void *vaddr = pkvm_iophys_to_virt(base_phys); ++ ++ if ((unsigned long)vaddr == INVALID_ADDR) ++ return -EINVAL; ++ ++ if ((lapic->apic_base_phys == base_phys) && ++ (lapic->apic_base_va == vaddr)) ++ goto done; ++ ++ /* unmap the previous MMIO mapping then map the new one */ ++ if (lapic->apic_base_va) { ++ pkvm_mmu_unmap((unsigned long)lapic->apic_base_va, ++ PAGE_SIZE); ++ lapic->apic_base_phys = 0; ++ lapic->apic_base_va = NULL; ++ } ++ ++ if (pkvm_mmu_map((unsigned long)vaddr, base_phys, PAGE_SIZE, ++ 0, PKVM_PAGE_IO_NOCACHE)) ++ return -ENOMEM; ++ ++ lapic->apic_base_phys = base_phys; ++ lapic->apic_base_va = vaddr; ++ lapic->x2apic = false; ++ } else ++ lapic->x2apic = true; ++done: ++ /* ++ * APIC_ID reg is writable for primary VM so it is ++ * possible for primary VM to change the APIC_ID. ++ * So pkvm should have a way to intercept the APIC_ID ++ * changing. For x2apic mode, this can be done through ++ * intercepting the APIC_ID msr write. ++ * ++ * TODO: handling the APIC_ID changing for xapic mode. ++ */ ++ lapic->apic_id = __pkvm_lapic_read(lapic, APIC_ID); ++ ++ return 0; ++} ++ ++static inline bool is_lapic_setup(struct pkvm_pcpu *pcpu) ++{ ++ return !!pcpu->lapic; ++} ++ ++int pkvm_setup_lapic(struct pkvm_pcpu *pcpu, int cpu) ++{ ++ struct pkvm_lapic *lapic = &pkvm_lapic[cpu]; ++ u64 apicbase; ++ ++ /* Nothing needs to be done if already setup */ ++ if (is_lapic_setup(pcpu)) ++ return 0; ++ ++ pkvm_rdmsrl(MSR_IA32_APICBASE, apicbase); ++ ++ pcpu->lapic = lapic; ++ ++ return __pkvm_setup_lapic(lapic, apicbase); ++} ++ ++void pkvm_apic_base_msr_write(struct kvm_vcpu *vcpu, u64 apicbase) ++{ ++ struct pkvm_pcpu *pcpu = to_pkvm_hvcpu(vcpu)->pcpu; ++ struct pkvm_lapic *lapic = pcpu->lapic; ++ ++ /* ++ * MSR is accessed before the init finalizing phase ++ * that pkvm has not setup lapic yet. In this case, let the ++ * wrmsr directly go to the hardware. ++ */ ++ if (!is_lapic_setup(pcpu)) { ++ pkvm_wrmsrl(MSR_IA32_APICBASE, apicbase); ++ return; ++ } ++ ++ /* A fatal error when is running at runtime */ ++ PKVM_ASSERT(__pkvm_setup_lapic(lapic, apicbase) == 0); ++ ++ pkvm_wrmsrl(MSR_IA32_APICBASE, apicbase); ++} ++ ++int pkvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 val) ++{ ++ struct pkvm_pcpu *pcpu = to_pkvm_hvcpu(vcpu)->pcpu; ++ struct pkvm_lapic *lapic = pcpu->lapic; ++ u32 reg = (msr - APIC_BASE_MSR) << 4; ++ ++ /* ++ * MSR is accessed before the init finalizing phase ++ * that pkvm has not setup lapic yet. In this case, let the ++ * wrmsr directly go to the hardware. ++ */ ++ if (!is_lapic_setup(pcpu)) { ++ pkvm_wrmsrl(msr, val); ++ return 0; ++ } ++ ++ /* Ensure lapic is in x2apic mode */ ++ if (!lapic->x2apic) ++ return -EINVAL; ++ ++ switch (reg) { ++ case APIC_ID: ++ /* ++ * Not allow primary VM to modify the lapic ID which ++ * can result in the failure of pkvm to kick. ++ */ ++ PKVM_ASSERT(lapic->apic_id == (u32)val); ++ break; ++ default: ++ break; ++ } ++ ++ pkvm_wrmsrl(msr, val); ++ return 0; ++} ++ ++void pkvm_lapic_send_init(struct pkvm_pcpu *dst_pcpu) ++{ ++ u32 icrlow = APIC_INT_ASSERT | APIC_DM_INIT; ++ int cpu_id = get_pcpu_id(); ++ struct pkvm_pcpu *pcpu = pkvm_hyp->pcpus[cpu_id]; ++ struct pkvm_lapic *dst_lapic = dst_pcpu->lapic; ++ ++ /* Not to send INIT to self */ ++ if (pcpu == dst_pcpu) ++ return; ++ /* ++ * If the lapic is not setup yet, which is during the finalizing ++ * phase, cannot send INIT. Also not necessary to use INIT for tlb ++ * shoot down as when isolating some memory from the primary VM in ++ * the finalizing phase, as we can flush ept tlbs at the end of ++ * finalizing for each CPU. ++ */ ++ if (unlikely(!is_lapic_setup(pcpu) || !is_lapic_setup(dst_pcpu))) ++ return; ++ ++ __pkvm_lapic_icr_write(pcpu->lapic, icrlow, dst_lapic->apic_id); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lapic.h b/arch/x86/kvm/vmx/pkvm/hyp/lapic.h +new file mode 100644 +index 000000000000..d4513afe5c80 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/lapic.h +@@ -0,0 +1,12 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _PKVM_LAPIC_H_ ++#define _PKVM_LAPIC_H_ ++ ++int pkvm_setup_lapic(struct pkvm_pcpu *pcpu, int cpu); ++void pkvm_apic_base_msr_write(struct kvm_vcpu *vcpu, u64 apicbase); ++int pkvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 val); ++void pkvm_lapic_send_init(struct pkvm_pcpu *dst_pcpu); ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c b/arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c +new file mode 100644 +index 000000000000..67f295a4668b +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/lib/list_debug.c +@@ -0,0 +1,16 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ */ ++ ++#include ++ ++bool __list_add_valid(struct list_head *new, struct list_head *prev, ++ struct list_head *next) ++{ ++ return true; ++} ++ ++bool __list_del_entry_valid(struct list_head *entry) ++{ ++ return true; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S b/arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S +new file mode 100644 +index 000000000000..b976f646d352 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/lib/memcpy_64.S +@@ -0,0 +1,26 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* Copyright 2002 Andi Kleen */ ++ ++#include ++ ++/* ++ * memcpy - Copy a memory block. ++ * ++ * Input: ++ * rdi destination ++ * rsi source ++ * rdx count ++ * ++ * Output: ++ * rax original destination ++ * ++ * This is enhanced fast string memcpy. It is faster and ++ * simpler than old memcpy. ++ */ ++ ++SYM_FUNC_START(memcpy) ++ movq %rdi, %rax ++ movq %rdx, %rcx ++ rep movsb ++ RET ++SYM_FUNC_END(memcpy) +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S b/arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S +new file mode 100644 +index 000000000000..8c30d2f5f925 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/lib/memset_64.S +@@ -0,0 +1,24 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* Copyright 2002 Andi Kleen, SuSE Labs */ ++ ++#include ++ ++/* ++ * ISO C memset - set a memory block to a byte value. This function uses ++ * enhanced rep stosb to override the fast string function. ++ * The code is simpler and shorter than the fast string function as well. ++ * ++ * rdi destination ++ * rsi value (char) ++ * rdx count (bytes) ++ * ++ * rax original destination ++ */ ++SYM_FUNC_START(memset) ++ movq %rdi,%r9 ++ movb %sil,%al ++ movq %rdx,%rcx ++ rep stosb ++ movq %r9,%rax ++ RET ++SYM_FUNC_END(memset) +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S b/arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S +new file mode 100644 +index 000000000000..7758ec40fe7c +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/lib/retpoline.S +@@ -0,0 +1,115 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++ .section .text.__x86.indirect_thunk ++ ++.macro RETPOLINE reg ++ ANNOTATE_INTRA_FUNCTION_CALL ++ call .Ldo_rop_\@ ++.Lspec_trap_\@: ++ UNWIND_HINT_EMPTY ++ pause ++ lfence ++ jmp .Lspec_trap_\@ ++.Ldo_rop_\@: ++ mov %\reg, (%_ASM_SP) ++ UNWIND_HINT_FUNC ++ RET ++.endm ++ ++.macro THUNK reg ++ ++ .align RETPOLINE_THUNK_SIZE ++SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) ++ UNWIND_HINT_EMPTY ++ ++ RETPOLINE \reg ++ ++.endm ++ ++ .align RETPOLINE_THUNK_SIZE ++SYM_CODE_START(__x86_indirect_thunk_array) ++ ++#define GEN(reg) THUNK reg ++#include ++#undef GEN ++ ++ .align RETPOLINE_THUNK_SIZE ++SYM_CODE_END(__x86_indirect_thunk_array) ++ ++/* ++ * This function name is magical and is used by -mfunction-return=thunk-extern ++ * for the compiler to generate JMPs to it. ++ */ ++#ifdef CONFIG_RETHUNK ++ ++ .section .text.__x86.return_thunk ++ ++/* ++ * Safety details here pertain to the AMD Zen{1,2} microarchitecture: ++ * 1) The RET at __x86_return_thunk must be on a 64 byte boundary, for ++ * alignment within the BTB. ++ * 2) The instruction at zen_untrain_ret must contain, and not ++ * end with, the 0xc3 byte of the RET. ++ * 3) STIBP must be enabled, or SMT disabled, to prevent the sibling thread ++ * from re-poisoning the BTB prediction. ++ */ ++ .align 64 ++ .skip 63, 0xcc ++SYM_FUNC_START_NOALIGN(zen_untrain_ret); ++ ++ /* ++ * As executed from zen_untrain_ret, this is: ++ * ++ * TEST $0xcc, %bl ++ * LFENCE ++ * JMP __x86_return_thunk ++ * ++ * Executing the TEST instruction has a side effect of evicting any BTB ++ * prediction (potentially attacker controlled) attached to the RET, as ++ * __x86_return_thunk + 1 isn't an instruction boundary at the moment. ++ */ ++ .byte 0xf6 ++ ++ /* ++ * As executed from __x86_return_thunk, this is a plain RET. ++ * ++ * As part of the TEST above, RET is the ModRM byte, and INT3 the imm8. ++ * ++ * We subsequently jump backwards and architecturally execute the RET. ++ * This creates a correct BTB prediction (type=ret), but in the ++ * meantime we suffer Straight Line Speculation (because the type was ++ * no branch) which is halted by the INT3. ++ * ++ * With SMT enabled and STIBP active, a sibling thread cannot poison ++ * RET's prediction to a type of its choice, but can evict the ++ * prediction due to competitive sharing. If the prediction is ++ * evicted, __x86_return_thunk will suffer Straight Line Speculation ++ * which will be contained safely by the INT3. ++ */ ++SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL) ++ ret ++ int3 ++SYM_CODE_END(__x86_return_thunk) ++ ++ /* ++ * Ensure the TEST decoding / BTB invalidation is complete. ++ */ ++ lfence ++ ++ /* ++ * Jump back and execute the RET in the middle of the TEST instruction. ++ * INT3 is for SLS protection. ++ */ ++ jmp __x86_return_thunk ++ int3 ++SYM_FUNC_END(zen_untrain_ret) ++ ++#endif /* CONFIG_RETHUNK */ +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c b/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c +new file mode 100644 +index 000000000000..5e6ee262fbe8 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.c +@@ -0,0 +1,1013 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#include ++#include ++#include ++#include "pkvm_hyp.h" ++#include "mem_protect.h" ++#include "pgtable.h" ++#include "ept.h" ++ ++struct check_walk_data { ++ int nstate; ++ enum pkvm_page_state *desired; ++}; ++ ++enum pkvm_component_id { ++ PKVM_ID_HOST, ++ PKVM_ID_HYP, ++ PKVM_ID_GUEST, ++}; ++ ++struct pkvm_mem_trans_desc { ++ enum pkvm_component_id id; ++ union { ++ struct { ++ struct pkvm_pgtable *pgt_override; ++ u64 addr; ++ } host; ++ ++ struct { ++ u64 addr; ++ } hyp; ++ ++ struct { ++ struct pkvm_pgtable *pgt; ++ u64 addr; ++ u64 phys; ++ } guest; ++ }; ++ u64 prot; ++}; ++ ++struct pkvm_mem_transition { ++ u64 size; ++ struct pkvm_mem_trans_desc initiator; ++ struct pkvm_mem_trans_desc completer; ++}; ++ ++static void guest_pgstate_pgt_lock(struct pkvm_pgtable *pgt) ++{ ++ pkvm_spin_lock(&pgstate_pgt_to_shadow_vm(pgt)->lock); ++} ++ ++static void guest_pgstate_pgt_unlock(struct pkvm_pgtable *pgt) ++{ ++ pkvm_spin_unlock(&pgstate_pgt_to_shadow_vm(pgt)->lock); ++} ++ ++static u64 pkvm_init_invalid_leaf_owner(pkvm_id owner_id) ++{ ++ /* the page owned by others also means NOPAGE in page state */ ++ return FIELD_PREP(PKVM_INVALID_PTE_OWNER_MASK, owner_id) | ++ FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, PKVM_NOPAGE); ++} ++ ++static int host_ept_set_owner_locked(struct pkvm_pgtable *pgt_override, phys_addr_t addr, ++ u64 size, pkvm_id owner_id) ++{ ++ u64 annotation = pkvm_init_invalid_leaf_owner(owner_id); ++ ++ /* ++ * The memory [addr, addr + size) will be unmapped from host ept. At the ++ * same time, the annotation with a NOPAGE flag will be put in the ++ * invalid pte that has been unmapped. And the information shows that ++ * the page has been used by some guest and its id can be read from ++ * annotation. Also when later these pages are back to host, the annotation ++ * will be helpful to check the right page transition. ++ */ ++ return pkvm_pgtable_annotate(pgt_override ? pgt_override : pkvm_hyp->host_vm.ept, ++ addr, size, annotation); ++} ++ ++static int host_ept_create_idmap_locked(struct pkvm_pgtable *pgt_override, u64 addr, ++ u64 size, int pgsz_mask, u64 prot) ++{ ++ return pkvm_pgtable_map(pgt_override ? pgt_override : pkvm_hyp->host_vm.ept, ++ addr, addr, size, pgsz_mask, prot, NULL); ++} ++ ++static int ++__check_page_state_walker(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long vaddr_end, int level, void *ptep, ++ unsigned long flags, struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct check_walk_data *data = arg; ++ int i; ++ ++ for (i = 0; i < data->nstate; i++) ++ if (pkvm_getstate(*(u64 *)ptep) == data->desired[i]) ++ return 0; ++ ++ return -EPERM; ++} ++ ++static int check_page_state_range(struct pkvm_pgtable *pgt, u64 addr, u64 size, ++ enum pkvm_page_state *states, int nstate) ++{ ++ struct check_walk_data data = { ++ .nstate = nstate, ++ .desired = states, ++ }; ++ struct pkvm_pgtable_walker walker = { ++ .cb = __check_page_state_walker, ++ .flags = PKVM_PGTABLE_WALK_LEAF, ++ .arg = &data, ++ }; ++ ++ return pgtable_walk(pgt, addr, size, true, &walker); ++} ++ ++static int __host_check_page_state_range(struct pkvm_pgtable *pgt_override, u64 addr, ++ u64 size, enum pkvm_page_state state) ++{ ++ struct pkvm_pgtable *host_ept = pgt_override ? pgt_override : pkvm_hyp->host_vm.ept; ++ ++ return check_page_state_range(host_ept, addr, size, &state, 1); ++} ++ ++static int __guest_check_page_state_range(struct pkvm_pgtable *pgt, ++ u64 addr, u64 size, ++ enum pkvm_page_state state) ++{ ++ return check_page_state_range(pgt, addr, size, &state, 1); ++} ++ ++static pkvm_id pkvm_guest_id(struct pkvm_pgtable *pgt) ++{ ++ /* Using the shadow_vm_handle as guest_id. */ ++ return pgstate_pgt_to_shadow_vm(pgt)->shadow_vm_handle; ++} ++ ++static pkvm_id __pkvm_owner_id(const struct pkvm_mem_trans_desc *desc) ++{ ++ switch (desc->id) { ++ case PKVM_ID_HYP: ++ return pkvm_hyp_id; ++ case PKVM_ID_GUEST: ++ return pkvm_guest_id(desc->guest.pgt); ++ default: ++ WARN_ON(1); ++ return -1; ++ } ++} ++ ++static pkvm_id initiator_owner_id(const struct pkvm_mem_transition *tx) ++{ ++ return __pkvm_owner_id(&tx->initiator); ++} ++ ++static pkvm_id completer_owner_id(const struct pkvm_mem_transition *tx) ++{ ++ return __pkvm_owner_id(&tx->completer); ++} ++ ++static int host_request_donation(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->initiator.host.addr; ++ u64 size = tx->size; ++ ++ return __host_check_page_state_range(tx->initiator.host.pgt_override, ++ addr, size, PKVM_PAGE_OWNED); ++} ++ ++static int guest_request_donation(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->initiator.guest.addr; ++ u64 size = tx->size; ++ enum pkvm_page_state states[] = { PKVM_PAGE_OWNED, ++ PKVM_PAGE_SHARED_OWNED, ++ }; ++ ++ /* ++ * When destroying vm, there may be multiple page states in the guest ++ * pgstate ept. In such case, both page states are ok to be reclaimed ++ * back by host. ++ */ ++ return check_page_state_range(tx->initiator.guest.pgt, ++ addr, size, states, ARRAY_SIZE(states)); ++} ++ ++static int host_ack_donation(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->completer.host.addr; ++ u64 size = tx->size; ++ enum pkvm_page_state states[] = { PKVM_NOPAGE, ++ PKVM_PAGE_SHARED_BORROWED, ++ }; ++ struct pkvm_pgtable *host_ept = tx->completer.host.pgt_override ? ++ tx->completer.host.pgt_override : ++ pkvm_hyp->host_vm.ept; ++ ++ /* Same as guest_request_donation. */ ++ return check_page_state_range(host_ept, addr, size, states, ARRAY_SIZE(states)); ++} ++ ++static int guest_ack_donation(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->completer.guest.addr; ++ u64 size = tx->size; ++ ++ return __guest_check_page_state_range(tx->completer.guest.pgt, addr, ++ size, PKVM_NOPAGE); ++} ++ ++static int check_donation(const struct pkvm_mem_transition *tx) ++{ ++ int ret; ++ ++ switch (tx->initiator.id) { ++ case PKVM_ID_HOST: ++ ret = host_request_donation(tx); ++ break; ++ case PKVM_ID_HYP: ++ ret = 0; ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_request_donation(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret) ++ return ret; ++ ++ switch (tx->completer.id) { ++ case PKVM_ID_HOST: ++ ret = host_ack_donation(tx); ++ break; ++ case PKVM_ID_HYP: ++ ret = 0; ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_ack_donation(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++ ++static int host_initiate_donation(const struct pkvm_mem_transition *tx) ++{ ++ pkvm_id owner_id = completer_owner_id(tx); ++ u64 addr = tx->initiator.host.addr; ++ u64 size = tx->size; ++ ++ return host_ept_set_owner_locked(tx->initiator.host.pgt_override, addr, size, owner_id); ++} ++ ++static int guest_initiate_donation(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->initiator.guest.addr; ++ u64 phys = tx->initiator.guest.phys; ++ u64 size = tx->size; ++ ++ return pkvm_pgtable_unmap_safe(tx->initiator.guest.pgt, addr, phys, size, NULL); ++} ++ ++static int host_complete_donation(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->completer.host.addr; ++ u64 size = tx->size; ++ u64 prot = pkvm_mkstate(tx->completer.prot, PKVM_PAGE_OWNED); ++ ++ return host_ept_create_idmap_locked(tx->completer.host.pgt_override, addr, size, 0, prot); ++} ++ ++static int guest_complete_donation(const struct pkvm_mem_transition *tx) ++{ ++ struct pkvm_pgtable *pgt = tx->completer.guest.pgt; ++ u64 addr = tx->completer.guest.addr; ++ u64 size = tx->size; ++ u64 phys = tx->completer.guest.phys; ++ u64 prot = tx->completer.prot; ++ ++ prot = pkvm_mkstate(prot, PKVM_PAGE_OWNED); ++ return pkvm_pgtable_map(pgt, addr, phys, size, 0, prot, NULL); ++} ++ ++static int __do_donate(const struct pkvm_mem_transition *tx) ++{ ++ int ret; ++ ++ switch (tx->initiator.id) { ++ case PKVM_ID_HOST: ++ ret = host_initiate_donation(tx); ++ break; ++ case PKVM_ID_HYP: ++ ret = 0; ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_initiate_donation(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret) ++ return ret; ++ ++ switch (tx->completer.id) { ++ case PKVM_ID_HOST: ++ ret = host_complete_donation(tx); ++ break; ++ case PKVM_ID_HYP: ++ ret = 0; ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_complete_donation(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++ ++/* ++ * do_donate - the page owner transfer ownership to another component. ++ * ++ * Initiator: OWNED => NO_PAGE ++ * Completer: NO_APGE => OWNED ++ * ++ * The special component is pkvm_hyp. Since pkvm_hyp can access all the ++ * memory, nothing needs to be done if the page owner is transferred to hyp or ++ * hyp transfers the ownership to other entities. ++ */ ++static int do_donate(const struct pkvm_mem_transition *donation) ++{ ++ int ret; ++ ++ ret = check_donation(donation); ++ if (ret) ++ return ret; ++ ++ return WARN_ON(__do_donate(donation)); ++} ++ ++int __pkvm_host_donate_hyp(u64 hpa, u64 size) ++{ ++ int ret; ++ u64 hyp_addr = (u64)__pkvm_va(hpa); ++ struct pkvm_mem_transition donation = { ++ .size = size, ++ .initiator = { ++ .id = PKVM_ID_HOST, ++ .host = { ++ .addr = hpa, ++ }, ++ }, ++ .completer = { ++ .id = PKVM_ID_HYP, ++ .hyp = { ++ .addr = hyp_addr, ++ }, ++ }, ++ }; ++ ++ host_ept_lock(); ++ ++ ret = do_donate(&donation); ++ ++ host_ept_unlock(); ++ ++ return ret; ++} ++ ++int __pkvm_hyp_donate_host(u64 hpa, u64 size) ++{ ++ int ret; ++ u64 hyp_addr = (u64)__pkvm_va(hpa); ++ struct pkvm_mem_transition donation = { ++ .size = size, ++ .initiator = { ++ .id = PKVM_ID_HYP, ++ .hyp = { ++ .addr = hyp_addr, ++ }, ++ }, ++ .completer = { ++ .id = PKVM_ID_HOST, ++ .host = { ++ .addr = hpa, ++ }, ++ .prot = HOST_EPT_DEF_MEM_PROT, ++ }, ++ }; ++ ++ host_ept_lock(); ++ ++ ret = do_donate(&donation); ++ ++ host_ept_unlock(); ++ ++ return ret; ++} ++ ++int __pkvm_host_donate_guest(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size, u64 prot) ++{ ++ int ret; ++ struct pkvm_mem_transition donation = { ++ .size = size, ++ .initiator = { ++ .id = PKVM_ID_HOST, ++ .host = { ++ .addr = hpa, ++ }, ++ }, ++ .completer = { ++ .id = PKVM_ID_GUEST, ++ .guest = { ++ .pgt = guest_pgt, ++ .addr = gpa, ++ .phys = hpa, ++ }, ++ .prot = prot, ++ }, ++ }; ++ ++ host_ept_lock(); ++ ++ ret = do_donate(&donation); ++ ++ host_ept_unlock(); ++ ++ return ret; ++} ++ ++/* ++ * Fastpath interface will use the host EPT instance without doing tlbflushing ++ * to have a better performance. It is usually used in the scenario that caller ++ * needs to change a bunch of pages' state without having the TLB flushing ++ * overhead in the each iteration, but caller still needs to do TLB flushing ++ * after completing all the iterations. ++ */ ++int __pkvm_host_donate_guest_fastpath(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size, u64 prot) ++{ ++ int ret; ++ struct pkvm_mem_transition donation = { ++ .size = size, ++ .initiator = { ++ .id = PKVM_ID_HOST, ++ .host = { ++ .pgt_override = pkvm_hyp->host_vm.ept_notlbflush, ++ .addr = hpa, ++ }, ++ }, ++ .completer = { ++ .id = PKVM_ID_GUEST, ++ .guest = { ++ .pgt = guest_pgt, ++ .addr = gpa, ++ .phys = hpa, ++ }, ++ .prot = prot, ++ }, ++ }; ++ ++ host_ept_lock(); ++ ++ ret = do_donate(&donation); ++ ++ host_ept_unlock(); ++ ++ return ret; ++} ++ ++int __pkvm_host_undonate_guest(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size) ++{ ++ int ret; ++ struct pkvm_mem_transition donation = { ++ .size = size, ++ .initiator = { ++ .id = PKVM_ID_GUEST, ++ .guest = { ++ .addr = gpa, ++ .phys = hpa, ++ .pgt = guest_pgt, ++ }, ++ }, ++ .completer = { ++ .id = PKVM_ID_HOST, ++ .host = { ++ .addr = hpa, ++ }, ++ .prot = HOST_EPT_DEF_MEM_PROT, ++ }, ++ }; ++ ++ host_ept_lock(); ++ ++ ret = do_donate(&donation); ++ ++ host_ept_unlock(); ++ ++ return ret; ++} ++ ++static int host_request_share(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->initiator.host.addr; ++ u64 size = tx->size; ++ ++ return __host_check_page_state_range(tx->initiator.host.pgt_override, ++ addr, size, PKVM_PAGE_OWNED); ++} ++ ++static int guest_request_share(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->initiator.guest.addr; ++ u64 size = tx->size; ++ ++ return __guest_check_page_state_range(tx->initiator.guest.pgt, ++ addr, size, PKVM_PAGE_OWNED); ++} ++ ++static int host_ack_share(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->completer.host.addr; ++ u64 size = tx->size; ++ ++ return __host_check_page_state_range(tx->completer.host.pgt_override, ++ addr, size, PKVM_NOPAGE); ++} ++ ++static int guest_ack_share(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->completer.guest.addr; ++ u64 size = tx->size; ++ ++ return __guest_check_page_state_range(tx->completer.guest.pgt, addr, ++ size, PKVM_NOPAGE); ++} ++ ++static int check_share(const struct pkvm_mem_transition *tx) ++{ ++ int ret; ++ ++ switch (tx->initiator.id) { ++ case PKVM_ID_HOST: ++ ret = host_request_share(tx); ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_request_share(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret) ++ return ret; ++ ++ switch (tx->completer.id) { ++ case PKVM_ID_HOST: ++ ret = host_ack_share(tx); ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_ack_share(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++ ++static int host_initiate_share(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->initiator.host.addr; ++ u64 size = tx->size; ++ u64 prot = pkvm_mkstate(tx->initiator.prot, PKVM_PAGE_SHARED_OWNED); ++ ++ return host_ept_create_idmap_locked(tx->initiator.host.pgt_override, addr, size, 0, prot); ++} ++ ++static int guest_initiate_share(const struct pkvm_mem_transition *tx) ++{ ++ struct pkvm_pgtable *pgt = tx->initiator.guest.pgt; ++ u64 addr = tx->initiator.guest.addr; ++ u64 phys = tx->initiator.guest.phys; ++ u64 size = tx->size; ++ u64 prot = pkvm_mkstate(tx->initiator.prot, PKVM_PAGE_SHARED_OWNED); ++ ++ return pkvm_pgtable_map(pgt, addr, phys, size, 0, prot, NULL); ++} ++ ++static int host_complete_share(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->completer.host.addr; ++ u64 size = tx->size; ++ u64 prot = pkvm_mkstate(tx->completer.prot, PKVM_PAGE_SHARED_BORROWED); ++ ++ return host_ept_create_idmap_locked(tx->completer.host.pgt_override, addr, size, 0, prot); ++} ++ ++static int guest_complete_share(const struct pkvm_mem_transition *tx) ++{ ++ struct pkvm_pgtable *pgt = tx->completer.guest.pgt; ++ u64 addr = tx->completer.guest.addr; ++ u64 size = tx->size; ++ u64 phys = tx->completer.guest.phys; ++ u64 prot = tx->completer.prot; ++ ++ prot = pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED); ++ return pkvm_pgtable_map(pgt, addr, phys, size, 0, prot, NULL); ++} ++ ++static int __do_share(const struct pkvm_mem_transition *tx) ++{ ++ int ret; ++ ++ switch (tx->initiator.id) { ++ case PKVM_ID_HOST: ++ ret = host_initiate_share(tx); ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_initiate_share(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret) ++ return ret; ++ ++ switch (tx->completer.id) { ++ case PKVM_ID_HOST: ++ ret = host_complete_share(tx); ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_complete_share(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++ ++/* ++ * do_share() - The page owner grants access to another component with a given ++ * set of permissions. ++ * ++ * Initiator: OWNED => SHARED_OWNED ++ * Completer: NOPAGE => SHARED_BORROWED ++ */ ++static int do_share(const struct pkvm_mem_transition *share) ++{ ++ int ret; ++ ++ ret = check_share(share); ++ if (ret) ++ return ret; ++ ++ return WARN_ON(__do_share(share)); ++} ++ ++int __pkvm_host_share_guest(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size, u64 prot) ++{ ++ int ret; ++ struct pkvm_mem_transition share = { ++ .size = size, ++ .initiator = { ++ .id = PKVM_ID_HOST, ++ .host = { ++ .addr = hpa, ++ }, ++ .prot = HOST_EPT_DEF_MEM_PROT, ++ }, ++ .completer = { ++ .id = PKVM_ID_GUEST, ++ .guest = { ++ .pgt = guest_pgt, ++ .addr = gpa, ++ .phys = hpa, ++ }, ++ .prot = prot, ++ }, ++ }; ++ ++ host_ept_lock(); ++ ++ ret = do_share(&share); ++ ++ host_ept_unlock(); ++ ++ return ret; ++} ++ ++static int __pkvm_guest_share_host_page(struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 hpa, u64 guest_prot) ++{ ++ struct pkvm_mem_transition share = { ++ .size = PAGE_SIZE, ++ .initiator = { ++ .id = PKVM_ID_GUEST, ++ .guest = { ++ .pgt = guest_pgt, ++ .addr = gpa, ++ .phys = hpa, ++ }, ++ .prot = guest_prot, ++ }, ++ .completer = { ++ .id = PKVM_ID_HOST, ++ .host = { ++ .addr = hpa, ++ }, ++ .prot = HOST_EPT_DEF_MEM_PROT, ++ }, ++ }; ++ ++ return do_share(&share); ++} ++ ++int __pkvm_guest_share_host(struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size) ++{ ++ unsigned long hpa; ++ u64 prot; ++ int ret = 0; ++ ++ if (!PAGE_ALIGNED(size)) ++ return -EINVAL; ++ ++ guest_pgstate_pgt_lock(guest_pgt); ++ host_ept_lock(); ++ ++ while (size) { ++ pkvm_pgtable_lookup(guest_pgt, gpa, &hpa, &prot, NULL); ++ if (hpa == INVALID_ADDR) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ ret = __pkvm_guest_share_host_page(guest_pgt, gpa, hpa, prot); ++ if (ret) ++ break; ++ ++ size -= PAGE_SIZE; ++ gpa += PAGE_SIZE; ++ } ++ ++ host_ept_unlock(); ++ guest_pgstate_pgt_unlock(guest_pgt); ++ ++ ++ return ret; ++} ++ ++static int host_request_unshare(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->initiator.host.addr; ++ u64 size = tx->size; ++ ++ return __host_check_page_state_range(tx->initiator.host.pgt_override, addr, ++ size, PKVM_PAGE_SHARED_OWNED); ++} ++ ++static int guest_request_unshare(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->initiator.guest.addr; ++ u64 size = tx->size; ++ ++ return __guest_check_page_state_range(tx->initiator.guest.pgt, ++ addr, size, PKVM_PAGE_SHARED_OWNED); ++} ++ ++static int host_ack_unshare(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->completer.host.addr; ++ u64 size = tx->size; ++ ++ return __host_check_page_state_range(tx->completer.host.pgt_override, addr, ++ size, PKVM_PAGE_SHARED_BORROWED); ++} ++ ++static int guest_ack_unshare(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->completer.guest.addr; ++ u64 size = tx->size; ++ ++ return __guest_check_page_state_range(tx->completer.guest.pgt, addr, ++ size, PKVM_PAGE_SHARED_BORROWED); ++} ++ ++int check_unshare(const struct pkvm_mem_transition *tx) ++{ ++ int ret; ++ ++ switch (tx->initiator.id) { ++ case PKVM_ID_HOST: ++ ret = host_request_unshare(tx); ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_request_unshare(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret) ++ return ret; ++ ++ switch (tx->completer.id) { ++ case PKVM_ID_HOST: ++ ret = host_ack_unshare(tx); ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_ack_unshare(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++ ++static int host_initiate_unshare(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->initiator.host.addr; ++ u64 size = tx->size; ++ u64 prot = pkvm_mkstate(tx->initiator.prot, PKVM_PAGE_OWNED); ++ ++ return host_ept_create_idmap_locked(tx->initiator.host.pgt_override, addr, size, 0, prot); ++} ++ ++static int guest_initiate_unshare(const struct pkvm_mem_transition *tx) ++{ ++ struct pkvm_pgtable *pgt = tx->initiator.guest.pgt; ++ u64 addr = tx->initiator.guest.addr; ++ u64 phys = tx->initiator.guest.phys; ++ u64 size = tx->size; ++ u64 prot = pkvm_mkstate(tx->initiator.prot, PKVM_PAGE_OWNED); ++ ++ return pkvm_pgtable_map(pgt, addr, phys, size, 0, prot, NULL); ++} ++ ++static int host_complete_unshare(const struct pkvm_mem_transition *tx) ++{ ++ u64 addr = tx->completer.host.addr; ++ u64 size = tx->size; ++ u64 owner_id = initiator_owner_id(tx); ++ ++ return host_ept_set_owner_locked(tx->completer.host.pgt_override, addr, size, owner_id); ++} ++ ++static int guest_complete_unshare(const struct pkvm_mem_transition *tx) ++{ ++ struct pkvm_pgtable *pgt = tx->completer.guest.pgt; ++ u64 addr = tx->completer.guest.addr; ++ u64 phys = tx->completer.guest.phys; ++ u64 size = tx->size; ++ ++ return pkvm_pgtable_unmap_safe(pgt, addr, phys, size, NULL); ++} ++ ++static int __do_unshare(struct pkvm_mem_transition *tx) ++{ ++ int ret; ++ ++ switch (tx->initiator.id) { ++ case PKVM_ID_HOST: ++ ret = host_initiate_unshare(tx); ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_initiate_unshare(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret) ++ return ret; ++ ++ switch (tx->completer.id) { ++ case PKVM_ID_HOST: ++ ret = host_complete_unshare(tx); ++ break; ++ case PKVM_ID_GUEST: ++ ret = guest_complete_unshare(tx); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++ ++/* ++ * do_unshare() - The page owner takes back the page access for another ++ * component. ++ * ++ * Initiator: SHARED_OWNED => OWNED ++ * Completer: SHARED_BORROWED => NOPAGE ++ */ ++int do_unshare(struct pkvm_mem_transition *share) ++{ ++ int ret; ++ ++ ret = check_unshare(share); ++ if (ret) ++ return ret; ++ ++ return WARN_ON(__do_unshare(share)); ++} ++ ++int __pkvm_host_unshare_guest(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size) ++{ ++ int ret; ++ struct pkvm_mem_transition share = { ++ .size = size, ++ .initiator = { ++ .id = PKVM_ID_HOST, ++ .host = { ++ .addr = hpa, ++ }, ++ .prot = HOST_EPT_DEF_MEM_PROT, ++ }, ++ .completer = { ++ .id = PKVM_ID_GUEST, ++ .guest = { ++ .pgt = guest_pgt, ++ .addr = gpa, ++ .phys = hpa, ++ }, ++ }, ++ }; ++ ++ host_ept_lock(); ++ ++ ret = do_unshare(&share); ++ ++ host_ept_unlock(); ++ ++ return ret; ++} ++ ++static int __pkvm_guest_unshare_host_page(struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 hpa, u64 guest_prot) ++{ ++ struct pkvm_mem_transition share = { ++ .size = PAGE_SIZE, ++ .initiator = { ++ .id = PKVM_ID_GUEST, ++ .guest = { ++ .pgt = guest_pgt, ++ .addr = gpa, ++ .phys = hpa, ++ }, ++ .prot = guest_prot, ++ }, ++ .completer = { ++ .id = PKVM_ID_HOST, ++ .host = { ++ .addr = hpa, ++ }, ++ }, ++ }; ++ ++ return do_unshare(&share); ++} ++ ++int __pkvm_guest_unshare_host(struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size) ++{ ++ unsigned long hpa; ++ u64 prot; ++ int ret = 0; ++ ++ guest_pgstate_pgt_lock(guest_pgt); ++ host_ept_lock(); ++ ++ while (size) { ++ pkvm_pgtable_lookup(guest_pgt, gpa, &hpa, &prot, NULL); ++ if (hpa == INVALID_ADDR) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ ret = __pkvm_guest_unshare_host_page(guest_pgt, gpa, hpa, prot); ++ if (ret) ++ break; ++ ++ size -= PAGE_SIZE; ++ gpa += PAGE_SIZE; ++ } ++ ++ host_ept_unlock(); ++ guest_pgstate_pgt_unlock(guest_pgt); ++ ++ return ret; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h b/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h +new file mode 100644 +index 000000000000..f71c55c46d3a +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/mem_protect.h +@@ -0,0 +1,205 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef __PKVM_MEM_PROTECT_H__ ++#define __PKVM_MEM_PROTECT_H__ ++ ++/* ++ * enum pkvm_pgtable_prot - The ignored bits in page-table. ++ * pkvm will use these ignored bits as software bits to ++ * identify the page status. ++ */ ++enum pkvm_pgtable_prot { ++ PKVM_PGTABLE_PROT_SW0 = BIT(56), ++ PKVM_PGTABLE_PROT_SW1 = BIT(57), ++}; ++ ++/* ++ * Using the ignored bits in page-table as SW bits. ++ * SW bits 0-1 are used to track the memory ownership state of each page: ++ * 00: The page has no mapping in page table (also invalid pte). And under ++ * this page state, host ept is using the pte ignored bits to record owner_id. ++ * 01: The page is owned exclusively by the page-table owner. ++ * 10: The page is owned by the page-table owner, but is shared ++ * with another entity. ++ * 11: The page is shared with, but not owned by the page-table owner. ++ */ ++enum pkvm_page_state { ++ PKVM_NOPAGE = 0ULL, ++ PKVM_PAGE_OWNED = PKVM_PGTABLE_PROT_SW0, ++ PKVM_PAGE_SHARED_OWNED = PKVM_PGTABLE_PROT_SW1, ++ PKVM_PAGE_SHARED_BORROWED = PKVM_PGTABLE_PROT_SW0 | ++ PKVM_PGTABLE_PROT_SW1, ++}; ++ ++#define PKVM_PAGE_STATE_PROT_MASK (PKVM_PGTABLE_PROT_SW0 | PKVM_PGTABLE_PROT_SW1) ++/* use 20 bits[12~31] - not conflict w/ low 12 bits pte prot */ ++#define PKVM_INVALID_PTE_OWNER_MASK GENMASK(31, 12) ++ ++static inline u64 pkvm_mkstate(u64 prot, enum pkvm_page_state state) ++{ ++ return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; ++} ++ ++static inline enum pkvm_page_state pkvm_getstate(u64 pte) ++{ ++ return pte & PKVM_PAGE_STATE_PROT_MASK; ++} ++ ++typedef u32 pkvm_id; ++static const pkvm_id pkvm_hyp_id = 0; ++ ++/* ++ * __pkvm_host_donate_hyp() - Donate pages from host to hyp, then host cannot ++ * access these donated pages. ++ * ++ * @hpa: Start hpa of being donated pages, must be continuous. ++ * @size: The size of memory to be donated. ++ * ++ * A range of pages [hpa, hpa + size) will be donated from host to hyp. And ++ * this will unmap these pages from host ept and set the page owner as hyp_id ++ * in the pte in host ept. For hyp mmu, it will do nothing as hyp mmu can ++ * access all the memory by default, but modifying host ept is necessary because a ++ * page used by pkvm is private and can't be accessed by host. ++ */ ++int __pkvm_host_donate_hyp(u64 hpa, u64 size); ++ ++/* ++ * __pkvm_hyp_donate_host() - Donate pages from hyp to host, then host can ++ * access these pages. ++ * ++ * @hpa: Start hpa of being donated pages, must be continuous. ++ * @size: The size of memory to be donated. ++ * ++ * A range of pages [hpa, hpa + size) will be donated from hyp to host. This ++ * will create mapping in host ept for these pages, and nothing to do with hyp ++ * mmu. This is paired with __pkvm_host_donate_hyp(), and same as host reclaiming ++ * these pages back. ++ */ ++int __pkvm_hyp_donate_host(u64 hpa, u64 size); ++ ++/* ++ * __pkvm_host_share_guest() - Share pages between host and guest. Host still ++ * ownes the page and guest will have temporary access to these pages. ++ * ++ * @hpa: Start hpa of being shared pages, must be continuous. ++ * @guest_pgt: The guest ept pagetable. ++ * @gpa: Start gpa that will be used for mapping into the guest ept. ++ * @size: The size of pages to be shared. ++ * @prot: The prot that will be used for creating mapping for guest ept. ++ * ++ * A range of pages [hpa, hpa + size) in host ept that their page state ++ * will be modified from PAGE_OWNED to PAGE_SHARED_OWNED. There will be ++ * mapping from gfn to pfn to be created in guest ept. The @prot ++ * and PAGE_SHARED_BORROWED will be used to create such mapping. ++ */ ++int __pkvm_host_share_guest(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size, u64 prot); ++ ++/* ++ * __pkvm_host_unshare_guest() - Host unshare pages that have been shared to guest ++ * previously. Guest will not be able to access these pages. ++ * ++ * @hpa: Start hpa of being shared pages, must be continuous. ++ * @guest_pgt: The guest ept pagetable. ++ * @gpa: Start gpa of shared pages being mapped in guest ept. ++ * @size: The size of pages to be shared. ++ * ++ * Unmap the range [gfn, gfn + nr_pages) in guest ept pagetable. And change ++ * the page state from PAGE_SHARED_BORROWED to PAGE_OWNED in the host ept. ++ */ ++int __pkvm_host_unshare_guest(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size); ++ ++/* ++ * __pkvm_host_donate_guest() - Host donate pages to guest. Then host can't ++ * access these pages and guest can access. ++ * ++ * @hpa: Start hpa of being donated pages, must be continuous. ++ * @guest_pgt: The guest ept pagetable. ++ * @gpa: Start gpa of donated pages that will be mapped in guest ept. ++ * @size: The size of pages to being donated. ++ * @prot: The prot that will be used for creating mapping in guest ept. ++ * ++ * A range of pages [hpa, hpa + size) will be donated from host to guest. And ++ * this will unmap these pages from host ept and set the page owner as guest_id ++ * in the pte in host ept. The guest_id is equal to the vm's shadow_handle+1. In ++ * the same time, the mapping gpa -> hpa with @size will be created in guest ept ++ * with @prot. ++ */ ++int __pkvm_host_donate_guest(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size, u64 prot); ++ ++/* ++ * __pkvm_host_donate_guest_fastpath() - Similar to __pkvm_host_donate_guest() but ++ * will use the fastpath to set annotation in host EPT to donate a page. The fastpath ++ * of setting annotation doesn't do the TLB flushing when unmaps from the host EPT. ++ * This function is used in the scenario that, the caller can do TLB flushing after ++ * doing a bunch of donating pages which can improve the performance. The caller ++ * should guarantee that doing TLB flushing after donating doesn't bring any security ++ * window that host can steal the data from the donated page. ++ */ ++int __pkvm_host_donate_guest_fastpath(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size, u64 prot); ++ ++/* ++ * __pkvm_host_undoate_guest() - Host reclaim these pages donated to guest. ++ * Then guest can't access these pages and host can access. ++ * ++ * @hpa: Start hpa of being donated pages, must be continuous. ++ * @guest_pgt: The guest ept pagetable. ++ * @gpa: Start gpa of donated pages that will be unmapped in guest ept. ++ * @size: The size of pages to be donated. ++ * ++ * A range of pages [hpa, hpa + size) will be donated from guest to host. And ++ * this will unmap these pages [gpa, gpa + size) from guest ept. In the same ++ * time, the identity mapping for hpa will be created in host ept. ++ */ ++int __pkvm_host_undonate_guest(u64 hpa, struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size); ++/* ++ * __pkvm_guest_share_host() - Guest share pages to host. Guest still ++ * ownes the pages and host will have temporary access to these pages. ++ * ++ * @guest_pgt: The guest ept pagetable. ++ * @gpa: Start gpa of being shared pages, must be continuous. ++ * @size: The size of pages to be shared, should be PAGE_ALIGNED. ++ * ++ * The parameter does not have hpa, as the caller does not know it. The hpa ++ * depends on looking up the guest ept to get it. ++ * ++ * Now the function will share one PAGE at a time. If the size is larger than ++ * PAGE_SIZE, it will split it into multiple PAGE_SIZE pages and share them using ++ * a loop. ++ * ++ * A range of pages [gpa, gpa + size) in guest ept that its page state ++ * will be modified from PAGE_OWNED to PAGE_SHARED_OWNED. There will be ++ * mapping to be created in host ept for addr hpa, and its page state will be ++ * PAGE_SHARED_BORROWED. ++ */ ++int __pkvm_guest_share_host(struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size); ++ ++/* ++ * __pkvm_guest_unshare_host() - Guest reclaim these pages donated to host. ++ * Then host can't access these pages and guest still ownes it. ++ * ++ * @guest_pgt: The guest ept pagetable. ++ * @gpa: Start gpa of being unshared pages, must be continuous. ++ * @size: The size of pages to be unshared, should be PAGE_ALIGNED. ++ * ++ * The parameter does not have hpa, as the caller does not know it. The hpa ++ * depends on looking up the guest ept to get it. ++ * ++ * Now the function will unshare one PAGE at a time. If the size is larger than ++ * PAGE_SIZE, it will split it into multiple PAGE_SIZE pages and unshare them ++ * using a loop. ++ * ++ * A range of pages [gpa, gpa + size) in guest ept that its page state will be ++ * modified from PAGE_SHARED_OWNED to PAGE_OWNED. The mapping for these ++ * pages in host ept will be unmapped and the owner_id will be set to guest_id. ++ */ ++int __pkvm_guest_unshare_host(struct pkvm_pgtable *guest_pgt, ++ u64 gpa, u64 size); ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/memory.c b/arch/x86/kvm/vmx/pkvm/hyp/memory.c +new file mode 100644 +index 000000000000..94e458cf8d1d +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/memory.c +@@ -0,0 +1,363 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++ ++#include ++#include "memory.h" ++#include "pgtable.h" ++#include "pkvm_hyp.h" ++#include "cpu.h" ++ ++unsigned long __page_base_offset; ++unsigned long __symbol_base_offset; ++unsigned long __x86_clflush_size; ++static u8 max_physaddr_bits; ++ ++unsigned int pkvm_memblock_nr; ++struct memblock_region pkvm_memory[PKVM_MEMBLOCK_REGIONS]; ++ ++void *pkvm_iophys_to_virt(unsigned long phys) ++{ ++ unsigned long iova = PKVM_IOVA_OFFSET + phys; ++ ++ if (iova >= __page_base_offset) ++ return (void *)INVALID_ADDR; ++ ++ return (void *)iova; ++} ++ ++void *pkvm_phys_to_virt(unsigned long phys) ++{ ++ return (void *)__page_base_offset + phys; ++} ++ ++unsigned long pkvm_virt_to_phys(void *virt) ++{ ++ /* this api only take care direct & io mapping */ ++ if ((unsigned long)virt < PKVM_IOVA_OFFSET) ++ return INVALID_ADDR; ++ ++ return ((unsigned long)virt >= __page_base_offset) ? ++ (unsigned long)virt - __page_base_offset : ++ (unsigned long)virt - PKVM_IOVA_OFFSET; ++} ++ ++unsigned long pkvm_virt_to_symbol_phys(void *virt) ++{ ++ return (unsigned long)virt - __symbol_base_offset; ++} ++ ++void *host_gpa2hva(unsigned long gpa) ++{ ++ /* host gpa = hpa */ ++ return pkvm_phys_to_virt(gpa); ++} ++ ++unsigned long host_gpa2hpa(unsigned long gpa) ++{ ++ /* Host VM is using identity mapping so GPA == HPA */ ++ return gpa; ++} ++ ++void *host_mmio2hva(unsigned long gpa) ++{ ++ return pkvm_iophys_to_virt(gpa); ++} ++ ++extern struct pkvm_pgtable_ops mmu_ops; ++static struct pkvm_mm_ops mm_ops = { ++ .phys_to_virt = host_gpa2hva, ++}; ++ ++static int check_translation(struct kvm_vcpu *vcpu, gva_t gva, gpa_t gpa, ++ u64 prot, u32 access, struct x86_exception *exception) ++{ ++ u16 errcode = 0; ++ bool page_rw_flags_on = true; ++ bool user_mode_addr = true; ++ const int user_mode_access = access & PFERR_USER_MASK; ++ const int write_access = access & PFERR_WRITE_MASK; ++ bool cr4_smap = vmcs_readl(GUEST_CR4) & X86_CR4_SMAP; ++ bool cr0_wp = vmcs_readl(GUEST_CR0) & X86_CR0_WP; ++ ++ /* ++ * As pkvm hypervisor will not do instruction emulation, here we do not ++ * expect guest memory access for instruction fetch. ++ */ ++ WARN_ON(access & PFERR_FETCH_MASK); ++ ++ /* pte is not present */ ++ if (gpa == INVALID_ADDR) { ++ goto check_fault; ++ } else { ++ errcode |= PFERR_PRESENT_MASK; ++ ++ /*TODO: check reserved bits and PK */ ++ ++ /* check for R/W */ ++ if ((prot & _PAGE_RW) == 0) { ++ if (write_access && (user_mode_access || cr0_wp)) ++ /* ++ * case 1: Supermode and wp is 1 ++ * case 2: Usermode ++ */ ++ goto check_fault; ++ page_rw_flags_on = false; ++ } ++ ++ /* check for U/S */ ++ if ((prot & _PAGE_USER) == 0) { ++ user_mode_addr = false; ++ if (user_mode_access) ++ goto check_fault; ++ } ++ ++ /* ++ * When SMAP is on, we only need to apply check when address is ++ * user-mode address. ++ * ++ * Also SMAP only impacts the supervisor-mode access. ++ */ ++ /* if SMAP is enabled and supervisor-mode access */ ++ if (cr4_smap && (!user_mode_access) && user_mode_addr) { ++ bool acflag = vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_AC; ++ ++ /* read from user mode address, eflags.ac = 0 */ ++ if ((!write_access) && (!acflag)) { ++ goto check_fault; ++ } else if (write_access) { ++ /* write to user mode address */ ++ ++ /* cr0.wp = 0, eflags.ac = 0 */ ++ if ((!cr0_wp) && (!acflag)) ++ goto check_fault; ++ ++ /* ++ * cr0.wp = 1, eflags.ac = 1, r/w flag is 0 ++ * on any paging structure entry ++ */ ++ if (cr0_wp && acflag && (!page_rw_flags_on)) ++ goto check_fault; ++ ++ /* cr0.wp = 1, eflags.ac = 0 */ ++ if (cr0_wp && (!acflag)) ++ goto check_fault; ++ } else { ++ /* do nothing */ ++ } ++ } ++ } ++ ++ return 0; ++ ++check_fault: ++ errcode |= write_access | user_mode_access; ++ exception->error_code = errcode; ++ exception->vector = PF_VECTOR; ++ exception->error_code_valid = true; ++ exception->address = gva; ++ exception->nested_page_fault = false; ++ exception->async_page_fault = false; ++ return -EFAULT; ++ ++} ++ ++int gva2gpa(struct kvm_vcpu *vcpu, gva_t gva, gpa_t *gpa, ++ u32 access, struct x86_exception *exception) ++{ ++ struct pkvm_pgtable guest_mmu; ++ gpa_t _gpa; ++ u64 prot; ++ int pg_level; ++ ++ /* caller should ensure exception is not NULL */ ++ WARN_ON(exception == NULL); ++ ++ memset(exception, 0, sizeof(*exception)); ++ ++ /*TODO: support other paging mode beside long mode */ ++ guest_mmu.root_pa = vcpu->arch.cr3 & PAGE_MASK; ++ pkvm_pgtable_init(&guest_mmu, &mm_ops, &mmu_ops, &pkvm_hyp->mmu_cap, false); ++ pkvm_pgtable_lookup(&guest_mmu, (unsigned long)gva, ++ (unsigned long *)&_gpa, &prot, &pg_level); ++ *gpa = _gpa; ++ ++ return check_translation(vcpu, gva, _gpa, prot, access, exception); ++} ++ ++static inline int __copy_gpa(struct kvm_vcpu *vcpu, void *addr, gpa_t gpa, ++ unsigned int size, unsigned int pg_size, ++ bool from_guest) ++{ ++ unsigned int len, offset_in_pg; ++ void *hva; ++ ++ offset_in_pg = (unsigned int)gpa & (pg_size - 1); ++ len = (size > (pg_size - offset_in_pg)) ? (pg_size - offset_in_pg) : size; ++ ++ hva = host_gpa2hva(gpa); ++ if (from_guest) ++ memcpy(addr, hva, len); ++ else ++ memcpy(hva, addr, len); ++ ++ return len; ++} ++ ++/* only support host VM now */ ++static int copy_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr, ++ unsigned int bytes, struct x86_exception *exception, bool from_guest) ++{ ++ u32 access = VMX_AR_DPL(vmcs_read32(GUEST_SS_AR_BYTES)) == 3 ? PFERR_USER_MASK : 0; ++ gpa_t gpa; ++ unsigned int len; ++ int ret = 0; ++ ++ if (!from_guest) ++ access |= PFERR_WRITE_MASK; ++ ++ while ((bytes > 0) && (ret == 0)) { ++ ret = gva2gpa(vcpu, gva, &gpa, access, exception); ++ if (ret >= 0) { ++ len = __copy_gpa(vcpu, addr, gpa, bytes, PAGE_SIZE, from_guest); ++ if (len == 0) ++ return -EINVAL; ++ gva += len; ++ addr += len; ++ bytes -= len; ++ } ++ } ++ ++ return ret; ++} ++ ++int read_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr, ++ unsigned int bytes, struct x86_exception *exception) ++{ ++ return copy_gva(vcpu, gva, addr, bytes, exception, true); ++} ++ ++int write_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr, ++ unsigned int bytes, struct x86_exception *exception) ++{ ++ return copy_gva(vcpu, gva, addr, bytes, exception, false); ++} ++ ++/* only support host VM now */ ++static int copy_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr, ++ unsigned int bytes, bool from_guest) ++{ ++ unsigned int len; ++ ++ while (bytes > 0) { ++ len = __copy_gpa(vcpu, addr, gpa, bytes, PAGE_SIZE, from_guest); ++ if (len == 0) ++ return -EINVAL; ++ gpa += len; ++ addr += len; ++ bytes -= len; ++ } ++ ++ return 0; ++} ++ ++int read_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr, unsigned int bytes) ++{ ++ return copy_gpa(vcpu, gpa, addr, bytes, true); ++} ++ ++int write_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr, unsigned int bytes) ++{ ++ return copy_gpa(vcpu, gpa, addr, bytes, false); ++} ++ ++bool find_mem_range(unsigned long addr, struct mem_range *range) ++{ ++ int cur, left = 0, right = pkvm_memblock_nr; ++ struct memblock_region *reg; ++ unsigned long end; ++ ++ range->start = 0; ++ range->end = ULONG_MAX; ++ ++ /* The list of memblock regions is sorted, binary search it */ ++ while (left < right) { ++ cur = (left + right) >> 1; ++ reg = &pkvm_memory[cur]; ++ end = reg->base + reg->size; ++ if (addr < reg->base) { ++ right = cur; ++ range->end = reg->base; ++ } else if (addr >= end) { ++ left = cur + 1; ++ range->start = end; ++ } else { ++ range->start = reg->base; ++ range->end = end; ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++bool mem_range_included(struct mem_range *child, struct mem_range *parent) ++{ ++ return parent->start <= child->start && child->end <= parent->end; ++} ++ ++static void pkvm_clflush_cache_range_opt(void *vaddr, unsigned int size) ++{ ++ const unsigned long clflush_size = __x86_clflush_size; ++ void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); ++ void *vend = vaddr + size; ++ ++ if (p >= vend) ++ return; ++ ++ for (; p < vend; p += clflush_size) ++ clflushopt(p); ++} ++ ++/** ++ * pkvm_clflush_cache_range - flush a cache range with clflush ++ * which is implemented by referring to clflush_cache_range() in kernel. ++ * ++ * @vaddr: virtual start address ++ * @size: number of bytes to flush ++ */ ++void pkvm_clflush_cache_range(void *vaddr, unsigned int size) ++{ ++ /* ++ * clflush is an unordered instruction which needs fencing ++ * with MFENCE or SFENCE to avoid ordering issue. Put a mb() ++ * before the clflush. ++ */ ++ mb(); ++ pkvm_clflush_cache_range_opt(vaddr, size); ++ /* And also put another one after. */ ++ mb(); ++} ++ ++u64 get_max_physaddr_bits(void) ++{ ++ u32 eax, ebx, ecx, edx; ++ ++ if (max_physaddr_bits) ++ return max_physaddr_bits; ++ ++ eax = 0x80000000; ++ ecx = 0; ++ native_cpuid(&eax, &ebx, &ecx, &edx); ++ if (eax >= 0x80000008) { ++ eax = 0x80000008; ++ native_cpuid(&eax, &ebx, &ecx, &edx); ++ max_physaddr_bits = (u8)eax & 0xff; ++ } ++ ++ return max_physaddr_bits; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/memory.h b/arch/x86/kvm/vmx/pkvm/hyp/memory.h +new file mode 100644 +index 000000000000..ba6608ec6800 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/memory.h +@@ -0,0 +1,51 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _PKVM_MEMORY_H_ ++#define _PKVM_MEMORY_H_ ++ ++#include ++ ++#define INVALID_ADDR (~(unsigned long)0) ++ ++/* ++ * simply define IOVA offset from bit 43 to avoid ++ * canonical addressing check for the linear address ++ * as max linear address bits usually >= 47 ++ */ ++#define PKVM_IOVA_OFFSET 0x0000080000000000 ++ ++/* MMU entry property bits for UC. Can be used to map MMIO. */ ++#define PKVM_PAGE_IO_NOCACHE ((u64)(__PAGE_KERNEL | _PAGE_PWT | _PAGE_PCD)) ++ ++unsigned long pkvm_virt_to_symbol_phys(void *virt); ++#define __pkvm_pa_symbol(x) pkvm_virt_to_symbol_phys((void *)x) ++ ++void *pkvm_iophys_to_virt(unsigned long phys); ++ ++#include ++void *host_gpa2hva(unsigned long gpa); ++unsigned long host_gpa2hpa(unsigned long gpa); ++void *host_mmio2hva(unsigned long gpa); ++int gva2gpa(struct kvm_vcpu *vcpu, gva_t gva, gpa_t *gpa, ++ u32 access, struct x86_exception *exception); ++int read_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr, ++ unsigned int bytes, struct x86_exception *exception); ++int write_gva(struct kvm_vcpu *vcpu, gva_t gva, void *addr, ++ unsigned int bytes, struct x86_exception *exception); ++int read_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr, unsigned int bytes); ++int write_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, void *addr, unsigned int bytes); ++ ++struct mem_range { ++ unsigned long start; ++ unsigned long end; ++}; ++ ++bool find_mem_range(unsigned long addr, struct mem_range *range); ++bool mem_range_included(struct mem_range *child, struct mem_range *parent); ++ ++void pkvm_clflush_cache_range(void *vaddr, unsigned int size); ++ ++u64 get_max_physaddr_bits(void); ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/mmu.c b/arch/x86/kvm/vmx/pkvm/hyp/mmu.c +new file mode 100644 +index 000000000000..5cf5c784e501 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/mmu.c +@@ -0,0 +1,258 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include "pkvm_hyp.h" ++#include "early_alloc.h" ++#include "pgtable.h" ++#include "mmu.h" ++#include "debug.h" ++ ++static struct pkvm_pool mmu_pool; ++static struct pkvm_pgtable hyp_mmu; ++static pkvm_spinlock_t _hyp_mmu_lock = __PKVM_SPINLOCK_UNLOCKED; ++ ++static void *mmu_zalloc_page(void) ++{ ++ return pkvm_alloc_pages(&mmu_pool, 0); ++} ++ ++static void mmu_get_page(void *vaddr) ++{ ++ pkvm_get_page(&mmu_pool, vaddr); ++} ++ ++static void mmu_put_page(void *vaddr) ++{ ++ pkvm_put_page(&mmu_pool, vaddr); ++} ++ ++static void flush_tlb_noop(struct pkvm_pgtable *pgt, ++ unsigned long addr, unsigned long size) ++{ ++} ++ ++static struct pkvm_mm_ops mmu_mm_ops = { ++ .phys_to_virt = pkvm_phys_to_virt, ++ .virt_to_phys = pkvm_virt_to_phys, ++ .zalloc_page = mmu_zalloc_page, ++ .get_page = mmu_get_page, ++ .put_page = mmu_put_page, ++ .page_count = pkvm_page_count, ++ .flush_tlb = flush_tlb_noop, ++}; ++ ++static bool mmu_entry_present(void *ptep) ++{ ++ return pte_present(*(pte_t *)ptep); ++} ++ ++static bool mmu_entry_huge(void *ptep) ++{ ++ return pte_huge(*(pte_t *)ptep); ++} ++ ++static void mmu_entry_mkhuge(void *ptep) ++{ ++ pte_t *ptep_ptr = (pte_t *)ptep; ++ ++ *ptep_ptr = pte_mkhuge(*ptep_ptr); ++} ++ ++static unsigned long mmu_entry_to_phys(void *ptep) ++{ ++ return native_pte_val(*(pte_t *)ptep) & PTE_PFN_MASK; ++} ++ ++static u64 mmu_entry_to_prot(void *ptep) ++{ ++ return (u64)pte_flags(pte_clear_flags(*(pte_t *)ptep, _PAGE_PSE)); ++} ++ ++static int mmu_entry_to_index(unsigned long vaddr, int level) ++{ ++ return PT_LEVEL_INDEX(vaddr, level); ++} ++ ++static bool mmu_entry_is_leaf(void *ptep, int level) ++{ ++ if (level == PG_LEVEL_4K || ++ !mmu_entry_present(ptep) || ++ mmu_entry_huge(ptep)) ++ return true; ++ ++ return false; ++} ++ ++static int mmu_level_entry_size(int level) ++{ ++ return PAGE_SIZE / PTRS_PER_PTE; ++} ++ ++static int mmu_level_to_entries(int level) ++{ ++ return PTRS_PER_PTE; ++} ++ ++static unsigned long mmu_level_to_size(int level) ++{ ++ return page_level_size(level); ++} ++ ++static void mmu_set_entry(void *ptep, u64 pte) ++{ ++ native_set_pte((pte_t *)ptep, native_make_pte(pte)); ++} ++ ++static u64 mmu_level_page_mask(int level) ++{ ++ return (~((1UL << PT64_LEVEL_SHIFT(level)) - 1)); ++} ++ ++struct pkvm_pgtable_ops mmu_ops = { ++ .pgt_entry_present = mmu_entry_present, ++ .pgt_entry_mapped = mmu_entry_present, ++ .pgt_entry_huge = mmu_entry_huge, ++ .pgt_entry_mkhuge = mmu_entry_mkhuge, ++ .pgt_entry_to_phys = mmu_entry_to_phys, ++ .pgt_entry_to_prot = mmu_entry_to_prot, ++ .pgt_entry_to_index = mmu_entry_to_index, ++ .pgt_level_page_mask = mmu_level_page_mask, ++ .pgt_entry_is_leaf = mmu_entry_is_leaf, ++ .pgt_level_entry_size = mmu_level_entry_size, ++ .pgt_level_to_entries = mmu_level_to_entries, ++ .pgt_level_to_size = mmu_level_to_size, ++ .pgt_set_entry = mmu_set_entry, ++ .default_prot = MMU_PROT_DEF, ++}; ++ ++static int finalize_host_mappings_walker(struct pkvm_pgtable *mmu, ++ unsigned long vaddr, ++ unsigned long vaddr_end, ++ int level, ++ void *ptep, ++ unsigned long flags, ++ struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct pkvm_mm_ops *mm_ops = arg; ++ struct pkvm_pgtable_ops *pgt_ops = mmu->pgt_ops; ++ ++ if (!pgt_ops->pgt_entry_present(ptep)) ++ return 0; ++ ++ /* ++ * Fix-up the refcount for the page-table pages as the early allocator ++ * was unable to access the pkvm_vmemmap and so the buddy allocator has ++ * initialized the refcount to '1'. ++ */ ++ mm_ops->get_page(ptep); ++ ++ return 0; ++} ++ ++static int fix_pgtable_refcnt(void) ++{ ++ unsigned long size; ++ struct pkvm_pgtable_ops *pgt_ops; ++ struct pkvm_pgtable_walker walker = { ++ .cb = finalize_host_mappings_walker, ++ .flags = PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST, ++ .arg = hyp_mmu.mm_ops, ++ }; ++ ++ pgt_ops = hyp_mmu.pgt_ops; ++ /* ++ * Calculate the max address space, then walk the [0, size) address ++ * range to fixup refcount of every used page. ++ */ ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++ /* ++ * only fix vmmemap range for debug mode, now for 64T memory, ++ * could be extended if physical memory is bigger than 64T ++ */ ++ size = (SZ_64T / PAGE_SIZE) * sizeof(struct pkvm_page); ++#else ++ size = pgt_ops->pgt_level_to_size(hyp_mmu.level + 1); ++#endif ++ ++ return pgtable_walk(&hyp_mmu, 0, size, true, &walker); ++} ++ ++int pkvm_mmu_map(unsigned long vaddr_start, unsigned long phys_start, ++ unsigned long size, int pgsz_mask, u64 prot) ++{ ++ int ret; ++ ++ pkvm_spin_lock(&_hyp_mmu_lock); ++ ret = pkvm_pgtable_map(&hyp_mmu, vaddr_start, phys_start, ++ size, pgsz_mask, prot, NULL); ++ pkvm_spin_unlock(&_hyp_mmu_lock); ++ return ret; ++} ++ ++int pkvm_mmu_unmap(unsigned long vaddr_start, unsigned long size) ++{ ++ int ret; ++ ++ pkvm_spin_lock(&_hyp_mmu_lock); ++ ret = pkvm_pgtable_unmap(&hyp_mmu, vaddr_start, size, NULL); ++ pkvm_spin_unlock(&_hyp_mmu_lock); ++ return ret; ++} ++ ++/* early mmu init before vmemmap ready, use early allocator first */ ++int pkvm_early_mmu_init(struct pkvm_pgtable_cap *cap, ++ void *mmu_pool_base, unsigned long mmu_pool_pages) ++{ ++ pkvm_early_alloc_init(mmu_pool_base, mmu_pool_pages << PAGE_SHIFT); ++ pkvm_hyp->mmu = &hyp_mmu; ++ return pkvm_pgtable_init(&hyp_mmu, &pkvm_early_alloc_mm_ops, &mmu_ops, cap, true); ++} ++ ++/* later mmu init after vmemmap ready, switch to buddy allocator */ ++int pkvm_later_mmu_init(void *mmu_pool_base, unsigned long mmu_pool_pages) ++{ ++ unsigned long reserved_pages, pfn; ++ int ret; ++ ++ /* Enable buddy allocator */ ++ pfn = __pkvm_pa(mmu_pool_base) >> PAGE_SHIFT; ++ reserved_pages = pkvm_early_alloc_nr_used_pages(); ++ ret = pkvm_pool_init(&mmu_pool, pfn, mmu_pool_pages, reserved_pages); ++ if (ret) { ++ pkvm_err("fail to init mmu_pool"); ++ return ret; ++ } ++ ++ /* The ops should alloc memory from mmu_pool now */ ++ hyp_mmu.mm_ops = &mmu_mm_ops; ++ ++ /* ++ * as we used early alloc mm_ops to create early pgtable mapping for mmu, ++ * the refcount was not maintained at that time, we need fix it by re-walk ++ * the pgtable ++ */ ++ return fix_pgtable_refcnt(); ++} ++ ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++void pkvm_mmu_clone_host(int level, unsigned long start_vaddr) ++{ ++ int i = mmu_entry_to_index(start_vaddr, level); ++ u64 *ptep = __va(hyp_mmu.root_pa); ++ u64 *host_cr3 = __va(__read_cr3() & PAGE_MASK); ++ ++ for (; i < PTRS_PER_PTE; i++) ++ ptep[i] = host_cr3[i]; ++ ++} ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/mmu.h b/arch/x86/kvm/vmx/pkvm/hyp/mmu.h +new file mode 100644 +index 000000000000..ea2df00e1a5b +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/mmu.h +@@ -0,0 +1,28 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _PKVM_MMU_H_ ++#define _PKVM_MMU_H_ ++ ++#define MMU_PROT_DEF 0 ++ ++int pkvm_mmu_map(unsigned long vaddr_start, unsigned long phys_start, ++ unsigned long size, int pgsz_mask, u64 prot); ++ ++int pkvm_mmu_unmap(unsigned long vaddr_start, unsigned long size); ++ ++int pkvm_early_mmu_init(struct pkvm_pgtable_cap *cap, ++ void *mmu_pool_base, unsigned long mmu_pool_pages); ++ ++int pkvm_later_mmu_init(void *mmu_pool_base, unsigned long mmu_pool_pages); ++ ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++void pkvm_mmu_clone_host(int level, unsigned long start_vaddr); ++#else ++static inline void pkvm_mmu_clone_host(int level, unsigned long start_vaddr) {} ++#endif ++ ++extern struct pkvm_pgtable_ops mmu_ops; ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/nested.c b/arch/x86/kvm/vmx/pkvm/hyp/nested.c +new file mode 100644 +index 000000000000..ab4b4e40baf2 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/nested.c +@@ -0,0 +1,1485 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++#include ++#include "pkvm_hyp.h" ++#include "nested.h" ++#include "cpu.h" ++#include "vmx.h" ++#include "ept.h" ++#include "debug.h" ++#include "mem_protect.h" ++ ++/* ++ * Not support shadow vmcs & vmfunc; ++ * Not support descriptor-table exiting ++ * as it requires guest memory access ++ * to decode and emulate instructions ++ * which is not supported for protected VM. ++ */ ++#define NESTED_UNSUPPORTED_2NDEXEC \ ++ (SECONDARY_EXEC_SHADOW_VMCS | \ ++ SECONDARY_EXEC_ENABLE_VMFUNC | \ ++ SECONDARY_EXEC_DESC) ++ ++static const unsigned int vmx_msrs[] = { ++ LIST_OF_VMX_MSRS ++}; ++ ++bool is_vmx_msr(unsigned long msr) ++{ ++ bool found = false; ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(vmx_msrs); i++) { ++ if (msr == vmx_msrs[i]) { ++ found = true; ++ break; ++ } ++ } ++ ++ return found; ++} ++ ++int read_vmx_msr(struct kvm_vcpu *vcpu, unsigned long msr, u64 *val) ++{ ++ u32 low, high; ++ int err = 0; ++ ++ pkvm_rdmsr(msr, low, high); ++ ++ switch (msr) { ++ case MSR_IA32_VMX_PROCBASED_CTLS2: ++ high &= ~NESTED_UNSUPPORTED_2NDEXEC; ++ break; ++ case MSR_IA32_VMX_MISC: ++ /* not support PT, SMM, Shadowing */ ++ low &= ~(MSR_IA32_VMX_MISC_INTEL_PT | BIT(14) | BIT(28) ++ | MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS); ++ break; ++ case MSR_IA32_VMX_VMFUNC: ++ /* not support vmfunc */ ++ low = high = 0; ++ break; ++ case MSR_IA32_VMX_EPT_VPID_CAP: ++ low &= ~VMX_EPT_AD_BIT; ++ break; ++ default: ++ err = -EACCES; ++ break; ++ } ++ ++ *val = (u64)high << 32 | (u64)low; ++ ++ return err; ++} ++ ++/** ++ * According to SDM Appendix B Field Encoding in VMCS, some fields only ++ * exist on processors that support the 1-setting of the corresponding ++ * fields in the control regs. ++ */ ++static bool has_vmcs_field(u16 encoding) ++{ ++ struct nested_vmx_msrs *msrs = &pkvm_hyp->vmcs_config.nested; ++ ++ switch (encoding) { ++ case MSR_BITMAP: ++ return msrs->procbased_ctls_high & CPU_BASED_USE_MSR_BITMAPS; ++ case VIRTUAL_APIC_PAGE_ADDR: ++ case VIRTUAL_APIC_PAGE_ADDR_HIGH: ++ case TPR_THRESHOLD: ++ return msrs->procbased_ctls_high & CPU_BASED_TPR_SHADOW; ++ case SECONDARY_VM_EXEC_CONTROL: ++ return msrs->procbased_ctls_high & ++ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; ++ case VIRTUAL_PROCESSOR_ID: ++ return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID; ++ case XSS_EXIT_BITMAP: ++ return msrs->secondary_ctls_high & SECONDARY_EXEC_XSAVES; ++ case PML_ADDRESS: ++ return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_PML; ++ case VM_FUNCTION_CONTROL: ++ return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_VMFUNC; ++ case EPT_POINTER: ++ return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT; ++ case EOI_EXIT_BITMAP0: ++ case EOI_EXIT_BITMAP1: ++ case EOI_EXIT_BITMAP2: ++ case EOI_EXIT_BITMAP3: ++ return msrs->secondary_ctls_high & ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; ++ case VMREAD_BITMAP: ++ case VMWRITE_BITMAP: ++ return msrs->secondary_ctls_high & SECONDARY_EXEC_SHADOW_VMCS; ++ case ENCLS_EXITING_BITMAP: ++ return msrs->secondary_ctls_high & ++ SECONDARY_EXEC_ENCLS_EXITING; ++ case GUEST_INTR_STATUS: ++ return msrs->secondary_ctls_high & ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; ++ case GUEST_PML_INDEX: ++ return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_PML; ++ case APIC_ACCESS_ADDR: ++ case APIC_ACCESS_ADDR_HIGH: ++ return msrs->secondary_ctls_high & ++ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; ++ case TSC_MULTIPLIER: ++ case TSC_MULTIPLIER_HIGH: ++ return msrs->secondary_ctls_high & ++ SECONDARY_EXEC_TSC_SCALING; ++ case GUEST_PHYSICAL_ADDRESS: ++ case GUEST_PHYSICAL_ADDRESS_HIGH: ++ return msrs->secondary_ctls_high & ++ SECONDARY_EXEC_ENABLE_EPT; ++ case GUEST_PDPTR0: ++ case GUEST_PDPTR0_HIGH: ++ case GUEST_PDPTR1: ++ case GUEST_PDPTR1_HIGH: ++ case GUEST_PDPTR2: ++ case GUEST_PDPTR2_HIGH: ++ case GUEST_PDPTR3: ++ case GUEST_PDPTR3_HIGH: ++ return msrs->secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT; ++ case PLE_GAP: ++ case PLE_WINDOW: ++ return msrs->secondary_ctls_high & ++ SECONDARY_EXEC_PAUSE_LOOP_EXITING; ++ case VMX_PREEMPTION_TIMER_VALUE: ++ return msrs->pinbased_ctls_high & ++ PIN_BASED_VMX_PREEMPTION_TIMER; ++ case POSTED_INTR_DESC_ADDR: ++ return msrs->pinbased_ctls_high & PIN_BASED_POSTED_INTR; ++ case POSTED_INTR_NV: ++ return msrs->pinbased_ctls_high & PIN_BASED_POSTED_INTR; ++ case GUEST_IA32_PAT: ++ case GUEST_IA32_PAT_HIGH: ++ return (msrs->entry_ctls_high & VM_ENTRY_LOAD_IA32_PAT) || ++ (msrs->exit_ctls_high & VM_EXIT_SAVE_IA32_PAT); ++ case GUEST_IA32_EFER: ++ case GUEST_IA32_EFER_HIGH: ++ return (msrs->entry_ctls_high & VM_ENTRY_LOAD_IA32_EFER) || ++ (msrs->exit_ctls_high & VM_EXIT_SAVE_IA32_EFER); ++ case GUEST_IA32_PERF_GLOBAL_CTRL: ++ case GUEST_IA32_PERF_GLOBAL_CTRL_HIGH: ++ return msrs->entry_ctls_high & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; ++ case GUEST_BNDCFGS: ++ case GUEST_BNDCFGS_HIGH: ++ return (msrs->entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || ++ (msrs->exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS); ++ case GUEST_IA32_RTIT_CTL: ++ case GUEST_IA32_RTIT_CTL_HIGH: ++ return (msrs->entry_ctls_high & VM_ENTRY_LOAD_IA32_RTIT_CTL) || ++ (msrs->exit_ctls_high & VM_EXIT_CLEAR_IA32_RTIT_CTL); ++ case HOST_IA32_PAT: ++ case HOST_IA32_PAT_HIGH: ++ return msrs->exit_ctls_high & VM_EXIT_LOAD_IA32_PAT; ++ case HOST_IA32_EFER: ++ case HOST_IA32_EFER_HIGH: ++ return msrs->exit_ctls_high & VM_EXIT_LOAD_IA32_EFER; ++ case HOST_IA32_PERF_GLOBAL_CTRL: ++ case HOST_IA32_PERF_GLOBAL_CTRL_HIGH: ++ return msrs->exit_ctls_high & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; ++ case EPTP_LIST_ADDRESS: ++ return msrs->vmfunc_controls & VMX_VMFUNC_EPTP_SWITCHING; ++ default: ++ return true; ++ } ++} ++ ++enum VMXResult { ++ VMsucceed, ++ VMfailValid, ++ VMfailInvalid, ++}; ++ ++struct shadow_vmcs_field { ++ u16 encoding; ++ u16 offset; ++}; ++ ++static u8 vmx_vmread_bitmap[PAGE_SIZE] __aligned(PAGE_SIZE); ++static u8 vmx_vmwrite_bitmap[PAGE_SIZE] __aligned(PAGE_SIZE); ++ ++static struct shadow_vmcs_field shadow_read_only_fields[] = { ++#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, ++#include "pkvm_nested_vmcs_fields.h" ++}; ++static int max_shadow_read_only_fields = ++ ARRAY_SIZE(shadow_read_only_fields); ++static struct shadow_vmcs_field shadow_read_write_fields[] = { ++#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, ++#include "pkvm_nested_vmcs_fields.h" ++}; ++static int max_shadow_read_write_fields = ++ ARRAY_SIZE(shadow_read_write_fields); ++static struct shadow_vmcs_field emulated_fields[] = { ++#define EMULATED_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, ++#include "pkvm_nested_vmcs_fields.h" ++}; ++static int max_emulated_fields = ++ ARRAY_SIZE(emulated_fields); ++ ++static void init_vmcs_shadow_fields(void) ++{ ++ int i, j; ++ ++ memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); ++ memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); ++ ++ for (i = j = 0; i < max_shadow_read_only_fields; i++) { ++ struct shadow_vmcs_field entry = shadow_read_only_fields[i]; ++ u16 field = entry.encoding; ++ ++ if (!has_vmcs_field(field)) ++ continue; ++ ++ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && ++ (i + 1 == max_shadow_read_only_fields || ++ shadow_read_only_fields[i + 1].encoding != field + 1)) { ++ pkvm_err("Missing field from shadow_read_only_field %x\n", ++ field + 1); ++ } ++ ++ clear_bit(field, (unsigned long *)vmx_vmread_bitmap); ++ if (field & 1) ++ continue; ++ shadow_read_only_fields[j++] = entry; ++ } ++ max_shadow_read_only_fields = j; ++ ++ for (i = j = 0; i < max_shadow_read_write_fields; i++) { ++ struct shadow_vmcs_field entry = shadow_read_write_fields[i]; ++ u16 field = entry.encoding; ++ ++ if (!has_vmcs_field(field)) ++ continue; ++ ++ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && ++ (i + 1 == max_shadow_read_write_fields || ++ shadow_read_write_fields[i + 1].encoding != field + 1)) { ++ pkvm_err("Missing field from shadow_read_write_field %x\n", ++ field + 1); ++ } ++ ++ clear_bit(field, (unsigned long *)vmx_vmwrite_bitmap); ++ clear_bit(field, (unsigned long *)vmx_vmread_bitmap); ++ if (field & 1) ++ continue; ++ shadow_read_write_fields[j++] = entry; ++ } ++ max_shadow_read_write_fields = j; ++} ++ ++static void init_emulated_vmcs_fields(void) ++{ ++ int i, j; ++ ++ for (i = j = 0; i < max_emulated_fields; i++) { ++ struct shadow_vmcs_field entry = emulated_fields[i]; ++ u16 field = entry.encoding; ++ ++ if (!has_vmcs_field(field)) ++ continue; ++ ++ emulated_fields[j++] = entry; ++ } ++ max_emulated_fields = j; ++} ++ ++static bool is_host_fields(unsigned long field) ++{ ++ return (((field) >> 10U) & 0x3U) == 3U; ++} ++ ++static bool is_emulated_fields(unsigned long field_encoding) ++{ ++ int i; ++ ++ for (i = 0; i < max_emulated_fields; i++) { ++ if ((unsigned long)emulated_fields[i].encoding == field_encoding) ++ return true; ++ } ++ ++ return false; ++} ++ ++static void nested_vmx_result(enum VMXResult result, int error_number) ++{ ++ u64 rflags = vmcs_readl(GUEST_RFLAGS); ++ ++ rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | ++ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF); ++ ++ if (result == VMfailValid) { ++ rflags |= X86_EFLAGS_ZF; ++ vmcs_write32(VM_INSTRUCTION_ERROR, error_number); ++ } else if (result == VMfailInvalid) { ++ rflags |= X86_EFLAGS_CF; ++ } else { ++ /* VMsucceed, do nothing */ ++ } ++ ++ if (result != VMsucceed) { ++ pkvm_err("VMX failed: %d/%d", result, error_number); ++ } ++ ++ vmcs_writel(GUEST_RFLAGS, rflags); ++} ++ ++static int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, ++ u32 vmx_instruction_info, gva_t *ret) ++{ ++ gva_t off; ++ struct kvm_segment s; ++ ++ /* ++ * According to Vol. 3B, "Information for VM Exits Due to Instruction ++ * Execution", on an exit, vmx_instruction_info holds most of the ++ * addressing components of the operand. Only the displacement part ++ * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). ++ * For how an actual address is calculated from all these components, ++ * refer to Vol. 1, "Operand Addressing". ++ */ ++ int scaling = vmx_instruction_info & 3; ++ int addr_size = (vmx_instruction_info >> 7) & 7; ++ bool is_reg = vmx_instruction_info & (1u << 10); ++ int seg_reg = (vmx_instruction_info >> 15) & 7; ++ int index_reg = (vmx_instruction_info >> 18) & 0xf; ++ bool index_is_valid = !(vmx_instruction_info & (1u << 22)); ++ int base_reg = (vmx_instruction_info >> 23) & 0xf; ++ bool base_is_valid = !(vmx_instruction_info & (1u << 27)); ++ ++ if (is_reg) { ++ /* TODO: inject #UD */ ++ return 1; ++ } ++ ++ /* Addr = segment_base + offset */ ++ /* offset = base + [index * scale] + displacement */ ++ off = exit_qualification; /* holds the displacement */ ++ if (addr_size == 1) ++ off = (gva_t)sign_extend64(off, 31); ++ else if (addr_size == 0) ++ off = (gva_t)sign_extend64(off, 15); ++ if (base_is_valid) ++ off += vcpu->arch.regs[base_reg]; ++ if (index_is_valid) ++ off += vcpu->arch.regs[index_reg] << scaling; ++ ++ if (seg_reg == VCPU_SREG_FS) { ++ s.base = vmcs_readl(GUEST_FS_BASE); ++ } ++ if (seg_reg == VCPU_SREG_GS) { ++ s.base = vmcs_readl(GUEST_GS_BASE); ++ } ++ ++ /* TODO: support more cpu mode beside long mode */ ++ /* ++ * The effective address, i.e. @off, of a memory operand is truncated ++ * based on the address size of the instruction. Note that this is ++ * the *effective address*, i.e. the address prior to accounting for ++ * the segment's base. ++ */ ++ if (addr_size == 1) /* 32 bit */ ++ off &= 0xffffffff; ++ else if (addr_size == 0) /* 16 bit */ ++ off &= 0xffff; ++ ++ /* ++ * The virtual/linear address is never truncated in 64-bit ++ * mode, e.g. a 32-bit address size can yield a 64-bit virtual ++ * address when using FS/GS with a non-zero base. ++ */ ++ if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) ++ *ret = s.base + off; ++ else ++ *ret = off; ++ ++ /* TODO: check addr is canonical, otherwise inject #GP/#SS */ ++ ++ return 0; ++} ++ ++static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, ++ int *ret) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ gva_t gva; ++ struct x86_exception e; ++ int r; ++ ++ if (get_vmx_mem_address(vcpu, vmx->exit_qualification, ++ vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) { ++ *ret = 1; ++ return -EINVAL; ++ } ++ ++ r = read_gva(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); ++ if (r < 0) { ++ /*TODO: handle memory failure exception */ ++ *ret = 1; ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int validate_vmcs_revision_id(struct kvm_vcpu *vcpu, gpa_t vmpointer) ++{ ++ struct vmcs_config *vmcs_config = &pkvm_hyp->vmcs_config; ++ u32 rev_id; ++ ++ read_gpa(vcpu, vmpointer, &rev_id, sizeof(rev_id)); ++ ++ return (rev_id == vmcs_config->revision_id); ++} ++ ++static bool check_vmx_permission(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ bool permit = true; ++ ++ /*TODO: check more env (cr, cpl) and inject #UD/#GP */ ++ if (!vmx->nested.vmxon) ++ permit = false; ++ ++ return permit; ++} ++ ++static void clear_shadow_indicator(struct vmcs *vmcs) ++{ ++ vmcs->hdr.shadow_vmcs = 0; ++} ++ ++static void set_shadow_indicator(struct vmcs *vmcs) ++{ ++ vmcs->hdr.shadow_vmcs = 1; ++} ++ ++/* current vmcs is vmcs02 */ ++static void copy_shadow_fields_vmcs02_to_vmcs12(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) ++{ ++ const struct shadow_vmcs_field *fields[] = { ++ shadow_read_write_fields, ++ shadow_read_only_fields ++ }; ++ const int max_fields[] = { ++ max_shadow_read_write_fields, ++ max_shadow_read_only_fields ++ }; ++ struct shadow_vmcs_field field; ++ unsigned long val; ++ int i, q; ++ ++ for (q = 0; q < ARRAY_SIZE(fields); q++) { ++ for (i = 0; i < max_fields[q]; i++) { ++ field = fields[q][i]; ++ val = __vmcs_readl(field.encoding); ++ if (is_host_fields((field.encoding))) { ++ pkvm_err("%s: field 0x%x is host field, please remove from shadowing!", ++ __func__, field.encoding); ++ continue; ++ } ++ vmcs12_write_any(vmcs12, field.encoding, field.offset, val); ++ } ++ } ++} ++ ++/* current vmcs is vmcs02 */ ++static void copy_shadow_fields_vmcs12_to_vmcs02(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) ++{ ++ const struct shadow_vmcs_field *fields[] = { ++ shadow_read_write_fields, ++ shadow_read_only_fields ++ }; ++ const int max_fields[] = { ++ max_shadow_read_write_fields, ++ max_shadow_read_only_fields ++ }; ++ struct shadow_vmcs_field field; ++ unsigned long val; ++ int i, q; ++ ++ for (q = 0; q < ARRAY_SIZE(fields); q++) { ++ for (i = 0; i < max_fields[q]; i++) { ++ field = fields[q][i]; ++ val = vmcs12_read_any(vmcs12, field.encoding, ++ field.offset); ++ if (is_host_fields((field.encoding))) { ++ pkvm_err("%s: field 0x%x is host field, please remove from shadowing!", ++ __func__, field.encoding); ++ continue; ++ } ++ __vmcs_writel(field.encoding, val); ++ } ++ } ++} ++ ++/* current vmcs is vmcs01*/ ++static void save_vmcs01_fields_for_emulation(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ ++ vcpu->arch.efer = vmcs_read64(GUEST_IA32_EFER); ++ vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); ++ vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); ++ vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); ++} ++ ++/* current vmcs is vmcs02*/ ++static u64 emulate_field_for_vmcs02(struct vcpu_vmx *vmx, u16 field, u64 virt_val) ++{ ++ u64 val = virt_val; ++ struct kvm_vcpu *vcpu = &vmx->vcpu; ++ ++ switch (field) { ++ case VM_ENTRY_CONTROLS: ++ /* L1 host wishes to use its own MSRs for L2 guest? ++ * vmcs02 shall use such guest states in vmcs01 as its guest states ++ */ ++ if ((val & VM_ENTRY_LOAD_IA32_EFER) != VM_ENTRY_LOAD_IA32_EFER) { ++ val |= VM_ENTRY_LOAD_IA32_EFER; ++ vmcs_write64(GUEST_IA32_EFER, vcpu->arch.efer); ++ } ++ if ((val & VM_ENTRY_LOAD_IA32_PAT) != VM_ENTRY_LOAD_IA32_PAT) { ++ val |= VM_ENTRY_LOAD_IA32_PAT; ++ vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); ++ } ++ if ((val & VM_ENTRY_LOAD_DEBUG_CONTROLS) != VM_ENTRY_LOAD_DEBUG_CONTROLS) { ++ val |= VM_ENTRY_LOAD_DEBUG_CONTROLS; ++ vmcs_writel(GUEST_DR7, vcpu->arch.dr7); ++ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); ++ } ++ break; ++ case VM_EXIT_CONTROLS: ++ /* L1 host wishes to keep use MSRs from L2 guest after its VMExit? ++ * vmcs02 shall enable vmexit save for such guest states ++ * then vmcs01 shall take these guest states as its before L1 VMEntry ++ */ ++ if ((val & VM_EXIT_LOAD_IA32_EFER) != VM_EXIT_LOAD_IA32_EFER) ++ val |= VM_EXIT_SAVE_IA32_EFER; ++ if ((val & VM_EXIT_LOAD_IA32_PAT) != VM_EXIT_LOAD_IA32_PAT) ++ val |= VM_EXIT_SAVE_IA32_PAT; ++ /* host always in 64bit mode */ ++ val |= VM_EXIT_HOST_ADDR_SPACE_SIZE; ++ break; ++ case SECONDARY_VM_EXEC_CONTROL: ++ val &= ~NESTED_UNSUPPORTED_2NDEXEC; ++ /* Enable the #VE, but only protected VM will use it. */ ++ val |= SECONDARY_EXEC_EPT_VIOLATION_VE; ++ break; ++ } ++ return val; ++} ++ ++/* current vmcs is vmcs02*/ ++static void sync_vmcs12_dirty_fields_to_vmcs02(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) ++{ ++ struct shadow_vmcs_field field; ++ unsigned long val, phys_val; ++ int i; ++ ++ if (vmx->nested.dirty_vmcs12) { ++ for (i = 0; i < max_emulated_fields; i++) { ++ field = emulated_fields[i]; ++ if (field.encoding == EPT_POINTER) ++ /* ++ * EPTP is configured as shadow EPTP when the first ++ * time the vmcs02 is loaded. As shadow EPTP is not ++ * changed at the runtime, also cannot use the virtual ++ * EPT from KVM high, no need to sync to vmcs02 again. ++ */ ++ continue; ++ val = vmcs12_read_any(vmcs12, field.encoding, field.offset); ++ phys_val = emulate_field_for_vmcs02(vmx, field.encoding, val); ++ __vmcs_writel(field.encoding, phys_val); ++ } ++ vmx->nested.dirty_vmcs12 = false; ++ } ++} ++ ++/* current vmcs is vmcs01, set vmcs01 guest state with vmcs02 host state */ ++static void prepare_vmcs01_guest_state(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) ++{ ++ vmcs_writel(GUEST_CR0, vmcs12->host_cr0); ++ vmcs_writel(GUEST_CR3, vmcs12->host_cr3); ++ vmcs_writel(GUEST_CR4, vmcs12->host_cr4); ++ ++ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); ++ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); ++ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); ++ ++ /* Both cases want vmcs01 to take EFER/PAT from L2 ++ * 1. L1 host wishes to load its own MSRs on L2 guest VMExit ++ * such vmcs12's host states shall be set as vmcs01's guest states ++ * 2. L1 host wishes to keep use MSRs from L2 guest after its VMExit ++ * such vmcs02's guest state shall be set as vmcs01's guest states ++ * the vmcs02's guest state were recorded in vmcs12 host ++ * ++ * For case 1, IA32_PERF_GLOBAL_CTRL is separately checked. ++ */ ++ vmcs_write64(GUEST_IA32_EFER, vmcs12->host_ia32_efer); ++ vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); ++ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) ++ vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); ++ ++ vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); ++ vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); ++ vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); ++ vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); ++ vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); ++ vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); ++ vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); ++ ++ vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); ++ vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); ++ vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); ++ vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); ++ vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); ++ ++ vmcs_writel(GUEST_RIP, vmcs12->host_rip); ++ vmcs_writel(GUEST_RSP, vmcs12->host_rsp); ++ vmcs_writel(GUEST_RFLAGS, 0x2); ++} ++ ++static void nested_release_vmcs12(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu); ++ struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu; ++ struct vmcs *vmcs02; ++ struct vmcs12 *vmcs12; ++ ++ if (vmx->nested.current_vmptr == INVALID_GPA) ++ return; ++ ++ /* cur_shadow_vcpu must be valid here */ ++ vmcs02 = (struct vmcs *)cur_shadow_vcpu->vmcs02; ++ vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12; ++ vmcs_load_track(vmx, vmcs02); ++ copy_shadow_fields_vmcs02_to_vmcs12(vmx, vmcs12); ++ ++ vmcs_clear_track(vmx, vmcs02); ++ clear_shadow_indicator(vmcs02); ++ ++ /*disable shadowing*/ ++ vmcs_load_track(vmx, vmx->loaded_vmcs->vmcs); ++ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); ++ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); ++ ++ write_gpa(vcpu, vmx->nested.current_vmptr, vmcs12, VMCS12_SIZE); ++ vmx->nested.dirty_vmcs12 = false; ++ vmx->nested.current_vmptr = INVALID_GPA; ++ pkvm_hvcpu->current_shadow_vcpu = NULL; ++ ++ WRITE_ONCE(cur_shadow_vcpu->vcpu, NULL); ++ /* ++ * Flush the current used shadow EPT to make sure ++ * nested_flush_shadow_ept() won't miss any flushing due to vmclear. ++ * See comments in nested_flush_shadow_ept(). ++ */ ++ pkvm_flush_shadow_ept(&cur_shadow_vcpu->vm->sept_desc); ++ kvm_clear_request(PKVM_REQ_TLB_FLUSH_SHADOW_EPT, vcpu); ++ ++ put_shadow_vcpu(cur_shadow_vcpu->shadow_vcpu_handle); ++} ++ ++static void nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu); ++ struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu; ++ struct vmcs *vmcs02 = (struct vmcs *)cur_shadow_vcpu->vmcs02; ++ struct vmcs12 *vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12; ++ ++ if (vmx->nested.current_vmptr == INVALID_GPA) { ++ nested_vmx_result(VMfailInvalid, 0); ++ } else if (vmcs12->launch_state == launch) { ++ /* VMLAUNCH_NONCLEAR_VMCS or VMRESUME_NONLAUNCHED_VMCS */ ++ nested_vmx_result(VMfailValid, ++ launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); ++ } else { ++ /* save vmcs01 guest state for possible emulation */ ++ save_vmcs01_fields_for_emulation(vcpu); ++ ++ /* switch to vmcs02 */ ++ vmcs_clear_track(vmx, vmcs02); ++ clear_shadow_indicator(vmcs02); ++ vmcs_load_track(vmx, vmcs02); ++ ++ sync_vmcs12_dirty_fields_to_vmcs02(vmx, vmcs12); ++ ++ /* mark guest mode */ ++ vcpu->arch.hflags |= HF_GUEST_MASK; ++ } ++} ++ ++static void setup_guest_ept(struct shadow_vcpu_state *shadow_vcpu, u64 guest_eptp) ++{ ++ struct vmcs12 *vmcs12 = (struct vmcs12 *)shadow_vcpu->cached_vmcs12; ++ struct pkvm_shadow_vm *vm = shadow_vcpu->vm; ++ bool invalidate = false; ++ ++ if (!is_valid_eptp(guest_eptp)) ++ pkvm_guest_ept_deinit(shadow_vcpu); ++ else if (vmcs12->ept_pointer != guest_eptp) { ++ pkvm_guest_ept_deinit(shadow_vcpu); ++ pkvm_guest_ept_init(shadow_vcpu, guest_eptp); ++ } ++ ++ pkvm_spin_lock(&vm->lock); ++ if (vm->sept_desc.last_guest_eptp != guest_eptp) { ++ vm->sept_desc.last_guest_eptp = guest_eptp; ++ invalidate = true; ++ } ++ pkvm_spin_unlock(&vm->lock); ++ ++ if (invalidate) ++ pkvm_invalidate_shadow_ept(&vm->sept_desc); ++} ++ ++int handle_vmxon(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ gpa_t vmptr; ++ int r; ++ ++ /*TODO: check env error(cr, efer, rflags, cpl) */ ++ if (vmx->nested.vmxon) { ++ nested_vmx_result(VMfailValid, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); ++ } else { ++ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) { ++ nested_vmx_result(VMfailInvalid, 0); ++ return r; ++ } else if (!validate_vmcs_revision_id(vcpu, vmptr)) { ++ nested_vmx_result(VMfailInvalid, 0); ++ } else { ++ vmx->nested.current_vmptr = INVALID_GPA; ++ vmx->nested.dirty_vmcs12 = false; ++ vmx->nested.vmxon_ptr = vmptr; ++ vmx->nested.vmxon = true; ++ ++ nested_vmx_result(VMsucceed, 0); ++ } ++ } ++ ++ return 0; ++} ++ ++int handle_vmxoff(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ ++ if (check_vmx_permission(vcpu)) { ++ vmx->nested.vmxon = false; ++ vmx->nested.vmxon_ptr = INVALID_GPA; ++ ++ nested_vmx_result(VMsucceed, 0); ++ } ++ ++ return 0; ++} ++ ++int handle_vmptrld(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu); ++ struct shadow_vcpu_state *shadow_vcpu; ++ struct vmcs *vmcs02; ++ struct vmcs12 *vmcs12; ++ gpa_t vmptr; ++ int r; ++ ++ if (check_vmx_permission(vcpu)) { ++ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) { ++ nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS); ++ return r; ++ } else if (vmptr == vmx->nested.vmxon_ptr) { ++ nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_VMXON_POINTER); ++ } else if (!validate_vmcs_revision_id(vcpu, vmptr)) { ++ nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); ++ } else { ++ if (vmx->nested.current_vmptr != vmptr) { ++ s64 handle; ++ ++ nested_release_vmcs12(vcpu); ++ ++ handle = find_shadow_vcpu_handle_by_vmcs(vmptr); ++ if ((handle > 0) && (shadow_vcpu = get_shadow_vcpu(handle))) { ++ vmcs02 = (struct vmcs *)shadow_vcpu->vmcs02; ++ vmcs12 = (struct vmcs12 *) shadow_vcpu->cached_vmcs12; ++ ++ read_gpa(vcpu, vmptr, vmcs12, VMCS12_SIZE); ++ vmx->nested.dirty_vmcs12 = true; ++ ++ /* ++ * Save vmcs01 guest state for possible emulation when ++ * calling sync_vmcs12_dirty_fields_to_vmcs02. ++ */ ++ save_vmcs01_fields_for_emulation(vcpu); ++ ++ WRITE_ONCE(shadow_vcpu->vcpu, vcpu); ++ if (!shadow_vcpu->vmcs02_inited) { ++ memset(vmcs02, 0, pkvm_hyp->vmcs_config.size); ++ vmcs02->hdr.revision_id = pkvm_hyp->vmcs_config.revision_id; ++ vmcs_load_track(vmx, vmcs02); ++ init_contant_host_state_area(pkvm_hvcpu->pcpu, vcpu->cpu); ++ vmcs_writel(HOST_RIP, (unsigned long)__pkvm_vmx_vmexit); ++ /* ++ * EPTP is mantained by pKVM and configured with ++ * shadow EPTP from its corresponding shadow VM. ++ * As shadow EPTP is not changed at runtime, set ++ * it to EPTP when the first time this vmcs02 is ++ * loading. ++ */ ++ vmcs_write64(EPT_POINTER, ++ shadow_vcpu->vm->sept_desc.shadow_eptp); ++ /* ++ * Flush the shadow eptp in case there are stale ++ * entries which are not flushed when destroying ++ * this shadow EPTP at last time. ++ */ ++ pkvm_flush_shadow_ept(&shadow_vcpu->vm->sept_desc); ++ ++ /* ++ * Write the #VE information physical address. ++ */ ++ if (shadow_vcpu_is_protected(shadow_vcpu)) { ++ memset(&shadow_vcpu->ve_info, 0, sizeof(shadow_vcpu->ve_info)); ++ vmcs_write64(VE_INFO_ADDR, __pkvm_pa(&shadow_vcpu->ve_info)); ++ } ++ ++ shadow_vcpu->last_cpu = vcpu->cpu; ++ shadow_vcpu->vmcs02_inited = true; ++ } else { ++ vmcs_load_track(vmx, vmcs02); ++ if (shadow_vcpu->last_cpu != vcpu->cpu) { ++ init_contant_host_state_area(pkvm_hvcpu->pcpu, vcpu->cpu); ++ shadow_vcpu->last_cpu = vcpu->cpu; ++ } ++ } ++ ++ pkvm_hvcpu->current_shadow_vcpu = shadow_vcpu; ++ ++ copy_shadow_fields_vmcs12_to_vmcs02(vmx, vmcs12); ++ sync_vmcs12_dirty_fields_to_vmcs02(vmx, vmcs12); ++ vmcs_clear_track(vmx, vmcs02); ++ set_shadow_indicator(vmcs02); ++ ++ /* enable shadowing */ ++ vmcs_load_track(vmx, vmx->loaded_vmcs->vmcs); ++ vmcs_write64(VMREAD_BITMAP, __pkvm_pa_symbol(vmx_vmread_bitmap)); ++ vmcs_write64(VMWRITE_BITMAP, __pkvm_pa_symbol(vmx_vmwrite_bitmap)); ++ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); ++ vmcs_write64(VMCS_LINK_POINTER, __pkvm_pa(vmcs02)); ++ ++ vmx->nested.current_vmptr = vmptr; ++ ++ nested_vmx_result(VMsucceed, 0); ++ } else { ++ nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS); ++ } ++ } else { ++ nested_vmx_result(VMsucceed, 0); ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++int handle_vmclear(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ gpa_t vmptr; ++ u32 zero = 0; ++ int r; ++ ++ if (check_vmx_permission(vcpu)) { ++ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) { ++ nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS); ++ return r; ++ } else if (vmptr == vmx->nested.vmxon_ptr) { ++ nested_vmx_result(VMfailValid, VMXERR_VMCLEAR_VMXON_POINTER); ++ } else { ++ if (vmx->nested.current_vmptr == vmptr) ++ nested_release_vmcs12(vcpu); ++ ++ write_gpa(vcpu, vmptr + offsetof(struct vmcs12, launch_state), ++ &zero, sizeof(zero)); ++ ++ nested_vmx_result(VMsucceed, 0); ++ } ++ } ++ ++ return 0; ++} ++ ++int handle_vmwrite(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu); ++ struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu; ++ struct vmcs12 *vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12; ++ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); ++ struct x86_exception e; ++ unsigned long field; ++ short offset; ++ gva_t gva; ++ int r, reg; ++ u64 value = 0; ++ ++ if (check_vmx_permission(vcpu)) { ++ if (vmx->nested.current_vmptr == INVALID_GPA) { ++ nested_vmx_result(VMfailInvalid, 0); ++ } else { ++ if (instr_info & BIT(10)) { ++ reg = ((instr_info) >> 3) & 0xf; ++ value = vcpu->arch.regs[reg]; ++ } else { ++ if (get_vmx_mem_address(vcpu, vmx->exit_qualification, ++ instr_info, &gva)) ++ return 1; ++ ++ r = read_gva(vcpu, gva, &value, 8, &e); ++ if (r < 0) { ++ /*TODO: handle memory failure exception */ ++ return r; ++ } ++ } ++ ++ reg = ((instr_info) >> 28) & 0xf; ++ field = vcpu->arch.regs[reg]; ++ ++ offset = get_vmcs12_field_offset(field); ++ if (offset < 0) { ++ nested_vmx_result(VMfailInvalid, VMXERR_UNSUPPORTED_VMCS_COMPONENT); ++ return 0; ++ } ++ ++ /*TODO: check vcpu supports "VMWRITE to any supported field in the VMCS"*/ ++ if (vmcs_field_readonly(field)) { ++ nested_vmx_result(VMfailInvalid, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); ++ return 0; ++ } ++ ++ /* ++ * Some Intel CPUs intentionally drop the reserved bits of the AR byte ++ * fields on VMWRITE. Emulate this behavior to ensure consistent KVM ++ * behavior regardless of the underlying hardware, e.g. if an AR_BYTE ++ * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD ++ * from L1 will return a different value than VMREAD from L2 (L1 sees ++ * the stripped down value, L2 sees the full value as stored by KVM). ++ */ ++ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) ++ value &= 0x1f0ff; ++ ++ if (field == EPT_POINTER) ++ setup_guest_ept(cur_shadow_vcpu, value); ++ ++ vmcs12_write_any(vmcs12, field, offset, value); ++ ++ if (is_emulated_fields(field)) { ++ vmx->nested.dirty_vmcs12 = true; ++ nested_vmx_result(VMsucceed, 0); ++ } else if (is_host_fields(field)){ ++ nested_vmx_result(VMsucceed, 0); ++ } else { ++ pkvm_err("%s: not include emulated fields 0x%lx, please add!\n", ++ __func__, field); ++ nested_vmx_result(VMfailInvalid, VMXERR_UNSUPPORTED_VMCS_COMPONENT); ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++int handle_vmread(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu); ++ struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu; ++ struct vmcs12 *vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12; ++ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); ++ struct x86_exception e; ++ unsigned long field; ++ short offset; ++ gva_t gva = 0; ++ int r, reg; ++ u64 value; ++ ++ if (check_vmx_permission(vcpu)) { ++ if (vmx->nested.current_vmptr == INVALID_GPA) { ++ nested_vmx_result(VMfailInvalid, 0); ++ } else { ++ /* Decode instruction info and find the field to read */ ++ reg = ((instr_info) >> 28) & 0xf; ++ field = vcpu->arch.regs[reg]; ++ ++ offset = get_vmcs12_field_offset(field); ++ if (offset < 0) { ++ nested_vmx_result(VMfailInvalid, VMXERR_UNSUPPORTED_VMCS_COMPONENT); ++ } else { ++ value = vmcs12_read_any(vmcs12, field, offset); ++ if (instr_info & BIT(10)) { ++ reg = ((instr_info) >> 3) & 0xf; ++ vcpu->arch.regs[reg] = value; ++ } else { ++ if (get_vmx_mem_address(vcpu, vmx->exit_qualification, ++ instr_info, &gva)) ++ return 1; ++ ++ r = write_gva(vcpu, gva, &value, 8, &e); ++ if (r < 0) { ++ /*TODO: handle memory failure exception */ ++ return r; ++ } ++ } ++ nested_vmx_result(VMsucceed, 0); ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++int handle_vmresume(struct kvm_vcpu *vcpu) ++{ ++ if (check_vmx_permission(vcpu)) ++ nested_vmx_run(vcpu, false); ++ ++ return 0; ++} ++ ++int handle_vmlaunch(struct kvm_vcpu *vcpu) ++{ ++ if (check_vmx_permission(vcpu)) ++ nested_vmx_run(vcpu, true); ++ ++ return 0; ++} ++ ++int handle_invept(struct kvm_vcpu *vcpu) ++{ ++ struct vmx_capability *vmx_cap = &pkvm_hyp->vmx_cap; ++ struct shadow_vcpu_state *shadow_vcpu; ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ u32 vmx_instruction_info, types; ++ unsigned long type; ++ int gpr_index; ++ ++ if (!vmx_has_invept()) ++ /* TODO: inject #UD */ ++ return -EINVAL; ++ ++ if (!check_vmx_permission(vcpu)) ++ return 0; ++ ++ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); ++ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); ++ type = vcpu->arch.regs[gpr_index]; ++ types = (vmx_cap->ept >> VMX_EPT_EXTENT_SHIFT) & 6; ++ ++ if (type >= 32 || !(types & (1 << type))) { ++ nested_vmx_result(VMfailValid, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); ++ return 0; ++ } ++ ++ /* ++ * Shadow EPT TLB is flushed when doing vmclear for a shadow vcpu, so if ++ * this CPU doesn't have a shadow vcpu loaded, there is no shadow ++ * EPT TLB entries left on this CPU, and no need to execute invept. ++ */ ++ shadow_vcpu = to_pkvm_hvcpu(vcpu)->current_shadow_vcpu; ++ if (!shadow_vcpu) ++ goto out; ++ ++ switch (type) { ++ case VMX_EPT_EXTENT_CONTEXT: { ++ struct vmcs12 *vmcs12; ++ struct x86_exception e; ++ gva_t gva; ++ struct { ++ u64 eptp, gpa; ++ } operand; ++ ++ if (get_vmx_mem_address(vcpu, vmx->exit_qualification, ++ vmx_instruction_info, &gva)) ++ /* TODO: handle the decode failure */ ++ return -EINVAL; ++ ++ if (read_gva(vcpu, gva, &operand, sizeof(operand), &e) < 0) ++ /*TODO: handle memory failure exception */ ++ return -EINVAL; ++ ++ /* ++ * For single context invept with a guest eptp, do the invept ++ * if the guest eptp matches the shadow eptp of this ++ * loaded shadow vcpu. ++ */ ++ vmcs12 = (struct vmcs12 *)shadow_vcpu->cached_vmcs12; ++ if (vmcs12->ept_pointer == operand.eptp) ++ pkvm_flush_shadow_ept(&shadow_vcpu->vm->sept_desc); ++ break; ++ } ++ case VMX_EPT_EXTENT_GLOBAL: ++ /* ++ * For global context invept, directly do invept with the ++ * shadow eptp of the current shadow vcpu, as there is no ++ * other shadow ept's TLB entries left on this cpu. ++ */ ++ pkvm_flush_shadow_ept(&shadow_vcpu->vm->sept_desc); ++ break; ++ default: ++ break; ++ } ++ ++out: ++ nested_vmx_result(VMsucceed, 0); ++ return 0; ++} ++ ++void vpid_sync_context(int vpid) ++{ ++ if (vmx_has_invvpid_single()) ++ vpid_sync_vcpu_single(vpid); ++ else if (vpid != 0) ++ vpid_sync_vcpu_global(); ++} ++ ++void vpid_sync_vcpu_addr(int vpid, gva_t addr) ++{ ++ if (vpid == 0) ++ return; ++ ++ if (vmx_has_invvpid_individual_addr()) ++ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); ++ else ++ vpid_sync_context(vpid); ++} ++ ++#define VMX_VPID_EXTENT_SUPPORTED_MASK \ ++ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ ++ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ ++ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ ++ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) ++ ++int handle_invvpid(struct kvm_vcpu *vcpu) ++{ ++ struct vmx_capability *vmx_cap = &pkvm_hyp->vmx_cap; ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ u32 vmx_instruction_info, types; ++ struct x86_exception e; ++ unsigned long type; ++ gva_t gva; ++ int gpr_index; ++ ++ struct { ++ u64 vpid : 16; ++ u64 rsvd : 48; ++ u64 gla; ++ } operand; ++ ++ if (!vmx_has_invvpid()) ++ /* TODO: inject #UD */ ++ return -EINVAL; ++ ++ if (!check_vmx_permission(vcpu)) ++ return 0; ++ ++ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); ++ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); ++ type = vcpu->arch.regs[gpr_index]; ++ types = (vmx_cap->vpid & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; ++ ++ if (type > VMX_VPID_EXTENT_SINGLE_NON_GLOBAL || !(types & (1 << type))) { ++ nested_vmx_result(VMfailValid, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); ++ return 0; ++ } ++ ++ if (get_vmx_mem_address(vcpu, vmx->exit_qualification, ++ vmx_instruction_info, &gva)) ++ /* TODO: handle the decode failure */ ++ return -EINVAL; ++ ++ if (read_gva(vcpu, gva, &operand, sizeof(operand), &e) < 0) ++ /*TODO: handle memory failure exception */ ++ return -EINVAL; ++ ++ if (operand.rsvd != 0) { ++ nested_vmx_result(VMfailValid, ++ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); ++ return 0; ++ } ++ ++ switch (type) { ++ case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: ++ if (!operand.vpid || ++ !__is_canonical_address(operand.gla, ++ pkvm_virt_addr_bits())) { ++ nested_vmx_result(VMfailValid, ++ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); ++ return 0; ++ } ++ ++ vpid_sync_vcpu_addr(operand.vpid, operand.gla); ++ break; ++ case VMX_VPID_EXTENT_SINGLE_CONTEXT: ++ case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: ++ if (!operand.vpid) { ++ nested_vmx_result(VMfailValid, ++ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); ++ return 0; ++ } ++ ++ vpid_sync_context(operand.vpid); ++ break; ++ case VMX_VPID_EXTENT_ALL_CONTEXT: ++ vpid_sync_context(operand.vpid); ++ break; ++ default: ++ break; ++ } ++ ++ nested_vmx_result(VMsucceed, 0); ++ return 0; ++} ++ ++static bool nested_handle_ept_violation(struct shadow_vcpu_state *shadow_vcpu, ++ u64 l2_gpa, u64 exit_quali) ++{ ++ enum sept_handle_ret ret = pkvm_handle_shadow_ept_violation(shadow_vcpu, ++ l2_gpa, exit_quali); ++ bool handled = false; ++ ++ switch (ret) { ++ case PKVM_INJECT_EPT_MISC: { ++ struct vcpu_vmx *vmx = to_vmx(shadow_vcpu->vcpu); ++ ++ vmx->exit_reason.full = EXIT_REASON_EPT_MISCONFIG; ++ /* ++ * Inject EPT_MISCONFIG vmexit reason if can directly modify ++ * the read-only fields. Otherwise still deliver EPT_VIOLATION ++ * for simplification. ++ */ ++ if (vmx_has_vmwrite_any_field()) ++ vmcs_write32(VM_EXIT_REASON, EXIT_REASON_EPT_MISCONFIG); ++ break; ++ } ++ case PKVM_HANDLED: ++ handled = true; ++ break; ++ default: ++ break; ++ } ++ ++ if (handled && (vmcs_read32(IDT_VECTORING_INFO_FIELD) & ++ VECTORING_INFO_VALID_MASK)) ++ /* pending interrupt, back to kvm-high to inject */ ++ handled = false; ++ ++ return handled; ++} ++ ++static void pkvm_get_ve_info(struct kvm_vcpu *vcpu) ++{ ++ struct shadow_vcpu_state *shadow_vcpu = to_pkvm_hvcpu(vcpu)->current_shadow_vcpu; ++ struct pkvm_ve_info *ve; ++ ++ ve = &shadow_vcpu->ve_info; ++ ++ kvm_rcx_write(vcpu, ve->exit_reason); ++ kvm_rdx_write(vcpu, ve->exit_qual); ++ kvm_r8_write(vcpu, ve->gla); ++ kvm_r9_write(vcpu, ve->gpa); ++ ++ /* ++ * When virtualization exception happens, the valid filed in #VE ++ * information will be set to 0xffffffff. We need to clear it to 0 when ++ * protected VM handles this #VE, so another #VE can continue to happen. ++ */ ++ ve->valid = 0; ++} ++ ++static bool nested_handle_vmcall(struct kvm_vcpu *vcpu) ++{ ++ u64 nr, a0, a1, a2, a3; ++ struct shadow_vcpu_state *shadow_vcpu = to_pkvm_hvcpu(vcpu)->current_shadow_vcpu; ++ struct pkvm_pgtable *pgstate_pgt = &shadow_vcpu->vm->pgstate_pgt; ++ bool handled = false; ++ int ret = 0; ++ ++ /* All normal guest's vmcall should be handled by KVM. */ ++ if (!shadow_vcpu_is_protected(shadow_vcpu)) ++ return false; ++ ++ nr = vcpu->arch.regs[VCPU_REGS_RAX]; ++ a0 = vcpu->arch.regs[VCPU_REGS_RBX]; ++ a1 = vcpu->arch.regs[VCPU_REGS_RCX]; ++ a2 = vcpu->arch.regs[VCPU_REGS_RDX]; ++ a3 = vcpu->arch.regs[VCPU_REGS_RSI]; ++ ++ switch (nr) { ++ case PKVM_GHC_SHARE_MEM: ++ ret = __pkvm_guest_share_host(pgstate_pgt, a0, a1); ++ handled = true; ++ break; ++ case PKVM_GHC_UNSHARE_MEM: ++ ret = __pkvm_guest_unshare_host(pgstate_pgt, a0, a1); ++ handled = true; ++ break; ++ case PKVM_GHC_GET_VE_INFO: ++ pkvm_get_ve_info(vcpu); ++ handled = true; ++ break; ++ default: ++ break; ++ } ++ ++ if (handled) ++ vcpu->arch.regs[VCPU_REGS_RAX] = ret; ++ ++ return handled; ++} ++ ++static bool nested_handle_cpuid(struct kvm_vcpu *vcpu) ++{ ++ struct shadow_vcpu_state *shadow_vcpu = to_pkvm_hvcpu(vcpu)->current_shadow_vcpu; ++ u32 leaf; ++ ++ if (!shadow_vcpu_is_protected(shadow_vcpu)) ++ return false; ++ ++ leaf = vcpu->arch.regs[VCPU_REGS_RAX]; ++ ++ /* ++ * Reuse the KVM_CPUID_SIGNATURE, which has been used by KVM. By ++ * intercept the process of detecting hypervisor, the protected vm will ++ * detect PKVM hypervisor instead of KVM. ++ */ ++ if (leaf == KVM_CPUID_SIGNATURE) { ++ const u32 *sigptr = (const u32 *)"PKVMPKVMPKVM"; ++ vcpu->arch.regs[VCPU_REGS_RBX] = sigptr[0]; ++ vcpu->arch.regs[VCPU_REGS_RCX] = sigptr[1]; ++ vcpu->arch.regs[VCPU_REGS_RDX] = sigptr[2]; ++ return true; ++ } ++ ++ return false; ++} ++ ++int nested_vmexit(struct kvm_vcpu *vcpu, bool *skip_instruction) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu); ++ struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu; ++ struct vmcs *vmcs02 = (struct vmcs *)cur_shadow_vcpu->vmcs02; ++ struct vmcs12 *vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12; ++ ++ switch (vmx->exit_reason.full) { ++ case EXIT_REASON_EPT_VIOLATION: ++ /* EPT violation can be handled by pkvm, no need back to kvm-high */ ++ if (nested_handle_ept_violation(cur_shadow_vcpu, ++ vmcs_read64(GUEST_PHYSICAL_ADDRESS), ++ vmx->exit_qualification)) ++ return 0; ++ break; ++ case EXIT_REASON_VMCALL: ++ if (nested_handle_vmcall(vcpu)) { ++ *skip_instruction = true; ++ return 0; ++ } ++ break; ++ case EXIT_REASON_INIT_SIGNAL: ++ /* ++ * INIT vmexit reason is unsupported by KVM in primary VM and ++ * it is reused by pkvm to kick vcpu out of non-root. ++ * When this vmexit reason happens, no need back to primary VM. ++ */ ++ return 0; ++ case EXIT_REASON_CPUID: ++ if (nested_handle_cpuid(vcpu)) { ++ *skip_instruction = true; ++ return 0; ++ } ++ break; ++ default: ++ break; ++ } ++ ++ /* clear guest mode */ ++ vcpu->arch.hflags &= ~HF_GUEST_MASK; ++ ++ /* L1 host wishes to keep use MSRs from L2 guest after its VMExit? ++ * save vmcs02 guest state for later vmcs01 guest state preparation ++ */ ++ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) != VM_EXIT_LOAD_IA32_EFER) ++ vmcs12->host_ia32_efer = vmcs_read64(GUEST_IA32_EFER); ++ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) != VM_EXIT_LOAD_IA32_PAT) ++ vmcs12->host_ia32_pat = vmcs_read64(GUEST_IA32_PAT); ++ ++ if (!vmcs12->launch_state) ++ vmcs12->launch_state = 1; ++ ++ /* switch to vmcs01 */ ++ vmcs_clear_track(vmx, vmcs02); ++ set_shadow_indicator(vmcs02); ++ vmcs_load_track(vmx, vmx->loaded_vmcs->vmcs); ++ ++ prepare_vmcs01_guest_state(vmx, vmcs12); ++ ++ return 0; ++} ++ ++void nested_flush_shadow_ept(struct kvm_vcpu *vcpu) ++{ ++ struct pkvm_host_vcpu *pkvm_hvcpu = to_pkvm_hvcpu(vcpu); ++ struct shadow_vcpu_state *cur_shadow_vcpu = pkvm_hvcpu->current_shadow_vcpu; ++ ++ /* ++ * If the shadow vcpu is released from this CPU, no need to ++ * worry about its TLB as it is already flushed during release. ++ */ ++ if (!cur_shadow_vcpu) ++ return; ++ ++ /* ++ * And probably the shadow EPT is not the one wanting to be flushed ++ * if another shadow vcpu is loaded after kick, and cannot tell ++ * this case without additional hints. So always do the shadow ++ * ept flushing. ++ */ ++ pkvm_flush_shadow_ept(&cur_shadow_vcpu->vm->sept_desc); ++} ++ ++void nested_invalidate_shadow_ept(int shadow_vm_handle, u64 start_gpa, u64 size) ++{ ++ struct pkvm_shadow_vm *vm = get_shadow_vm(shadow_vm_handle); ++ ++ if (!vm) ++ return; ++ ++ if (!start_gpa && !size) ++ /* ++ * With start_gpa = 0 & size = 0, do invalidation ++ * for the entire shadow EPT ++ */ ++ pkvm_invalidate_shadow_ept(&vm->sept_desc); ++ else ++ pkvm_invalidate_shadow_ept_with_range(&vm->sept_desc, ++ start_gpa, size); ++ ++ put_shadow_vm(shadow_vm_handle); ++} ++ ++void pkvm_init_nest(void) ++{ ++ init_vmcs_shadow_fields(); ++ init_emulated_vmcs_fields(); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/nested.h b/arch/x86/kvm/vmx/pkvm/hyp/nested.h +new file mode 100644 +index 000000000000..c539026862c2 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/nested.h +@@ -0,0 +1,32 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef __PKVM_NESTED_H ++#define __PKVM_NESTED_H ++ ++int handle_vmxon(struct kvm_vcpu *vcpu); ++int handle_vmxoff(struct kvm_vcpu *vcpu); ++int handle_vmptrld(struct kvm_vcpu *vcpu); ++int handle_vmclear(struct kvm_vcpu *vcpu); ++int handle_vmwrite(struct kvm_vcpu *vcpu); ++int handle_vmread(struct kvm_vcpu *vcpu); ++int handle_vmresume(struct kvm_vcpu *vcpu); ++int handle_vmlaunch(struct kvm_vcpu *vcpu); ++int handle_invept(struct kvm_vcpu *vcpu); ++int handle_invvpid(struct kvm_vcpu *vcpu); ++int nested_vmexit(struct kvm_vcpu *vcpu, bool *skip_instruction); ++void nested_flush_shadow_ept(struct kvm_vcpu *vcpu); ++void nested_invalidate_shadow_ept(int shadow_handle, u64 start_gpa, u64 size); ++void pkvm_init_nest(void); ++ ++#define LIST_OF_VMX_MSRS \ ++ MSR_IA32_VMX_MISC, \ ++ MSR_IA32_VMX_PROCBASED_CTLS2, \ ++ MSR_IA32_VMX_EPT_VPID_CAP, \ ++ MSR_IA32_VMX_VMFUNC ++ ++bool is_vmx_msr(unsigned long msr); ++int read_vmx_msr(struct kvm_vcpu *vcpu, unsigned long msr, u64 *val); ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pci.c b/arch/x86/kvm/vmx/pkvm/hyp/pci.c +new file mode 100644 +index 000000000000..222f009e669c +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/pci.c +@@ -0,0 +1,350 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* Copyright(c) 2023 Intel Corporation. */ ++#include ++#include ++ ++#include "pkvm_spinlock.h" ++#include "io.h" ++#include "io_emulate.h" ++#include "mmu.h" ++#include "ptdev.h" ++#include "pci.h" ++ ++static union pci_cfg_addr_reg host_vpci_cfg_addr; ++static pkvm_spinlock_t pci_cfg_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED }; ++static pkvm_spinlock_t host_vpci_cfg_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED }; ++ ++static int pci_cfg_space_read(union pci_cfg_addr_reg *cfg_addr, ++ u32 offset, int size, unsigned long *value) ++{ ++ pkvm_spin_lock(&pci_cfg_lock); ++ ++ pkvm_pio_write(PCI_CFG_ADDR, 4, cfg_addr->value); ++ pkvm_pio_read(PCI_CFG_DATA + offset, size, value); ++ ++ pkvm_spin_unlock(&pci_cfg_lock); ++ ++ return 0; ++} ++ ++static int pci_cfg_space_write(union pci_cfg_addr_reg *cfg_addr, ++ u32 offset, int size, unsigned long value) ++{ ++ pkvm_spin_lock(&pci_cfg_lock); ++ ++ pkvm_pio_write(PCI_CFG_ADDR, 4, cfg_addr->value); ++ pkvm_pio_write(PCI_CFG_DATA + offset, size, value); ++ ++ pkvm_spin_unlock(&pci_cfg_lock); ++ ++ return 0; ++} ++ ++static int pci_mmcfg_read(u64 address, int size, unsigned long *value) ++{ ++ pkvm_mmio_read(address, size, value); ++ return 0; ++} ++ ++static int pci_mmcfg_write(u64 address, int size, unsigned long value) ++{ ++ pkvm_mmio_write(address, size, value); ++ return 0; ++} ++ ++unsigned long pkvm_pci_cfg_space_read(u32 bdf, u32 offset, int size) ++{ ++ union pci_cfg_addr_reg reg; ++ unsigned long value = 0; ++ ++ reg.enable = 1; ++ reg.bdf = bdf; ++ reg.reg = offset & (~0x3); ++ ++ pci_cfg_space_read(®, offset & 0x3, size, &value); ++ ++ return value; ++} ++ ++void pkvm_pci_cfg_space_write(u32 bdf, u32 offset, int size, unsigned long value) ++{ ++ union pci_cfg_addr_reg reg; ++ ++ reg.enable = 1; ++ reg.bdf = bdf; ++ reg.reg = offset & (~0x3); ++ ++ pci_cfg_space_write(®, offset & 0x3, size, value); ++} ++ ++static bool host_vpci_cfg_data_allow_write(struct pkvm_ptdev *ptdev, u64 offset, int size, u32 value) ++{ ++ int index; ++ ++ if (!ptdev_attached_to_vm(ptdev)) ++ return true; ++ ++ if (offset >= 0x10 && offset < 0x28) { ++ index = (offset-0x10) >> 2; ++ /* Allow only aligned BAR write with the cached value*/ ++ return (offset & 0x3) == 0 && size == 4 && value == ptdev->bars[index]; ++ } ++ ++ return true; ++} ++ ++static int host_vpci_cfg_addr_read(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req) ++{ ++ u32 value = host_vpci_cfg_addr.value; ++ int ret = 0; ++ ++ pkvm_spin_lock(&host_vpci_cfg_lock); ++ ++ switch (req->size) { ++ case 1: ++ *(u8 *)req->value = (u8)value; ++ break; ++ case 2: ++ *(u16 *)req->value = (u16)value; ++ break; ++ case 4: ++ *(u32 *)req->value = value; ++ break; ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ pkvm_spin_unlock(&host_vpci_cfg_lock); ++ ++ return ret; ++} ++ ++static int host_vpci_cfg_addr_write(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req) ++{ ++ u32 *value = &host_vpci_cfg_addr.value; ++ int ret = 0; ++ ++ pkvm_spin_lock(&host_vpci_cfg_lock); ++ ++ switch (req->size) { ++ case 1: ++ *(u8 *)value = (u8)*req->value; ++ break; ++ case 2: ++ *(u16 *)value = (u16)*req->value; ++ break; ++ case 4: ++ *value = (u32)*req->value; ++ break; ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ pkvm_spin_unlock(&host_vpci_cfg_lock); ++ ++ return ret; ++} ++ ++static int host_vpci_cfg_data_audit_write(struct pkvm_pio_req *req) ++{ ++ struct pkvm_ptdev *ptdev; ++ u64 offset = host_vpci_cfg_addr.reg; ++ u32 bdf = host_vpci_cfg_addr.bdf; ++ int ret; ++ ++ ptdev = pkvm_get_ptdev(bdf, 0); ++ ++ if (ptdev) { ++ pkvm_spin_lock(&ptdev->lock); ++ if (!host_vpci_cfg_data_allow_write(ptdev, offset + req->port - PCI_CFG_DATA, ++ req->size, *req->value)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ } ++ ++ ret = pci_cfg_space_write(&host_vpci_cfg_addr, req->port - PCI_CFG_DATA, req->size, *req->value); ++ ++out: ++ if (ptdev) { ++ pkvm_spin_unlock(&ptdev->lock); ++ pkvm_put_ptdev(ptdev); ++ } ++ ++ return ret; ++} ++ ++static int host_vpci_cfg_data_read(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req) ++{ ++ int ret; ++ ++ pkvm_spin_lock(&host_vpci_cfg_lock); ++ ++ if (host_vpci_cfg_addr.enable) ++ ret = pci_cfg_space_read(&host_vpci_cfg_addr, req->port - PCI_CFG_DATA, req->size, req->value); ++ else ++ ret = -EINVAL; ++ ++ pkvm_spin_unlock(&host_vpci_cfg_lock); ++ ++ return ret; ++} ++ ++static int host_vpci_cfg_data_write(struct kvm_vcpu *vcpu, struct pkvm_pio_req *req) ++{ ++ int ret; ++ ++ pkvm_spin_lock(&host_vpci_cfg_lock); ++ ++ if (host_vpci_cfg_addr.enable) ++ ret = host_vpci_cfg_data_audit_write(req); ++ else ++ ret = -EINVAL; ++ ++ pkvm_spin_unlock(&host_vpci_cfg_lock); ++ ++ return ret; ++} ++ ++static int host_vpci_mmcfg_get_bdf_offset(u64 address, u32 *bdf, u64 *offset) ++{ ++ int i; ++ struct pkvm_pci_info *pci_info; ++ struct pci_mmcfg_region *region; ++ ++ pci_info = &pkvm_hyp->host_vm.pci_info; ++ for (i = 0; i < pci_info->mmcfg_table_size; i++) { ++ region = &pci_info->mmcfg_table[i]; ++ if (address >= region->res.start && address <= region->res.end) { ++ *bdf = (address - region->address) >> 12; ++ *offset = address & 0xfff; ++ return 0; ++ } ++ } ++ ++ return -EINVAL; ++} ++ ++int host_vpci_mmcfg_read(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req) ++{ ++ u64 address = (u64)host_mmio2hva(req->address); ++ ++ return pci_mmcfg_read(address, req->size, req->value); ++} ++ ++int host_vpci_mmcfg_write(struct kvm_vcpu *vcpu, struct pkvm_mmio_req *req) ++{ ++ struct pkvm_ptdev *ptdev; ++ u64 offset, address = (u64)host_mmio2hva(req->address); ++ u32 bdf; ++ int ret; ++ ++ if (host_vpci_mmcfg_get_bdf_offset(req->address, &bdf, &offset)) ++ return -EINVAL; ++ ++ ptdev = pkvm_get_ptdev(bdf, 0); ++ ++ if (ptdev) { ++ pkvm_spin_lock(&ptdev->lock); ++ if (!host_vpci_cfg_data_allow_write(ptdev, offset, req->size, *req->value)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ } ++ ++ ret = pci_mmcfg_write(address, req->size, *req->value); ++ ++out: ++ if (ptdev) { ++ pkvm_spin_unlock(&ptdev->lock); ++ pkvm_put_ptdev(ptdev); ++ } ++ ++ return ret; ++} ++ ++int init_pci(struct pkvm_hyp *pkvm) ++{ ++ int ret; ++ ++ ret = register_host_pio_handler(&pkvm->host_vm, ++ PCI_CFG_ADDR, IO_SIZE_4, host_vpci_cfg_addr_read, host_vpci_cfg_addr_write); ++ if (ret) ++ goto out; ++ ++ /* ++ * Kernel access the PCI config space data port in an unaligned way. So here we ++ * treat the data port as four consecutive ports and register four handlers for it. ++ * All registered ports and access width below are valid. ++ */ ++ ret = register_host_pio_handler(&pkvm->host_vm, ++ PCI_CFG_DATA, IO_SIZE_FULL, host_vpci_cfg_data_read, host_vpci_cfg_data_write); ++ if (ret) ++ goto out; ++ ++ ret = register_host_pio_handler(&pkvm->host_vm, ++ PCI_CFG_DATA + 1, IO_SIZE_1, host_vpci_cfg_data_read, host_vpci_cfg_data_write); ++ if (ret) ++ goto out; ++ ++ ret = register_host_pio_handler(&pkvm->host_vm, ++ PCI_CFG_DATA + 2, IO_SIZE_1 | IO_SIZE_2, host_vpci_cfg_data_read, host_vpci_cfg_data_write); ++ if (ret) ++ goto out; ++ ++ ret = register_host_pio_handler(&pkvm->host_vm, ++ PCI_CFG_DATA + 3, IO_SIZE_1, host_vpci_cfg_data_read, host_vpci_cfg_data_write); ++ if (ret) ++ goto out; ++ ++ return 0; ++ ++out: ++ pkvm_err("pkvm: init pci failed"); ++ return ret; ++} ++ ++static int pkvm_mmu_map_mmcfg_region(struct pkvm_pci_info *pci_info) ++{ ++ struct pci_mmcfg_region *region; ++ int i, ret; ++ u64 start, end; ++ ++ for (i = 0; i < pci_info->mmcfg_table_size; i++) { ++ region = &pci_info->mmcfg_table[i]; ++ start = region->res.start; ++ end = region->res.end; ++ ret = pkvm_mmu_map((u64)host_mmio2hva(start), start, ++ end - start + 1, 0, (u64)pgprot_val(PAGE_KERNEL_IO)); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int init_finalize_pci(struct pkvm_pci_info *pci_info) ++{ ++ struct pci_mmcfg_region *region; ++ unsigned long start, end; ++ int ret, i; ++ ++ ret = pkvm_mmu_map_mmcfg_region(pci_info); ++ if (ret) ++ return ret; ++ ++ for (i = 0; i < pci_info->mmcfg_table_size; i++) { ++ region = &pci_info->mmcfg_table[i]; ++ start = region->res.start; ++ end = region->res.end; ++ ++ ret = register_host_mmio_handler(start, end, ++ host_vpci_mmcfg_read, host_vpci_mmcfg_write); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pci.h b/arch/x86/kvm/vmx/pkvm/hyp/pci.h +new file mode 100644 +index 000000000000..22d57eff24df +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/pci.h +@@ -0,0 +1,24 @@ ++/* SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2023 Intel Corporation ++ */ ++#ifndef _PKVM_PCI_H_ ++#define _PKVM_PCI_H_ ++ ++#define PCI_CFG_ADDR 0xcf8 ++#define PCI_CFG_DATA 0xcfc ++ ++union pci_cfg_addr_reg { ++ u32 value; ++ struct { ++ u32 reg : 8; ++ u32 bdf : 16; ++ u32 resv : 7; ++ u32 enable : 1; ++ }; ++}; ++ ++unsigned long pkvm_pci_cfg_space_read(u32 bdf, u32 offset, int size); ++void pkvm_pci_cfg_space_write(u32 bdf, u32 offset, int size, unsigned long value); ++ ++int init_finalize_pci(struct pkvm_pci_info *pci); ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pgtable.c b/arch/x86/kvm/vmx/pkvm/hyp/pgtable.c +new file mode 100644 +index 000000000000..463b053d7894 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/pgtable.c +@@ -0,0 +1,801 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++ ++#include "pgtable.h" ++#include "memory.h" ++#include "mem_protect.h" ++#include "debug.h" ++#include "bug.h" ++ ++struct pgt_walk_data { ++ struct pkvm_pgtable *pgt; ++ struct pgt_flush_data flush_data; ++ unsigned long vaddr; ++ unsigned long vaddr_end; ++ struct pkvm_pgtable_walker *walker; ++}; ++ ++struct pkvm_pgtable_lookup_data { ++ unsigned long vaddr; ++ unsigned long phys; ++ u64 prot; ++ int level; ++}; ++ ++static bool pkvm_phys_is_valid(u64 phys) ++{ ++ return phys != INVALID_ADDR; ++} ++ ++static bool leaf_mapping_valid(struct pkvm_pgtable_ops *pgt_ops, ++ unsigned long vaddr, ++ unsigned long vaddr_end, ++ int pgsz_mask, ++ int level) ++{ ++ unsigned long page_size = pgt_ops->pgt_level_to_size(level); ++ ++ if (!((1 << level) & pgsz_mask)) ++ return false; ++ ++ if (!IS_ALIGNED(vaddr, page_size)) ++ return false; ++ ++ if (page_size > (vaddr_end - vaddr)) ++ return false; ++ ++ return true; ++} ++ ++static bool leaf_mapping_allowed(struct pkvm_pgtable_ops *pgt_ops, ++ unsigned long vaddr, ++ unsigned long vaddr_end, ++ unsigned long phys, ++ int pgsz_mask, ++ int level) ++{ ++ unsigned long page_size = pgt_ops->pgt_level_to_size(level); ++ ++ if (pkvm_phys_is_valid(phys) && !IS_ALIGNED(phys, page_size)) ++ return false; ++ ++ return leaf_mapping_valid(pgt_ops, vaddr, vaddr_end, pgsz_mask, level); ++} ++ ++static void *pgtable_alloc_page(struct pkvm_mm_ops *mm_ops) ++{ ++ void *page = NULL; ++ ++ if (mm_ops->zalloc_page) ++ page = mm_ops->zalloc_page(); ++ ++ if (page && mm_ops->flush_cache) ++ mm_ops->flush_cache(page, PAGE_SIZE); ++ ++ return page; ++} ++ ++static void pgtable_set_entry(struct pkvm_pgtable_ops *pgt_ops, ++ struct pkvm_mm_ops *mm_ops, ++ void *ptep, u64 pte) ++{ ++ pgt_ops->pgt_set_entry(ptep, pte); ++ ++ if (mm_ops->flush_cache) ++ mm_ops->flush_cache(ptep, sizeof(u64)); ++} ++ ++static void pgtable_split(struct pkvm_pgtable_ops *pgt_ops, ++ struct pkvm_mm_ops *mm_ops, ++ unsigned long vaddr, unsigned long phys, ++ unsigned long size, void *ptep, ++ int level, u64 prot) ++{ ++ unsigned long phys_end = phys + size; ++ int level_size = pgt_ops->pgt_level_to_size(level); ++ int entry_size = PAGE_SIZE / pgt_ops->pgt_level_to_entries(level); ++ int i = 0; ++ ++ if (level > PG_LEVEL_4K) ++ pgt_ops->pgt_entry_mkhuge(&prot); ++ ++ for (i = 0; phys < phys_end; phys += level_size, i++) { ++ pgtable_set_entry(pgt_ops, mm_ops,(ptep + i * entry_size), phys | prot); ++ mm_ops->get_page(ptep); ++ } ++} ++ ++int pgtable_map_leaf(struct pkvm_pgtable *pgt, ++ unsigned long vaddr, ++ int level, void *ptep, ++ struct pgt_flush_data *flush_data, ++ struct pkvm_pgtable_map_data *data) ++{ ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ struct pkvm_mm_ops *mm_ops = pgt->mm_ops; ++ u64 old = *(u64 *)ptep, new; ++ ++ if (pkvm_phys_is_valid(data->phys)) { ++ new = data->phys | data->prot; ++ if (level != PG_LEVEL_4K) ++ pgt_ops->pgt_entry_mkhuge(&new); ++ } else { ++ new = data->annotation; ++ } ++ ++ if (pgt_ops->pgt_entry_mapped(ptep)) { ++ /* if just modify the page state, do set_pte directly */ ++ if (!((old ^ new) & ~PKVM_PAGE_STATE_PROT_MASK)) ++ goto set_pte; ++ ++ if (pgt_ops->pgt_entry_present(ptep)) { ++ pgtable_set_entry(pgt_ops, mm_ops, ptep, 0); ++ flush_data->flushtlb |= true; ++ } ++ mm_ops->put_page(ptep); ++ } ++ ++ if (pgt_ops->pgt_entry_mapped(&new)) ++ mm_ops->get_page(ptep); ++ ++set_pte: ++ pgtable_set_entry(pgt_ops, mm_ops, ptep, new); ++ if (pkvm_phys_is_valid(data->phys)) ++ data->phys += page_level_size(level); ++ ++ return 0; ++} ++ ++static int pgtable_map_try_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long vaddr_end, int level, void *ptep, ++ struct pgt_flush_data *flush_data, ++ struct pkvm_pgtable_map_data *data) ++{ ++ if (!leaf_mapping_allowed(pgt->pgt_ops, vaddr, vaddr_end, ++ data->phys, data->pgsz_mask, level)) { ++ /* The 4K page shall be able to map, otherwise return err */ ++ return (level == PG_LEVEL_4K ? -EINVAL: -E2BIG); ++ } ++ ++ if (data->map_leaf_override) ++ return data->map_leaf_override(pgt, vaddr, level, ptep, flush_data, data); ++ else ++ return pgtable_map_leaf(pgt, vaddr, level, ptep, flush_data, data); ++} ++ ++static int pgtable_map_walk_leaf(struct pkvm_pgtable *pgt, ++ unsigned long vaddr, unsigned long vaddr_end, ++ int level, void *ptep, unsigned long flags, ++ struct pgt_flush_data *flush_data, ++ struct pkvm_pgtable_map_data *data) ++{ ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ struct pkvm_mm_ops *mm_ops = pgt->mm_ops; ++ unsigned long size = page_level_size(level); ++ void *page; ++ int ret; ++ ++ /* First try to create leaf page mapping on current level */ ++ ret = pgtable_map_try_leaf(pgt, vaddr, vaddr_end, level, ptep, flush_data, data); ++ if (ret != -E2BIG) ++ return ret; ++ ++ /* ++ * Be here is because the mapping needs to be done on smaller(or level-1) ++ * page size. We need to allocate a table page for the smaller(level-1) ++ * page mapping. And for current level, if the huge page mapping is already ++ * present, we need further split it. ++ */ ++ page = pgtable_alloc_page(mm_ops); ++ if (!page) ++ return -ENOMEM; ++ ++ if (pgt_ops->pgt_entry_huge(ptep)) { ++ u64 prot = pgt_ops->pgt_entry_to_prot(ptep); ++ ++ prot = pkvm_mkstate(prot, pkvm_getstate(*(u64 *)ptep)); ++ ++ /* ++ * Split the large mapping and reuse the ++ * large mapping's prot. The translation ++ * doesn't have a change, so no need to ++ * flush tlb. ++ */ ++ mm_ops->put_page(ptep); ++ pgtable_split(pgt_ops, mm_ops, ALIGN_DOWN(vaddr, size), ++ pgt_ops->pgt_entry_to_phys(ptep), ++ size, page, level - 1, prot); ++ } ++ ++ mm_ops->get_page(ptep); ++ pgtable_set_entry(pgt_ops, mm_ops, ptep, pgt->table_prot | mm_ops->virt_to_phys(page)); ++ ++ return 0; ++} ++ ++/* ++ *TODO: support merging small entries to a large one. ++ */ ++static int pgtable_map_cb(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long vaddr_end, int level, void *ptep, ++ unsigned long flags, struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct pkvm_pgtable_map_data *data = arg; ++ ++ switch(flags) { ++ case PKVM_PGTABLE_WALK_LEAF: ++ return pgtable_map_walk_leaf(pgt, vaddr, vaddr_end, level, ++ ptep, flags, flush_data, data); ++ case PKVM_PGTABLE_WALK_TABLE_PRE: ++ case PKVM_PGTABLE_WALK_TABLE_POST: ++ break; ++ } ++ ++ return -EINVAL; ++} ++ ++/* ++ * put_page_to_free_list(): the page added to the freelist should not be used ++ * by any one as this page will be used as a node linked to the freelist. ++ */ ++static inline void put_page_to_freelist(void *page, struct list_head *head) ++{ ++ struct list_head *node = page; ++ ++ list_add_tail(node, head); ++} ++ ++/* ++ * get_page_to_free_list(): the page got from the freelist is valid to be used ++ * again. ++ */ ++static inline void *get_page_from_freelist(struct list_head *head) ++{ ++ struct list_head *node = head->next; ++ ++ list_del(node); ++ memset(node, 0, sizeof(struct list_head)); ++ ++ return (void *)node; ++} ++ ++static int pgtable_unmap_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ int level, void *ptep, struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct pkvm_pgtable_unmap_data *data = arg; ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ struct pkvm_mm_ops *mm_ops = pgt->mm_ops; ++ unsigned long size = page_level_size(level); ++ ++ if (data->phys != INVALID_ADDR) { ++ unsigned long phys = pgt_ops->pgt_entry_to_phys(ptep); ++ ++ PKVM_ASSERT(phys == data->phys); ++ } ++ ++ if (pgt_ops->pgt_entry_present(ptep)) ++ flush_data->flushtlb |= true; ++ ++ pgtable_set_entry(pgt_ops, mm_ops, ptep, pgt_ops->default_prot); ++ mm_ops->put_page(ptep); ++ ++ if (data->phys != INVALID_ADDR) { ++ data->phys = ALIGN_DOWN(data->phys, size); ++ data->phys += size; ++ } ++ ++ return 0; ++} ++ ++static void pgtable_free_child(struct pkvm_pgtable *pgt, void *ptep, ++ struct pgt_flush_data *flush_data) ++{ ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ struct pkvm_mm_ops *mm_ops = pgt->mm_ops; ++ void *child_ptep; ++ ++ /* ++ * Check the child pte page refcount. Put the child pte page if ++ * no one else is using it. ++ */ ++ child_ptep = mm_ops->phys_to_virt(pgt_ops->pgt_entry_to_phys(ptep)); ++ if (mm_ops->page_count(child_ptep) == 1) { ++ pgtable_set_entry(pgt_ops, mm_ops, ptep, pgt_ops->default_prot); ++ mm_ops->put_page(ptep); ++ put_page_to_freelist(child_ptep, &flush_data->free_list); ++ } ++} ++ ++static int pgtable_unmap_cb(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long vaddr_end, int level, void *ptep, ++ unsigned long flags, struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct pkvm_pgtable_unmap_data *data = arg; ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ struct pkvm_mm_ops *mm_ops = pgt->mm_ops; ++ unsigned long size = page_level_size(level); ++ ++ if (!pgt_ops->pgt_entry_mapped(ptep)) ++ /* Nothing to do if the entry is not mapped */ ++ return 0; ++ ++ /* ++ * Unmap the page if the target address range belongs a ++ * - 4K PTE entry ++ * - huge page and don't need to split it ++ * - a full huge page ++ */ ++ if (level == PG_LEVEL_4K || (pgt_ops->pgt_entry_huge(ptep) && ++ (!data->split_huge_page || leaf_mapping_valid(pgt_ops, vaddr, ++ vaddr_end, 1 << level, level)))) { ++ ++ if (data->unmap_leaf_override) { ++ vaddr = ALIGN_DOWN(vaddr, pgt_ops->pgt_level_to_size(level)); ++ return data->unmap_leaf_override(pgt, vaddr, level, ptep, ++ flush_data, data); ++ } else ++ return pgtable_unmap_leaf(pgt, vaddr, level, ptep, ++ flush_data, data); ++ } ++ ++ if (pgt_ops->pgt_entry_huge(ptep)) { ++ /* ++ * if it is huge pte, split and goto next level. ++ */ ++ u64 prot = pgt_ops->pgt_entry_to_prot(ptep); ++ void *page = pgtable_alloc_page(mm_ops); ++ ++ if (!page) ++ return -ENOMEM; ++ ++ prot = pkvm_mkstate(prot, pkvm_getstate(*(u64 *)ptep)); ++ /* ++ * Split the large mapping and reuse the ++ * large mapping's prot. The translation ++ * doesn't have a change, so no need to ++ * flush tlb. ++ */ ++ pgtable_split(pgt_ops, mm_ops, ALIGN_DOWN(vaddr, size), ++ pgt_ops->pgt_entry_to_phys(ptep), ++ size, page, level - 1, prot); ++ pgtable_set_entry(pgt_ops, mm_ops, ptep, ++ pgt->table_prot | mm_ops->virt_to_phys(page)); ++ return 0; ++ } ++ ++ /* if not huge entry then means it is table entry */ ++ pgtable_free_child(pgt, ptep, flush_data); ++ return 0; ++} ++ ++static int pgtable_lookup_cb(struct pkvm_pgtable *pgt, ++ unsigned long aligned_vaddr, ++ unsigned long aligned_vaddr_end, ++ int level, ++ void *ptep, ++ unsigned long flags, ++ struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct pkvm_pgtable_lookup_data *data = arg; ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ u64 pte = atomic64_read((atomic64_t *)ptep); ++ ++ data->phys = INVALID_ADDR; ++ data->prot = 0; ++ data->level = level; ++ ++ /* ++ * This cb shall only be called for leaf. If now it is not a leaf ++ * that means the pte is changed by others, and we shall re-walk the pgtable ++ */ ++ if (unlikely(!pgt_ops->pgt_entry_is_leaf(&pte, level))) ++ return -EAGAIN; ++ ++ if (pgt_ops->pgt_entry_present(&pte)) { ++ unsigned long offset = ++ data->vaddr & ~pgt_ops->pgt_level_page_mask(level); ++ ++ data->phys = pgt_ops->pgt_entry_to_phys(&pte) + offset; ++ data->prot = pgt_ops->pgt_entry_to_prot(&pte); ++ } ++ ++ return PGTABLE_WALK_DONE; ++} ++ ++static int pgtable_free_leaf(struct pkvm_pgtable *pgt, ++ struct pgt_flush_data *flush_data, ++ void *ptep) ++{ ++ if (pgt->pgt_ops->pgt_entry_mapped(ptep)) { ++ if (pgt->pgt_ops->pgt_entry_present(ptep)) ++ flush_data->flushtlb |= true; ++ pgt->mm_ops->put_page(ptep); ++ } ++ ++ return 0; ++} ++ ++static int pgtable_free_cb(struct pkvm_pgtable *pgt, ++ unsigned long vaddr, ++ unsigned long vaddr_end, ++ int level, ++ void *ptep, ++ unsigned long flags, ++ struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct pkvm_pgtable_free_data *data = arg; ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ ++ if (pgt_ops->pgt_entry_is_leaf(ptep, level)) { ++ if (data->free_leaf_override) ++ return data->free_leaf_override(pgt, vaddr, level, ptep, ++ flush_data, data); ++ else ++ return pgtable_free_leaf(pgt, flush_data, ptep); ++ } ++ ++ /* Free the child page */ ++ pgtable_free_child(pgt, ptep, flush_data); ++ return 0; ++} ++ ++static int _pgtable_walk(struct pgt_walk_data *data, void *ptep, int level); ++static int pgtable_visit(struct pgt_walk_data *data, void *ptep, int level) ++{ ++ struct pkvm_pgtable_ops *pgt_ops = data->pgt->pgt_ops; ++ struct pkvm_mm_ops *mm_ops = data->pgt->mm_ops; ++ struct pkvm_pgtable_walker *walker = data->walker; ++ unsigned long flags = walker->flags; ++ bool leaf = pgt_ops->pgt_entry_is_leaf(ptep, level); ++ void *child_ptep; ++ int ret = 0; ++ ++ if (!leaf && (flags & PKVM_PGTABLE_WALK_TABLE_PRE)) ++ ret = walker->cb(data->pgt, data->vaddr, data->vaddr_end, ++ level, ptep, PKVM_PGTABLE_WALK_TABLE_PRE, ++ &data->flush_data, walker->arg); ++ ++ if (leaf && (flags & PKVM_PGTABLE_WALK_LEAF)) { ++ ret = walker->cb(data->pgt, data->vaddr, data->vaddr_end, ++ level, ptep, PKVM_PGTABLE_WALK_LEAF, ++ &data->flush_data, walker->arg); ++ leaf = pgt_ops->pgt_entry_is_leaf(ptep, level); ++ } ++ ++ if (ret) ++ return ret; ++ ++ if (leaf) { ++ unsigned long size = pgt_ops->pgt_level_to_size(level); ++ data->vaddr = ALIGN_DOWN(data->vaddr, size); ++ data->vaddr += size; ++ return ret; ++ } ++ ++ child_ptep = mm_ops->phys_to_virt(pgt_ops->pgt_entry_to_phys(ptep)); ++ ret = _pgtable_walk(data, child_ptep, level - 1); ++ if (ret) ++ return ret; ++ ++ if (flags & PKVM_PGTABLE_WALK_TABLE_POST) ++ ret = walker->cb(data->pgt, data->vaddr, data->vaddr_end, ++ level, ptep, PKVM_PGTABLE_WALK_TABLE_POST, ++ &data->flush_data, walker->arg); ++ ++ return ret; ++} ++ ++static int _pgtable_walk(struct pgt_walk_data *data, void *ptep, int level) ++{ ++ struct pkvm_pgtable_ops *pgt_ops = data->pgt->pgt_ops; ++ int entries = pgt_ops->pgt_level_to_entries(level); ++ int entry_size = pgt_ops->pgt_level_entry_size(level); ++ int idx = pgt_ops->pgt_entry_to_index(data->vaddr, level); ++ int ret; ++ ++ for (; idx < entries; idx++) { ++ if (data->vaddr >= data->vaddr_end) ++ break; ++ ++ ret = pgtable_visit(data, (ptep + idx * entry_size), level); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int pgtable_walk(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long size, bool page_aligned, ++ struct pkvm_pgtable_walker *walker) ++{ ++ unsigned long aligned_vaddr = ++ page_aligned ? ALIGN_DOWN(vaddr, PAGE_SIZE) : vaddr; ++ unsigned long aligned_size = ++ page_aligned ? ALIGN(size, PAGE_SIZE) : size; ++ struct pgt_walk_data data = { ++ .pgt = pgt, ++ .flush_data = { ++ .flushtlb = false, ++ .free_list = LIST_HEAD_INIT(data.flush_data.free_list), ++ }, ++ .vaddr = aligned_vaddr, ++ .vaddr_end = aligned_vaddr + aligned_size, ++ .walker = walker, ++ }; ++ struct pkvm_mm_ops *mm_ops = pgt->mm_ops; ++ int ret; ++ ++ if (!size || data.vaddr == data.vaddr_end) ++ return 0; ++ ++ ret = _pgtable_walk(&data, mm_ops->phys_to_virt(pgt->root_pa), pgt->level); ++ ++ if (data.flush_data.flushtlb || !list_empty(&data.flush_data.free_list)) ++ pgt->mm_ops->flush_tlb(pgt, aligned_vaddr, aligned_size); ++ ++ while (!list_empty(&data.flush_data.free_list)) { ++ void *page = get_page_from_freelist(&data.flush_data.free_list); ++ ++ pgt->mm_ops->put_page(page); ++ } ++ ++ return ret; ++} ++ ++int pkvm_pgtable_init(struct pkvm_pgtable *pgt, ++ struct pkvm_mm_ops *mm_ops, ++ struct pkvm_pgtable_ops *pgt_ops, ++ struct pkvm_pgtable_cap *cap, ++ bool alloc_root) ++{ ++ void *root; ++ ++ if (!mm_ops || !pgt_ops || !cap) ++ return -EINVAL; ++ ++ if (alloc_root) { ++ root = pgtable_alloc_page(mm_ops); ++ if (!root) ++ return -ENOMEM; ++ pgt->root_pa = __pkvm_pa(root); ++ } ++ ++ pgt->mm_ops = mm_ops; ++ pgt->pgt_ops = pgt_ops; ++ pgt->level = cap->level; ++ pgt->allowed_pgsz = cap->allowed_pgsz; ++ pgt->table_prot = cap->table_prot; ++ ++ return 0; ++} ++ ++static int __pkvm_pgtable_map(struct pkvm_pgtable *pgt, unsigned long vaddr_start, ++ unsigned long phys, unsigned long size, ++ int pgsz_mask, u64 prot, pgtable_leaf_ov_fn_t map_leaf, ++ u64 annotation) ++{ ++ struct pkvm_pgtable_map_data data = { ++ .phys = phys, ++ .annotation = annotation, ++ .prot = prot, ++ .pgsz_mask = pgsz_mask ? pgt->allowed_pgsz & pgsz_mask : ++ pgt->allowed_pgsz, ++ .map_leaf_override = map_leaf, ++ }; ++ struct pkvm_pgtable_walker walker = { ++ .cb = pgtable_map_cb, ++ .arg = &data, ++ .flags = PKVM_PGTABLE_WALK_LEAF, ++ }; ++ ++ return pgtable_walk(pgt, vaddr_start, size, true, &walker); ++} ++ ++int pkvm_pgtable_map(struct pkvm_pgtable *pgt, unsigned long vaddr_start, ++ unsigned long phys_start, unsigned long size, ++ int pgsz_mask, u64 prot, pgtable_leaf_ov_fn_t map_leaf) ++{ ++ return __pkvm_pgtable_map(pgt, vaddr_start, ALIGN_DOWN(phys_start, PAGE_SIZE), ++ size, pgsz_mask, prot, map_leaf, 0); ++} ++ ++int pkvm_pgtable_unmap(struct pkvm_pgtable *pgt, unsigned long vaddr_start, ++ unsigned long size, pgtable_leaf_ov_fn_t unmap_leaf) ++{ ++ struct pkvm_pgtable_unmap_data data = { ++ .phys = INVALID_ADDR, ++ .split_huge_page = true, ++ .unmap_leaf_override = unmap_leaf, ++ }; ++ struct pkvm_pgtable_walker walker = { ++ .cb = pgtable_unmap_cb, ++ .arg = &data, ++ .flags = PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST, ++ }; ++ ++ return pgtable_walk(pgt, vaddr_start, size, true, &walker); ++} ++ ++int pkvm_pgtable_unmap_safe(struct pkvm_pgtable *pgt, unsigned long vaddr_start, ++ unsigned long phys_start, unsigned long size, ++ pgtable_leaf_ov_fn_t unmap_leaf) ++{ ++ struct pkvm_pgtable_unmap_data data = { ++ .phys = ALIGN_DOWN(phys_start, PAGE_SIZE), ++ .split_huge_page = true, ++ .unmap_leaf_override = unmap_leaf, ++ }; ++ struct pkvm_pgtable_walker walker = { ++ .cb = pgtable_unmap_cb, ++ .arg = &data, ++ .flags = PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST, ++ }; ++ ++ return pgtable_walk(pgt, vaddr_start, size, true, &walker); ++} ++ ++int pkvm_pgtable_unmap_nosplit(struct pkvm_pgtable *pgt, unsigned long vaddr_start, ++ unsigned long size, pgtable_leaf_ov_fn_t unmap_leaf) ++{ ++ struct pkvm_pgtable_unmap_data data = { ++ .phys = INVALID_ADDR, ++ .split_huge_page = false, ++ .unmap_leaf_override = unmap_leaf, ++ }; ++ struct pkvm_pgtable_walker walker = { ++ .cb = pgtable_unmap_cb, ++ .arg = &data, ++ .flags = PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST, ++ }; ++ ++ return pgtable_walk(pgt, vaddr_start, size, true, &walker); ++} ++ ++void pkvm_pgtable_lookup(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long *pphys, u64 *pprot, int *plevel) ++{ ++ struct pkvm_pgtable_lookup_data data = { ++ .vaddr = vaddr, ++ }; ++ struct pkvm_pgtable_walker walker = { ++ .cb = pgtable_lookup_cb, ++ .arg = &data, ++ .flags = PKVM_PGTABLE_WALK_LEAF, ++ }; ++ int ret, retry_cnt = 0; ++ ++retry: ++ ret = pgtable_walk(pgt, vaddr, PAGE_SIZE, true, &walker); ++ if ((ret == -EAGAIN) && (retry_cnt++ < 5)) ++ goto retry; ++ ++ if (pphys) ++ *pphys = data.phys; ++ if (pprot) ++ *pprot = data.prot; ++ if (plevel) ++ *plevel = data.level; ++} ++ ++void pkvm_pgtable_destroy(struct pkvm_pgtable *pgt, pgtable_leaf_ov_fn_t free_leaf) ++{ ++ unsigned long size; ++ void *virt_root; ++ struct pkvm_pgtable_ops *pgt_ops; ++ struct pkvm_pgtable_free_data data = { ++ .free_leaf_override = free_leaf, ++ }; ++ struct pkvm_pgtable_walker walker = { ++ .cb = pgtable_free_cb, ++ .arg = &data, ++ .flags = PKVM_PGTABLE_WALK_LEAF | PKVM_PGTABLE_WALK_TABLE_POST, ++ }; ++ ++ pgt_ops = pgt->pgt_ops; ++ size = pgt_ops->pgt_level_to_size(pgt->level + 1); ++ ++ pgtable_walk(pgt, 0, size, true, &walker); ++ virt_root = pgt->mm_ops->phys_to_virt(pgt->root_pa); ++ pgt->mm_ops->put_page(virt_root); ++} ++ ++/* ++ * pkvm_pgtable_annotate() - Unmap and annotate pages to track ownership. ++ * @annotation: The value stored in the invalid pte. ++ * @annotation[2:0] must be 0. ++ */ ++int pkvm_pgtable_annotate(struct pkvm_pgtable *pgt, unsigned long addr, ++ unsigned long size, u64 annotation) ++{ ++ if (pgt->pgt_ops->pgt_entry_present(&annotation)) ++ return -EINVAL; ++ ++ return __pkvm_pgtable_map(pgt, addr, INVALID_ADDR, ++ size, 1 << PG_LEVEL_4K, 0, ++ NULL, annotation); ++} ++ ++static int pgtable_sync_map_cb(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long vaddr_end, int level, void *ptep, ++ unsigned long flags, struct pgt_flush_data *flush_data, ++ void *const arg) ++{ ++ struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; ++ struct pkvm_pgtable_sync_data *data = arg; ++ unsigned long phys; ++ unsigned long size; ++ u64 prot; ++ ++ phys = pgt_ops->pgt_entry_to_phys(ptep); ++ size = pgt_ops->pgt_level_to_size(level); ++ ++ if (!pgt->pgt_ops->pgt_entry_present(ptep)) ++ return pkvm_pgtable_unmap(data->dest_pgt, vaddr, size, NULL); ++ ++ if (data->prot_override) ++ prot = *data->prot_override; ++ else ++ prot = pgt_ops->pgt_entry_to_prot(ptep); ++ ++ return pkvm_pgtable_map(data->dest_pgt, vaddr, phys, ++ size, 0, prot, data->map_leaf_override); ++} ++ ++/* ++ * pkvm_pgtable_sync_map_range() - map the given address range in the destination ++ * pgtable according to the source pgtable, with the same phys address and desired ++ * property bits. ++ * ++ * @src: source pgtable. ++ * @dest: destination pgtable. ++ * @vaddr: virtual start address of the range. ++ * @size: size of the range in bytes. ++ * @prot: desired property bits. Can be NULL if use the same property ++ * bits as the source pgtable ++ * @map_leaf: function to map the leaf entry for destination pgtable. ++ */ ++int pkvm_pgtable_sync_map_range(struct pkvm_pgtable *src, struct pkvm_pgtable *dest, ++ unsigned long vaddr, unsigned long size, ++ u64 *prot, pgtable_leaf_ov_fn_t map_leaf) ++{ ++ struct pkvm_pgtable_sync_data data = { ++ .dest_pgt = dest, ++ .prot_override = prot, ++ .map_leaf_override = map_leaf, ++ }; ++ struct pkvm_pgtable_walker walker = { ++ .cb = pgtable_sync_map_cb, ++ .flags = PKVM_PGTABLE_WALK_LEAF, ++ .arg = &data, ++ }; ++ ++ return pgtable_walk(src, vaddr, size, true, &walker); ++} ++ ++/* ++ * pkvm_pgtable_sync_map() - map the destination pgtable according to the source ++ * pgtable, with the same phys address and desired property bits. ++ * ++ * @src: source pgtable. ++ * @dest: destination pgtable. ++ * @prot: desired property bits. Can be NULL if use the same property ++ * bits as the source pgtable ++ * @map_leaf: function to map the leaf entry for destination pgtable. ++ */ ++int pkvm_pgtable_sync_map(struct pkvm_pgtable *src, struct pkvm_pgtable *dest, ++ u64 *prot, pgtable_leaf_ov_fn_t map_leaf) ++{ ++ unsigned long size = src->pgt_ops->pgt_level_to_size(src->level + 1); ++ ++ return pkvm_pgtable_sync_map_range(src, dest, 0, size, prot, map_leaf); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pgtable.h b/arch/x86/kvm/vmx/pkvm/hyp/pgtable.h +new file mode 100644 +index 000000000000..85a2f74c5fe4 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/pgtable.h +@@ -0,0 +1,155 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _PKVM_PGTABLE_H_ ++#define _PKVM_PGTABLE_H_ ++ ++#include ++#include ++ ++#define SUPPRESS_VE BIT(63) ++ ++struct pkvm_mm_ops { ++ void *(*phys_to_virt)(unsigned long phys); ++ unsigned long (*virt_to_phys)(void *vaddr); ++ void *(*zalloc_page)(void); ++ int (*page_count)(void *vaddr); ++ void (*get_page)(void *vaddr); ++ void (*put_page)(void *vaddr); ++ void (*flush_tlb)(struct pkvm_pgtable *pgt, ++ unsigned long vaddr, unsigned long size); ++ void (*flush_cache)(void *vaddr, unsigned int size); ++}; ++ ++struct pkvm_pgtable_ops { ++ bool (*pgt_entry_present)(void *pte); ++ bool (*pgt_entry_mapped)(void *pte); ++ bool (*pgt_entry_huge)(void *pte); ++ void (*pgt_entry_mkhuge)(void *ptep); ++ unsigned long (*pgt_entry_to_phys)(void *pte); ++ u64 (*pgt_entry_to_prot)(void *pte); ++ int (*pgt_entry_to_index)(unsigned long vaddr, int level); ++ u64 (*pgt_level_page_mask)(int level); ++ bool (*pgt_entry_is_leaf)(void *ptep, int level); ++ int (*pgt_level_entry_size)(int level); ++ int (*pgt_level_to_entries)(int level); ++ unsigned long (*pgt_level_to_size)(int level); ++ void (*pgt_set_entry)(void *ptep, u64 val); ++ u64 default_prot; ++}; ++ ++struct pkvm_pgtable { ++ unsigned long root_pa; ++ int level; ++ int allowed_pgsz; ++ u64 table_prot; ++ struct pkvm_mm_ops *mm_ops; ++ struct pkvm_pgtable_ops *pgt_ops; ++}; ++ ++struct pgt_flush_data { ++ bool flushtlb; ++ struct list_head free_list; ++}; ++ ++typedef int (*pgtable_visit_fn_t)(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long vaddr_end, int level, void *ptep, ++ unsigned long flags, struct pgt_flush_data *flush_data, ++ void *const arg); ++ ++typedef int (*pgtable_leaf_ov_fn_t)(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ int level, void *ptep, struct pgt_flush_data *flush_data, ++ void *data); ++ ++struct pkvm_pgtable_map_data { ++ unsigned long phys; ++ u64 annotation; ++ u64 prot; ++ int pgsz_mask; ++ ++ /* ++ * extra override helper ops: ++ * - map_leaf_override(): override the final page entry map function ++ * for pkvm_pgtable_map() ++ */ ++ pgtable_leaf_ov_fn_t map_leaf_override; ++}; ++ ++struct pkvm_pgtable_unmap_data { ++ unsigned long phys; ++ ++ /* ++ * extra override helper ops: ++ * - unmap_leaf_override(): override the final page entry map function ++ * for pkvm_pgtable_unmap() ++ */ ++ pgtable_leaf_ov_fn_t unmap_leaf_override; ++ ++ bool split_huge_page; ++}; ++ ++struct pkvm_pgtable_free_data { ++ /* ++ * extra override helper ops: ++ * - free_leaf_override(): override the final page entry free function ++ * for pkvm_pgtable_destroy() ++ */ ++ pgtable_leaf_ov_fn_t free_leaf_override; ++}; ++ ++struct pkvm_pgtable_sync_data { ++ struct pkvm_pgtable *dest_pgt; ++ u64 *prot_override; ++ ++ pgtable_leaf_ov_fn_t map_leaf_override; ++}; ++ ++#define PGTABLE_WALK_DONE 1 ++ ++struct pkvm_pgtable_walker { ++ const pgtable_visit_fn_t cb; ++ void *const arg; ++ unsigned long flags; ++#define PKVM_PGTABLE_WALK_TABLE_PRE BIT(0) ++#define PKVM_PGTABLE_WALK_LEAF BIT(1) ++#define PKVM_PGTABLE_WALK_TABLE_POST BIT(2) ++}; ++ ++int pgtable_walk(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long size, bool page_aligned, ++ struct pkvm_pgtable_walker *walker); ++int pkvm_pgtable_init(struct pkvm_pgtable *pgt, ++ struct pkvm_mm_ops *mm_ops, ++ struct pkvm_pgtable_ops *pgt_ops, ++ struct pkvm_pgtable_cap *cap, ++ bool alloc_root); ++int pkvm_pgtable_map(struct pkvm_pgtable *pgt, unsigned long vaddr_start, ++ unsigned long phys_start, unsigned long size, ++ int pgsz_mask, u64 entry_prot, pgtable_leaf_ov_fn_t map_leaf); ++int pgtable_map_leaf(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ int level, void *ptep, struct pgt_flush_data *flush_data, ++ struct pkvm_pgtable_map_data *data); ++int pkvm_pgtable_unmap(struct pkvm_pgtable *pgt, unsigned long vaddr_start, ++ unsigned long size, pgtable_leaf_ov_fn_t unmap_leaf); ++int pkvm_pgtable_unmap_safe(struct pkvm_pgtable *pgt, unsigned long vaddr_start, ++ unsigned long phys_start, unsigned long size, ++ pgtable_leaf_ov_fn_t unmap_leaf); ++int pkvm_pgtable_unmap_nosplit(struct pkvm_pgtable *pgt, unsigned long vaddr_start, ++ unsigned long size, pgtable_leaf_ov_fn_t unmap_leaf); ++void pkvm_pgtable_lookup(struct pkvm_pgtable *pgt, unsigned long vaddr, ++ unsigned long *pphys, u64 *pprot, int *plevel); ++void pkvm_pgtable_destroy(struct pkvm_pgtable *pgt, pgtable_leaf_ov_fn_t free_leaf); ++int pkvm_pgtable_annotate(struct pkvm_pgtable *pgt, unsigned long addr, ++ unsigned long size, u64 annotation); ++int pkvm_pgtable_sync_map(struct pkvm_pgtable *src, struct pkvm_pgtable *dest, ++ u64 *prot, pgtable_leaf_ov_fn_t map_leaf); ++int pkvm_pgtable_sync_map_range(struct pkvm_pgtable *src, struct pkvm_pgtable *dest, ++ unsigned long vaddr, unsigned long size, ++ u64 *prot, pgtable_leaf_ov_fn_t map_leaf); ++ ++static inline void pkvm_pgtable_set_mm_ops(struct pkvm_pgtable *pgt, struct pkvm_mm_ops *mm_ops) ++{ ++ pgt->mm_ops = mm_ops; ++} ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pkvm.c b/arch/x86/kvm/vmx/pkvm/hyp/pkvm.c +new file mode 100644 +index 000000000000..3bf26c75ae98 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/pkvm.c +@@ -0,0 +1,470 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++ ++#include "pkvm_hyp.h" ++#include "ept.h" ++#include "mem_protect.h" ++#include "lapic.h" ++#include "ptdev.h" ++ ++struct pkvm_hyp *pkvm_hyp; ++ ++#define MAX_SHADOW_VMS (PKVM_MAX_NORMAL_VM_NUM + PKVM_MAX_SECURE_VM_NUM) ++#define HANDLE_OFFSET 1 ++ ++#define to_shadow_vm_handle(vcpu_handle) ((s64)(vcpu_handle) >> SHADOW_VM_HANDLE_SHIFT) ++#define to_shadow_vcpu_idx(vcpu_handle) ((s64)(vcpu_handle) & SHADOW_VCPU_INDEX_MASK) ++ ++static DECLARE_BITMAP(shadow_vms_bitmap, MAX_SHADOW_VMS); ++static pkvm_spinlock_t shadow_vms_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED }; ++struct shadow_vm_ref { ++ atomic_t refcount; ++ struct pkvm_shadow_vm *vm; ++}; ++static struct shadow_vm_ref shadow_vms_ref[MAX_SHADOW_VMS]; ++ ++#define SHADOW_VCPU_ARRAY(vm) \ ++ ((struct shadow_vcpu_array *)((void *)(vm) + sizeof(struct pkvm_shadow_vm))) ++ ++#define SHADOW_VCPU_HASH_BITS 10 ++DEFINE_HASHTABLE(shadow_vcpu_table, SHADOW_VCPU_HASH_BITS); ++static pkvm_spinlock_t shadow_vcpu_table_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED }; ++ ++static int allocate_shadow_vm_handle(struct pkvm_shadow_vm *vm) ++{ ++ struct shadow_vm_ref *vm_ref; ++ int handle; ++ ++ /* ++ * The shadow_vm_handle is an int so it cannot exceed INT_MAX. ++ * Meanwhile shadow_vm_handle will also be used as owner_id in ++ * the page state machine so it also cannot exceed the max ++ * owner_id. ++ */ ++ BUILD_BUG_ON(MAX_SHADOW_VMS > ++ min(INT_MAX, ((1 << hweight_long(PKVM_INVALID_PTE_OWNER_MASK)) - 1))); ++ ++ pkvm_spin_lock(&shadow_vms_lock); ++ ++ handle = find_next_zero_bit(shadow_vms_bitmap, MAX_SHADOW_VMS, ++ HANDLE_OFFSET); ++ if ((u32)handle < MAX_SHADOW_VMS) { ++ __set_bit(handle, shadow_vms_bitmap); ++ vm->shadow_vm_handle = handle; ++ vm_ref = &shadow_vms_ref[handle]; ++ vm_ref->vm = vm; ++ atomic_set(&vm_ref->refcount, 1); ++ } else ++ handle = -ENOMEM; ++ ++ pkvm_spin_unlock(&shadow_vms_lock); ++ ++ return handle; ++} ++ ++static struct pkvm_shadow_vm *free_shadow_vm_handle(int handle) ++{ ++ struct shadow_vm_ref *vm_ref; ++ struct pkvm_shadow_vm *vm = NULL; ++ ++ pkvm_spin_lock(&shadow_vms_lock); ++ ++ if ((u32)handle >= MAX_SHADOW_VMS) ++ goto out; ++ ++ vm_ref = &shadow_vms_ref[handle]; ++ if ((atomic_cmpxchg(&vm_ref->refcount, 1, 0) != 1)) { ++ pkvm_err("%s: VM%d is busy, refcount %d\n", ++ __func__, handle, atomic_read(&vm_ref->refcount)); ++ goto out; ++ } ++ ++ vm = vm_ref->vm; ++ ++ vm_ref->vm = NULL; ++ __clear_bit(handle, shadow_vms_bitmap); ++out: ++ pkvm_spin_unlock(&shadow_vms_lock); ++ return vm; ++} ++ ++int __pkvm_init_shadow_vm(struct kvm_vcpu *hvcpu, unsigned long kvm_va, ++ unsigned long shadow_pa, size_t shadow_size) ++{ ++ unsigned long offset = offsetof(struct kvm, arch.vm_type); ++ unsigned long vm_type, bytes = sizeof(unsigned long); ++ struct pkvm_shadow_vm *vm; ++ struct x86_exception e; ++ int shadow_vm_handle; ++ ++ if (!PAGE_ALIGNED(shadow_pa) || ++ !PAGE_ALIGNED(shadow_size) || ++ (shadow_size != PAGE_ALIGN(sizeof(struct pkvm_shadow_vm) ++ + pkvm_shadow_vcpu_array_size()))) ++ return -EINVAL; ++ ++ if (read_gva(hvcpu, kvm_va + offset, &vm_type, bytes, &e) < 0) ++ return -EINVAL; ++ ++ if(__pkvm_host_donate_hyp(shadow_pa, shadow_size)) ++ return -EINVAL; ++ ++ vm = pkvm_phys_to_virt(shadow_pa); ++ ++ memset(vm, 0, shadow_size); ++ pkvm_spinlock_init(&vm->lock); ++ INIT_LIST_HEAD(&vm->ptdev_head); ++ ++ vm->host_kvm_va = kvm_va; ++ vm->shadow_size = shadow_size; ++ vm->vm_type = vm_type; ++ ++ if (pkvm_pgstate_pgt_init(vm)) ++ goto undonate; ++ ++ if (pkvm_shadow_ept_init(&vm->sept_desc)) ++ goto deinit_pgstate_pgt; ++ ++ shadow_vm_handle = allocate_shadow_vm_handle(vm); ++ if (shadow_vm_handle < 0) ++ goto deinit_shadow_ept; ++ ++ return shadow_vm_handle; ++ ++deinit_shadow_ept: ++ pkvm_shadow_ept_deinit(&vm->sept_desc); ++deinit_pgstate_pgt: ++ pkvm_pgstate_pgt_deinit(vm); ++undonate: ++ memset(vm, 0, shadow_size); ++ __pkvm_hyp_donate_host(shadow_pa, shadow_size); ++ return -EINVAL; ++} ++ ++unsigned long __pkvm_teardown_shadow_vm(int shadow_vm_handle) ++{ ++ struct pkvm_shadow_vm *vm = free_shadow_vm_handle(shadow_vm_handle); ++ struct pkvm_ptdev *ptdev, *tmp; ++ unsigned long shadow_size; ++ ++ if (!vm) ++ return 0; ++ ++ pkvm_shadow_ept_deinit(&vm->sept_desc); ++ ++ pkvm_pgstate_pgt_deinit(vm); ++ ++ list_for_each_entry_safe(ptdev, tmp, &vm->ptdev_head, vm_node) ++ pkvm_detach_ptdev(ptdev, vm); ++ ++ shadow_size = vm->shadow_size; ++ memset(vm, 0, shadow_size); ++ ++ WARN_ON(__pkvm_hyp_donate_host(pkvm_virt_to_phys(vm), shadow_size)); ++ ++ return pkvm_virt_to_phys(vm); ++} ++ ++struct pkvm_shadow_vm *get_shadow_vm(int shadow_vm_handle) ++{ ++ struct shadow_vm_ref *vm_ref; ++ ++ if ((u32)shadow_vm_handle >= MAX_SHADOW_VMS) ++ return NULL; ++ ++ vm_ref = &shadow_vms_ref[shadow_vm_handle]; ++ return atomic_inc_not_zero(&vm_ref->refcount) ? vm_ref->vm : NULL; ++} ++ ++void put_shadow_vm(int shadow_vm_handle) ++{ ++ struct shadow_vm_ref *vm_ref; ++ ++ if ((u32)shadow_vm_handle >= MAX_SHADOW_VMS) ++ return; ++ ++ vm_ref = &shadow_vms_ref[shadow_vm_handle]; ++ WARN_ON(atomic_dec_if_positive(&vm_ref->refcount) <= 0); ++} ++ ++void pkvm_shadow_vm_link_ptdev(struct pkvm_shadow_vm *vm, ++ struct list_head *node, bool coherency) ++{ ++ pkvm_spin_lock(&vm->lock); ++ list_add_tail(node, &vm->ptdev_head); ++ vm->noncoherent_ptdev += !coherency; ++ vm->need_prepopulation = true; ++ pkvm_shadow_sl_iommu_pgt_update_coherency(&vm->pgstate_pgt, ++ !vm->noncoherent_ptdev); ++ pkvm_spin_unlock(&vm->lock); ++} ++ ++void pkvm_shadow_vm_unlink_ptdev(struct pkvm_shadow_vm *vm, ++ struct list_head *node, bool coherency) ++{ ++ pkvm_spin_lock(&vm->lock); ++ list_del(node); ++ vm->noncoherent_ptdev -= !coherency; ++ pkvm_shadow_sl_iommu_pgt_update_coherency(&vm->pgstate_pgt, ++ !vm->noncoherent_ptdev); ++ pkvm_spin_unlock(&vm->lock); ++} ++ ++static void add_shadow_vcpu_vmcs12_map(struct shadow_vcpu_state *vcpu) ++{ ++ pkvm_spin_lock(&shadow_vcpu_table_lock); ++ hash_add(shadow_vcpu_table, &vcpu->hnode, vcpu->vmcs12_pa); ++ pkvm_spin_unlock(&shadow_vcpu_table_lock); ++} ++ ++static void remove_shadow_vcpu_vmcs12_map(struct shadow_vcpu_state *vcpu) ++{ ++ pkvm_spin_lock(&shadow_vcpu_table_lock); ++ hash_del(&vcpu->hnode); ++ pkvm_spin_unlock(&shadow_vcpu_table_lock); ++} ++ ++s64 find_shadow_vcpu_handle_by_vmcs(unsigned long vmcs12_pa) ++{ ++ struct shadow_vcpu_state *shadow_vcpu; ++ s64 handle = -1; ++ ++ pkvm_spin_lock(&shadow_vcpu_table_lock); ++ hash_for_each_possible(shadow_vcpu_table, shadow_vcpu, hnode, vmcs12_pa) { ++ if (shadow_vcpu->vmcs12_pa == vmcs12_pa) { ++ handle = shadow_vcpu->shadow_vcpu_handle; ++ break; ++ } ++ } ++ pkvm_spin_unlock(&shadow_vcpu_table_lock); ++ ++ return handle; ++} ++ ++struct shadow_vcpu_state *get_shadow_vcpu(s64 shadow_vcpu_handle) ++{ ++ int shadow_vm_handle = to_shadow_vm_handle(shadow_vcpu_handle); ++ u32 vcpu_idx = to_shadow_vcpu_idx(shadow_vcpu_handle); ++ struct shadow_vcpu_ref *vcpu_ref; ++ struct shadow_vcpu_state *vcpu; ++ struct pkvm_shadow_vm *vm; ++ ++ if (vcpu_idx >= KVM_MAX_VCPUS) ++ return NULL; ++ ++ vm = get_shadow_vm(shadow_vm_handle); ++ if (!vm) ++ return NULL; ++ ++ vcpu_ref = &SHADOW_VCPU_ARRAY(vm)->ref[vcpu_idx]; ++ vcpu = atomic_inc_not_zero(&vcpu_ref->refcount) ? vcpu_ref->vcpu : NULL; ++ ++ put_shadow_vm(shadow_vm_handle); ++ return vcpu; ++} ++ ++void put_shadow_vcpu(s64 shadow_vcpu_handle) ++{ ++ int shadow_vm_handle = to_shadow_vm_handle(shadow_vcpu_handle); ++ u32 vcpu_idx = to_shadow_vcpu_idx(shadow_vcpu_handle); ++ struct shadow_vcpu_ref *vcpu_ref; ++ struct pkvm_shadow_vm *vm; ++ ++ if (vcpu_idx >= KVM_MAX_VCPUS) ++ return; ++ ++ vm = get_shadow_vm(shadow_vm_handle); ++ if (!vm) ++ return; ++ ++ vcpu_ref = &SHADOW_VCPU_ARRAY(vm)->ref[vcpu_idx]; ++ WARN_ON(atomic_dec_if_positive(&vcpu_ref->refcount) <= 0); ++ ++ put_shadow_vm(shadow_vm_handle); ++} ++ ++static s64 attach_shadow_vcpu_to_vm(struct pkvm_shadow_vm *vm, ++ struct shadow_vcpu_state *shadow_vcpu) ++{ ++ struct shadow_vcpu_ref *vcpu_ref; ++ u32 vcpu_idx; ++ ++ /* ++ * Shadow_vcpu_handle is a s64 value combined with shadow_vm_handle ++ * and shadow_vcpu index from the array. So the array size cannot be ++ * larger than the shadow_vcpu index mask. ++ */ ++ BUILD_BUG_ON(KVM_MAX_VCPUS > SHADOW_VCPU_INDEX_MASK); ++ ++ /* ++ * Save a shadow_vm pointer in shadow_vcpu requires additional ++ * get so that later when use this pointer at runtime no need ++ * to get again. This will be put when detaching this shadow_vcpu. ++ */ ++ shadow_vcpu->vm = get_shadow_vm(vm->shadow_vm_handle); ++ if (!shadow_vcpu->vm) ++ return -EINVAL; ++ ++ add_shadow_vcpu_vmcs12_map(shadow_vcpu); ++ ++ pkvm_spin_lock(&vm->lock); ++ ++ if (vm->created_vcpus == KVM_MAX_VCPUS) { ++ pkvm_spin_unlock(&vm->lock); ++ return -EINVAL; ++ } ++ ++ vcpu_idx = vm->created_vcpus; ++ shadow_vcpu->shadow_vcpu_handle = ++ to_shadow_vcpu_handle(vm->shadow_vm_handle, vcpu_idx); ++ vcpu_ref = &SHADOW_VCPU_ARRAY(vm)->ref[vcpu_idx]; ++ vcpu_ref->vcpu = shadow_vcpu; ++ vm->created_vcpus++; ++ atomic_set(&vcpu_ref->refcount, 1); ++ ++ pkvm_spin_unlock(&vm->lock); ++ ++ return shadow_vcpu->shadow_vcpu_handle; ++} ++ ++static struct shadow_vcpu_state * ++detach_shadow_vcpu_from_vm(struct pkvm_shadow_vm *vm, s64 shadow_vcpu_handle) ++{ ++ u32 vcpu_idx = to_shadow_vcpu_idx(shadow_vcpu_handle); ++ struct shadow_vcpu_state *shadow_vcpu = NULL; ++ struct shadow_vcpu_ref *vcpu_ref; ++ ++ if (vcpu_idx >= KVM_MAX_VCPUS) ++ return NULL; ++ ++ pkvm_spin_lock(&vm->lock); ++ ++ vcpu_ref = &SHADOW_VCPU_ARRAY(vm)->ref[vcpu_idx]; ++ if ((atomic_cmpxchg(&vcpu_ref->refcount, 1, 0) != 1)) { ++ pkvm_err("%s: VM%d shadow_vcpu%d is busy, refcount %d\n", ++ __func__, vm->shadow_vm_handle, vcpu_idx, ++ atomic_read(&vcpu_ref->refcount)); ++ } else { ++ shadow_vcpu = vcpu_ref->vcpu; ++ vcpu_ref->vcpu = NULL; ++ } ++ ++ pkvm_spin_unlock(&vm->lock); ++ ++ if (shadow_vcpu) { ++ remove_shadow_vcpu_vmcs12_map(shadow_vcpu); ++ /* ++ * Paired with the get_shadow_vm when saving the shadow_vm pointer ++ * during attaching shadow_vcpu. ++ */ ++ put_shadow_vm(shadow_vcpu->vm->shadow_vm_handle); ++ } ++ ++ return shadow_vcpu; ++} ++ ++s64 __pkvm_init_shadow_vcpu(struct kvm_vcpu *hvcpu, int shadow_vm_handle, ++ unsigned long vcpu_va, unsigned long shadow_pa, ++ size_t shadow_size) ++{ ++ struct pkvm_shadow_vm *vm; ++ struct shadow_vcpu_state *shadow_vcpu; ++ struct x86_exception e; ++ unsigned long vmcs12_va; ++ s64 shadow_vcpu_handle; ++ int ret; ++ ++ if (!PAGE_ALIGNED(shadow_pa) || !PAGE_ALIGNED(shadow_size) || ++ (shadow_size != PAGE_ALIGN(sizeof(struct shadow_vcpu_state))) || ++ (pkvm_hyp->vmcs_config.size > PAGE_SIZE)) ++ return -EINVAL; ++ ++ if (__pkvm_host_donate_hyp(shadow_pa, shadow_size)) ++ return -EINVAL; ++ ++ shadow_vcpu = pkvm_phys_to_virt(shadow_pa); ++ memset(shadow_vcpu, 0, shadow_size); ++ shadow_vcpu->shadow_size = shadow_size; ++ ++ ret = read_gva(hvcpu, vcpu_va, &shadow_vcpu->vmx, sizeof(struct vcpu_vmx), &e); ++ if (ret < 0) ++ goto undonate; ++ ++ vmcs12_va = (unsigned long)shadow_vcpu->vmx.vmcs01.vmcs; ++ if (gva2gpa(hvcpu, vmcs12_va, (gpa_t *)&shadow_vcpu->vmcs12_pa, 0, &e)) ++ goto undonate; ++ ++ vm = get_shadow_vm(shadow_vm_handle); ++ if (!vm) ++ goto undonate; ++ ++ shadow_vcpu_handle = attach_shadow_vcpu_to_vm(vm, shadow_vcpu); ++ ++ put_shadow_vm(shadow_vm_handle); ++ ++ if (shadow_vcpu_handle < 0) ++ goto undonate; ++ ++ return shadow_vcpu_handle; ++undonate: ++ memset(shadow_vcpu, 0, shadow_size); ++ __pkvm_hyp_donate_host(shadow_pa, shadow_size); ++ return -EINVAL; ++} ++ ++unsigned long __pkvm_teardown_shadow_vcpu(s64 shadow_vcpu_handle) ++{ ++ int shadow_vm_handle = to_shadow_vm_handle(shadow_vcpu_handle); ++ struct shadow_vcpu_state *shadow_vcpu; ++ unsigned long shadow_size; ++ struct pkvm_shadow_vm *vm = get_shadow_vm(shadow_vm_handle); ++ ++ if (!vm) ++ return 0; ++ ++ shadow_vcpu = detach_shadow_vcpu_from_vm(vm, shadow_vcpu_handle); ++ ++ put_shadow_vm(shadow_vm_handle); ++ ++ if (!shadow_vcpu) ++ return 0; ++ ++ shadow_size = shadow_vcpu->shadow_size; ++ memset(shadow_vcpu, 0, shadow_size); ++ WARN_ON(__pkvm_hyp_donate_host(pkvm_virt_to_phys(shadow_vcpu), ++ shadow_size)); ++ ++ return pkvm_virt_to_phys(shadow_vcpu); ++} ++ ++void pkvm_kick_vcpu(struct kvm_vcpu *vcpu) ++{ ++ struct pkvm_host_vcpu *hvcpu = to_pkvm_hvcpu(vcpu); ++ struct pkvm_pcpu *pcpu = hvcpu->pcpu; ++ ++ if (kvm_vcpu_exiting_guest_mode(vcpu) != IN_GUEST_MODE) ++ return; ++ ++ pkvm_lapic_send_init(pcpu); ++} ++ ++int pkvm_add_ptdev(int shadow_vm_handle, u16 bdf, u32 pasid) ++{ ++ struct pkvm_shadow_vm *vm = get_shadow_vm(shadow_vm_handle); ++ int ret = 0; ++ ++ if (!vm) ++ return -EINVAL; ++ ++ if (vm->vm_type != KVM_X86_DEFAULT_VM) ++ ret = pkvm_attach_ptdev(bdf, pasid, vm); ++ ++ put_shadow_vm(shadow_vm_handle); ++ ++ return ret; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S b/arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S +new file mode 100644 +index 000000000000..af81ce58c72f +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/pkvm.lds.S +@@ -0,0 +1,10 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#include ++ ++SECTIONS { ++ PKVM_SECTION(.text) ++ PKVM_SECTION(.rodata) ++ PKVM_SECTION(.data) ++ PKVM_SECTION(.bss) ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h b/arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h +new file mode 100644 +index 000000000000..5948f1b39953 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/pkvm_hyp.h +@@ -0,0 +1,187 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef __PKVM_HYP_H ++#define __PKVM_HYP_H ++ ++#include "pkvm_spinlock.h" ++#include "pgtable.h" ++ ++/* ++ * Descriptor for shadow EPT ++ */ ++struct shadow_ept_desc { ++ /* shadow EPTP value configured by pkvm */ ++ u64 shadow_eptp; ++ ++ /* Save the last guest EPTP value configured by kvm high */ ++ u64 last_guest_eptp; ++ ++ struct pkvm_pgtable sept; ++}; ++ ++/* ++ * Store the Virtualization Exception(#VE) information when a #VE occurs. This ++ * struture definition is based on ++ * sdm Volume 3, 25.5.7.2 Virtualizaiton-Exception Information. ++ */ ++struct pkvm_ve_info { ++ u32 exit_reason; ++ u32 valid; ++ u64 exit_qual; ++ u64 gla; ++ u64 gpa; ++ u16 eptp_index; ++}; ++ ++/* ++ * * A container for the vcpu state that hyp needs to maintain for protected VMs. ++ * */ ++struct shadow_vcpu_state { ++ /* ++ * A unique id to the shadow vcpu, which is combined by ++ * shadow_vm_handle and shadow_vcpu index in the array. ++ * As shadow_vm_handle is in the high end and it is an ++ * int, so define the shadow_vcpu_handle as a s64. ++ */ ++ s64 shadow_vcpu_handle; ++ ++ struct pkvm_shadow_vm *vm; ++ ++ /* The donated size of shadow_vcpu. */ ++ unsigned long shadow_size; ++ ++ struct hlist_node hnode; ++ unsigned long vmcs12_pa; ++ bool vmcs02_inited; ++ ++ struct vcpu_vmx vmx; ++ ++ /* represents for the virtual EPT configured by kvm-high */ ++ struct pkvm_pgtable vept; ++ ++ /* assume vmcs02 is one page */ ++ u8 vmcs02[PAGE_SIZE] __aligned(PAGE_SIZE); ++ u8 cached_vmcs12[VMCS12_SIZE] __aligned(PAGE_SIZE); ++ ++ struct pkvm_ve_info ve_info; ++ ++ /* The last cpu this vmcs02 runs with */ ++ int last_cpu; ++ ++ /* point to the kvm_vcpu associated with this shadow_vcpu */ ++ struct kvm_vcpu *vcpu; ++} __aligned(PAGE_SIZE); ++ ++#define SHADOW_VM_HANDLE_SHIFT 32 ++#define SHADOW_VCPU_INDEX_MASK ((1UL << SHADOW_VM_HANDLE_SHIFT) - 1) ++#define to_shadow_vcpu_handle(vm_handle, vcpu_idx) \ ++ (((s64)(vm_handle) << SHADOW_VM_HANDLE_SHIFT) | \ ++ ((vcpu_idx) & SHADOW_VCPU_INDEX_MASK)) ++ ++/* ++ * Shadow_vcpu_array will be appended to the end of the pkvm_shadow_vm area ++ * implicitly, so that the shadow_vcpu_state pointer cannot be got directly ++ * from the pkvm_shadow_vm, but needs to be done through the interface ++ * get/put_shadow_vcpu. This can prevent the shadow_vcpu_state pointer from being ++ * abused without getting/putting the refcount. ++ */ ++struct shadow_vcpu_array { ++ struct shadow_vcpu_ref { ++ atomic_t refcount; ++ struct shadow_vcpu_state *vcpu; ++ } ref[KVM_MAX_VCPUS]; ++} __aligned(PAGE_SIZE); ++ ++static inline size_t pkvm_shadow_vcpu_array_size(void) ++{ ++ return sizeof(struct shadow_vcpu_array); ++} ++ ++/* ++ * * Holds the relevant data for running a protected vm. ++ * */ ++struct pkvm_shadow_vm { ++ /* A unique id to the shadow structs in the hyp shadow area. */ ++ int shadow_vm_handle; ++ ++ /* Number of vcpus for the vm. */ ++ int created_vcpus; ++ ++ /* The host's kvm va. */ ++ unsigned long host_kvm_va; ++ ++ /* The donated size of shadow_vm. */ ++ unsigned long shadow_size; ++ ++ /* ++ * VM's shadow EPT. All vCPU shares one mapping. ++ * FIXME: a potential security issue if some vCPUs are ++ * in SMM but the others are not. ++ */ ++ struct shadow_ept_desc sept_desc; ++ ++ /* ++ * Page state page table manages the page states, and ++ * works as IOMMU second-level page table for protected ++ * VM with passthrough devices. For the protected VM ++ * without passthrough devices or normal VM, it manages ++ * the page states only. ++ */ ++ struct pkvm_pgtable pgstate_pgt; ++ /* Indicate if pgstate_pgt needs to be prepopulated */ ++ bool need_prepopulation; ++ /* ++ * Indicate the count of the shadow VM passthrough devices ++ * which are attached to non-coherent IOMMU. ++ */ ++ unsigned long noncoherent_ptdev; ++ ++ /* link the passthrough devices of a protected VM */ ++ struct list_head ptdev_head; ++ ++ /* The vm_type to indicate if this is a protected VM */ ++ unsigned long vm_type; ++ ++ pkvm_spinlock_t lock; ++} __aligned(PAGE_SIZE); ++ ++#define sept_to_shadow_ept_desc(_sept) container_of(_sept, struct shadow_ept_desc, sept) ++ ++#define sept_desc_to_shadow_vm(desc) container_of(desc, struct pkvm_shadow_vm, sept_desc) ++ ++#define sept_to_shadow_vm(_sept) sept_desc_to_shadow_vm(sept_to_shadow_ept_desc(_sept)) ++ ++#define pgstate_pgt_to_shadow_vm(_pgt) container_of(_pgt, struct pkvm_shadow_vm, pgstate_pgt) ++ ++int __pkvm_init_shadow_vm(struct kvm_vcpu *hvcpu, unsigned long kvm_va, ++ unsigned long shadow_pa, size_t shadow_size); ++unsigned long __pkvm_teardown_shadow_vm(int shadow_vm_handle); ++struct pkvm_shadow_vm *get_shadow_vm(int shadow_vm_handle); ++void put_shadow_vm(int shadow_vm_handle); ++void pkvm_shadow_vm_link_ptdev(struct pkvm_shadow_vm *vm, ++ struct list_head *node, bool coherency); ++void pkvm_shadow_vm_unlink_ptdev(struct pkvm_shadow_vm *vm, ++ struct list_head *node, bool coherency); ++s64 __pkvm_init_shadow_vcpu(struct kvm_vcpu *hvcpu, int shadow_vm_handle, ++ unsigned long vcpu_va, unsigned long shadow_pa, ++ size_t shadow_size); ++unsigned long __pkvm_teardown_shadow_vcpu(s64 shadow_vcpu_handle); ++struct shadow_vcpu_state *get_shadow_vcpu(s64 shadow_vcpu_handle); ++void put_shadow_vcpu(s64 shadow_vcpu_handle); ++s64 find_shadow_vcpu_handle_by_vmcs(unsigned long vmcs12_pa); ++void pkvm_kick_vcpu(struct kvm_vcpu *vcpu); ++int pkvm_add_ptdev(int shadow_vm_handle, u16 bdf, u32 pasid); ++ ++#define PKVM_REQ_TLB_FLUSH_HOST_EPT KVM_ARCH_REQ(0) ++#define PKVM_REQ_TLB_FLUSH_SHADOW_EPT KVM_ARCH_REQ(1) ++ ++extern struct pkvm_hyp *pkvm_hyp; ++ ++static inline bool shadow_vcpu_is_protected(struct shadow_vcpu_state *shadow_vcpu) ++{ ++ return shadow_vcpu->vm->vm_type == KVM_X86_PROTECTED_VM; ++} ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h b/arch/x86/kvm/vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h +new file mode 100644 +index 000000000000..85512f010bdb +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/pkvm_nested_vmcs_fields.h +@@ -0,0 +1,191 @@ ++#if !defined(EMULATED_FIELD_RW) && !defined(SHADOW_FIELD_RW) && !defined(SHADOW_FIELD_RO) ++BUILD_BUG_ON(1) ++#endif ++ ++#ifndef EMULATED_FIELD_RW ++#define EMULATED_FIELD_RW(x, y) ++#endif ++#ifndef SHADOW_FIELD_RW ++#define SHADOW_FIELD_RW(x, y) ++#endif ++#ifndef SHADOW_FIELD_RO ++#define SHADOW_FIELD_RO(x, y) ++#endif ++ ++/* ++ * Emulated fields for vmcs02: ++ * ++ * These fields are recorded in cached_vmcs12, and should be emulated to ++ * real value in vmcs02 before vmcs01 active. ++ */ ++/* 16-bits */ ++EMULATED_FIELD_RW(VIRTUAL_PROCESSOR_ID, virtual_processor_id) ++ ++/* 32-bits */ ++EMULATED_FIELD_RW(VM_EXIT_CONTROLS, vm_exit_controls) ++EMULATED_FIELD_RW(VM_ENTRY_CONTROLS, vm_entry_controls) ++EMULATED_FIELD_RW(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control) ++ ++/* 64-bits, what about their HIGH 32 fields? */ ++EMULATED_FIELD_RW(IO_BITMAP_A, io_bitmap_a) ++EMULATED_FIELD_RW(IO_BITMAP_B, io_bitmap_b) ++EMULATED_FIELD_RW(MSR_BITMAP, msr_bitmap) ++EMULATED_FIELD_RW(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr) ++EMULATED_FIELD_RW(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr) ++EMULATED_FIELD_RW(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr) ++EMULATED_FIELD_RW(XSS_EXIT_BITMAP, xss_exit_bitmap) ++EMULATED_FIELD_RW(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr) ++EMULATED_FIELD_RW(PML_ADDRESS, pml_address) ++EMULATED_FIELD_RW(VM_FUNCTION_CONTROL, vm_function_control) ++EMULATED_FIELD_RW(EPT_POINTER, ept_pointer) ++EMULATED_FIELD_RW(EOI_EXIT_BITMAP0, eoi_exit_bitmap0) ++EMULATED_FIELD_RW(EOI_EXIT_BITMAP1, eoi_exit_bitmap1) ++EMULATED_FIELD_RW(EOI_EXIT_BITMAP2, eoi_exit_bitmap2) ++EMULATED_FIELD_RW(EOI_EXIT_BITMAP3, eoi_exit_bitmap3) ++EMULATED_FIELD_RW(EPTP_LIST_ADDRESS, eptp_list_address) ++EMULATED_FIELD_RW(VMREAD_BITMAP, vmread_bitmap) ++EMULATED_FIELD_RW(VMWRITE_BITMAP, vmwrite_bitmap) ++EMULATED_FIELD_RW(ENCLS_EXITING_BITMAP, encls_exiting_bitmap) ++EMULATED_FIELD_RW(VMCS_LINK_POINTER, vmcs_link_pointer) ++ ++/* ++ * Shadow fields for vmcs02: ++ * ++ * These fields are HW shadowing in vmcs02, we try to shadow all non-host ++ * fields except emulated ones. ++ * Host state fields need to be recorded in cached_vmcs12 and restored to vmcs01's ++ * guest state when returning to L1 host, so please ensure __NO__ host fields below. ++ */ ++ ++/* 16-bits */ ++SHADOW_FIELD_RW(POSTED_INTR_NV, posted_intr_nv) ++SHADOW_FIELD_RW(GUEST_ES_SELECTOR, guest_es_selector) ++SHADOW_FIELD_RW(GUEST_CS_SELECTOR, guest_cs_selector) ++SHADOW_FIELD_RW(GUEST_SS_SELECTOR, guest_ss_selector) ++SHADOW_FIELD_RW(GUEST_DS_SELECTOR, guest_ds_selector) ++SHADOW_FIELD_RW(GUEST_FS_SELECTOR, guest_fs_selector) ++SHADOW_FIELD_RW(GUEST_GS_SELECTOR, guest_gs_selector) ++SHADOW_FIELD_RW(GUEST_LDTR_SELECTOR, guest_ldtr_selector) ++SHADOW_FIELD_RW(GUEST_TR_SELECTOR, guest_tr_selector) ++SHADOW_FIELD_RW(GUEST_TR_SELECTOR, guest_tr_selector) ++SHADOW_FIELD_RW(GUEST_INTR_STATUS, guest_intr_status) ++SHADOW_FIELD_RW(GUEST_PML_INDEX, guest_pml_index) ++ ++/* 32-bits */ ++SHADOW_FIELD_RW(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control) ++SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control) ++SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap) ++SHADOW_FIELD_RW(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask) ++SHADOW_FIELD_RW(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match) ++SHADOW_FIELD_RW(CR3_TARGET_COUNT, cr3_target_count) ++SHADOW_FIELD_RW(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count) ++SHADOW_FIELD_RW(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count) ++SHADOW_FIELD_RW(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count) ++SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field) ++SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code) ++SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len) ++SHADOW_FIELD_RW(TPR_THRESHOLD, tpr_threshold) ++SHADOW_FIELD_RW(GUEST_ES_LIMIT, guest_es_limit) ++SHADOW_FIELD_RW(GUEST_CS_LIMIT, guest_cs_limit) ++SHADOW_FIELD_RW(GUEST_SS_LIMIT, guest_ss_limit) ++SHADOW_FIELD_RW(GUEST_DS_LIMIT, guest_ds_limit) ++SHADOW_FIELD_RW(GUEST_FS_LIMIT, guest_fs_limit) ++SHADOW_FIELD_RW(GUEST_GS_LIMIT, guest_gs_limit) ++SHADOW_FIELD_RW(GUEST_LDTR_LIMIT, guest_ldtr_limit) ++SHADOW_FIELD_RW(GUEST_TR_LIMIT, guest_tr_limit) ++SHADOW_FIELD_RW(GUEST_GDTR_LIMIT, guest_gdtr_limit) ++SHADOW_FIELD_RW(GUEST_IDTR_LIMIT, guest_idtr_limit) ++SHADOW_FIELD_RW(GUEST_ES_AR_BYTES, guest_es_ar_bytes) ++SHADOW_FIELD_RW(GUEST_CS_AR_BYTES, guest_cs_ar_bytes) ++SHADOW_FIELD_RW(GUEST_SS_AR_BYTES, guest_ss_ar_bytes) ++SHADOW_FIELD_RW(GUEST_DS_AR_BYTES, guest_ds_ar_bytes) ++SHADOW_FIELD_RW(GUEST_FS_AR_BYTES, guest_fs_ar_bytes) ++SHADOW_FIELD_RW(GUEST_GS_AR_BYTES, guest_gs_ar_bytes) ++SHADOW_FIELD_RW(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes) ++SHADOW_FIELD_RW(GUEST_TR_AR_BYTES, guest_tr_ar_bytes) ++SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info) ++SHADOW_FIELD_RW(GUEST_ACTIVITY_STATE, guest_activity_state) ++SHADOW_FIELD_RW(GUEST_SYSENTER_CS, guest_sysenter_cs) ++SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value) ++SHADOW_FIELD_RW(PLE_GAP, ple_gap) ++SHADOW_FIELD_RW(PLE_WINDOW, ple_window) ++ ++/* Natural width */ ++SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK, cr0_guest_host_mask) ++SHADOW_FIELD_RW(CR4_GUEST_HOST_MASK, cr4_guest_host_mask) ++SHADOW_FIELD_RW(CR0_READ_SHADOW, cr0_read_shadow) ++SHADOW_FIELD_RW(CR4_READ_SHADOW, cr4_read_shadow) ++SHADOW_FIELD_RW(GUEST_CR0, guest_cr0) ++SHADOW_FIELD_RW(GUEST_CR3, guest_cr3) ++SHADOW_FIELD_RW(GUEST_CR4, guest_cr4) ++SHADOW_FIELD_RW(GUEST_ES_BASE, guest_es_base) ++SHADOW_FIELD_RW(GUEST_CS_BASE, guest_cs_base) ++SHADOW_FIELD_RW(GUEST_SS_BASE, guest_ss_base) ++SHADOW_FIELD_RW(GUEST_DS_BASE, guest_ds_base) ++SHADOW_FIELD_RW(GUEST_FS_BASE, guest_fs_base) ++SHADOW_FIELD_RW(GUEST_GS_BASE, guest_gs_base) ++SHADOW_FIELD_RW(GUEST_LDTR_BASE, guest_ldtr_base) ++SHADOW_FIELD_RW(GUEST_TR_BASE, guest_tr_base) ++SHADOW_FIELD_RW(GUEST_GDTR_BASE, guest_gdtr_base) ++SHADOW_FIELD_RW(GUEST_IDTR_BASE, guest_idtr_base) ++SHADOW_FIELD_RW(GUEST_DR7, guest_dr7) ++SHADOW_FIELD_RW(GUEST_RSP, guest_rsp) ++SHADOW_FIELD_RW(GUEST_RIP, guest_rip) ++SHADOW_FIELD_RW(GUEST_RFLAGS, guest_rflags) ++SHADOW_FIELD_RW(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions) ++SHADOW_FIELD_RW(GUEST_SYSENTER_ESP, guest_sysenter_esp) ++SHADOW_FIELD_RW(GUEST_SYSENTER_EIP, guest_sysenter_eip) ++ ++/* 64-bit */ ++SHADOW_FIELD_RW(TSC_OFFSET, tsc_offset) ++SHADOW_FIELD_RW(TSC_OFFSET_HIGH, tsc_offset) ++SHADOW_FIELD_RW(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr) ++SHADOW_FIELD_RW(VIRTUAL_APIC_PAGE_ADDR_HIGH, virtual_apic_page_addr) ++SHADOW_FIELD_RW(APIC_ACCESS_ADDR, apic_access_addr) ++SHADOW_FIELD_RW(APIC_ACCESS_ADDR_HIGH, apic_access_addr) ++SHADOW_FIELD_RW(TSC_MULTIPLIER, tsc_multiplier) ++SHADOW_FIELD_RW(TSC_MULTIPLIER_HIGH, tsc_multiplier) ++SHADOW_FIELD_RW(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl) ++SHADOW_FIELD_RW(GUEST_IA32_DEBUGCTL_HIGH, guest_ia32_debugctl) ++SHADOW_FIELD_RW(GUEST_IA32_PAT, guest_ia32_pat) ++SHADOW_FIELD_RW(GUEST_IA32_PAT_HIGH, guest_ia32_pat) ++SHADOW_FIELD_RW(GUEST_IA32_EFER, guest_ia32_efer) ++SHADOW_FIELD_RW(GUEST_IA32_EFER_HIGH, guest_ia32_efer) ++SHADOW_FIELD_RW(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl) ++SHADOW_FIELD_RW(GUEST_IA32_PERF_GLOBAL_CTRL_HIGH, guest_ia32_perf_global_ctrl) ++SHADOW_FIELD_RW(GUEST_PDPTR0, guest_pdptr0) ++SHADOW_FIELD_RW(GUEST_PDPTR0_HIGH, guest_pdptr0) ++SHADOW_FIELD_RW(GUEST_PDPTR1, guest_pdptr1) ++SHADOW_FIELD_RW(GUEST_PDPTR1_HIGH, guest_pdptr1) ++SHADOW_FIELD_RW(GUEST_PDPTR2, guest_pdptr2) ++SHADOW_FIELD_RW(GUEST_PDPTR2_HIGH, guest_pdptr2) ++SHADOW_FIELD_RW(GUEST_PDPTR3, guest_pdptr3) ++SHADOW_FIELD_RW(GUEST_PDPTR3_HIGH, guest_pdptr3) ++SHADOW_FIELD_RW(GUEST_BNDCFGS, guest_bndcfgs) ++SHADOW_FIELD_RW(GUEST_BNDCFGS_HIGH, guest_bndcfgs) ++ ++/* 32-bits */ ++SHADOW_FIELD_RO(VM_INSTRUCTION_ERROR, vm_instruction_error) ++SHADOW_FIELD_RO(VM_EXIT_REASON, vm_exit_reason) ++SHADOW_FIELD_RO(VM_EXIT_INTR_INFO, vm_exit_intr_info) ++SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code) ++SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field) ++SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code) ++SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len) ++SHADOW_FIELD_RO(VMX_INSTRUCTION_INFO, vmx_instruction_info) ++ ++/* Natural width */ ++SHADOW_FIELD_RO(EXIT_QUALIFICATION, exit_qualification) ++SHADOW_FIELD_RO(EXIT_IO_RCX, exit_io_rcx) ++SHADOW_FIELD_RO(EXIT_IO_RSI, exit_io_rsi) ++SHADOW_FIELD_RO(EXIT_IO_RDI, exit_io_rdi) ++SHADOW_FIELD_RO(EXIT_IO_RIP, exit_io_rip) ++SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS, guest_linear_address) ++ ++/* 64-bit */ ++SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address) ++SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address) ++ ++#undef EMULATED_FIELD_RW ++#undef SHADOW_FIELD_RW ++#undef SHADOW_FIELD_RO +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ptdev.c b/arch/x86/kvm/vmx/pkvm/hyp/ptdev.c +new file mode 100644 +index 000000000000..409fd0af75e9 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/ptdev.c +@@ -0,0 +1,213 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* Copyright(c) 2022 Intel Corporation. */ ++ ++#include ++#include ++#include ++#include "pkvm_hyp.h" ++#include "iommu.h" ++#include "ptdev.h" ++#include "iommu_spgt.h" ++#include "bug.h" ++#include "pci.h" ++ ++#define MAX_PTDEV_NUM (PKVM_MAX_PDEV_NUM + PKVM_MAX_PASID_PDEV_NUM) ++static DEFINE_HASHTABLE(ptdev_hasht, 8); ++static DECLARE_BITMAP(ptdevs_bitmap, MAX_PTDEV_NUM); ++static struct pkvm_ptdev pkvm_ptdev[MAX_PTDEV_NUM]; ++static pkvm_spinlock_t ptdev_lock = { __ARCH_PKVM_SPINLOCK_UNLOCKED }; ++ ++struct pkvm_ptdev *pkvm_alloc_ptdev(u16 bdf, u32 pasid, bool coherency) ++{ ++ struct pkvm_ptdev *ptdev = NULL; ++ unsigned long index; ++ ++ pkvm_spin_lock(&ptdev_lock); ++ ++ index = find_next_zero_bit(ptdevs_bitmap, MAX_PTDEV_NUM, 0); ++ if (index < MAX_PTDEV_NUM) { ++ __set_bit(index, ptdevs_bitmap); ++ ptdev = &pkvm_ptdev[index]; ++ ptdev->bdf = bdf; ++ ptdev->pasid = pasid; ++ ptdev->iommu_coherency = coherency; ++ ptdev->index = index; ++ ptdev->pgt = pkvm_hyp->host_vm.ept; ++ INIT_LIST_HEAD(&ptdev->iommu_node); ++ INIT_LIST_HEAD(&ptdev->vm_node); ++ atomic_set(&ptdev->refcount, 1); ++ pkvm_spinlock_init(&ptdev->lock); ++ hash_add(ptdev_hasht, &ptdev->hnode, bdf); ++ } ++ ++ pkvm_spin_unlock(&ptdev_lock); ++ ++ return ptdev; ++} ++ ++struct pkvm_ptdev *pkvm_get_ptdev(u16 bdf, u32 pasid) ++{ ++ struct pkvm_ptdev *ptdev = NULL, *tmp; ++ ++ pkvm_spin_lock(&ptdev_lock); ++ ++ hash_for_each_possible(ptdev_hasht, tmp, hnode, bdf) { ++ if (match_ptdev(tmp, bdf, pasid)) { ++ ptdev = atomic_inc_not_zero(&tmp->refcount) ? tmp : NULL; ++ if (ptdev) ++ break; ++ } ++ } ++ ++ pkvm_spin_unlock(&ptdev_lock); ++ return ptdev; ++} ++ ++void pkvm_put_ptdev(struct pkvm_ptdev *ptdev) ++{ ++ if (!atomic_dec_and_test(&ptdev->refcount)) ++ return; ++ ++ pkvm_spin_lock(&ptdev_lock); ++ ++ hlist_del(&ptdev->hnode); ++ ++ __clear_bit(ptdev->index, ptdevs_bitmap); ++ ++ if (ptdev->pgt != pkvm_hyp->host_vm.ept) ++ pkvm_put_host_iommu_spgt(ptdev->pgt, ptdev->iommu_coherency); ++ ++ memset(ptdev, 0, sizeof(struct pkvm_ptdev)); ++ ++ pkvm_spin_unlock(&ptdev_lock); ++} ++ ++void pkvm_setup_ptdev_vpgt(struct pkvm_ptdev *ptdev, unsigned long root_gpa, ++ struct pkvm_mm_ops *mm_ops, struct pkvm_pgtable_ops *paging_ops, ++ struct pkvm_pgtable_cap *cap, bool shadowed) ++{ ++ pkvm_spin_lock(&ptdev->lock); ++ ++ if (ptdev->pgt != pkvm_hyp->host_vm.ept && ++ (!shadowed || root_gpa != ptdev->vpgt.root_pa) && ++ !ptdev_attached_to_vm(ptdev)) { ++ pkvm_put_host_iommu_spgt(ptdev->pgt, ptdev->iommu_coherency); ++ ptdev->pgt = pkvm_hyp->host_vm.ept; ++ } ++ ++ if (!root_gpa || root_gpa == INVALID_ADDR || !mm_ops || !paging_ops || !cap) { ++ memset(&ptdev->vpgt, 0, sizeof(struct pkvm_pgtable)); ++ goto out; ++ } ++ ++ ptdev->vpgt.root_pa = root_gpa; ++ PKVM_ASSERT(pkvm_pgtable_init(&ptdev->vpgt, mm_ops, paging_ops, cap, false) == 0); ++ ++ if (shadowed && ptdev->pgt == pkvm_hyp->host_vm.ept) { ++ ptdev->pgt = pkvm_get_host_iommu_spgt(root_gpa, ptdev->iommu_coherency); ++ PKVM_ASSERT(ptdev->pgt); ++ } ++out: ++ pkvm_spin_unlock(&ptdev->lock); ++} ++ ++void pkvm_setup_ptdev_did(struct pkvm_ptdev *ptdev, u16 did) ++{ ++ ptdev->did = did; ++} ++ ++static void pkvm_ptdev_cache_bar(struct pkvm_ptdev *ptdev) ++{ ++ u32 offset; ++ int i; ++ ++ for (i = 0; i < 6; i++) { ++ offset = 0x10 + 4 * i; ++ ptdev->bars[i] = pkvm_pci_cfg_space_read(ptdev->bdf, offset, 4); ++ } ++} ++ ++/* ++ * pkvm_detach_ptdev() - detach a ptdev from the shadow VM it is attached. ++ * Basically it reverts what pkvm_attach_ptdev() does. ++ * ++ * @ptdev: The target ptdev. ++ * @vm: The shadow VM which will be attached to. ++ */ ++void pkvm_detach_ptdev(struct pkvm_ptdev *ptdev, struct pkvm_shadow_vm *vm) ++{ ++ /* Reset what the attach API has set */ ++ pkvm_spin_lock(&ptdev->lock); ++ ptdev->shadow_vm_handle = 0; ++ ptdev->pgt = pkvm_hyp->host_vm.ept; ++ pkvm_spin_unlock(&ptdev->lock); ++ ++ pkvm_shadow_vm_unlink_ptdev(vm, &ptdev->vm_node, ++ ptdev->iommu_coherency); ++ pkvm_iommu_sync(ptdev->bdf, ptdev->pasid); ++ ++ pkvm_put_ptdev(ptdev); ++} ++ ++/* ++ * pkvm_attach_ptdev() - attach a ptdev to a shadow VM so it will be isolated ++ * from the primary VM. ++ * ++ * @bdf: The bdf of this ptdev. ++ * @pasid: The pasid of this ptdev. ++ * @vm: The shadow VM which will be attached to. ++ * ++ * FIXME: ++ * The passthrough devices attached to the protected VM is relying on KVM ++ * high to send vmcall so that pKVM can know which device should be isolated. ++ * But if KVM high has created a passthrough device for a protected VM without ++ * using this vmcall to notify pKVM, pKVM should still be able to isolate this ++ * passthrough device. To guarantee this, either needs pKVM to know the ++ * passthrough devices information to isolate them independently or needs ++ * protected VM to check with pKVM about its passthrough device info through ++ * some vmcall. Currently neither way is available. ++ */ ++int pkvm_attach_ptdev(u16 bdf, u32 pasid, struct pkvm_shadow_vm *vm) ++{ ++ struct pkvm_ptdev *ptdev = pkvm_get_ptdev(bdf, pasid); ++ ++ if (!ptdev) { ++ ptdev = pkvm_alloc_ptdev(bdf, pasid, ++ pkvm_iommu_coherency(bdf, pasid)); ++ if (!ptdev) ++ return -ENODEV; ++ } ++ ++ pkvm_spin_lock(&ptdev->lock); ++ ++ if (cmpxchg(&ptdev->shadow_vm_handle, 0, vm->shadow_vm_handle) != 0) { ++ pkvm_err("%s: ptdev with bdf 0x%x pasid 0x%x is already attached\n", ++ __func__, bdf, pasid); ++ pkvm_spin_unlock(&ptdev->lock); ++ pkvm_put_ptdev(ptdev); ++ return -ENODEV; ++ } ++ ++ pkvm_ptdev_cache_bar(ptdev); ++ ++ PKVM_ASSERT(ptdev->pgt != &vm->pgstate_pgt); ++ if (ptdev->pgt != pkvm_hyp->host_vm.ept) ++ pkvm_put_host_iommu_spgt(ptdev->pgt, ptdev->iommu_coherency); ++ ++ /* ++ * Reset pgt of this ptdev to VM's pgstate_pgt so need to update ++ * IOMMU page table accordingly. ++ */ ++ ptdev->pgt = &vm->pgstate_pgt; ++ ++ pkvm_spin_unlock(&ptdev->lock); ++ ++ pkvm_shadow_vm_link_ptdev(vm, &ptdev->vm_node, ++ ptdev->iommu_coherency); ++ if (pkvm_iommu_sync(ptdev->bdf, ptdev->pasid)) { ++ pkvm_detach_ptdev(ptdev, vm); ++ return -ENODEV; ++ } ++ ++ return 0; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ptdev.h b/arch/x86/kvm/vmx/pkvm/hyp/ptdev.h +new file mode 100644 +index 000000000000..bfefcf7346c1 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/ptdev.h +@@ -0,0 +1,53 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* Copyright(c) 2022 Intel Corporation. */ ++ ++#ifndef _PKVM_PTDEV_H_ ++#define _PKVM_PTDEV_H_ ++ ++#include "pkvm_hyp.h" ++#include "pgtable.h" ++ ++struct pkvm_ptdev { ++ atomic_t refcount; ++ struct hlist_node hnode; ++ u16 did; ++ u16 bdf; ++ u32 pasid; ++ unsigned long index; ++ struct list_head iommu_node; ++ bool iommu_coherency; ++ /* cached value of BARs when attach to shadow vm */ ++ u32 bars[6]; ++ ++ /* Represents the page table maintained by primary VM */ ++ struct pkvm_pgtable vpgt; ++ /* Represents the page table maintained by pKVM */ ++ struct pkvm_pgtable *pgt; ++ ++ pkvm_spinlock_t lock; ++ ++ int shadow_vm_handle; ++ struct list_head vm_node; ++}; ++ ++struct pkvm_ptdev *pkvm_alloc_ptdev(u16 bdf, u32 pasid, bool coherency); ++struct pkvm_ptdev *pkvm_get_ptdev(u16 bdf, u32 pasid); ++void pkvm_put_ptdev(struct pkvm_ptdev *ptdev); ++void pkvm_setup_ptdev_vpgt(struct pkvm_ptdev *ptdev, unsigned long root_gpa, ++ struct pkvm_mm_ops *mm_ops, struct pkvm_pgtable_ops *paging_ops, ++ struct pkvm_pgtable_cap *cap, bool shadowed); ++void pkvm_setup_ptdev_did(struct pkvm_ptdev *ptdev, u16 did); ++void pkvm_detach_ptdev(struct pkvm_ptdev *ptdev, struct pkvm_shadow_vm *vm); ++int pkvm_attach_ptdev(u16 bdf, u32 pasid, struct pkvm_shadow_vm *vm); ++ ++static inline bool match_ptdev(struct pkvm_ptdev *ptdev, u16 bdf, u32 pasid) ++{ ++ return ptdev && (ptdev->bdf == bdf) && (ptdev->pasid == pasid); ++} ++ ++static inline bool ptdev_attached_to_vm(struct pkvm_ptdev *ptdev) ++{ ++ /* Attached ptdev has non-zero shadow_vm_handle */ ++ return cmpxchg(&ptdev->shadow_vm_handle, 0, 0) != 0; ++} ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/trace.c b/arch/x86/kvm/vmx/pkvm/hyp/trace.c +new file mode 100644 +index 000000000000..c4ef27e4d1c1 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/trace.c +@@ -0,0 +1,117 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#include ++#include ++#include ++#include ++ ++struct vmexit_perf { ++ struct perf_data l1data; ++ struct perf_data l2data; ++ struct perf_data *cur; ++ bool on; ++ bool start; ++ int cpu; ++ pkvm_spinlock_t lock; ++}; ++static struct vmexit_perf hvcpu_perf[CONFIG_NR_CPUS]; ++ ++static inline unsigned long long pkvm_rdtsc_ordered(void) ++{ ++ DECLARE_ARGS(val, low, high); ++ ++ asm volatile("lfence;rdtsc" : EAX_EDX_RET(val, low, high)); ++ ++ return EAX_EDX_VAL(val, low, high); ++} ++ ++void trace_vmexit_start(struct kvm_vcpu *vcpu, bool nested_vmexit) ++{ ++ int cpu = vcpu->cpu; ++ struct vmexit_perf *perf = &hvcpu_perf[cpu]; ++ ++ if (!perf->on) ++ return; ++ ++ perf->start = true; ++ perf->cpu = cpu; ++ if (nested_vmexit) ++ perf->cur = &perf->l2data; ++ else ++ perf->cur = &perf->l1data; ++ ++ pkvm_spin_lock(&perf->lock); ++ perf->cur->tsc = pkvm_rdtsc_ordered(); ++ pkvm_spin_unlock(&perf->lock); ++} ++ ++void trace_vmexit_end(struct kvm_vcpu *vcpu, u32 index) ++{ ++ int cpu = vcpu->cpu; ++ struct vmexit_perf *perf = &hvcpu_perf[cpu]; ++ struct perf_data *perf_data = perf->cur; ++ unsigned long long cycles; ++ ++ if (!perf->on || !perf->start || !perf_data) ++ return; ++ ++ pkvm_spin_lock(&perf->lock); ++ cycles = pkvm_rdtsc_ordered() - perf_data->tsc; ++ perf_data->data.cycles[index] += cycles; ++ perf_data->data.total_cycles += cycles; ++ perf_data->data.total_count++; ++ perf_data->data.reasons[index]++; ++ pkvm_spin_unlock(&perf->lock); ++} ++ ++void pkvm_handle_set_vmexit_trace(struct kvm_vcpu *vcpu, bool en) ++{ ++ int cpu = vcpu->cpu; ++ struct vmexit_perf *perf = &hvcpu_perf[cpu]; ++ ++ if (en && !perf->on) { ++ perf->on = true; ++ pkvm_dbg("%s: CPU%d enable vmexit_trace\n", __func__, cpu); ++ memset(&perf->l1data, 0, sizeof(struct perf_data)); ++ memset(&perf->l2data, 0, sizeof(struct perf_data)); ++ return; ++ } ++ ++ if (!en && perf->on) { ++ perf->on = false; ++ perf->start = false; ++ pkvm_dbg("%s: CPU%d disable vmexit_trace\n", __func__, cpu); ++ return; ++ } ++} ++ ++void pkvm_handle_dump_vmexit_trace(unsigned long pa, unsigned long size) ++{ ++ void *out = pkvm_phys_to_virt(pa); ++ struct pkvm_host_vcpu *p; ++ struct vmexit_perf *perf; ++ int cpu, index; ++ ++ for (index = 0; index < CONFIG_NR_CPUS; index++) { ++ p = pkvm_hyp->host_vm.host_vcpus[index]; ++ if (!p) ++ continue; ++ ++ cpu = p->vmx.vcpu.cpu; ++ perf = &hvcpu_perf[cpu]; ++ ++ pkvm_spin_lock(&perf->lock); ++ if (size >= sizeof(struct vmexit_perf_dump)) { ++ struct vmexit_perf_dump *dump = out; ++ ++ memcpy(&dump->l1data, &perf->l1data, sizeof(struct perf_data)); ++ memcpy(&dump->l2data, &perf->l2data, sizeof(struct perf_data)); ++ dump->cpu = perf->cpu; ++ out += sizeof(struct vmexit_perf_dump); ++ size -= sizeof(struct vmexit_perf_dump); ++ } ++ pkvm_spin_unlock(&perf->lock); ++ } ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/trace.h b/arch/x86/kvm/vmx/pkvm/hyp/trace.h +new file mode 100644 +index 000000000000..970d2e770844 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/trace.h +@@ -0,0 +1,15 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _HYP_TRACE_H_ ++#define _HYP_TRACE_H_ ++ ++#include ++ ++void trace_vmexit_start(struct kvm_vcpu *vcpu, bool nested_vmexit); ++void trace_vmexit_end(struct kvm_vcpu *vcpu, u32 index); ++void pkvm_handle_set_vmexit_trace(struct kvm_vcpu *vcpu, bool en); ++void pkvm_handle_dump_vmexit_trace(unsigned long pa, unsigned long size); ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c +new file mode 100644 +index 000000000000..77324f75424b +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c +@@ -0,0 +1,360 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++#include ++#include "trace.h" ++#include "vmexit.h" ++ ++#include "pkvm_hyp.h" ++#include "vmsr.h" ++#include "nested.h" ++#include "ept.h" ++#include "iommu.h" ++#include "lapic.h" ++#include "io_emulate.h" ++#include "debug.h" ++ ++#define CR0 0 ++#define CR3 3 ++#define CR4 4 ++ ++#define MOV_TO_CR 0 ++ ++extern int __pkvm_init_finalise(struct kvm_vcpu *vcpu, ++ phys_addr_t phys, unsigned long size); ++ ++static void skip_emulated_instruction(void) ++{ ++ unsigned long rip; ++ ++ rip = vmcs_readl(GUEST_RIP); ++ rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); ++ vmcs_writel(GUEST_RIP, rip); ++} ++ ++static void handle_cpuid(struct kvm_vcpu *vcpu) ++{ ++ u32 eax, ebx, ecx, edx; ++ ++ eax = vcpu->arch.regs[VCPU_REGS_RAX]; ++ ecx = vcpu->arch.regs[VCPU_REGS_RCX]; ++ native_cpuid(&eax, &ebx, &ecx, &edx); ++ vcpu->arch.regs[VCPU_REGS_RAX] = eax; ++ vcpu->arch.regs[VCPU_REGS_RBX] = ebx; ++ vcpu->arch.regs[VCPU_REGS_RCX] = ecx; ++ vcpu->arch.regs[VCPU_REGS_RDX] = edx; ++} ++ ++static void handle_cr(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ unsigned long exit_qual, val; ++ int cr; ++ int type; ++ int reg; ++ unsigned long old_value; ++ ++ exit_qual = vmx->exit_qualification; ++ cr = exit_qual & 15; ++ type = (exit_qual >> 4) & 3; ++ reg = (exit_qual >> 8) & 15; ++ ++ switch (type) { ++ case MOV_TO_CR: ++ switch (cr) { ++ case CR0: ++ old_value = vmcs_readl(GUEST_CR0); ++ val = vcpu->arch.regs[reg]; ++ break; ++ case CR4: ++ old_value = vmcs_readl(GUEST_CR4); ++ val = vcpu->arch.regs[reg]; ++ /* ++ * VMXE bit is owned by host, others are owned by guest ++ * So only when guest is trying to modify VMXE bit it ++ * can cause vmexit and get here. ++ */ ++ vmcs_writel(CR4_READ_SHADOW, val); ++ break; ++ default: ++ break; ++ } ++ break; ++ default: ++ break; ++ } ++} ++ ++static unsigned long handle_vmcall(struct kvm_vcpu *vcpu) ++{ ++ u64 nr, a0, a1, a2, a3; ++ unsigned long ret = 0; ++ ++ nr = vcpu->arch.regs[VCPU_REGS_RAX]; ++ a0 = vcpu->arch.regs[VCPU_REGS_RBX]; ++ a1 = vcpu->arch.regs[VCPU_REGS_RCX]; ++ a2 = vcpu->arch.regs[VCPU_REGS_RDX]; ++ a3 = vcpu->arch.regs[VCPU_REGS_RSI]; ++ ++ switch (nr) { ++ case PKVM_HC_SET_VMEXIT_TRACE: ++ pkvm_handle_set_vmexit_trace(vcpu, a0); ++ break; ++ case PKVM_HC_DUMP_VMEXIT_TRACE: ++ pkvm_handle_dump_vmexit_trace(a0, a1); ++ break; ++ case PKVM_HC_INIT_FINALISE: ++ __pkvm_init_finalise(vcpu, a0, a1); ++ break; ++ case PKVM_HC_INIT_SHADOW_VM: ++ ret = __pkvm_init_shadow_vm(vcpu, a0, a1, a2); ++ break; ++ case PKVM_HC_INIT_SHADOW_VCPU: ++ ret = __pkvm_init_shadow_vcpu(vcpu, a0, a1, a2, a3); ++ break; ++ case PKVM_HC_TEARDOWN_SHADOW_VM: ++ ret = __pkvm_teardown_shadow_vm(a0); ++ break; ++ case PKVM_HC_TEARDOWN_SHADOW_VCPU: ++ ret = __pkvm_teardown_shadow_vcpu(a0); ++ break; ++ case PKVM_HC_MMIO_ACCESS: ++ ret = pkvm_access_iommu(a0, a1, a2, a3); ++ break; ++ case PKVM_HC_ACTIVATE_IOMMU: ++ ret = pkvm_activate_iommu(); ++ break; ++ case PKVM_HC_TLB_REMOTE_FLUSH_RANGE: ++ nested_invalidate_shadow_ept(a0, a1, a2); ++ break; ++ case PKVM_HC_SET_MMIO_VE: ++ pkvm_shadow_clear_suppress_ve(vcpu, a0); ++ break; ++ case PKVM_HC_ADD_PTDEV: ++ ret = pkvm_add_ptdev(a0, a1, a2); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++ ++static void handle_xsetbv(struct kvm_vcpu *vcpu) ++{ ++ u32 eax = (u32)(vcpu->arch.regs[VCPU_REGS_RAX] & -1u); ++ u32 edx = (u32)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u); ++ u32 ecx = (u32)(vcpu->arch.regs[VCPU_REGS_RCX] & -1u); ++ ++ asm volatile(".byte 0x0f,0x01,0xd1" ++ : : "a" (eax), "d" (edx), "c" (ecx)); ++} ++ ++static void handle_irq_window(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ u32 cpu_based_exec_ctrl = exec_controls_get(vmx); ++ ++ exec_controls_set(vmx, cpu_based_exec_ctrl & ~CPU_BASED_INTR_WINDOW_EXITING); ++ pkvm_dbg("%s: CPU%d clear irq_window_exiting\n", __func__, vcpu->cpu); ++} ++ ++static void handle_nmi_window(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ u32 cpu_based_exec_ctrl = exec_controls_get(vmx); ++ ++ exec_controls_set(vmx, cpu_based_exec_ctrl & ~CPU_BASED_NMI_WINDOW_EXITING); ++ pkvm_dbg("%s: CPU%d clear nmi_window_exiting\n", __func__, vcpu->cpu); ++} ++ ++static void handle_pending_events(struct kvm_vcpu *vcpu) ++{ ++ struct pkvm_host_vcpu *pkvm_host_vcpu = to_pkvm_hvcpu(vcpu); ++ ++ if (!is_guest_mode(vcpu) && pkvm_host_vcpu->pending_nmi) { ++ /* Inject if NMI is not blocked */ ++ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, ++ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); ++ pkvm_host_vcpu->pending_nmi = false; ++ } ++ ++ if (kvm_check_request(PKVM_REQ_TLB_FLUSH_HOST_EPT, vcpu)) ++ pkvm_flush_host_ept(); ++ if (kvm_check_request(PKVM_REQ_TLB_FLUSH_SHADOW_EPT, vcpu)) ++ nested_flush_shadow_ept(vcpu); ++} ++ ++static inline void set_vcpu_mode(struct kvm_vcpu *vcpu, int mode) ++{ ++ vcpu->mode = mode; ++ /* ++ * Make sure vcpu->mode is set before checking/handling the pending ++ * requests. Pairs with kvm_vcpu_exiting_guest_mode(). ++ */ ++ smp_wmb(); ++} ++ ++/* we take use of kvm_vcpu structure, but not used all the fields */ ++int pkvm_main(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ int launch = 1; ++ ++ vcpu->mode = IN_GUEST_MODE; ++ ++ do { ++ bool skip_instruction = false, guest_exit = false; ++ ++ if (__pkvm_vmx_vcpu_run(vcpu->arch.regs, launch)) { ++ pkvm_err("%s: CPU%d run_vcpu failed with error 0x%x\n", ++ __func__, vcpu->cpu, vmcs_read32(VM_INSTRUCTION_ERROR)); ++ return -EINVAL; ++ } ++ ++ vcpu->arch.cr2 = native_read_cr2(); ++ ++ trace_vmexit_start(vcpu, is_guest_mode(vcpu) ? true : false); ++ ++ set_vcpu_mode(vcpu, OUTSIDE_GUEST_MODE); ++ ++ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); ++ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); ++ ++ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); ++ vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); ++ ++ if (is_guest_mode(vcpu)) { ++ guest_exit = true; ++ nested_vmexit(vcpu, &skip_instruction); ++ } else { ++ switch (vmx->exit_reason.full) { ++ case EXIT_REASON_INIT_SIGNAL: ++ /* ++ * INIT is used as kick when making a request. ++ * So just break the vmexits and go to pending ++ * events handling. ++ */ ++ break; ++ case EXIT_REASON_CPUID: ++ handle_cpuid(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_CR_ACCESS: ++ pkvm_dbg("CPU%d vmexit_reason: CR_ACCESS.\n", vcpu->cpu); ++ handle_cr(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_MSR_READ: ++ pkvm_dbg("CPU%d vmexit_reason: MSR_READ 0x%lx\n", ++ vcpu->cpu, vcpu->arch.regs[VCPU_REGS_RCX]); ++ handle_read_msr(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_MSR_WRITE: ++ pkvm_dbg("CPU%d vmexit_reason: MSR_WRITE 0x%lx\n", ++ vcpu->cpu, vcpu->arch.regs[VCPU_REGS_RCX]); ++ handle_write_msr(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_VMLAUNCH: ++ handle_vmlaunch(vcpu); ++ break; ++ case EXIT_REASON_VMRESUME: ++ handle_vmresume(vcpu); ++ break; ++ case EXIT_REASON_VMON: ++ pkvm_dbg("CPU%d vmexit reason: VMXON.\n", vcpu->cpu); ++ handle_vmxon(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_VMOFF: ++ pkvm_dbg("CPU%d vmexit reason: VMXOFF.\n", vcpu->cpu); ++ handle_vmxoff(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_VMPTRLD: ++ pkvm_dbg("CPU%d vmexit reason: VMPTRLD.\n", vcpu->cpu); ++ handle_vmptrld(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_VMCLEAR: ++ pkvm_dbg("CPU%d vmexit reason: VMCLEAR.\n", vcpu->cpu); ++ handle_vmclear(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_VMREAD: ++ pkvm_dbg("CPU%d vmexit reason: WMREAD.\n", vcpu->cpu); ++ handle_vmread(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_VMWRITE: ++ pkvm_dbg("CPU%d vmexit reason: VMWRITE.\n", vcpu->cpu); ++ handle_vmwrite(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_XSETBV: ++ handle_xsetbv(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_VMCALL: ++ vcpu->arch.regs[VCPU_REGS_RAX] = handle_vmcall(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_INTERRUPT_WINDOW: ++ handle_irq_window(vcpu); ++ break; ++ case EXIT_REASON_NMI_WINDOW: ++ handle_nmi_window(vcpu); ++ break; ++ case EXIT_REASON_INVEPT: ++ handle_invept(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_INVVPID: ++ handle_invvpid(vcpu); ++ skip_instruction = true; ++ break; ++ case EXIT_REASON_EPT_VIOLATION: ++ if (handle_host_ept_violation(vcpu, &skip_instruction)) ++ pkvm_err("pkvm: handle host ept violation failed"); ++ break; ++ case EXIT_REASON_IO_INSTRUCTION: ++ if (handle_host_pio(vcpu)) ++ pkvm_err("pkvm: handle host port I/O access failed."); ++ skip_instruction = true; ++ break; ++ default: ++ pkvm_dbg("CPU%d: Unsupported vmexit reason 0x%x.\n", vcpu->cpu, vmx->exit_reason.full); ++ skip_instruction = true; ++ break; ++ } ++ } ++ ++ if (skip_instruction) ++ skip_emulated_instruction(); ++handle_events: ++ handle_pending_events(vcpu); ++ ++ set_vcpu_mode(vcpu, IN_GUEST_MODE); ++ ++ if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)) ++ goto handle_events; ++ ++ /* ++ * L2 VMExit -> L1 VMEntry and L1 VMExit -> L1 VMEnetry: vmresume. ++ * L2 VMExit -> L2 VMEntry: vmresume ++ * L1 VMExit -> L2 VMEntry: vmlaunch, as vmcs02 is clear every time ++ */ ++ launch = !is_guest_mode(vcpu) ? 0 : (guest_exit ? 0 : 1); ++ ++ native_write_cr2(vcpu->arch.cr2); ++ trace_vmexit_end(vcpu, vmx->exit_reason.basic); ++ } while (1); ++ ++ return 0; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.h b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.h +new file mode 100644 +index 000000000000..95a27c2ac112 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.h +@@ -0,0 +1,11 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#ifndef _PKVM_VMEXIT_H_ ++#define _PKVM_VMEXIT_H_ ++ ++int __pkvm_vmx_vcpu_run(unsigned long *regs, int launch); ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmsr.c b/arch/x86/kvm/vmx/pkvm/hyp/vmsr.c +new file mode 100644 +index 000000000000..10a035aee7ec +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmsr.c +@@ -0,0 +1,120 @@ ++/* ++ * SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 ++ * Copyright (C) 2018-2022 Intel Corporation ++ */ ++ ++#include ++#include "cpu.h" ++#include "nested.h" ++#include "lapic.h" ++#include "debug.h" ++ ++#define INTERCEPT_DISABLE (0U) ++#define INTERCEPT_READ (1U << 0U) ++#define INTERCEPT_WRITE (1U << 1U) ++#define INTERCEPT_READ_WRITE (INTERCEPT_READ | INTERCEPT_WRITE) ++ ++static unsigned int emulated_ro_guest_msrs[] = { ++ LIST_OF_VMX_MSRS, ++}; ++ ++static unsigned int emulated_wo_guest_msrs[] = { ++ MSR_IA32_APICBASE, ++ (APIC_BASE_MSR + (APIC_ID >> 4)), ++}; ++ ++static void enable_msr_interception(u8 *bitmap, unsigned int msr_arg, unsigned int mode) ++{ ++ unsigned int read_offset = 0U; ++ unsigned int write_offset = 2048U; ++ unsigned int msr = msr_arg; ++ u8 msr_bit; ++ unsigned int msr_index; ++ ++ if ((msr <= 0x1FFFU) || ((msr >= 0xc0000000U) && (msr <= 0xc0001fffU))) { ++ if ((msr & 0xc0000000U) != 0U) { ++ read_offset = read_offset + 1024U; ++ write_offset = write_offset + 1024U; ++ } ++ ++ msr &= 0x1FFFU; ++ msr_bit = (u8)(1U << (msr & 0x7U)); ++ msr_index = msr >> 3U; ++ ++ if ((mode & INTERCEPT_READ) == INTERCEPT_READ) { ++ bitmap[read_offset + msr_index] |= msr_bit; ++ } else { ++ bitmap[read_offset + msr_index] &= ~msr_bit; ++ } ++ ++ if ((mode & INTERCEPT_WRITE) == INTERCEPT_WRITE) { ++ bitmap[write_offset + msr_index] |= msr_bit; ++ } else { ++ bitmap[write_offset + msr_index] &= ~msr_bit; ++ } ++ } else { ++ pkvm_err("%s, Invalid MSR: 0x%x", __func__, msr); ++ } ++} ++ ++int handle_read_msr(struct kvm_vcpu *vcpu) ++{ ++ unsigned long msr = vcpu->arch.regs[VCPU_REGS_RCX]; ++ int ret = 0; ++ u32 low = 0, high = 0; ++ u64 val; ++ ++ /* For non-supported MSRs, return low=high=0 by default */ ++ if (is_vmx_msr(msr)) { ++ ret = read_vmx_msr(vcpu, msr, &val); ++ if (!ret) { ++ low = (u32)val; ++ high = (u32)(val >> 32); ++ } ++ } ++ ++ pkvm_dbg("%s: CPU%d Value of msr 0x%lx: low=0x%x, high=0x%x\n", __func__, vcpu->cpu, msr, low, high); ++ ++ vcpu->arch.regs[VCPU_REGS_RAX] = low; ++ vcpu->arch.regs[VCPU_REGS_RDX] = high; ++ ++ return ret; ++} ++ ++int handle_write_msr(struct kvm_vcpu *vcpu) ++{ ++ unsigned long msr = vcpu->arch.regs[VCPU_REGS_RCX]; ++ u32 low, high; ++ u64 val; ++ int ret = 0; ++ ++ low = vcpu->arch.regs[VCPU_REGS_RAX]; ++ high = vcpu->arch.regs[VCPU_REGS_RDX]; ++ val = low | ((u64)high << 32); ++ ++ switch (msr) { ++ case MSR_IA32_APICBASE: ++ pkvm_apic_base_msr_write(vcpu, val); ++ break; ++ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: ++ ret = pkvm_x2apic_msr_write(vcpu, msr, val); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++void init_msr_emulation(struct vcpu_vmx *vmx) ++{ ++ int i; ++ u8 *bitmap = (u8 *)vmx->loaded_vmcs->msr_bitmap; ++ ++ for (i = 0; i < ARRAY_SIZE(emulated_ro_guest_msrs); i++) { ++ enable_msr_interception(bitmap, emulated_ro_guest_msrs[i], INTERCEPT_READ); ++ } ++ ++ for (i = 0; i < ARRAY_SIZE(emulated_wo_guest_msrs); i++) ++ enable_msr_interception(bitmap, emulated_wo_guest_msrs[i], INTERCEPT_WRITE); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmsr.h b/arch/x86/kvm/vmx/pkvm/hyp/vmsr.h +new file mode 100644 +index 000000000000..2a8f947fb17a +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmsr.h +@@ -0,0 +1,11 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _PKVM_VMSR_H_ ++#define _PKVM_VMSR_H_ ++ ++int handle_read_msr(struct kvm_vcpu *vcpu); ++int handle_write_msr(struct kvm_vcpu *vcpu); ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx.c b/arch/x86/kvm/vmx/pkvm/hyp/vmx.c +new file mode 100644 +index 000000000000..4ad38578d0e7 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx.c +@@ -0,0 +1,79 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ */ ++ ++#include ++#include "cpu.h" ++ ++void init_contant_host_state_area(struct pkvm_pcpu *pcpu, int cpu) ++{ ++ unsigned long a; ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++ u32 high, low; ++ struct desc_ptr dt; ++ u16 selector; ++#endif ++ ++ vmcs_writel(HOST_CR0, native_read_cr0() & ~X86_CR0_TS); ++ vmcs_writel(HOST_CR3, pcpu->cr3); ++ vmcs_writel(HOST_CR4, native_read_cr4()); ++ ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++ savesegment(cs, selector); ++ vmcs_write16(HOST_CS_SELECTOR, selector); ++ savesegment(ss, selector); ++ vmcs_write16(HOST_SS_SELECTOR, selector); ++ savesegment(ds, selector); ++ vmcs_write16(HOST_DS_SELECTOR, selector); ++ savesegment(es, selector); ++ vmcs_write16(HOST_ES_SELECTOR, selector); ++ savesegment(fs, selector); ++ vmcs_write16(HOST_FS_SELECTOR, selector); ++ pkvm_rdmsrl(MSR_FS_BASE, a); ++ vmcs_writel(HOST_FS_BASE, a); ++ savesegment(gs, selector); ++ vmcs_write16(HOST_GS_SELECTOR, selector); ++ pkvm_rdmsrl(MSR_GS_BASE, a); ++ vmcs_writel(HOST_GS_BASE, a); ++ ++ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); ++ vmcs_writel(HOST_TR_BASE, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); ++ ++ native_store_gdt(&dt); ++ vmcs_writel(HOST_GDTR_BASE, dt.address); ++ vmcs_writel(HOST_IDTR_BASE, (unsigned long)(&pcpu->idt_page)); ++ ++ pkvm_rdmsr(MSR_IA32_SYSENTER_CS, low, high); ++ vmcs_write32(HOST_IA32_SYSENTER_CS, low); ++ ++ pkvm_rdmsrl(MSR_IA32_SYSENTER_ESP, a); ++ vmcs_writel(HOST_IA32_SYSENTER_ESP, a); ++ ++ pkvm_rdmsrl(MSR_IA32_SYSENTER_EIP, a); ++ vmcs_writel(HOST_IA32_SYSENTER_EIP, a); ++#else ++ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); ++ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); ++ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); ++ vmcs_write16(HOST_ES_SELECTOR, 0); ++ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); ++ vmcs_write16(HOST_FS_SELECTOR, 0); ++ vmcs_write16(HOST_GS_SELECTOR, 0); ++ vmcs_writel(HOST_FS_BASE, 0); ++ vmcs_writel(HOST_GS_BASE, 0); ++ ++ vmcs_writel(HOST_TR_BASE, (unsigned long)&pcpu->tss); ++ vmcs_writel(HOST_GDTR_BASE, (unsigned long)(&pcpu->gdt_page)); ++ vmcs_writel(HOST_IDTR_BASE, (unsigned long)(&pcpu->idt_page)); ++ ++ vmcs_write16(HOST_GS_SELECTOR, __KERNEL_DS); ++ vmcs_writel(HOST_GS_BASE, cpu); ++#endif ++ ++ /* MSR area */ ++ pkvm_rdmsrl(MSR_EFER, a); ++ vmcs_write64(HOST_IA32_EFER, a); ++ ++ pkvm_rdmsrl(MSR_IA32_CR_PAT, a); ++ vmcs_write64(HOST_IA32_PAT, a); ++} +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx.h b/arch/x86/kvm/vmx/pkvm/hyp/vmx.h +new file mode 100644 +index 000000000000..40da630f3c95 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx.h +@@ -0,0 +1,63 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#ifndef __PKVM_VMX_H ++#define __PKVM_VMX_H ++ ++#include "pkvm_hyp.h" ++ ++static inline u64 pkvm_construct_eptp(unsigned long root_hpa, int level) ++{ ++ u64 eptp = 0; ++ ++ if ((level == 4) && vmx_ept_has_4levels()) ++ eptp = VMX_EPTP_PWL_4; ++ else if ((level == 5) && vmx_ept_has_5levels()) ++ eptp = VMX_EPTP_PWL_5; ++ ++ if (vmx_ept_has_mt_wb()) ++ eptp |= VMX_EPTP_MT_WB; ++ ++ eptp |= (root_hpa & PAGE_MASK); ++ ++ return eptp; ++} ++ ++static inline void vmcs_load_track(struct vcpu_vmx *vmx, struct vmcs *vmcs) ++{ ++ struct pkvm_host_vcpu *pkvm_host_vcpu = vmx_to_pkvm_hvcpu(vmx); ++ ++ pkvm_host_vcpu->current_vmcs = vmcs; ++ barrier(); ++ vmcs_load(vmcs); ++} ++ ++static inline void vmcs_clear_track(struct vcpu_vmx *vmx, struct vmcs *vmcs) ++{ ++ struct pkvm_host_vcpu *pkvm_host_vcpu = vmx_to_pkvm_hvcpu(vmx); ++ ++ /* vmcs_clear might clear none current vmcs */ ++ if (pkvm_host_vcpu->current_vmcs == vmcs) ++ pkvm_host_vcpu->current_vmcs = NULL; ++ ++ barrier(); ++ vmcs_clear(vmcs); ++} ++ ++static inline void flush_ept(u64 eptp) ++{ ++ if (vmx_has_invept_context()) ++ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); ++ else ++ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); ++} ++ ++static inline u8 pkvm_virt_addr_bits(void) ++{ ++ return (vmcs_readl(GUEST_CR4) & X86_CR4_LA57) ? 57 : 48; ++} ++ ++void init_contant_host_state_area(struct pkvm_pcpu *pcpu, int cpu); ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S b/arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S +new file mode 100644 +index 000000000000..ad6ae1257a7a +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S +@@ -0,0 +1,186 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define WORD_SIZE (BITS_PER_LONG / 8) ++ ++#define VCPU_RAX (__VCPU_REGS_RAX * WORD_SIZE) ++#define VCPU_RCX (__VCPU_REGS_RCX * WORD_SIZE) ++#define VCPU_RDX (__VCPU_REGS_RDX * WORD_SIZE) ++#define VCPU_RBX (__VCPU_REGS_RBX * WORD_SIZE) ++#define VCPU_RBP (__VCPU_REGS_RBP * WORD_SIZE) ++#define VCPU_RSI (__VCPU_REGS_RSI * WORD_SIZE) ++#define VCPU_RDI (__VCPU_REGS_RDI * WORD_SIZE) ++ ++#define VCPU_R8 (__VCPU_REGS_R8 * WORD_SIZE) ++#define VCPU_R9 (__VCPU_REGS_R9 * WORD_SIZE) ++#define VCPU_R10 (__VCPU_REGS_R10 * WORD_SIZE) ++#define VCPU_R11 (__VCPU_REGS_R11 * WORD_SIZE) ++#define VCPU_R12 (__VCPU_REGS_R12 * WORD_SIZE) ++#define VCPU_R13 (__VCPU_REGS_R13 * WORD_SIZE) ++#define VCPU_R14 (__VCPU_REGS_R14 * WORD_SIZE) ++#define VCPU_R15 (__VCPU_REGS_R15 * WORD_SIZE) ++ ++#define HOST_RSP 0x6C14 ++ ++/** ++ * __vmenter - VM-Enter the current loaded VMCS ++ * ++ * Returns: ++ * %RFLAGS.CF is set on VM-Fail Invalid ++ * %RFLAGS.ZF is set on VM-Fail Valid ++ * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit ++ * ++ * Note that VMRESUME/VMLAUNCH fall-through and return directly if ++ * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump ++ * to vmx_vmexit. ++ */ ++SYM_FUNC_START_LOCAL(__vmenter) ++ /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ ++ je 2f ++ ++1: vmresume ++ ANNOTATE_UNRET_SAFE ++ ret ++ ++2: vmlaunch ++ ANNOTATE_UNRET_SAFE ++ ret ++SYM_FUNC_END(__vmenter) ++ ++/** ++ * __pkvm_vmx_vmexit - Handle a VMX VM-Exit ++ * ++ * Returns: ++ * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit ++ * ++ * This is __vmenter's partner in crime. On a VM-Exit, control will jump ++ * here after hardware loads the host's state, i.e. this is the destination ++ * referred to by VMCS.HOST_RIP. ++ */ ++SYM_FUNC_START(__pkvm_vmx_vmexit) ++ ANNOTATE_UNRET_SAFE ++ ret ++SYM_FUNC_END(__pkvm_vmx_vmexit) ++ ++/** ++ * __pkvm_vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode ++ * @regs: unsigned long * (to guest registers) ++ * @launched: %true if the VMCS has been launched ++ * ++ * Returns: ++ * 0 on VM-Exit, 1 on VM-Fail ++ */ ++SYM_FUNC_START(__pkvm_vmx_vcpu_run) ++ push %_ASM_BP ++ mov %_ASM_SP, %_ASM_BP ++ push %r15 ++ push %r14 ++ push %r13 ++ push %r12 ++ ++ push %_ASM_BX ++ ++ push %_ASM_ARG1 ++ ++ /* record host RSP (0x6C14) */ ++ mov $HOST_RSP, %_ASM_BX ++ lea -WORD_SIZE(%_ASM_SP), %_ASM_CX ++ vmwrite %_ASM_CX, %_ASM_BX ++ ++ mov %_ASM_ARG1, %_ASM_CX ++ cmp $1, %_ASM_ARG2 ++ ++ mov VCPU_RAX(%_ASM_CX), %_ASM_AX ++ mov VCPU_RBX(%_ASM_CX), %_ASM_BX ++ mov VCPU_RDX(%_ASM_CX), %_ASM_DX ++ mov VCPU_RSI(%_ASM_CX), %_ASM_SI ++ mov VCPU_RDI(%_ASM_CX), %_ASM_DI ++ mov VCPU_RBP(%_ASM_CX), %_ASM_BP ++ mov VCPU_R8(%_ASM_CX), %r8 ++ mov VCPU_R9(%_ASM_CX), %r9 ++ mov VCPU_R10(%_ASM_CX), %r10 ++ mov VCPU_R11(%_ASM_CX), %r11 ++ mov VCPU_R12(%_ASM_CX), %r12 ++ mov VCPU_R13(%_ASM_CX), %r13 ++ mov VCPU_R14(%_ASM_CX), %r14 ++ mov VCPU_R15(%_ASM_CX), %r15 ++ ++ mov VCPU_RCX(%_ASM_CX), %_ASM_CX ++ ++ call __vmenter ++ ++ /* Jump on VM-Fail. */ ++ jbe 2f ++ ++ push %_ASM_CX ++ mov WORD_SIZE(%_ASM_SP), %_ASM_CX ++ ++ mov %_ASM_AX, VCPU_RAX(%_ASM_CX) ++ mov %_ASM_BX, VCPU_RBX(%_ASM_CX) ++ mov %_ASM_DX, VCPU_RDX(%_ASM_CX) ++ mov %_ASM_SI, VCPU_RSI(%_ASM_CX) ++ mov %_ASM_DI, VCPU_RDI(%_ASM_CX) ++ mov %_ASM_BP, VCPU_RBP(%_ASM_CX) ++ mov %r8 , VCPU_R8(%_ASM_CX) ++ mov %r9 , VCPU_R9(%_ASM_CX) ++ mov %r10, VCPU_R10(%_ASM_CX) ++ mov %r11, VCPU_R11(%_ASM_CX) ++ mov %r12, VCPU_R12(%_ASM_CX) ++ mov %r13, VCPU_R13(%_ASM_CX) ++ mov %r14, VCPU_R14(%_ASM_CX) ++ mov %r15, VCPU_R15(%_ASM_CX) ++ ++ pop VCPU_RCX(%_ASM_CX) ++ ++ /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */ ++ xor %eax, %eax ++ ++ /* ++ * Clear all general purpose registers except RSP and RAX to prevent ++ * speculative use of the guest's values, even those that are reloaded ++ * via the stack. In theory, an L1 cache miss when restoring registers ++ * could lead to speculative execution with the guest's values. ++ * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially ++ * free. RSP and RAX are exempt as RSP is restored by hardware during ++ * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. ++ */ ++1: xor %ebx, %ebx ++ xor %ecx, %ecx ++ xor %edx, %edx ++ xor %esi, %esi ++ xor %edi, %edi ++ xor %ebp, %ebp ++ xor %r8d, %r8d ++ xor %r9d, %r9d ++ xor %r10d, %r10d ++ xor %r11d, %r11d ++ xor %r12d, %r12d ++ xor %r13d, %r13d ++ xor %r14d, %r14d ++ xor %r15d, %r15d ++ ++ /* "POP" @regs. */ ++ add $WORD_SIZE, %_ASM_SP ++ pop %_ASM_BX ++ ++ pop %r12 ++ pop %r13 ++ pop %r14 ++ pop %r15 ++ ++ pop %_ASM_BP ++ ANNOTATE_UNRET_SAFE ++ ret ++ /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ ++2: mov $1, %eax ++ jmp 1b ++SYM_FUNC_END(__pkvm_vmx_vcpu_run) +diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h b/arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h +new file mode 100644 +index 000000000000..b99067af3a6b +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx_ops.h +@@ -0,0 +1,173 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _PKVM_VMX_OPS_H_ ++#define _PKVM_VMX_OPS_H_ ++ ++#include "memory.h" ++#include "debug.h" ++ ++static __always_inline unsigned long __vmcs_readl(unsigned long field) ++{ ++ unsigned long value; ++ ++#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT ++ asm_volatile_goto("1: vmread %[field], %[output]\n\t" ++ "jna %l[do_fail]\n\t" ++ : [output] "=r" (value) ++ : [field] "r" (field) ++ : "cc" ++ : do_fail); ++ ++ return value; ++ ++do_fail: ++ pkvm_err("pkvm: vmread failed: field=%lx\n", field); ++ return 0; ++#else ++ asm volatile ("vmread %%rdx, %%rax " ++ : "=a" (value) ++ : "d"(field) ++ : "cc"); ++ return value; ++#endif ++} ++ ++static __always_inline u16 vmcs_read16(unsigned long field) ++{ ++ vmcs_check16(field); ++ return __vmcs_readl(field); ++} ++ ++static __always_inline u32 vmcs_read32(unsigned long field) ++{ ++ vmcs_check32(field); ++ return __vmcs_readl(field); ++} ++ ++static __always_inline u64 vmcs_read64(unsigned long field) ++{ ++ vmcs_check64(field); ++ return __vmcs_readl(field); ++} ++ ++static __always_inline unsigned long vmcs_readl(unsigned long field) ++{ ++ vmcs_checkl(field); ++ return __vmcs_readl(field); ++} ++ ++static inline void pkvm_vmwrite_error(unsigned long field, unsigned long value) ++{ ++ pkvm_err("pkvm: vmwrite failed: field=%lx val=%lx err=%d\n", ++ field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); ++} ++ ++static inline void pkvm_vmclear_error(struct vmcs *vmcs, u64 phys_addr) ++{ ++ pkvm_err("pkvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); ++} ++ ++static inline void pkvm_vmptrld_error(struct vmcs *vmcs, u64 phys_addr) ++{ ++ pkvm_err("pkvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); ++} ++ ++static inline void pkvm_invvpid_error(unsigned long ext, u16 vpid, gva_t gva) ++{ ++ pkvm_err("pkvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", ++ ext, vpid, gva); ++} ++ ++static inline void pkvm_invept_error(unsigned long ext, u64 eptp, gpa_t gpa) ++{ ++ pkvm_err("pkvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", ++ ext, eptp, gpa); ++} ++ ++#define vmx_asm1(insn, op1, error_args...) \ ++do { \ ++ asm_volatile_goto(__stringify(insn) " %0\n\t" \ ++ ".byte 0x2e\n\t" /* branch not taken hint */ \ ++ "jna %l[error]\n\t" \ ++ : : op1 : "cc" : error); \ ++ return; \ ++error: \ ++ pkvm_##insn##_error(error_args); \ ++ return; \ ++} while (0) ++ ++#define vmx_asm2(insn, op1, op2, error_args...) \ ++do { \ ++ asm_volatile_goto(__stringify(insn) " %1, %0\n\t" \ ++ ".byte 0x2e\n\t" /* branch not taken hint */ \ ++ "jna %l[error]\n\t" \ ++ : : op1, op2 : "cc" : error); \ ++ return; \ ++error: \ ++ pkvm_##insn##_error(error_args); \ ++ return; \ ++} while (0) ++ ++static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) ++{ ++ vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value); ++} ++ ++static __always_inline void vmcs_write16(unsigned long field, u16 value) ++{ ++ vmcs_check16(field); ++ __vmcs_writel(field, value); ++} ++ ++static __always_inline void vmcs_write32(unsigned long field, u32 value) ++{ ++ vmcs_check32(field); ++ __vmcs_writel(field, value); ++} ++ ++static __always_inline void vmcs_write64(unsigned long field, u64 value) ++{ ++ vmcs_check64(field); ++ __vmcs_writel(field, value); ++} ++ ++static __always_inline void vmcs_writel(unsigned long field, unsigned long value) ++{ ++ vmcs_checkl(field); ++ __vmcs_writel(field, value); ++} ++ ++static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) ++{ ++ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, ++ "vmcs_clear_bits does not support 64-bit fields"); ++ __vmcs_writel(field, __vmcs_readl(field) & ~mask); ++} ++ ++static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) ++{ ++ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, ++ "vmcs_set_bits does not support 64-bit fields"); ++ __vmcs_writel(field, __vmcs_readl(field) | mask); ++} ++ ++static inline void vmcs_clear(struct vmcs *vmcs) ++{ ++ u64 phys_addr = __pkvm_pa(vmcs); ++ ++ vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr); ++} ++ ++static inline void vmcs_load(struct vmcs *vmcs) ++{ ++ u64 phys_addr = __pkvm_pa(vmcs); ++ ++ vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr); ++} ++ ++void vpid_sync_context(int vpid); ++void vpid_sync_vcpu_addr(int vpid, gva_t addr); ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/include/capabilities.h b/arch/x86/kvm/vmx/pkvm/include/capabilities.h +new file mode 100644 +index 000000000000..4f5c6695f509 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/include/capabilities.h +@@ -0,0 +1,95 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#ifndef _PKVM_CAPS_H_ ++#define _PKVM_CAPS_H_ ++ ++#ifdef __PKVM_HYP__ ++#define PKVM_HYP pkvm_hyp ++#else ++#define PKVM_HYP pkvm_sym(pkvm_hyp) ++#endif ++ ++static inline bool vmx_has_vmwrite_any_field(void) ++{ ++ return !!(PKVM_HYP->vmcs_config.nested.misc_low & ++ MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS); ++} ++ ++static inline bool vmx_ept_capability_check(u32 bit) ++{ ++ struct vmx_capability *vmx_cap = &PKVM_HYP->vmx_cap; ++ ++ return vmx_cap->ept & bit; ++} ++ ++static inline bool vmx_has_invept(void) ++{ ++ return vmx_ept_capability_check(VMX_EPT_INVEPT_BIT); ++} ++ ++static inline bool vmx_has_ept_execute_only(void) ++{ ++ return vmx_ept_capability_check(VMX_EPT_EXECUTE_ONLY_BIT); ++} ++ ++static inline bool vmx_ept_has_4levels(void) ++{ ++ return vmx_ept_capability_check(VMX_EPT_PAGE_WALK_4_BIT); ++} ++ ++static inline bool vmx_ept_has_5levels(void) ++{ ++ return vmx_ept_capability_check(VMX_EPT_PAGE_WALK_5_BIT); ++} ++ ++static inline bool vmx_ept_has_mt_wb(void) ++{ ++ return vmx_ept_capability_check(VMX_EPTP_WB_BIT); ++} ++ ++static inline bool vmx_ept_has_2m_page(void) ++{ ++ return vmx_ept_capability_check(VMX_EPT_2MB_PAGE_BIT); ++} ++ ++static inline bool vmx_ept_has_1g_page(void) ++{ ++ return vmx_ept_capability_check(VMX_EPT_1GB_PAGE_BIT); ++} ++ ++static inline bool vmx_has_invept_context(void) ++{ ++ return vmx_ept_capability_check(VMX_EPT_EXTENT_CONTEXT_BIT); ++} ++ ++static inline bool vmx_vpid_capability_check(u32 bit) ++{ ++ struct vmx_capability *vmx_cap = &PKVM_HYP->vmx_cap; ++ ++ return vmx_cap->vpid & bit; ++} ++ ++static inline bool vmx_has_invvpid(void) ++{ ++ return vmx_vpid_capability_check(VMX_VPID_INVVPID_BIT); ++} ++ ++static inline bool vmx_has_invvpid_individual_addr(void) ++{ ++ return vmx_vpid_capability_check(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT); ++} ++ ++static inline bool vmx_has_invvpid_single(void) ++{ ++ return vmx_vpid_capability_check(VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT); ++} ++ ++static inline bool vmx_has_invvpid_global(void) ++{ ++ return vmx_vpid_capability_check(VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT); ++} ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/include/pkvm.h b/arch/x86/kvm/vmx/pkvm/include/pkvm.h +new file mode 100644 +index 000000000000..9ba0678fc492 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/include/pkvm.h +@@ -0,0 +1,155 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#ifndef _PKVM_H_ ++#define _PKVM_H_ ++ ++#include ++#include ++ ++#define STACK_SIZE SZ_16K ++#define PKVM_MAX_IOMMU_NUM 32 ++#define PKVM_MAX_PASID_PDEV_NUM 32 ++#define PKVM_MAX_PDEV_NUM 512 ++#define PKVM_MAX_NORMAL_VM_NUM 8 ++#define PKVM_MAX_SECURE_VM_NUM 2 ++ ++struct pkvm_pgtable_cap { ++ int level; ++ int allowed_pgsz; ++ u64 table_prot; ++}; ++ ++struct idt_page { ++ gate_desc idt[IDT_ENTRIES]; ++} __aligned(PAGE_SIZE); ++ ++struct pkvm_pcpu { ++ u8 stack[STACK_SIZE] __aligned(16); ++ unsigned long cr3; ++ struct gdt_page gdt_page; ++ struct idt_page idt_page; ++ struct tss_struct tss; ++ void *lapic; ++}; ++ ++struct pkvm_host_vcpu { ++ struct vcpu_vmx vmx; ++ struct pkvm_pcpu *pcpu; ++ struct vmcs *vmxarea; ++ struct vmcs *current_vmcs; ++ ++ void *current_shadow_vcpu; ++ ++ bool pending_nmi; ++ u8 *io_bitmap; ++}; ++ ++struct pkvm_pci_info { ++ struct pci_mmcfg_region *mmcfg_table; ++ int mmcfg_table_size; ++}; ++ ++struct pkvm_host_vm { ++ struct pkvm_host_vcpu *host_vcpus[CONFIG_NR_CPUS]; ++ struct pkvm_pgtable *ept; ++ struct pkvm_pgtable *ept_notlbflush; ++ struct pkvm_pci_info pci_info; ++ u8 *io_bitmap; ++}; ++ ++struct pkvm_iommu_info { ++ u64 reg_phys; ++ u64 reg_size; ++}; ++ ++struct pkvm_hyp { ++ int num_cpus; ++ ++ struct vmx_capability vmx_cap; ++ struct vmcs_config vmcs_config; ++ ++ struct pkvm_pgtable_cap mmu_cap; ++ struct pkvm_pgtable_cap ept_cap; ++ ++ struct pkvm_pgtable *mmu; ++ ++ struct pkvm_pcpu *pcpus[CONFIG_NR_CPUS]; ++ ++ struct pkvm_host_vm host_vm; ++ ++ struct pkvm_iommu_info iommu_infos[PKVM_MAX_IOMMU_NUM]; ++ ++ /* ++ * IOMMU works in nested translation mode with sharing ++ * the EPT as second-level page table. So the page table ++ * level and large page size should be supported by both ++ * EPT and IOMMU. ++ */ ++ int ept_iommu_pgt_level; ++ int ept_iommu_pgsz_mask; ++ ++ bool iommu_coherent; ++}; ++ ++static inline struct pkvm_host_vcpu *vmx_to_pkvm_hvcpu(struct vcpu_vmx *vmx) ++{ ++ return container_of(vmx, struct pkvm_host_vcpu, vmx); ++} ++ ++static inline struct pkvm_host_vcpu *to_pkvm_hvcpu(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ ++ return vmx_to_pkvm_hvcpu(vmx); ++} ++ ++struct pkvm_section { ++ unsigned long type; ++#define PKVM_RESERVED_MEMORY 0UL ++#define PKVM_CODE_DATA_SECTIONS 1UL ++#define KERNEL_DATA_SECTIONS 2UL ++ unsigned long addr; ++ unsigned long size; ++ u64 prot; ++}; ++ ++#define PKVM_PAGES (ALIGN(sizeof(struct pkvm_hyp), PAGE_SIZE) >> PAGE_SHIFT) ++#define PKVM_PCPU_PAGES (ALIGN(sizeof(struct pkvm_pcpu), PAGE_SIZE) >> PAGE_SHIFT) ++#define PKVM_HOST_VCPU_PAGES (ALIGN(sizeof(struct pkvm_host_vcpu), PAGE_SIZE) >> PAGE_SHIFT) ++#define PKVM_HOST_VCPU_VMCS_PAGES 3 /*vmxarea+vmcs+msr_bitmap*/ ++#define PKVM_EXTRA_PAGES 3 /*io_bitmap + mmcfg_table for host vm*/ ++ ++/* ++ * pkvm relocate its own text/data sections to some page aligned ++ * memory area. When creating the page table for pkvm, only create ++ * mapping for its own sections so that the other kernel functions ++ * won't be used and make the pkvm to be self contained. ++ */ ++extern char __pkvm_text_start[], __pkvm_text_end[]; ++extern char __pkvm_rodata_start[], __pkvm_rodata_end[]; ++extern char __pkvm_data_start[], __pkvm_data_end[]; ++extern char __pkvm_bss_start[], __pkvm_bss_end[]; ++ ++extern unsigned long pkvm_sym(__page_base_offset); ++extern unsigned long pkvm_sym(__symbol_base_offset); ++extern struct pkvm_hyp *pkvm_sym(pkvm_hyp); ++extern unsigned long pkvm_sym(__x86_clflush_size); ++ ++PKVM_DECLARE(void, __pkvm_vmx_vmexit(void)); ++PKVM_DECLARE(int, pkvm_main(struct kvm_vcpu *vcpu)); ++PKVM_DECLARE(void, init_contant_host_state_area(struct pkvm_pcpu *pcpu, int cpu)); ++PKVM_DECLARE(int, init_pci(struct pkvm_hyp *pkvm)); ++ ++PKVM_DECLARE(void *, pkvm_early_alloc_contig(unsigned int nr_pages)); ++PKVM_DECLARE(void *, pkvm_early_alloc_page(void)); ++PKVM_DECLARE(void, pkvm_early_alloc_init(void *virt, unsigned long size)); ++ ++PKVM_DECLARE(void, init_msr_emulation(struct vcpu_vmx *vmx)); ++ ++PKVM_DECLARE(void, noop_handler(void)); ++PKVM_DECLARE(void, nmi_handler(void)); ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h b/arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h +new file mode 100644 +index 000000000000..a924e36eb869 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/include/pkvm_trace.h +@@ -0,0 +1,29 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#ifndef _PKVM_TRACE_H_ ++#define _PKVM_TRACE_H_ ++ ++struct vmexit_data { ++ u64 total_count; ++ u64 total_cycles; ++ u64 reasons[74]; ++ u64 cycles[74]; ++}; ++ ++struct perf_data { ++ struct vmexit_data data; ++ unsigned long long tsc; ++}; ++ ++struct vmexit_perf_dump { ++ struct perf_data l1data; ++ struct perf_data l2data; ++ int cpu; ++}; ++ ++#define PKVM_HC_SET_VMEXIT_TRACE 0xabcd0001 ++#define PKVM_HC_DUMP_VMEXIT_TRACE 0xabcd0002 ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_constants.c b/arch/x86/kvm/vmx/pkvm/pkvm_constants.c +new file mode 100644 +index 000000000000..746129da4438 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/pkvm_constants.c +@@ -0,0 +1,26 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++#include ++#include "hyp/pkvm_hyp.h" ++#include "hyp/iommu_internal.h" ++ ++int main(void) ++{ ++ DEFINE(PKVM_PERCPU_PAGES, PKVM_PCPU_PAGES + PKVM_HOST_VCPU_PAGES + PKVM_HOST_VCPU_VMCS_PAGES); ++ DEFINE(PKVM_GLOBAL_PAGES, PKVM_PAGES + PKVM_EXTRA_PAGES); ++ DEFINE(PKVM_VMEMMAP_ENTRY_SIZE, sizeof(struct pkvm_page)); ++ DEFINE(PKVM_SHADOW_VM_SIZE, sizeof(struct pkvm_shadow_vm) + pkvm_shadow_vcpu_array_size()); ++ DEFINE(PKVM_SHADOW_VCPU_STATE_SIZE, sizeof(struct shadow_vcpu_state)); ++ DEFINE(PKVM_IOMMU_NUM, PKVM_MAX_IOMMU_NUM); ++ DEFINE(PKVM_PASIDDEV_NUM, PKVM_MAX_PASID_PDEV_NUM); ++ DEFINE(PKVM_PDEV_NUM, PKVM_MAX_PDEV_NUM); ++ DEFINE(PKVM_IOMMU_QI_DESC_SIZE, PKVM_QI_DESC_ALIGNED_SIZE); ++ DEFINE(PKVM_IOMMU_QI_DESC_STATUS_SIZE, PKVM_QI_DESC_STATUS_ALIGNED_SIZE); ++ DEFINE(PKVM_MAX_VM_NUM, PKVM_MAX_NORMAL_VM_NUM + PKVM_MAX_SECURE_VM_NUM); ++ return 0; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_constants.h b/arch/x86/kvm/vmx/pkvm/pkvm_constants.h +new file mode 100644 +index 000000000000..e6f2753b3d6a +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/pkvm_constants.h +@@ -0,0 +1,21 @@ ++#ifndef __PKVM_CONSTANTS_H__ ++#define __PKVM_CONSTANTS_H__ ++/* ++ * DO NOT MODIFY. ++ * ++ * This file was generated by Kbuild ++ */ ++ ++#define PKVM_PERCPU_PAGES 19 /* PKVM_PCPU_PAGES + PKVM_HOST_VCPU_PAGES + PKVM_HOST_VCPU_VMCS_PAGES */ ++#define PKVM_GLOBAL_PAGES 5 /* PKVM_PAGES + PKVM_EXTRA_PAGES */ ++#define PKVM_VMEMMAP_ENTRY_SIZE 4 /* sizeof(struct pkvm_page) */ ++#define PKVM_SHADOW_VM_SIZE 20480 /* sizeof(struct pkvm_shadow_vm) + pkvm_shadow_vcpu_array_size() */ ++#define PKVM_SHADOW_VCPU_STATE_SIZE 24576 /* sizeof(struct shadow_vcpu_state) */ ++#define PKVM_IOMMU_NUM 32 /* PKVM_MAX_IOMMU_NUM */ ++#define PKVM_PASIDDEV_NUM 32 /* PKVM_MAX_PASID_PDEV_NUM */ ++#define PKVM_PDEV_NUM 512 /* PKVM_MAX_PDEV_NUM */ ++#define PKVM_IOMMU_QI_DESC_SIZE 8192 /* PKVM_QI_DESC_ALIGNED_SIZE */ ++#define PKVM_IOMMU_QI_DESC_STATUS_SIZE 4096 /* PKVM_QI_DESC_STATUS_ALIGNED_SIZE */ ++#define PKVM_MAX_VM_NUM 10 /* PKVM_MAX_NORMAL_VM_NUM + PKVM_MAX_SECURE_VM_NUM */ ++ ++#endif +diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c b/arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c +new file mode 100644 +index 000000000000..c6cd7f3656b0 +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/pkvm_debugfs.c +@@ -0,0 +1,204 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++#include ++#include ++#include ++#include ++ ++static void set_vmexit_trace_func(void *data) ++{ ++ u64 val; ++ ++ if (!data) ++ return; ++ ++ val = *(u64 *)data; ++ kvm_hypercall1(PKVM_HC_SET_VMEXIT_TRACE, val); ++} ++ ++static int set_vmexit_trace(void *data, u64 val) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ smp_call_function_single(cpu, set_vmexit_trace_func, &val, true); ++ ++ return 0; ++} ++DEFINE_SIMPLE_ATTRIBUTE(set_vmexit_trace_fops, NULL, set_vmexit_trace, "%llu\n"); ++ ++static struct trace_print_flags vmexit_reasons[] = { VMX_EXIT_REASONS, { -1, NULL }}; ++ ++static const char *get_vmexit_reason(int index) ++{ ++ struct trace_print_flags *p = vmexit_reasons; ++ ++ while (p->name) { ++ if (p->mask == index) ++ return p->name; ++ p++; ++ } ++ ++ return NULL; ++} ++ ++static void __pkvm_vmexit_perf_dump_percpu(struct vmexit_perf_dump *perf, ++ struct vmexit_perf_dump *count, ++ bool dump_l2) ++{ ++ struct perf_data *perf_data, *count_perf_data; ++ int cpu = perf->cpu; ++ int i; ++ ++ if (dump_l2) { ++ perf_data = &perf->l2data; ++ count_perf_data = count ? &count->l2data : NULL; ++ } else { ++ perf_data = &perf->l1data; ++ count_perf_data = count ? &count->l1data : NULL; ++ } ++ ++ for (i = 0 ; i < 74; i++) { ++ if (!perf_data->data.reasons[i]) ++ continue; ++ ++ pr_info("CPU%d vmexit_from_%s reason %s %lld cycles %lld each-handler-cycle %lld\n", ++ cpu, dump_l2 ? "l2" : "l1", get_vmexit_reason(i), ++ perf_data->data.reasons[i], perf_data->data.cycles[i], ++ perf_data->data.cycles[i] / perf_data->data.reasons[i]); ++ ++ if (count_perf_data) { ++ count_perf_data->data.reasons[i] += perf_data->data.reasons[i]; ++ count_perf_data->data.cycles[i] += perf_data->data.cycles[i]; ++ } ++ ++ if (need_resched()) ++ cond_resched(); ++ } ++ ++ if (perf_data->data.total_count) { ++ pr_info("CPU%d total_vmexit_from_%s %lld total_cycles %lld\n", ++ cpu, dump_l2 ? "l2" : "l1", ++ perf_data->data.total_count, ++ perf_data->data.total_cycles); ++ memset(perf_data, 0, sizeof(struct perf_data)); ++ } ++} ++ ++static void __pkvm_vmexit_perf_dump_summary(struct vmexit_perf_dump *perf, bool dump_l2) ++{ ++ struct perf_data *perf_data; ++ int i; ++ ++ if (dump_l2) ++ perf_data = &perf->l2data; ++ else ++ perf_data = &perf->l1data; ++ ++ for (i = 0 ; i < 74; i++) { ++ if (!perf_data->data.reasons[i]) ++ continue; ++ ++ pr_info("AllCPU: vmexit_from_%s reason %s %lld cycles %lld each-handler-cycle %lld\n", ++ dump_l2 ? "l2" : "l1", get_vmexit_reason(i), ++ perf_data->data.reasons[i], perf_data->data.cycles[i], ++ perf_data->data.cycles[i] / perf_data->data.reasons[i]); ++ ++ perf_data->data.total_count += perf_data->data.reasons[i]; ++ perf_data->data.total_cycles += perf_data->data.cycles[i]; ++ ++ if (need_resched()) ++ cond_resched(); ++ } ++ ++ pr_info("AllCPU: total_vmexit_from_%s %lld total_cycles %lld\n", ++ dump_l2 ? "l2" : "l1", ++ perf_data->data.total_count, ++ perf_data->data.total_cycles); ++} ++ ++static struct vmexit_perf_dump pkvm_perf; ++static void pkvm_dump_vmexit_trace(struct vmexit_perf_dump *hvcpu_perf) ++{ ++ struct vmexit_perf_dump *perf; ++ int cpu; ++ ++ memset(&pkvm_perf.l1data, 0, sizeof(struct perf_data)); ++ memset(&pkvm_perf.l2data, 0, sizeof(struct perf_data)); ++ ++ for (cpu = 0; cpu < num_possible_cpus(); cpu++) { ++ perf = &hvcpu_perf[cpu]; ++ ++ __pkvm_vmexit_perf_dump_percpu(perf, &pkvm_perf, false); ++ __pkvm_vmexit_perf_dump_percpu(perf, &pkvm_perf, true); ++ } ++ ++ __pkvm_vmexit_perf_dump_summary(&pkvm_perf, false); ++ __pkvm_vmexit_perf_dump_summary(&pkvm_perf, true); ++} ++ ++static int dump_vmexit_trace(void *data, u64 *val) ++{ ++ struct vmexit_perf_dump *hvcpu_perf; ++ unsigned long size = sizeof(struct vmexit_perf_dump) * num_possible_cpus(); ++ ++ hvcpu_perf = alloc_pages_exact(size, GFP_KERNEL_ACCOUNT); ++ ++ kvm_hypercall2(PKVM_HC_DUMP_VMEXIT_TRACE, __pa(hvcpu_perf), size); ++ barrier(); ++ ++ pkvm_dump_vmexit_trace(hvcpu_perf); ++ ++ free_pages_exact(hvcpu_perf, size); ++ ++ *val = 0; ++ return 0; ++} ++DEFINE_SIMPLE_ATTRIBUTE(dump_vmexit_trace_fops, dump_vmexit_trace, NULL, "%llu\n"); ++ ++struct debugfs_item { ++ const char *name; ++ umode_t mode; ++ const struct file_operations *fops; ++ struct dentry *dentry; ++}; ++ ++struct debugfs_item debugfs_files[] = { ++ { "set_vmexit_trace", 0222, &set_vmexit_trace_fops}, ++ { "dump_vmexit_trace", 0444, &dump_vmexit_trace_fops}, ++ { NULL } ++}; ++ ++static struct dentry *debugfs_dir; ++ ++void pkvm_init_debugfs(void) ++{ ++ struct debugfs_item *p; ++ ++ debugfs_dir = debugfs_create_dir("pkvm", NULL); ++ if (IS_ERR_OR_NULL(debugfs_dir)) { ++ pr_err("MCP_TEST: Can't create debugfs root entry\n"); ++ goto failed_dir; ++ } ++ ++ for (p = debugfs_files; p->name; ++p) { ++ p->dentry = debugfs_create_file(p->name, p->mode, ++ debugfs_dir, ++ NULL, p->fops); ++ if (IS_ERR_OR_NULL(p->dentry)) ++ goto out_dir; ++ } ++ ++ return; ++ ++out_dir: ++ for (p = debugfs_files; p->dentry; ++p) { ++ debugfs_remove(p->dentry); ++ p->dentry = NULL; ++ } ++ debugfs_remove(debugfs_dir); ++failed_dir: ++ debugfs_dir = NULL; ++} +diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_host.c b/arch/x86/kvm/vmx/pkvm/pkvm_host.c +new file mode 100644 +index 000000000000..b46c9e07fb1e +--- /dev/null ++++ b/arch/x86/kvm/vmx/pkvm/pkvm_host.c +@@ -0,0 +1,1300 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include "pkvm_constants.h" ++#include ++ ++extern void pkvm_init_debugfs(void); ++ ++MODULE_LICENSE("GPL"); ++ ++struct pkvm_hyp *pkvm; ++ ++struct pkvm_deprivilege_param { ++ struct pkvm_hyp *pkvm; ++ int ret; ++}; ++DEFINE_PER_CPU_READ_MOSTLY(bool, pkvm_enabled); ++ ++#define is_aligned(POINTER, BYTE_COUNT) \ ++ (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0) ++ ++static u16 pkvm_host_vpid = VMX_NR_VPIDS - 1; ++ ++struct gdt_page pkvm_gdt_page = { ++ .gdt = { ++ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), ++ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), ++ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), ++ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), ++ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), ++ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), ++ }, ++}; ++ ++static int check_pci_device_count(void) ++{ ++ struct pci_dev *pdev = NULL; ++ int devs = 0, devs_with_pasid = 0; ++ ++ /* ++ * pkvm has reserved the memory for IOMMU during early boot, and that ++ * memory is estimated with PKVM_MAX_PDEV_NUM and PKVM_MAX_PASID_PDEV_NUM. ++ * The actual number larger than this will make IOMMU fail to create ++ * translation tables. ++ */ ++ for_each_pci_dev(pdev) { ++ if (pdev->pasid_cap) ++ devs_with_pasid++; ++ else ++ devs++; ++ } ++ ++ if (devs > PKVM_MAX_PDEV_NUM || ++ devs_with_pasid > PKVM_MAX_PASID_PDEV_NUM) { ++ pr_err("pkvm: Too many pdevs detected, actual %d %d max %d %d\n", ++ devs, devs_with_pasid, PKVM_MAX_PDEV_NUM, ++ PKVM_MAX_PASID_PDEV_NUM); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Check for the coherency of paging structures accessed through pasid table ++ * entries (in scalable mode) or context table entries (in legacy mode). ++ */ ++static inline bool is_iommu_coherent(u64 ecap) ++{ ++ return ecap_smts(ecap) ? !!ecap_smpwc(ecap) : !!ecap_coherent(ecap); ++} ++ ++__attribute__((optimize(0))) ++static int check_and_init_iommu(struct pkvm_hyp *pkvm) ++{ ++ struct pkvm_iommu_info *info; ++ struct dmar_drhd_unit *drhd; ++ int pgsz_mask = 1 << PG_LEVEL_4K; ++ int pgt_level = 0; ++ void __iomem *addr; ++ u64 reg_size; ++ u64 cap, ecap; ++ int index = 0, ret; ++ ++/* matches with IOMMU cap SAGAW bits */ ++#define PGT_4LEVEL BIT(2) ++#define PGT_5LEVEL BIT(3) ++ ++ ret = check_pci_device_count(); ++ if (ret) ++ return ret; ++ /* ++ * Some cases may require IOMMU and EPT to use both supported page ++ * table level and page size: ++ * ++ * 1) If IOMMU is working in nested translation of scalable-mode, ++ * pKVM may reuse EPT as the 2nd-level page table. ++ * ++ * 2) If IOMMU is working in legacy mode and a device is working ++ * in IOMMU pass-through mode, pKVM may reuse EPT as the 2nd-level ++ * page table. ++ * ++ * For other cases, though not necessary to use both IOMMU and EPT ++ * supported page table level and page size, using the same size ++ * can simplify the implementation, as pKVM doesn't need to check ++ * IOMMU types of all devices before deciding whether it's necessary ++ * to use both IOMMU and EPT supported page table level and page size. ++ */ ++ if (pkvm->vmx_cap.ept & VMX_EPT_PAGE_WALK_4_BIT) ++ pgt_level |= PGT_4LEVEL; ++ ++ if (pkvm->vmx_cap.ept & VMX_EPT_PAGE_WALK_5_BIT) ++ pgt_level |= PGT_5LEVEL; ++ ++ if (pkvm->vmx_cap.ept & VMX_EPT_2MB_PAGE_BIT) ++ pgsz_mask |= 1 << PG_LEVEL_2M; ++ ++ if ((pkvm->vmx_cap.ept & VMX_EPT_1GB_PAGE_BIT)) ++ pgsz_mask |= 1 << PG_LEVEL_1G; ++ ++ pkvm->iommu_coherent = true; ++ for_each_drhd_unit(drhd) { ++ int level = 0, mask = 1 << PG_LEVEL_4K; ++ ++ if (index >= PKVM_MAX_IOMMU_NUM) { ++ pr_err("pkvm: too many IOMMU devices to be supported\n"); ++ return -ENOMEM; ++ } ++ ++ if (!drhd->reg_base_addr) { ++ pr_err("pkvm: dmar unit not valid\n"); ++ return -EINVAL; ++ } ++ ++ /* ++ * pkvm requires host IOMMU driver to work in scalable mode with ++ * first-level translation or legacy mode. ++ */ ++ if ((readl(drhd->iommu->reg + DMAR_GSTS_REG) & DMA_GSTS_TES) && ++ (readq(drhd->iommu->reg + DMAR_RTADDR_REG) & BIT(11))) { ++ pr_err("pkvm: drhd reg_base 0x%llx: scalable/legacy mode not enabled\n", ++ drhd->reg_base_addr); ++ return -EINVAL; ++ } ++ ++ addr = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE); ++ if (!addr) { ++ pr_err("pkvm: failed to map drhd reg physical addr 0x%llx\n", ++ drhd->reg_base_addr); ++ return -EINVAL; ++ } ++ ++ info = &pkvm->iommu_infos[index]; ++ cap = readq(addr + DMAR_CAP_REG); ++ ecap = readq(addr + DMAR_ECAP_REG); ++ iounmap(addr); ++ ++ /* ++ * If pkvm IOMMU works in scalable mode, it requires to use nested translation. ++ */ ++ if (ecap_smts(ecap) && !ecap_nest(ecap)) { ++ pr_err("pkvm: drhd reg_base 0x%llx: nested translation not supported\n", ++ drhd->reg_base_addr); ++ return -EINVAL; ++ } ++ ++ /* ++ * Check for the coherency of the paging structure access. ++ */ ++ if (!is_iommu_coherent(ecap)) ++ pkvm->iommu_coherent = false; ++ ++ info->reg_phys = drhd->reg_base_addr; ++ reg_size = max_t(u64, ecap_max_iotlb_offset(ecap), ++ cap_max_fault_reg_offset(cap)); ++ info->reg_size = max_t(u64, reg_size, VTD_PAGE_SIZE); ++ ++ if (cap_sagaw(cap) & PGT_4LEVEL) ++ level |= PGT_4LEVEL; ++ if (cap_sagaw(cap) & PGT_5LEVEL) ++ level |= PGT_5LEVEL; ++ ++ if (cap_super_page_val(cap) & BIT(0)) ++ mask |= 1 << PG_LEVEL_2M; ++ if (cap_super_page_val(cap) & BIT(1)) ++ mask |= 1 << PG_LEVEL_1G; ++ ++ /* Get the both supported page table level */ ++ pgt_level &= level; ++ pgsz_mask &= mask; ++ ++ index++; ++ } ++ ++ /* ++ * There may be no supported page table level for both IOMMU and EPT. ++ * But there will always be both supported page size, which is 4K. ++ */ ++ if (pgt_level == 0) { ++ pr_err("pkvm: no common page table level for IOMMU and EPT\n"); ++ return -EINVAL; ++ } ++ ++ /* By default to use 4level */ ++ pkvm->ept_iommu_pgt_level = pgt_level & PGT_4LEVEL ? 4 : 5; ++ ++ pkvm->ept_iommu_pgsz_mask = pgsz_mask; ++ ++ return 0; ++} ++ ++u64 pkvm_total_reserve_pages(void) ++{ ++ u64 total; ++ ++ total = pkvm_data_struct_pages(PKVM_GLOBAL_PAGES, PKVM_PERCPU_PAGES, num_possible_cpus()); ++ total += pkvm_vmemmap_pages(PKVM_VMEMMAP_ENTRY_SIZE); ++ total += pkvm_mmu_pgtable_pages(); ++ total += host_ept_pgtable_pages(); ++ total += pkvm_iommu_pages(PKVM_MAX_PASID, PKVM_PASIDDEV_NUM, ++ PKVM_PDEV_NUM, PKVM_IOMMU_NUM, ++ PKVM_IOMMU_QI_DESC_SIZE, ++ PKVM_IOMMU_QI_DESC_STATUS_SIZE, ++ num_possible_cpus()); ++ total += pkvm_shadow_ept_pgtable_pages(PKVM_MAX_VM_NUM); ++ total += pkvm_host_shadow_iommu_pgtable_pages(PKVM_PDEV_NUM); ++ ++ return total; ++} ++ ++static struct vmcs *pkvm_alloc_vmcs(struct vmcs_config *vmcs_config_ptr) ++{ ++ struct vmcs *vmcs; ++ int pages = ALIGN(vmcs_config_ptr->size, PAGE_SIZE) >> PAGE_SHIFT; ++ ++ vmcs = pkvm_sym(pkvm_early_alloc_contig)(pages); ++ if (!vmcs) ++ return NULL; ++ ++ memset(vmcs, 0, vmcs_config_ptr->size); ++ vmcs->hdr.revision_id = vmcs_config_ptr->revision_id; /* vmcs revision id */ ++ ++ return vmcs; ++} ++ ++static void vmxon_setup_revid(void *vmxon_region) ++{ ++ u32 rev_id = 0; ++ u32 msr_high_value = 0; ++ ++ rdmsr(MSR_IA32_VMX_BASIC, rev_id, msr_high_value); ++ ++ memcpy(vmxon_region, &rev_id, 4); ++} ++ ++static inline void cr4_set_vmxe(void) ++{ ++ unsigned long cr4_value; ++ ++ cr4_value = __read_cr4(); ++ __write_cr4(cr4_value | X86_CR4_VMXE); ++} ++ ++static inline void cr4_clear_vmxe(void) ++{ ++ unsigned long cr4_value; ++ ++ cr4_value = __read_cr4(); ++ __write_cr4(cr4_value & ~(X86_CR4_VMXE)); ++} ++ ++static int pkvm_cpu_vmxon(u64 vmxon_pointer) ++{ ++ u64 msr; ++ ++ cr4_set_vmxe(); ++ asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" ++ _ASM_EXTABLE(1b, %l[fault]) ++ : : [vmxon_pointer] "m"(vmxon_pointer) ++ : : fault); ++ return 0; ++ ++fault: ++ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", ++ rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); ++ cr4_clear_vmxe(); ++ return -EFAULT; ++} ++ ++static int pkvm_cpu_vmxoff(void) ++{ ++ asm_volatile_goto("1: vmxoff\n\t" ++ _ASM_EXTABLE(1b, %l[fault]) ++ ::: "cc", "memory" : fault); ++ cr4_clear_vmxe(); ++ return 0; ++ ++fault: ++ cr4_clear_vmxe(); ++ return -EFAULT; ++} ++ ++static int pkvm_enable_vmx(struct pkvm_host_vcpu *vcpu) ++{ ++ u64 phys_addr; ++ ++ vcpu->vmxarea = pkvm_sym(pkvm_early_alloc_page)(); ++ if (!vcpu->vmxarea) ++ return -ENOMEM; ++ ++ phys_addr = __pa(vcpu->vmxarea); ++ if (!is_aligned(phys_addr, PAGE_SIZE)) ++ return -ENOMEM; ++ ++ /*setup revision id in vmxon region*/ ++ vmxon_setup_revid(vcpu->vmxarea); ++ ++ return pkvm_cpu_vmxon(phys_addr); ++} ++ ++static inline u32 get_ar(u16 sel) ++{ ++ u32 access_rights; ++ ++ if (sel == 0) { ++ access_rights = 0x10000; ++ } else { ++ asm ("lar %%ax, %%rax\n" ++ : "=a"(access_rights) : "a"(sel)); ++ access_rights = access_rights >> 8; ++ access_rights = access_rights & 0xf0ff; ++ } ++ ++ return access_rights; ++} ++ ++#define init_guestsegment(seg, SEG, base, limit) \ ++ do { \ ++ u16 sel; \ ++ u32 ar; \ ++ \ ++ savesegment(seg, sel); \ ++ ar = get_ar(sel); \ ++ vmcs_write16(GUEST_##SEG##_SELECTOR, sel); \ ++ vmcs_write32(GUEST_##SEG##_AR_BYTES, ar); \ ++ vmcs_writel(GUEST_##SEG##_BASE, base); \ ++ vmcs_write32(GUEST_##SEG##_LIMIT, limit); \ ++ } while (0) ++ ++static noinline void init_guest_state_area_from_native(int cpu) ++{ ++ u16 ldtr; ++ struct desc_ptr dt; ++ unsigned long msrl; ++ u32 high, low; ++ ++ /* load CR regiesters */ ++ vmcs_writel(GUEST_CR0, read_cr0() & ~X86_CR0_TS); ++ vmcs_writel(GUEST_CR3, __read_cr3()); ++ vmcs_writel(GUEST_CR4, native_read_cr4()); ++ ++ /* load cs/ss/ds/es */ ++ init_guestsegment(cs, CS, 0x0, 0xffffffff); ++ init_guestsegment(ss, SS, 0x0, 0xffffffff); ++ init_guestsegment(ds, DS, 0x0, 0xffffffff); ++ init_guestsegment(es, ES, 0x0, 0xffffffff); ++ ++ /* load fs/gs */ ++ rdmsrl(MSR_FS_BASE, msrl); ++ init_guestsegment(fs, FS, msrl, 0xffffffff); ++ rdmsrl(MSR_GS_BASE, msrl); ++ init_guestsegment(gs, GS, msrl, 0xffffffff); ++ ++ /* load GDTR */ ++ native_store_gdt(&dt); ++ vmcs_writel(GUEST_GDTR_BASE, dt.address); ++ vmcs_write32(GUEST_GDTR_LIMIT, dt.size); ++ ++ /* load TR */ ++ vmcs_write16(GUEST_TR_SELECTOR, GDT_ENTRY_TSS*8); ++ vmcs_write32(GUEST_TR_AR_BYTES, get_ar(GDT_ENTRY_TSS*8)); ++ vmcs_writel(GUEST_TR_BASE, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); ++ vmcs_write32(GUEST_TR_LIMIT, __KERNEL_TSS_LIMIT); ++ ++ /* load LDTR */ ++ store_ldt(ldtr); ++ vmcs_write16(GUEST_LDTR_SELECTOR, ldtr); ++ vmcs_write32(GUEST_LDTR_AR_BYTES, 0x10000); ++ vmcs_writel(GUEST_LDTR_BASE, 0x0); ++ vmcs_write32(GUEST_LDTR_LIMIT, 0xffffffff); ++ ++ store_idt(&dt); ++ vmcs_writel(GUEST_IDTR_BASE, dt.address); ++ vmcs_write32(GUEST_IDTR_LIMIT, dt.size); ++ ++ /* set MSRs */ ++ vmcs_write64(GUEST_IA32_DEBUGCTL, 0); ++ ++ rdmsr(MSR_IA32_SYSENTER_CS, low, high); ++ vmcs_write32(GUEST_SYSENTER_CS, low); ++ ++ rdmsrl(MSR_IA32_SYSENTER_ESP, msrl); ++ vmcs_writel(GUEST_SYSENTER_ESP, msrl); ++ ++ rdmsrl(MSR_IA32_SYSENTER_EIP, msrl); ++ vmcs_writel(GUEST_SYSENTER_EIP, msrl); ++ ++ rdmsrl(MSR_EFER, msrl); ++ vmcs_write64(GUEST_IA32_EFER, msrl); ++ ++ rdmsrl(MSR_IA32_CR_PAT, msrl); ++ vmcs_write64(GUEST_IA32_PAT, msrl); ++} ++ ++static noinline void init_guest_state_area(struct pkvm_host_vcpu *vcpu, int cpu) ++{ ++ init_guest_state_area_from_native(cpu); ++ ++ /*Guest non register state*/ ++ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); ++ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); ++ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); ++ vmcs_write64(VMCS_LINK_POINTER, -1ull); ++} ++ ++static void init_host_state_area(struct pkvm_host_vcpu *vcpu, int cpu) ++{ ++ struct pkvm_pcpu *pcpu = vcpu->pcpu; ++ ++ pkvm_sym(init_contant_host_state_area)(pcpu, cpu); ++ ++ /*host RIP*/ ++ vmcs_writel(HOST_RIP, (unsigned long)pkvm_sym(__pkvm_vmx_vmexit)); ++} ++ ++static void init_execution_control(struct pkvm_host_vcpu *vcpu, ++ struct vmcs_config *vmcs_config_ptr, ++ struct vmx_capability *vmx_cap) ++{ ++ struct vcpu_vmx *vmx = &vcpu->vmx; ++ /* ++ * Fixed VPIDs for the host vCPUs, which implies that it could conflict ++ * with VPIDs from nested guests. ++ * ++ * It's safe because cached mappings used in non-root mode are associated ++ * with EP4TA, which is managed by pKVM and unique for every guest. ++ */ ++ if ((vmcs_config_ptr->cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_VPID) && ++ vmx_has_invvpid() && ++ (vmx_has_invvpid_single() || vmx_has_invvpid_global())) ++ vmcs_write16(VIRTUAL_PROCESSOR_ID, pkvm_host_vpid--); ++ ++ pin_controls_set(vmx, vmcs_config_ptr->pin_based_exec_ctrl); ++ exec_controls_set(vmx, vmcs_config_ptr->cpu_based_exec_ctrl); ++ secondary_exec_controls_set(vmx, vmcs_config_ptr->cpu_based_2nd_exec_ctrl); ++ /* disable EPT first, will enable after EPT pgtable created */ ++ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_EPT); ++ ++ vmcs_write32(CR3_TARGET_COUNT, 0); ++ ++ vmcs_write32(EXCEPTION_BITMAP, 0); ++ ++ vmcs_write64(IO_BITMAP_A, __pa(vcpu->io_bitmap)); ++ vmcs_write64(IO_BITMAP_B, __pa(vcpu->io_bitmap) + PAGE_SIZE); ++ ++ pkvm_sym(init_msr_emulation(vmx)); ++ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); ++ ++ /*guest owns the entire bits*/ ++ vmcs_writel(CR0_GUEST_HOST_MASK, 0); ++ ++ vmcs_writel(CR4_GUEST_HOST_MASK, X86_CR4_VMXE); ++} ++ ++static void init_vmexit_control(struct vcpu_vmx *vmx, struct vmcs_config *vmcs_config_ptr) ++{ ++ vm_exit_controls_set(vmx, vmcs_config_ptr->vmexit_ctrl); ++ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); ++} ++ ++static void init_vmentry_control(struct vcpu_vmx *vmx, struct vmcs_config *vmcs_config_ptr) ++{ ++ vm_entry_controls_set(vmx, vmcs_config_ptr->vmentry_ctrl); ++ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); ++ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); ++ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); ++} ++ ++static int pkvm_host_init_vmx(struct pkvm_host_vcpu *vcpu, int cpu) ++{ ++ struct vcpu_vmx *vmx = &vcpu->vmx; ++ int ret; ++ ++ ret = pkvm_enable_vmx(vcpu); ++ if (ret) ++ return ret; ++ ++ /* vmcs01: host vmcs in pKVM */ ++ vmx->vmcs01.vmcs = pkvm_alloc_vmcs(&pkvm->vmcs_config); ++ if (!vmx->vmcs01.vmcs) ++ return -ENOMEM; ++ ++ vmx->vmcs01.msr_bitmap = pkvm_sym(pkvm_early_alloc_page)(); ++ if (!vmx->vmcs01.msr_bitmap) { ++ pr_err("%s: No page for msr_bitmap\n", __func__); ++ return -ENOMEM; ++ } ++ ++ vcpu->io_bitmap = pkvm->host_vm.io_bitmap; ++ ++ vmx->loaded_vmcs = &vmx->vmcs01; ++ vmcs_load(vmx->loaded_vmcs->vmcs); ++ vcpu->current_vmcs = vmx->loaded_vmcs->vmcs; ++ ++ init_guest_state_area(vcpu, cpu); ++ init_host_state_area(vcpu, cpu); ++ init_execution_control(vcpu, &pkvm->vmcs_config, &pkvm->vmx_cap); ++ init_vmexit_control(vmx, &pkvm->vmcs_config); ++ init_vmentry_control(vmx, &pkvm->vmcs_config); ++ ++ return ret; ++} ++ ++static void pkvm_host_deinit_vmx(struct pkvm_host_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = &vcpu->vmx; ++ ++ pkvm_cpu_vmxoff(); ++ ++ if (vmx->vmcs01.vmcs) ++ vmx->vmcs01.vmcs = NULL; ++ ++ if (vmx->vmcs01.msr_bitmap) ++ vmx->vmcs01.msr_bitmap = NULL; ++} ++ ++static void pkvm_host_setup_nested_vmx_cap(struct pkvm_hyp *pkvm) ++{ ++ struct nested_vmx_msrs *msrs = &pkvm->vmcs_config.nested; ++ ++ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ++ msrs->procbased_ctls_low, ++ msrs->procbased_ctls_high); ++ ++ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, ++ &msrs->secondary_ctls_low, ++ &msrs->secondary_ctls_high); ++ ++ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ++ msrs->pinbased_ctls_low, ++ msrs->pinbased_ctls_high); ++ ++ rdmsrl_safe(MSR_IA32_VMX_VMFUNC, &msrs->vmfunc_controls); ++ ++ rdmsr(MSR_IA32_VMX_EXIT_CTLS, ++ msrs->exit_ctls_low, ++ msrs->exit_ctls_high); ++ ++ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, ++ msrs->entry_ctls_low, ++ msrs->entry_ctls_high); ++ ++ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, msrs->misc_high); ++} ++ ++__attribute__((optimize(0))) ++static int pkvm_host_check_and_setup_vmx_cap(struct pkvm_hyp *pkvm) ++{ ++ struct vmcs_config *vmcs_config = &pkvm->vmcs_config; ++ struct vmx_capability *vmx_cap = &pkvm->vmx_cap; ++ int ret = 0; ++ struct vmcs_config_setting setting = { ++ .cpu_based_exec_ctrl_min = ++ CPU_BASED_USE_IO_BITMAPS | ++ CPU_BASED_USE_MSR_BITMAPS | ++ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS, ++ .cpu_based_exec_ctrl_opt = 0, ++ .cpu_based_2nd_exec_ctrl_min = ++ SECONDARY_EXEC_ENABLE_EPT | ++ SECONDARY_EXEC_SHADOW_VMCS, ++ .cpu_based_2nd_exec_ctrl_opt = ++ SECONDARY_EXEC_ENABLE_VPID | ++ SECONDARY_EXEC_ENABLE_INVPCID | ++ SECONDARY_EXEC_XSAVES | ++ SECONDARY_EXEC_ENABLE_RDTSCP | ++ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE, ++ .pin_based_exec_ctrl_min = 0, ++ .pin_based_exec_ctrl_opt = 0, ++ .vmexit_ctrl_min = ++ VM_EXIT_HOST_ADDR_SPACE_SIZE | ++ VM_EXIT_LOAD_IA32_EFER | ++ VM_EXIT_SAVE_IA32_PAT | ++ VM_EXIT_SAVE_IA32_EFER | ++ VM_EXIT_SAVE_DEBUG_CONTROLS, ++ .vmexit_ctrl_opt = 0, ++ .vmentry_ctrl_min = ++ VM_ENTRY_LOAD_DEBUG_CONTROLS | ++ VM_ENTRY_IA32E_MODE | ++ VM_ENTRY_LOAD_IA32_EFER, ++// VM_ENTRY_LOAD_IA32_PAT, ++ .vmentry_ctrl_opt = 0, ++ .has_broken_vmx_preemption_timer = false, ++ .perf_global_ctrl_workaround = false, ++ }; ++ ++ if (!boot_cpu_has(X86_FEATURE_VMX)) ++ return -EINVAL; ++ ++ if (__setup_vmcs_config(vmcs_config, vmx_cap, &setting) < 0) ++ return -EINVAL; ++ ++ pr_info("pin_based_exec_ctrl 0x%x\n", vmcs_config->pin_based_exec_ctrl); ++ pr_info("cpu_based_exec_ctrl 0x%x\n", vmcs_config->cpu_based_exec_ctrl); ++ pr_info("cpu_based_2nd_exec_ctrl 0x%x\n", vmcs_config->cpu_based_2nd_exec_ctrl); ++ pr_info("vmexit_ctrl 0x%x\n", vmcs_config->vmexit_ctrl); ++ pr_info("vmentry_ctrl 0x%x\n", vmcs_config->vmentry_ctrl); ++ ++ pkvm_host_setup_nested_vmx_cap(pkvm); ++ ++ return ret; ++} ++ ++static int pkvm_init_mmu(struct pkvm_hyp *pkvm) ++{ ++ int pgsz_mask = (1 << PG_LEVEL_2M) | (1 << PG_LEVEL_4K); ++ ++ if (boot_cpu_has(X86_FEATURE_GBPAGES)) ++ pgsz_mask |= 1 << PG_LEVEL_1G; ++ ++ /* record mmu pgtable cap for later mmu pgtable build */ ++ pkvm->mmu_cap.level = pgtable_l5_enabled() ? 5 : 4; ++ pkvm->mmu_cap.allowed_pgsz = pgsz_mask; ++ pkvm->mmu_cap.table_prot = (u64)_KERNPG_TABLE_NOENC; ++ ++ /* ++ * Use IOMMU acknowledged level and page size mask for ++ * EPT as IOMMU will use EPT as its second-level page ++ * table in nested translation. ++ */ ++ pkvm->ept_cap.level = pkvm->ept_iommu_pgt_level; ++ pkvm->ept_cap.allowed_pgsz = pkvm->ept_iommu_pgsz_mask; ++ pkvm->ept_cap.table_prot = VMX_EPT_RWX_MASK; ++ ++ /* ++ * __page_base_offset stores the offset for pkvm ++ * to translate VA to a PA. ++ * ++ * __symbol_base_offset stores the offset for pkvm ++ * to translate its symbole's VA to a PA. ++ */ ++ pkvm_sym(__page_base_offset) = (unsigned long)__va(0); ++ pkvm_sym(__symbol_base_offset) = (unsigned long)__pkvm_text_start - __pa_symbol(__pkvm_text_start); ++ ++ /* ++ * __x86_clflush_size stores the clflush size for ++ * pkvm to do the clfush at runtime. ++ */ ++ pkvm_sym(__x86_clflush_size) = boot_cpu_data.x86_clflush_size; ++ ++ return 0; ++} ++ ++static void init_gdt(struct pkvm_pcpu *pcpu) ++{ ++ pcpu->gdt_page = pkvm_gdt_page; ++} ++ ++static void init_idt(struct pkvm_pcpu *pcpu) ++{ ++ gate_desc *idt = pcpu->idt_page.idt; ++ struct idt_data d = { ++ .segment = __KERNEL_CS, ++ .bits.ist = 0, ++ .bits.zero = 0, ++ .bits.type = GATE_INTERRUPT, ++ .bits.dpl = 0, ++ .bits.p = 1, ++ }; ++ gate_desc desc; ++ int i; ++ ++#ifdef CONFIG_PKVM_INTEL_DEBUG ++ gate_desc *host_idt; ++ struct desc_ptr dt; ++ ++ store_idt(&dt); ++ host_idt = (gate_desc *)dt.address; ++ ++ /* reuse other exception handler but control nmi handler */ ++ for (i = 0; i <= X86_TRAP_IRET; i++) { ++ if (i == X86_TRAP_NMI) { ++ d.vector = i; ++ d.bits.ist = 0; ++ d.addr = (const void *)pkvm_sym(nmi_handler); ++ idt_init_desc(&desc, &d); ++ write_idt_entry(idt, i, &desc); ++ } else { ++ memcpy(&idt[i], &host_idt[i], sizeof(gate_desc)); ++ } ++ } ++#else ++ for (i = 0; i <= X86_TRAP_IRET; i++) { ++ d.vector = i; ++ d.bits.ist = 0; ++ if (i == X86_TRAP_NMI) ++ d.addr = (const void *)pkvm_sym(nmi_handler); ++ else ++ d.addr = (const void *)pkvm_sym(noop_handler); ++ idt_init_desc(&desc, &d); ++ write_idt_entry(idt, i, &desc); ++ } ++#endif ++} ++ ++static void init_tss(struct pkvm_pcpu *pcpu) ++{ ++ struct desc_struct *d = pcpu->gdt_page.gdt; ++ tss_desc tss; ++ ++ set_tssldt_descriptor(&tss, (unsigned long)&pcpu->tss, DESC_TSS, ++ __KERNEL_TSS_LIMIT); ++ ++ write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS); ++} ++ ++static int pkvm_setup_pcpu(struct pkvm_hyp *pkvm, int cpu) ++{ ++ struct pkvm_pcpu *pcpu; ++ ++ if (cpu >= CONFIG_NR_CPUS) ++ return -ENOMEM; ++ ++ pcpu = pkvm_sym(pkvm_early_alloc_contig)(PKVM_PCPU_PAGES); ++ if (!pcpu) ++ return -ENOMEM; ++ ++ /* tmp use host cr3, switch to pkvm owned cr3 after de-privilege */ ++ pcpu->cr3 = __read_cr3(); ++ ++ init_gdt(pcpu); ++ init_idt(pcpu); ++ init_tss(pcpu); ++ ++ pkvm->pcpus[cpu] = pcpu; ++ ++ return 0; ++} ++ ++static int pkvm_host_setup_vcpu(struct pkvm_hyp *pkvm, int cpu) ++{ ++ struct pkvm_host_vcpu *pkvm_host_vcpu; ++ ++ if (cpu >= CONFIG_NR_CPUS) ++ return -ENOMEM; ++ ++ pkvm_host_vcpu = pkvm_sym(pkvm_early_alloc_contig)(PKVM_HOST_VCPU_PAGES); ++ if (!pkvm_host_vcpu) ++ return -ENOMEM; ++ ++ pkvm_host_vcpu->pcpu = pkvm->pcpus[cpu]; ++ pkvm_host_vcpu->vmx.vcpu.cpu = cpu; ++ ++ pkvm->host_vm.host_vcpus[cpu] = pkvm_host_vcpu; ++ ++ return 0; ++} ++ ++static void enable_feature_control(void) ++{ ++ u64 old, test_bits; ++ ++ rdmsrl(MSR_IA32_FEAT_CTL, old); ++ test_bits = FEAT_CTL_LOCKED; ++ test_bits |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; ++ ++ if ((old & test_bits) != test_bits) ++ wrmsrl(MSR_IA32_FEAT_CTL, old | test_bits); ++} ++ ++#define savegpr(gpr, value) \ ++ asm("mov %%" #gpr ",%0":"=r" (value) : : "memory") ++ ++static noinline int pkvm_host_run_vcpu(struct pkvm_host_vcpu *vcpu) ++{ ++ u64 guest_rsp, host_rsp; ++ unsigned long *regs = vcpu->vmx.vcpu.arch.regs; ++ volatile int ret = 0; ++ ++ /* ++ * prepare to RUN vcpu: ++ * ++ * - record gprs in vcpu.arch.regs[]: ++ * ++ * - record below guest vmcs fields: ++ * GUSET_RFLAGS - read from native ++ * ++ * - record below guest vmcs fields: ++ * GUSET_RFLAGS - read from native ++ * GUEST_RSP - read from native ++ * GUEST_RIP - vmentry_point ++ * ++ * - switch RSP to host_rsp ++ */ ++ savegpr(rax, regs[__VCPU_REGS_RAX]); ++ savegpr(rcx, regs[__VCPU_REGS_RCX]); ++ savegpr(rdx, regs[__VCPU_REGS_RDX]); ++ savegpr(rbx, regs[__VCPU_REGS_RBX]); ++ savegpr(rbp, regs[__VCPU_REGS_RBP]); ++ savegpr(rsi, regs[__VCPU_REGS_RSI]); ++ savegpr(rdi, regs[__VCPU_REGS_RDI]); ++ savegpr(r8, regs[__VCPU_REGS_R8]); ++ savegpr(r9, regs[__VCPU_REGS_R9]); ++ savegpr(r10, regs[__VCPU_REGS_R10]); ++ savegpr(r11, regs[__VCPU_REGS_R11]); ++ savegpr(r12, regs[__VCPU_REGS_R12]); ++ savegpr(r13, regs[__VCPU_REGS_R13]); ++ savegpr(r14, regs[__VCPU_REGS_R14]); ++ savegpr(r15, regs[__VCPU_REGS_R15]); ++ host_rsp = (u64)vcpu->pcpu->stack + STACK_SIZE; ++ asm volatile( ++ "pushfq\n" ++ "popq %%rax\n" ++ "movq %1, %%rdx\n" ++ "vmwrite %%rax, %%rdx\n" ++ "movq %%rsp, %%rax\n" ++ "movq %2, %%rdx\n" ++ "vmwrite %%rax, %%rdx\n" ++ "movq %%rax, %0\n" ++ "movq $vmentry_point, %%rax\n" ++ "movq %3, %%rdx\n" ++ "vmwrite %%rax, %%rdx\n" ++ "movq %4, %%rsp\n" ++ : "=m"(guest_rsp) ++ : "i"(GUEST_RFLAGS), "i"(GUEST_RSP) , "i"(GUEST_RIP), "m"(host_rsp) ++ : "rax", "rdx", "memory"); ++ ++ /* ++ * call pkvm_main to do vmlaunch. ++ * ++ * if pkvm_main return: ++ * vmlaunch fail - switch back to guest_rsp ++ * if pkvm_main not return: ++ * vmlaunch success: guest ret to vmentry_point ++ */ ++ ret = pkvm_sym(pkvm_main)(&vcpu->vmx.vcpu); ++ asm volatile( ++ "movq %0, %%rsp\n" ++ "vmentry_point:\n" ++ : : "m"(guest_rsp) :); ++ ++ return ret; ++} ++ ++static void pkvm_host_deprivilege_cpu(void *data) ++{ ++ struct pkvm_deprivilege_param *p = data; ++ unsigned long flags; ++ int cpu = get_cpu(), ret; ++ struct pkvm_host_vcpu *vcpu = ++ p->pkvm->host_vm.host_vcpus[cpu]; ++ ++ local_irq_save(flags); ++ ++ enable_feature_control(); ++ ++ ret = pkvm_host_init_vmx(vcpu, cpu); ++ if (ret) { ++ pr_err("%s: init vmx failed\n", __func__); ++ goto out; ++ } ++ ++ ret = pkvm_host_run_vcpu(vcpu); ++ if (ret == 0) { ++ pr_info("%s: CPU%d in guest mode\n", __func__, cpu); ++ goto ok; ++ } ++ ++out: ++ p->ret = ret; ++ pkvm_host_deinit_vmx(vcpu); ++ pr_err("%s: failed to deprivilege CPU%d\n", __func__, cpu); ++ ++ok: ++ local_irq_restore(flags); ++ ++ put_cpu(); ++} ++ ++/* ++ * Used in root mode to deprivilege CPUs ++ */ ++static int pkvm_host_deprivilege_cpus(struct pkvm_hyp *pkvm) ++{ ++ struct pkvm_deprivilege_param p = { ++ .pkvm = pkvm, ++ .ret = 0, ++ }; ++ ++ on_each_cpu(pkvm_host_deprivilege_cpu, &p, 1); ++ if (p.ret) { ++ /* ++ * TODO: ++ * We are here because some CPU failed to be deprivileged, so ++ * the failed CPU will stay in root mode. But the others already ++ * in the non-root mode. In this case, we should let non-root mode ++ * CPUs go back to root mode, then the system can still run natively ++ * without pKVM enabled. ++ */ ++ pr_err("%s: WARNING - failed to deprivilege all CPUs!\n", __func__); ++ } else { ++ pr_info("%s: all cpus are in guest mode!\n", __func__); ++ } ++ ++ return p.ret; ++} ++ ++static int this_cpu_do_finalise_hc(struct pkvm_section *sections, unsigned long size) ++{ ++ int ret; ++ ++ local_irq_disable(); ++ ret = kvm_hypercall2(PKVM_HC_INIT_FINALISE, (unsigned long)sections, size); ++ if (!ret) ++ this_cpu_write(pkvm_enabled, true); ++ local_irq_enable(); ++ return ret; ++} ++ ++static void do_pkvm_finalise(void *data) ++{ ++ this_cpu_do_finalise_hc(NULL, 0); ++} ++ ++static int pkvm_init_finalise(void) ++{ ++ int ret, cpu; ++ int self = get_cpu(); ++ struct pkvm_section sections[] = { ++ /* ++ * NOTE: please ensure kernel section is put at the beginning, ++ * as we do section mapping by the order, while kernel data ++ * sections have overlap with pkvm ones, put the kernel section ++ * after pkvm one will make pkvm section readonly! ++ */ ++ { ++ /* ++ * Kernel section: addr is virtual, needed ++ * for pkvm to access kernel alias symbol ++ */ ++ .type = KERNEL_DATA_SECTIONS, ++ .addr = (unsigned long)_sdata, ++ .size = (unsigned long)(_edata - _sdata), ++ .prot = (u64)pgprot_val(PAGE_KERNEL_RO), ++ }, ++ { ++ /* ++ * Kernel section: addr is virtual, needed ++ * for pkvm to access kernel alias symbol ++ */ ++ .type = KERNEL_DATA_SECTIONS, ++ .addr = (unsigned long)__start_rodata, ++ .size = (unsigned long)(__end_rodata - __start_rodata), ++ .prot = (u64)pgprot_val(PAGE_KERNEL_RO), ++ }, ++ { ++ /* PKVM reserved memory: addr is physical */ ++ .type = PKVM_RESERVED_MEMORY, ++ .addr = (unsigned long)pkvm_mem_base, ++ .size = (unsigned long)pkvm_mem_size, ++ .prot = (u64)pgprot_val(PAGE_KERNEL), ++ }, ++ { ++ /* PKVM section: addr is virtual */ ++ .type = PKVM_CODE_DATA_SECTIONS, ++ .addr = (unsigned long)__pkvm_text_start, ++ .size = (unsigned long)(__pkvm_text_end - __pkvm_text_start), ++ .prot = (u64)pgprot_val(PAGE_KERNEL_EXEC), ++ }, ++ { ++ /* PKVM section: addr is virtual */ ++ .type = PKVM_CODE_DATA_SECTIONS, ++ .addr = (unsigned long)__pkvm_rodata_start, ++ .size = (unsigned long)(__pkvm_rodata_end - __pkvm_rodata_start), ++ .prot = (u64)pgprot_val(PAGE_KERNEL_RO), ++ }, ++ { ++ /* PKVM section: addr is virtual */ ++ .type = PKVM_CODE_DATA_SECTIONS, ++ .addr = (unsigned long)__pkvm_data_start, ++ .size = (unsigned long)(__pkvm_data_end - __pkvm_data_start), ++ .prot = (u64)pgprot_val(PAGE_KERNEL), ++ }, ++ { ++ /* PKVM section: addr is virtual */ ++ .type = PKVM_CODE_DATA_SECTIONS, ++ .addr = (unsigned long)__pkvm_bss_start, ++ .size = (unsigned long)(__pkvm_bss_end - __pkvm_bss_start), ++ .prot = (u64)pgprot_val(PAGE_KERNEL), ++ }, ++ }; ++ ++ /* ++ * First hypercall to recreate the pgtable for pkvm, and init ++ * memory pool for later use, on boot cpu. ++ * Input parameters are only needed for the first hypercall. ++ */ ++ ret = this_cpu_do_finalise_hc(sections, ARRAY_SIZE(sections)); ++ if (ret) { ++ pr_err("%s: pkvm finalise failed!\n", __func__); ++ goto out; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ if (cpu == self) ++ continue; ++ ++ /* ++ * Second hypercall to switch the mmu and ept pgtable ++ * for other cpus other than boot cpu. ++ */ ++ ret = smp_call_function_single(cpu, do_pkvm_finalise, ++ NULL, true); ++ } ++ ++ ret = kvm_hypercall0(PKVM_HC_ACTIVATE_IOMMU); ++out: ++ put_cpu(); ++ ++ return ret; ++} ++ ++static int add_device_to_pkvm(struct device *dev, void *data) ++{ ++ struct kvm_protected_vm *pkvm = data; ++ struct pci_dev *pdev; ++ u16 devid; ++ ++ if (!dev_is_pci(dev)) ++ return 0; ++ ++ pdev = to_pci_dev(dev); ++ devid = PCI_DEVID(pdev->bus->number, pdev->devfn); ++ ++ return kvm_hypercall3(PKVM_HC_ADD_PTDEV, pkvm->shadow_vm_handle, devid, 0); ++} ++ ++static int pkvm_init_pci(struct pkvm_hyp *pkvm) ++{ ++ struct pci_mmcfg_region *data, *cfg; ++ int length = 0, max_region_num = PAGE_SIZE / sizeof(struct pci_mmcfg_region); ++ ++ data = pkvm_sym(pkvm_early_alloc_page)(); ++ ++ list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) { ++ if (length >= max_region_num) ++ return -ENOMEM; ++ memcpy(&data[length], cfg, sizeof(struct pci_mmcfg_region)); ++ length += 1; ++ } ++ ++ pkvm->host_vm.pci_info.mmcfg_table = data; ++ pkvm->host_vm.pci_info.mmcfg_table_size = length; ++ ++ pkvm_sym(init_pci)(pkvm); ++ ++ return 0; ++} ++ ++int kvm_arch_add_device_to_pkvm(struct kvm *kvm, struct iommu_group *grp) ++{ ++ int ret = 0; ++ ++ kvm_get_kvm(kvm); ++ ++ if (kvm->arch.vm_type == KVM_X86_PROTECTED_VM) ++ ret = iommu_group_for_each_dev(grp, &kvm->pkvm, ++ add_device_to_pkvm); ++ ++ kvm_put_kvm(kvm); ++ ++ return ret; ++} ++ ++int pkvm_init_shadow_vm(struct kvm *kvm) ++{ ++ struct kvm_protected_vm *pkvm = &kvm->pkvm; ++ size_t shadow_sz; ++ void *shadow_addr; ++ int ret; ++ ++ INIT_LIST_HEAD(&kvm->pkvm.pinned_pages); ++ ++ shadow_sz = PAGE_ALIGN(PKVM_SHADOW_VM_SIZE); ++ shadow_addr = alloc_pages_exact(shadow_sz, GFP_KERNEL_ACCOUNT); ++ if (!shadow_addr) ++ return -ENOMEM; ++ ++ ret = kvm_hypercall3(PKVM_HC_INIT_SHADOW_VM, (unsigned long)kvm, ++ (unsigned long)__pa(shadow_addr), shadow_sz); ++ if (ret < 0) ++ goto free_page; ++ ++ pkvm->shadow_vm_handle = ret; ++ ++ return 0; ++free_page: ++ free_pages_exact(shadow_addr, shadow_sz); ++ return ret; ++} ++ ++void pkvm_teardown_shadow_vm(struct kvm *kvm) ++{ ++ struct kvm_protected_vm *pkvm = &kvm->pkvm; ++ struct kvm_pinned_page *ppage, *n; ++ unsigned long pa; ++ ++ pa = kvm_hypercall1(PKVM_HC_TEARDOWN_SHADOW_VM, pkvm->shadow_vm_handle); ++ if (!pa) ++ return; ++ ++ free_pages_exact(__va(pa), PAGE_ALIGN(PKVM_SHADOW_VM_SIZE)); ++ ++ if (list_empty(&pkvm->pinned_pages)) ++ return; ++ ++ list_for_each_entry_safe(ppage, n, &pkvm->pinned_pages, list) { ++ list_del(&ppage->list); ++ put_page(ppage->page); ++ kfree(ppage); ++ } ++} ++ ++int pkvm_init_shadow_vcpu(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_protected_vm *pkvm = &vcpu->kvm->pkvm; ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ s64 shadow_vcpu_handle; ++ size_t shadow_sz; ++ void *shadow_addr; ++ ++ shadow_sz = PAGE_ALIGN(PKVM_SHADOW_VCPU_STATE_SIZE); ++ shadow_addr = alloc_pages_exact(shadow_sz, GFP_KERNEL_ACCOUNT); ++ if (!shadow_addr) ++ return -ENOMEM; ++ ++ shadow_vcpu_handle = kvm_hypercall4(PKVM_HC_INIT_SHADOW_VCPU, ++ pkvm->shadow_vm_handle, (unsigned long)vmx, ++ (unsigned long)__pa(shadow_addr), shadow_sz); ++ if (shadow_vcpu_handle < 0) ++ goto free_page; ++ ++ vcpu->pkvm_shadow_vcpu_handle = shadow_vcpu_handle; ++ ++ return 0; ++ ++free_page: ++ free_pages_exact(shadow_addr, shadow_sz); ++ return -EINVAL; ++} ++ ++void pkvm_teardown_shadow_vcpu(struct kvm_vcpu *vcpu) ++{ ++ unsigned long pa = kvm_hypercall1(PKVM_HC_TEARDOWN_SHADOW_VCPU, ++ vcpu->pkvm_shadow_vcpu_handle); ++ ++ if (!pa) ++ return; ++ ++ free_pages_exact(__va(pa), PAGE_ALIGN(PKVM_SHADOW_VCPU_STATE_SIZE)); ++} ++ ++int pkvm_tlb_remote_flush_with_range(struct kvm *kvm, struct kvm_tlb_range *range) ++{ ++ int shadow_vm_handle = kvm->pkvm.shadow_vm_handle; ++ u64 start_gpa = 0; ++ u64 size = 0; ++ ++ if (shadow_vm_handle <= 0) ++ return -EOPNOTSUPP; ++ ++ if (range) { ++ start_gpa = range->start_gfn << PAGE_SHIFT; ++ size = range->pages * PAGE_SIZE; ++ } ++ ++ return kvm_hypercall3(PKVM_HC_TLB_REMOTE_FLUSH_RANGE, ++ shadow_vm_handle, start_gpa, size); ++} ++ ++int pkvm_tlb_remote_flush(struct kvm *kvm) ++{ ++ return pkvm_tlb_remote_flush_with_range(kvm, NULL); ++} ++ ++int pkvm_set_mmio_ve(struct kvm_vcpu *vcpu, unsigned long gfn) ++{ ++ if (vcpu->kvm->arch.vm_type == KVM_X86_PROTECTED_VM) { ++ kvm_hypercall1(PKVM_HC_SET_MMIO_VE, gfn); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static int pkvm_init_io_emulation(struct pkvm_hyp *pkvm) ++{ ++ pkvm->host_vm.io_bitmap = pkvm_sym(pkvm_early_alloc_contig)(2); ++ ++ if (!pkvm->host_vm.io_bitmap) { ++ pr_err("pkvm: %s: No page for io_bitmap\n", __func__); ++ return -ENOMEM; ++ } ++ ++ memset(pkvm->host_vm.io_bitmap, 0, 2 * PAGE_SIZE); ++ ++ return 0; ++} ++ ++int __init pkvm_init(void) ++{ ++ int ret = 0, cpu; ++ ++ if(pkvm_sym(pkvm_hyp)) { ++ pr_err("pkvm hypervisor is running!"); ++ return -EBUSY; ++ } ++ ++ if (!pkvm_mem_base) { ++ pr_err("pkvm required memory not get reseved!"); ++ ret = -ENOMEM; ++ goto out; ++ } ++ pkvm_sym(pkvm_early_alloc_init)(__va(pkvm_mem_base), ++ pkvm_data_struct_pages(PKVM_GLOBAL_PAGES, PKVM_PERCPU_PAGES, ++ num_possible_cpus()) << PAGE_SHIFT); ++ ++ /* pkvm hypervisor keeps same VA mapping as deprivileged host */ ++ pkvm = pkvm_sym(pkvm_hyp) = pkvm_sym(pkvm_early_alloc_contig)(PKVM_PAGES); ++ if (!pkvm) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ ret = pkvm_host_check_and_setup_vmx_cap(pkvm); ++ if (ret) ++ goto out; ++ ++ ret = check_and_init_iommu(pkvm); ++ if (ret) ++ goto out; ++ ++ ret = pkvm_init_mmu(pkvm); ++ if (ret) ++ goto out; ++ ++ ret = pkvm_init_io_emulation(pkvm); ++ if (ret) ++ goto out; ++ ++ ret = pkvm_init_pci(pkvm); ++ if (ret) ++ goto out; ++ ++ for_each_possible_cpu(cpu) { ++ ret = pkvm_setup_pcpu(pkvm, cpu); ++ if (ret) ++ goto out; ++ ret = pkvm_host_setup_vcpu(pkvm, cpu); ++ if (ret) ++ goto out; ++ } ++ ++ ret = pkvm_host_deprivilege_cpus(pkvm); ++ if (ret) ++ goto out; ++ ++ pkvm->num_cpus = num_possible_cpus(); ++ pkvm_init_debugfs(); ++ ++ return pkvm_init_finalise(); ++ ++out: ++ pkvm_sym(pkvm_hyp) = NULL; ++ return ret; ++} +diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c +index 2251b60920f8..6ab29b869914 100644 +--- a/arch/x86/kvm/vmx/vmcs12.c ++++ b/arch/x86/kvm/vmx/vmcs12.c +@@ -112,6 +112,8 @@ const unsigned short vmcs12_field_offsets[] = { + FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), + FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), + FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), ++ FIELD(PLE_GAP, ple_gap), ++ FIELD(PLE_WINDOW, ple_window), + FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), + FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), + FIELD(CR0_READ_SHADOW, cr0_read_shadow), +@@ -150,5 +152,9 @@ const unsigned short vmcs12_field_offsets[] = { + FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), + FIELD(HOST_RSP, host_rsp), + FIELD(HOST_RIP, host_rip), ++ FIELD(EXIT_IO_RCX, exit_io_rcx), ++ FIELD(EXIT_IO_RSI, exit_io_rsi), ++ FIELD(EXIT_IO_RDI, exit_io_rdi), ++ FIELD(EXIT_IO_RIP, exit_io_rip), + }; + const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); +diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h +index 746129ddd5ae..86299ccc97e7 100644 +--- a/arch/x86/kvm/vmx/vmcs12.h ++++ b/arch/x86/kvm/vmx/vmcs12.h +@@ -117,7 +117,11 @@ struct __packed vmcs12 { + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; +- natural_width paddingl[8]; /* room for future expansion */ ++ natural_width exit_io_rcx; ++ natural_width exit_io_rsi; ++ natural_width exit_io_rdi; ++ natural_width exit_io_rip; ++ natural_width paddingl[4]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; +@@ -165,7 +169,9 @@ struct __packed vmcs12 { + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 vmx_preemption_timer_value; +- u32 padding32[7]; /* room for future expansion */ ++ u32 ple_gap; ++ u32 ple_window; ++ u32 padding32[5]; /* room for future expansion */ + u16 virtual_processor_id; + u16 posted_intr_nv; + u16 guest_es_selector; +@@ -293,6 +299,10 @@ static inline void vmx_check_vmcs12_offsets(void) + CHECK_OFFSET(host_ia32_sysenter_eip, 656); + CHECK_OFFSET(host_rsp, 664); + CHECK_OFFSET(host_rip, 672); ++ CHECK_OFFSET(exit_io_rcx, 680); ++ CHECK_OFFSET(exit_io_rsi, 688); ++ CHECK_OFFSET(exit_io_rdi, 696); ++ CHECK_OFFSET(exit_io_rip, 704); + CHECK_OFFSET(pin_based_vm_exec_control, 744); + CHECK_OFFSET(cpu_based_vm_exec_control, 748); + CHECK_OFFSET(exception_bitmap, 752); +@@ -340,6 +350,8 @@ static inline void vmx_check_vmcs12_offsets(void) + CHECK_OFFSET(guest_sysenter_cs, 920); + CHECK_OFFSET(host_ia32_sysenter_cs, 924); + CHECK_OFFSET(vmx_preemption_timer_value, 928); ++ CHECK_OFFSET(ple_gap, 932); ++ CHECK_OFFSET(ple_window, 936); + CHECK_OFFSET(virtual_processor_id, 960); + CHECK_OFFSET(posted_intr_nv, 962); + CHECK_OFFSET(guest_es_selector, 964); +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 31a10d774df6..816e3be927f4 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -48,6 +48,7 @@ + #include + #include + #include ++#include + + #include "capabilities.h" + #include "cpuid.h" +@@ -66,6 +67,7 @@ + #include "vmcs12.h" + #include "vmx.h" + #include "x86.h" ++#include "vmx_lib.h" + + MODULE_AUTHOR("Qumranet"); + MODULE_LICENSE("GPL"); +@@ -119,9 +121,6 @@ module_param(nested, bool, S_IRUGO); + bool __read_mostly enable_pml = 1; + module_param_named(pml, enable_pml, bool, S_IRUGO); + +-static bool __read_mostly error_on_inconsistent_vmcs_config = true; +-module_param(error_on_inconsistent_vmcs_config, bool, 0444); +- + static bool __read_mostly dump_invalid_vmcs = 0; + module_param(dump_invalid_vmcs, bool, 0644); + +@@ -2536,6 +2535,11 @@ static void vmx_hardware_disable(void) + intel_pt_handle_vmx(0); + } + ++void free_vmcs(struct vmcs *vmcs) ++{ ++ free_page((unsigned long)vmcs); ++} ++ + /* + * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID + * directly instead of going through cpu_has(), to ensure KVM is trapping +@@ -2571,192 +2575,31 @@ static bool cpu_has_perf_global_ctrl_bug(void) + return false; + } + +-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, +- u32 msr, u32 *result) +-{ +- u32 vmx_msr_low, vmx_msr_high; +- u32 ctl = ctl_min | ctl_opt; +- +- rdmsr(msr, vmx_msr_low, vmx_msr_high); +- +- ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ +- ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ +- +- /* Ensure minimum (required) set of control bits are supported. */ +- if (ctl_min & ~ctl) +- return -EIO; +- +- *result = ctl; +- return 0; +-} +- +-static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +-{ +- u64 allowed; +- +- rdmsrl(msr, allowed); +- +- return ctl_opt & allowed; +-} +- + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, + struct vmx_capability *vmx_cap) + { +- u32 vmx_msr_low, vmx_msr_high; +- u32 _pin_based_exec_control = 0; +- u32 _cpu_based_exec_control = 0; +- u32 _cpu_based_2nd_exec_control = 0; +- u64 _cpu_based_3rd_exec_control = 0; +- u32 _vmexit_control = 0; +- u32 _vmentry_control = 0; +- u64 misc_msr; +- int i; +- +- /* +- * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. +- * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always +- * intercepts writes to PAT and EFER, i.e. never enables those controls. +- */ +- struct { +- u32 entry_control; +- u32 exit_control; +- } const vmcs_entry_exit_pairs[] = { +- { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, +- { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, +- { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, +- { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, +- { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, ++ struct vmcs_config_setting setting = { ++ .cpu_based_exec_ctrl_min = KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, ++ .cpu_based_exec_ctrl_opt = KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, ++ .cpu_based_2nd_exec_ctrl_min = KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, ++ .cpu_based_2nd_exec_ctrl_opt = KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, ++ .pin_based_exec_ctrl_min = KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, ++ .pin_based_exec_ctrl_opt = KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, ++ .vmexit_ctrl_min = KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, ++ .vmexit_ctrl_opt = KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, ++ .vmentry_ctrl_min = KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, ++ .vmentry_ctrl_opt = KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, ++ .has_broken_vmx_preemption_timer = cpu_has_broken_vmx_preemption_timer(), ++ .perf_global_ctrl_workaround = cpu_has_perf_global_ctrl_bug(), + }; +- +- memset(vmcs_conf, 0, sizeof(*vmcs_conf)); +- +- if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, +- KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, +- MSR_IA32_VMX_PROCBASED_CTLS, +- &_cpu_based_exec_control)) +- return -EIO; +- if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { +- if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, +- KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, +- MSR_IA32_VMX_PROCBASED_CTLS2, +- &_cpu_based_2nd_exec_control)) +- return -EIO; +- } +-#ifndef CONFIG_X86_64 +- if (!(_cpu_based_2nd_exec_control & +- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) +- _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; +-#endif +- +- if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) +- _cpu_based_2nd_exec_control &= ~( +- SECONDARY_EXEC_APIC_REGISTER_VIRT | +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | +- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); +- +- rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, +- &vmx_cap->ept, &vmx_cap->vpid); +- +- if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && +- vmx_cap->ept) { +- pr_warn_once("EPT CAP should not exist if not support " +- "1-setting enable EPT VM-execution control\n"); +- +- if (error_on_inconsistent_vmcs_config) +- return -EIO; +- +- vmx_cap->ept = 0; +- } +- if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && +- vmx_cap->vpid) { +- pr_warn_once("VPID CAP should not exist if not support " +- "1-setting enable VPID VM-execution control\n"); +- +- if (error_on_inconsistent_vmcs_config) +- return -EIO; +- +- vmx_cap->vpid = 0; +- } ++ int ret; + + if (!cpu_has_sgx()) +- _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; +- +- if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) +- _cpu_based_3rd_exec_control = +- adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, +- MSR_IA32_VMX_PROCBASED_CTLS3); +- +- if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, +- KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, +- MSR_IA32_VMX_EXIT_CTLS, +- &_vmexit_control)) +- return -EIO; +- +- if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, +- KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, +- MSR_IA32_VMX_PINBASED_CTLS, +- &_pin_based_exec_control)) +- return -EIO; ++ setting.cpu_based_2nd_exec_ctrl_opt &= ~SECONDARY_EXEC_ENCLS_EXITING; + +- if (cpu_has_broken_vmx_preemption_timer()) +- _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; +- if (!(_cpu_based_2nd_exec_control & +- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) +- _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; +- +- if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, +- KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, +- MSR_IA32_VMX_ENTRY_CTLS, +- &_vmentry_control)) +- return -EIO; +- +- for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { +- u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; +- u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; +- +- if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) +- continue; +- +- pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", +- _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); +- +- if (error_on_inconsistent_vmcs_config) +- return -EIO; +- +- _vmentry_control &= ~n_ctrl; +- _vmexit_control &= ~x_ctrl; +- } +- +- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); +- +- /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ +- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) +- return -EIO; +- +-#ifdef CONFIG_X86_64 +- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ +- if (vmx_msr_high & (1u<<16)) +- return -EIO; +-#endif +- +- /* Require Write-Back (WB) memory type for VMCS accesses. */ +- if (((vmx_msr_high >> 18) & 15) != 6) +- return -EIO; +- +- rdmsrl(MSR_IA32_VMX_MISC, misc_msr); +- +- vmcs_conf->size = vmx_msr_high & 0x1fff; +- vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; +- +- vmcs_conf->revision_id = vmx_msr_low; +- +- vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; +- vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; +- vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; +- vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; +- vmcs_conf->vmexit_ctrl = _vmexit_control; +- vmcs_conf->vmentry_ctrl = _vmentry_control; +- vmcs_conf->misc = misc_msr; ++ ret = __setup_vmcs_config(vmcs_conf, vmx_cap, &setting); ++ if (ret < 0) ++ return ret; + + return 0; + } +@@ -2784,11 +2627,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) + return vmcs; + } + +-void free_vmcs(struct vmcs *vmcs) +-{ +- free_page((unsigned long)vmcs); +-} +- + /* + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded + */ +@@ -4847,18 +4685,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) + + static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) + { +- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); ++ _vmx_enable_irq_window(to_vmx(vcpu)); + } + + static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) + { +- if (!enable_vnmi || +- vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { +- vmx_enable_irq_window(vcpu); +- return; +- } +- +- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++ _vmx_enable_nmi_window(to_vmx(vcpu), enable_vnmi); + } + + static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +@@ -7319,6 +7151,8 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu) + free_vpid(vmx->vpid); + nested_vmx_free_vcpu(vcpu); + free_loaded_vmcs(vmx->loaded_vmcs); ++ ++ pkvm_teardown_shadow_vcpu(vcpu); + } + + static int vmx_vcpu_create(struct kvm_vcpu *vcpu) +@@ -7416,7 +7250,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + +- return 0; ++ return pkvm_init_shadow_vcpu(vcpu); + + free_vmcs: + free_loaded_vmcs(vmx->loaded_vmcs); +@@ -7427,6 +7261,15 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) + return err; + } + ++static bool vmx_is_vm_type_supported(unsigned long type) ++{ ++#ifdef CONFIG_PKVM_INTEL ++ if (type == KVM_X86_PROTECTED_VM) ++ return true; ++#endif ++ return type == KVM_X86_DEFAULT_VM; ++} ++ + #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" + #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" + +@@ -7458,7 +7301,13 @@ static int vmx_vm_init(struct kvm *kvm) + break; + } + } +- return 0; ++ ++ return pkvm_init_shadow_vm(kvm); ++} ++ ++static void vmx_vm_free(struct kvm *kvm) ++{ ++ pkvm_teardown_shadow_vm(kvm); + } + + static int __init vmx_check_processor_compat(void) +@@ -8106,9 +7955,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { + .hardware_disable = vmx_hardware_disable, + .has_emulated_msr = vmx_has_emulated_msr, + ++ .is_vm_type_supported = vmx_is_vm_type_supported, + .vm_size = sizeof(struct kvm_vmx), + .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, ++ .vm_free = vmx_vm_free, + + .vcpu_precreate = vmx_vcpu_precreate, + .vcpu_create = vmx_vcpu_create, +@@ -8385,6 +8236,17 @@ static __init int hardware_setup(void) + } + #endif + ++#if IS_ENABLED(CONFIG_PKVM_INTEL) ++ if (!enable_ept || vmx_x86_ops.tlb_remote_flush || ++ vmx_x86_ops.tlb_remote_flush_with_range) { ++ pr_err_ratelimited("kvm: EPT or tlb_remote_flush ops not available to pKVM-IA\n"); ++ return -EOPNOTSUPP; ++ } ++ vmx_x86_ops.tlb_remote_flush = pkvm_tlb_remote_flush; ++ vmx_x86_ops.tlb_remote_flush_with_range = ++ pkvm_tlb_remote_flush_with_range; ++#endif ++ + if (!cpu_has_vmx_ple()) { + ple_gap = 0; + ple_window = 0; +@@ -8496,6 +8358,9 @@ static __init int hardware_setup(void) + } + + static struct kvm_x86_init_ops vmx_init_ops __initdata = { ++#ifdef CONFIG_PKVM_INTEL ++ .pkvm_init = pkvm_init, ++#endif + .cpu_has_kvm_support = cpu_has_kvm_support, + .disabled_by_bios = vmx_disabled_by_bios, + .check_processor_compatibility = vmx_check_processor_compat, +diff --git a/arch/x86/kvm/vmx/vmx_lib.h b/arch/x86/kvm/vmx/vmx_lib.h +new file mode 100644 +index 000000000000..38bae15db417 +--- /dev/null ++++ b/arch/x86/kvm/vmx/vmx_lib.h +@@ -0,0 +1,241 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __KVM_X86_VMX_LIB_H ++#define __KVM_X86_VMX_LIB_H ++ ++static bool __read_mostly error_on_inconsistent_vmcs_config = true; ++module_param(error_on_inconsistent_vmcs_config, bool, 0444); ++ ++#ifndef __PKVM_HYP__ ++struct vmcs_config_setting { ++ u32 cpu_based_exec_ctrl_min; ++ u32 cpu_based_exec_ctrl_opt; ++ u32 cpu_based_2nd_exec_ctrl_min; ++ u32 cpu_based_2nd_exec_ctrl_opt; ++ u32 pin_based_exec_ctrl_min; ++ u32 pin_based_exec_ctrl_opt; ++ u32 vmexit_ctrl_min; ++ u32 vmexit_ctrl_opt; ++ u32 vmentry_ctrl_min; ++ u32 vmentry_ctrl_opt; ++ bool has_broken_vmx_preemption_timer; ++ bool perf_global_ctrl_workaround; ++}; ++ ++ ++static inline u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) ++{ ++ u64 allowed; ++ ++ rdmsrl(msr, allowed); ++ ++ return ctl_opt & allowed; ++} ++ ++__attribute__((optimize(0))) ++static inline int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, ++ u32 msr, u32 *result) ++{ ++ u32 vmx_msr_low, vmx_msr_high; ++ u32 ctl = ctl_min | ctl_opt; ++ ++ rdmsr(msr, vmx_msr_low, vmx_msr_high); ++ ++ ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ ++ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ ++ ++ /* Ensure minimum (required) set of control bits are supported. */ ++ if (ctl_min & ~ctl) ++ return -EIO; ++ ++ *result = ctl; ++ return 0; ++} ++ ++__attribute__((optimize(1))) ++static inline int __setup_vmcs_config(struct vmcs_config *vmcs_conf, ++ struct vmx_capability *vmx_cap, ++ struct vmcs_config_setting *setting) ++{ ++ u32 vmx_msr_low, vmx_msr_high; ++ u32 min, opt; ++ u32 _pin_based_exec_control = 0; ++ u32 _cpu_based_exec_control = 0; ++ u32 _cpu_based_2nd_exec_control = 0; ++ u64 _cpu_based_3rd_exec_control = 0; ++ u32 _vmexit_control = 0; ++ u32 _vmentry_control = 0; ++ u64 misc_msr; ++ int i; ++ ++ /* ++ * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. ++ * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always ++ * intercepts writes to PAT and EFER, i.e. never enables those controls. ++ */ ++ struct { ++ u32 entry_control; ++ u32 exit_control; ++ } const vmcs_entry_exit_pairs[] = { ++ { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, ++ { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, ++ { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, ++ { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, ++ { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, ++ }; ++ ++ memset(vmcs_conf, 0, sizeof(*vmcs_conf)); ++ ++ min = setting->cpu_based_exec_ctrl_min; ++ opt = setting->cpu_based_exec_ctrl_opt; ++ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, ++ &_cpu_based_exec_control) < 0) ++ return -EIO; ++ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { ++ min = setting->cpu_based_2nd_exec_ctrl_min; ++ opt = setting->cpu_based_2nd_exec_ctrl_opt; ++ if (adjust_vmx_controls(min, opt, ++ MSR_IA32_VMX_PROCBASED_CTLS2, ++ &_cpu_based_2nd_exec_control) < 0) ++ return -EIO; ++ } ++#ifndef CONFIG_X86_64 ++ if (!(_cpu_based_2nd_exec_control & ++ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) ++ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; ++#endif ++ ++ if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) ++ _cpu_based_2nd_exec_control &= ~( ++ SECONDARY_EXEC_APIC_REGISTER_VIRT | ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); ++ ++ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ++ &vmx_cap->ept, &vmx_cap->vpid); ++ ++ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && ++ vmx_cap->ept) { ++ pr_warn_once("EPT CAP should not exist if not support " ++ "1-setting enable EPT VM-execution control\n"); ++ ++ if (error_on_inconsistent_vmcs_config) ++ return -EIO; ++ ++ vmx_cap->ept = 0; ++ } ++ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && ++ vmx_cap->vpid) { ++ pr_warn_once("VPID CAP should not exist if not support " ++ "1-setting enable VPID VM-execution control\n"); ++ ++ if (error_on_inconsistent_vmcs_config) ++ return -EIO; ++ ++ vmx_cap->vpid = 0; ++ } ++ ++ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) ++ _cpu_based_3rd_exec_control = ++ adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, ++ MSR_IA32_VMX_PROCBASED_CTLS3); ++ ++ ++ min = setting->vmexit_ctrl_min; ++ opt = setting->vmexit_ctrl_opt; ++ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, ++ &_vmexit_control) < 0) ++ return -EIO; ++ ++ min = setting->pin_based_exec_ctrl_min; ++ opt = setting->pin_based_exec_ctrl_opt; ++ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, ++ &_pin_based_exec_control) < 0) ++ return -EIO; ++ ++ if (setting->has_broken_vmx_preemption_timer) ++ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; ++ if (!(_cpu_based_2nd_exec_control & ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) ++ _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; ++ ++ min = setting->vmentry_ctrl_min; ++ opt = setting->vmentry_ctrl_opt; ++ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, ++ &_vmentry_control) < 0) ++ return -EIO; ++ ++ if (setting->perf_global_ctrl_workaround) { ++ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; ++ _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; ++ pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " ++ "does not work properly. Using workaround\n"); ++ } ++ ++ for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { ++ u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; ++ u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; ++ ++ if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) ++ continue; ++ ++ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", ++ _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); ++ ++ if (error_on_inconsistent_vmcs_config) ++ return -EIO; ++ ++ _vmentry_control &= ~n_ctrl; ++ _vmexit_control &= ~x_ctrl; ++ } ++ ++ rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); ++ ++ /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ ++ if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) ++ return -EIO; ++ ++#ifdef CONFIG_X86_64 ++ /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ ++ if (vmx_msr_high & (1u<<16)) ++ return -EIO; ++#endif ++ ++ /* Require Write-Back (WB) memory type for VMCS accesses. */ ++ if (((vmx_msr_high >> 18) & 15) != 6) ++ return -EIO; ++ ++ rdmsrl(MSR_IA32_VMX_MISC, misc_msr); ++ ++ vmcs_conf->size = vmx_msr_high & 0x1fff; ++ vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; ++ ++ vmcs_conf->revision_id = vmx_msr_low; ++ ++ vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; ++ vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; ++ vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; ++ vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; ++ vmcs_conf->vmexit_ctrl = _vmexit_control; ++ vmcs_conf->vmentry_ctrl = _vmentry_control; ++ ++ return 0; ++} ++#endif /* !__PKVM_HYP__*/ ++ ++static inline void _vmx_enable_irq_window(struct vcpu_vmx *vmx) ++{ ++ exec_controls_setbit(vmx, CPU_BASED_INTR_WINDOW_EXITING); ++} ++ ++static inline void _vmx_enable_nmi_window(struct vcpu_vmx *vmx, bool vnmi_enabled) ++{ ++ if (!vnmi_enabled || ++ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { ++ _vmx_enable_irq_window(vmx); ++ return; ++ } ++ ++ exec_controls_setbit(vmx, CPU_BASED_NMI_WINDOW_EXITING); ++} ++ ++#endif +diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h +index ec268df83ed6..6ca9ef128033 100644 +--- a/arch/x86/kvm/vmx/vmx_ops.h ++++ b/arch/x86/kvm/vmx/vmx_ops.h +@@ -67,6 +67,12 @@ static __always_inline void vmcs_checkl(unsigned long field) + "Natural width accessor invalid for 32-bit field"); + } + ++#ifdef __PKVM_HYP__ ++ ++#include "pkvm/hyp/vmx_ops.h" ++ ++#else ++ + static __always_inline unsigned long __vmcs_readl(unsigned long field) + { + unsigned long value; +@@ -278,6 +284,7 @@ static inline void vmcs_load(struct vmcs *vmcs) + + vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr); + } ++#endif /*__PKVM_HYP__*/ + + static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) + { +@@ -312,6 +319,12 @@ static inline void vpid_sync_vcpu_global(void) + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); + } + ++static inline void ept_sync_global(void) ++{ ++ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); ++} ++ ++#ifndef __PKVM_HYP__ + static inline void vpid_sync_context(int vpid) + { + if (cpu_has_vmx_invvpid_single()) +@@ -331,11 +344,6 @@ static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr) + vpid_sync_context(vpid); + } + +-static inline void ept_sync_global(void) +-{ +- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); +-} +- + static inline void ept_sync_context(u64 eptp) + { + if (cpu_has_vmx_invept_context()) +@@ -343,5 +351,6 @@ static inline void ept_sync_context(u64 eptp) + else + ept_sync_global(); + } ++#endif /* __PKVM_HYP__ */ + + #endif /* __KVM_X86_VMX_INSN_H */ +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 4d6baae1ae74..a8f7fb5729bd 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -4549,6 +4549,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; + break; ++ case KVM_CAP_VM_TYPES: ++ r = BIT(KVM_X86_DEFAULT_VM); ++ if (static_call(kvm_x86_is_vm_type_supported)(KVM_X86_PROTECTED_VM)) ++ r |= BIT(KVM_X86_PROTECTED_VM); ++ break; + default: + break; + } +@@ -9404,6 +9409,14 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) + return -EEXIST; + } + ++#ifdef CONFIG_PKVM_INTEL ++ r = ops->pkvm_init(); ++ if (r) { ++ pr_err_ratelimited("kvm: pkvm init fail\n"); ++ return r; ++ } ++#endif ++ + if (!ops->cpu_has_kvm_support()) { + pr_err_ratelimited("kvm: no hardware support for '%s'\n", + ops->runtime_ops->name); +@@ -9702,11 +9715,53 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) + return kvm_skip_emulated_instruction(vcpu); + } + ++int kvm_pkvm_hypercall(struct kvm_vcpu *vcpu) ++{ ++ unsigned long val, nr; ++ int size; ++ gpa_t gpa; ++ int ret; ++ ++ nr = kvm_rax_read(vcpu); ++ gpa = kvm_rbx_read(vcpu); ++ size = kvm_rcx_read(vcpu); ++ val = kvm_rdx_read(vcpu); ++ ++ /* ++ * Reuse the sev_es handler to emulate the mmio. ++ */ ++ switch (nr) { ++ case PKVM_GHC_IOREAD: ++ vcpu->mmio_is_write = 0; ++ ret = kvm_sev_es_mmio_read(vcpu, gpa, size, ++ &vcpu->arch.regs[VCPU_REGS_RAX]); ++ break; ++ case PKVM_GHC_IOWRITE: ++ vcpu->mmio_is_write = 1; ++ ret = kvm_sev_es_mmio_write(vcpu, gpa, size, &val); ++ break; ++ default: ++ ret = 1; ++ break; ++ } ++ ++ /* ++ * We assume calling this function will always succeed which will update ++ * the GUEST_RIP to skip the current instruction. ++ */ ++ static_call(kvm_x86_skip_emulated_instruction)(vcpu); ++ ++ return ret; ++} ++ + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) + { + unsigned long nr, a0, a1, a2, a3, ret; + int op_64_bit; + ++ if (vcpu->kvm->arch.vm_type == KVM_X86_PROTECTED_VM) ++ return kvm_pkvm_hypercall(vcpu); ++ + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + +@@ -12446,9 +12501,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + int ret; + unsigned long flags; + +- if (type) ++ if (!static_call(kvm_x86_is_vm_type_supported)(type)) + return -EINVAL; + ++ kvm->arch.vm_type = type; ++ + ret = kvm_page_track_init(kvm); + if (ret) + goto out; +@@ -12641,6 +12698,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) + kvm_page_track_cleanup(kvm); + kvm_xen_destroy_vm(kvm); + kvm_hv_destroy_vm(kvm); ++ static_call_cond(kvm_x86_vm_free)(kvm); + } + + static void memslot_rmap_free(struct kvm_memory_slot *slot) +diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c +index 5f0ce77a259d..0d619878a0aa 100644 +--- a/arch/x86/mm/pat/set_memory.c ++++ b/arch/x86/mm/pat/set_memory.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + + #include "../mm_internal.h" + +@@ -2121,6 +2122,9 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) + + static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) + { ++ if (pkvm_is_protected_guest()) ++ return pkvm_set_mem_host_visibility(addr, numpages, enc); ++ + if (hv_is_isolation_supported()) + return hv_set_mem_host_visibility(addr, numpages, !enc); + +diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c +index 1f925285104e..30d9954ffb60 100644 +--- a/drivers/iommu/intel/debugfs.c ++++ b/drivers/iommu/intel/debugfs.c +@@ -136,13 +136,13 @@ static int iommu_regset_show(struct seq_file *m, void *unused) + */ + raw_spin_lock_irqsave(&iommu->register_lock, flag); + for (i = 0 ; i < ARRAY_SIZE(iommu_regs_32); i++) { +- value = dmar_readl(iommu->reg + iommu_regs_32[i].offset); ++ value = dmar_readl(iommu, iommu_regs_32[i].offset); + seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n", + iommu_regs_32[i].regs, iommu_regs_32[i].offset, + value); + } + for (i = 0 ; i < ARRAY_SIZE(iommu_regs_64); i++) { +- value = dmar_readq(iommu->reg + iommu_regs_64[i].offset); ++ value = dmar_readq(iommu, iommu_regs_64[i].offset); + seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n", + iommu_regs_64[i].regs, iommu_regs_64[i].offset, + value); +@@ -250,7 +250,7 @@ static void ctx_tbl_walk(struct seq_file *m, struct intel_iommu *iommu, u16 bus) + tbl_wlk.ctx_entry = context; + m->private = &tbl_wlk; + +- if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) { ++ if (dmar_readq(iommu, DMAR_RTADDR_REG) & DMA_RTADDR_SMT) { + pasid_dir_ptr = context->lo & VTD_PAGE_MASK; + pasid_dir_size = get_pasid_dir_size(context); + pasid_dir_walk(m, pasid_dir_ptr, pasid_dir_size); +@@ -288,7 +288,7 @@ static int dmar_translation_struct_show(struct seq_file *m, void *unused) + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { +- sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); ++ sts = dmar_readl(iommu, DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_TES)) { + seq_printf(m, "DMA Remapping is not enabled on %s\n", + iommu->name); +@@ -441,8 +441,8 @@ static int invalidation_queue_show(struct seq_file *m, void *unused) + raw_spin_lock_irqsave(&qi->q_lock, flags); + seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n", + (u64)virt_to_phys(qi->desc), +- dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift, +- dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift); ++ dmar_readq(iommu, DMAR_IQH_REG) >> shift, ++ dmar_readq(iommu, DMAR_IQT_REG) >> shift); + invalidation_queue_entry_show(m, iommu); + raw_spin_unlock_irqrestore(&qi->q_lock, flags); + seq_putc(m, '\n'); +@@ -523,7 +523,7 @@ static int ir_translation_struct_show(struct seq_file *m, void *unused) + seq_printf(m, "Remapped Interrupt supported on IOMMU: %s\n", + iommu->name); + +- sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); ++ sts = dmar_readl(iommu, DMAR_GSTS_REG); + if (iommu->ir_table && (sts & DMA_GSTS_IRES)) { + irta = virt_to_phys(iommu->ir_table->base); + seq_printf(m, " IR table address:%llx\n", irta); +diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c +index f800989ea046..36a22c0a6f30 100644 +--- a/drivers/iommu/intel/dmar.c ++++ b/drivers/iommu/intel/dmar.c +@@ -878,7 +878,7 @@ static int __ref + dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg) + { + struct acpi_dmar_hardware_unit *drhd; +- void __iomem *addr; ++ struct intel_iommu iommu; + u64 cap, ecap; + + drhd = (void *)entry; +@@ -887,22 +887,23 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg) + return -EINVAL; + } + ++ iommu.reg_phys = drhd->address; + if (arg) +- addr = ioremap(drhd->address, VTD_PAGE_SIZE); ++ iommu.reg = ioremap(drhd->address, VTD_PAGE_SIZE); + else +- addr = early_ioremap(drhd->address, VTD_PAGE_SIZE); +- if (!addr) { ++ iommu.reg = early_ioremap(drhd->address, VTD_PAGE_SIZE); ++ if (!iommu.reg) { + pr_warn("Can't validate DRHD address: %llx\n", drhd->address); + return -EINVAL; + } + +- cap = dmar_readq(addr + DMAR_CAP_REG); +- ecap = dmar_readq(addr + DMAR_ECAP_REG); ++ cap = dmar_readq(&iommu, DMAR_CAP_REG); ++ ecap = dmar_readq(&iommu, DMAR_ECAP_REG); + + if (arg) +- iounmap(addr); ++ iounmap(iommu.reg); + else +- early_iounmap(addr, VTD_PAGE_SIZE); ++ early_iounmap(iommu.reg, VTD_PAGE_SIZE); + + if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) { + warn_invalid_dmar(drhd->address, " returns all ones"); +@@ -981,16 +982,19 @@ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr) + goto release; + } + +- iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); +- iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG); ++ iommu->cap = dmar_readq(iommu, DMAR_CAP_REG); ++ iommu->ecap = dmar_readq(iommu, DMAR_ECAP_REG); + + if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) { + err = -EINVAL; + warn_invalid_dmar(phys_addr, " returns all ones"); + goto unmap; + } ++#ifdef CONFIG_PKVM_INTEL ++ pkvm_update_iommu_virtual_caps(&iommu->cap, &iommu->ecap); ++#endif + if (ecap_vcs(iommu->ecap)) +- iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG); ++ iommu->vccap = dmar_readq(iommu, DMAR_VCCAP_REG); + + /* the registers might be more than one page */ + map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), +@@ -1087,7 +1091,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) + + iommu->node = NUMA_NO_NODE; + +- ver = readl(iommu->reg + DMAR_VER_REG); ++ ver = dmar_readl(iommu, DMAR_VER_REG); + pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", + iommu->name, + (unsigned long long)drhd->reg_base_addr, +@@ -1096,7 +1100,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) + (unsigned long long)iommu->ecap); + + /* Reflect status in gcmd */ +- sts = readl(iommu->reg + DMAR_GSTS_REG); ++ sts = dmar_readl(iommu, DMAR_GSTS_REG); + if (sts & DMA_GSTS_IRES) + iommu->gcmd |= DMA_GCMD_IRE; + if (sts & DMA_GSTS_TES) +@@ -1211,8 +1215,8 @@ static const char *qi_type_string(u8 type) + + static void qi_dump_fault(struct intel_iommu *iommu, u32 fault) + { +- unsigned int head = dmar_readl(iommu->reg + DMAR_IQH_REG); +- u64 iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG); ++ unsigned int head = dmar_readl(iommu, DMAR_IQH_REG); ++ u64 iqe_err = dmar_readq(iommu, DMAR_IQER_REG); + struct qi_desc *desc = iommu->qi->desc + head; + + if (fault & DMA_FSTS_IQE) +@@ -1250,7 +1254,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) + if (qi->desc_status[wait_index] == QI_ABORT) + return -EAGAIN; + +- fault = readl(iommu->reg + DMAR_FSTS_REG); ++ fault = dmar_readl(iommu, DMAR_FSTS_REG); + if (fault & (DMA_FSTS_IQE | DMA_FSTS_ITE | DMA_FSTS_ICE)) + qi_dump_fault(iommu, fault); + +@@ -1260,7 +1264,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) + * is cleared. + */ + if (fault & DMA_FSTS_IQE) { +- head = readl(iommu->reg + DMAR_IQH_REG); ++ head = dmar_readl(iommu, DMAR_IQH_REG); + if ((head >> shift) == index) { + struct qi_desc *desc = qi->desc + head; + +@@ -1271,7 +1275,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) + */ + memcpy(desc, qi->desc + (wait_index << shift), + 1 << shift); +- writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG); ++ dmar_writel(iommu, DMAR_FSTS_REG, DMA_FSTS_IQE); + pr_info("Invalidation Queue Error (IQE) cleared\n"); + return -EINVAL; + } +@@ -1282,13 +1286,13 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) + * No new descriptors are fetched until the ITE is cleared. + */ + if (fault & DMA_FSTS_ITE) { +- head = readl(iommu->reg + DMAR_IQH_REG); ++ head = dmar_readl(iommu, DMAR_IQH_REG); + head = ((head >> shift) - 1 + QI_LENGTH) % QI_LENGTH; + head |= 1; +- tail = readl(iommu->reg + DMAR_IQT_REG); ++ tail = dmar_readl(iommu, DMAR_IQT_REG); + tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH; + +- writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG); ++ dmar_writel(iommu, DMAR_FSTS_REG, DMA_FSTS_ITE); + pr_info("Invalidation Time-out Error (ITE) cleared\n"); + + do { +@@ -1302,7 +1306,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) + } + + if (fault & DMA_FSTS_ICE) { +- writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG); ++ dmar_writel(iommu, DMAR_FSTS_REG, DMA_FSTS_ICE); + pr_info("Invalidation Completion Error (ICE) cleared\n"); + } + +@@ -1393,7 +1397,7 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, + * update the HW tail register indicating the presence of + * new descriptors. + */ +- writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG); ++ dmar_writel(iommu, DMAR_IQT_REG, qi->free_head << shift); + + while (qi->desc_status[wait_index] != QI_DONE) { + /* +@@ -1621,22 +1625,22 @@ void dmar_disable_qi(struct intel_iommu *iommu) + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + +- sts = readl(iommu->reg + DMAR_GSTS_REG); ++ sts = dmar_readl(iommu, DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_QIES)) + goto end; + + /* + * Give a chance to HW to complete the pending invalidation requests. + */ +- while ((readl(iommu->reg + DMAR_IQT_REG) != +- readl(iommu->reg + DMAR_IQH_REG)) && ++ while ((dmar_readl(iommu, DMAR_IQT_REG) != ++ dmar_readl(iommu, DMAR_IQH_REG)) && + (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time))) + cpu_relax(); + + iommu->gcmd &= ~DMA_GCMD_QIE; +- writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd); + +- IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, ++ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, + !(sts & DMA_GSTS_QIES), sts); + end: + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +@@ -1665,15 +1669,15 @@ static void __dmar_enable_qi(struct intel_iommu *iommu) + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + /* write zero to the tail reg */ +- writel(0, iommu->reg + DMAR_IQT_REG); ++ dmar_writel(iommu, DMAR_IQT_REG, 0); + +- dmar_writeq(iommu->reg + DMAR_IQA_REG, val); ++ dmar_writeq(iommu, DMAR_IQA_REG, val); + + iommu->gcmd |= DMA_GCMD_QIE; +- writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd); + + /* Make sure hardware complete it */ +- IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts); ++ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, (sts & DMA_GSTS_QIES), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + } +@@ -1865,9 +1869,9 @@ void dmar_msi_unmask(struct irq_data *data) + + /* unmask it */ + raw_spin_lock_irqsave(&iommu->register_lock, flag); +- writel(0, iommu->reg + reg); ++ dmar_writel(iommu, reg, 0); + /* Read a reg to force flush the post write */ +- readl(iommu->reg + reg); ++ dmar_readl(iommu, reg); + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } + +@@ -1879,9 +1883,9 @@ void dmar_msi_mask(struct irq_data *data) + + /* mask it */ + raw_spin_lock_irqsave(&iommu->register_lock, flag); +- writel(DMA_FECTL_IM, iommu->reg + reg); ++ dmar_writel(iommu, reg, DMA_FECTL_IM); + /* Read a reg to force flush the post write */ +- readl(iommu->reg + reg); ++ dmar_readl(iommu, reg); + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } + +@@ -1892,9 +1896,9 @@ void dmar_msi_write(int irq, struct msi_msg *msg) + unsigned long flag; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); +- writel(msg->data, iommu->reg + reg + 4); +- writel(msg->address_lo, iommu->reg + reg + 8); +- writel(msg->address_hi, iommu->reg + reg + 12); ++ dmar_writel(iommu, reg + 4, msg->data); ++ dmar_writel(iommu, reg + 8, msg->address_lo); ++ dmar_writel(iommu, reg + 12, msg->address_hi); + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } + +@@ -1905,9 +1909,9 @@ void dmar_msi_read(int irq, struct msi_msg *msg) + unsigned long flag; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); +- msg->data = readl(iommu->reg + reg + 4); +- msg->address_lo = readl(iommu->reg + reg + 8); +- msg->address_hi = readl(iommu->reg + reg + 12); ++ msg->data = dmar_readl(iommu, reg + 4); ++ msg->address_lo = dmar_readl(iommu, reg + 8); ++ msg->address_hi = dmar_readl(iommu, reg + 12); + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } + +@@ -1959,7 +1963,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id) + DEFAULT_RATELIMIT_BURST); + + raw_spin_lock_irqsave(&iommu->register_lock, flag); +- fault_status = readl(iommu->reg + DMAR_FSTS_REG); ++ fault_status = dmar_readl(iommu, DMAR_FSTS_REG); + if (fault_status && __ratelimit(&rs)) + pr_err("DRHD: handling fault status reg %x\n", fault_status); + +@@ -1981,7 +1985,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id) + bool pasid_present; + + /* highest 32 bits */ +- data = readl(iommu->reg + reg + ++ data = dmar_readl(iommu, reg + + fault_index * PRIMARY_FAULT_REG_LEN + 12); + if (!(data & DMA_FRCD_F)) + break; +@@ -1991,19 +1995,19 @@ irqreturn_t dmar_fault(int irq, void *dev_id) + type = dma_frcd_type(data); + + pasid = dma_frcd_pasid_value(data); +- data = readl(iommu->reg + reg + ++ data = dmar_readl(iommu, reg + + fault_index * PRIMARY_FAULT_REG_LEN + 8); + source_id = dma_frcd_source_id(data); + + pasid_present = dma_frcd_pasid_present(data); +- guest_addr = dmar_readq(iommu->reg + reg + ++ guest_addr = dmar_readq(iommu, reg + + fault_index * PRIMARY_FAULT_REG_LEN); + guest_addr = dma_frcd_page_addr(guest_addr); + } + + /* clear the fault */ +- writel(DMA_FRCD_F, iommu->reg + reg + +- fault_index * PRIMARY_FAULT_REG_LEN + 12); ++ dmar_writel(iommu, reg + ++ fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + +@@ -2019,8 +2023,8 @@ irqreturn_t dmar_fault(int irq, void *dev_id) + raw_spin_lock_irqsave(&iommu->register_lock, flag); + } + +- writel(DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO, +- iommu->reg + DMAR_FSTS_REG); ++ dmar_writel(iommu, DMAR_FSTS_REG, ++ DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO); + + unlock_exit: + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +@@ -2073,8 +2077,8 @@ int __init enable_drhd_fault_handling(void) + * Clear any previous faults. + */ + dmar_fault(iommu->irq, iommu); +- fault_status = readl(iommu->reg + DMAR_FSTS_REG); +- writel(fault_status, iommu->reg + DMAR_FSTS_REG); ++ fault_status = dmar_readl(iommu, DMAR_FSTS_REG); ++ dmar_writel(iommu, DMAR_FSTS_REG, fault_status); + } + + return 0; +diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c +index 5c4f5aa8e87e..f706d7c36207 100644 +--- a/drivers/iommu/intel/iommu.c ++++ b/drivers/iommu/intel/iommu.c +@@ -309,7 +309,7 @@ static void init_translation_status(struct intel_iommu *iommu) + { + u32 gsts; + +- gsts = readl(iommu->reg + DMAR_GSTS_REG); ++ gsts = dmar_readl(iommu, DMAR_GSTS_REG); + if (gsts & DMA_GSTS_TES) + iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; + } +@@ -1227,13 +1227,13 @@ static void iommu_set_root_entry(struct intel_iommu *iommu) + addr |= DMA_RTADDR_SMT; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); +- dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); ++ dmar_writeq(iommu, DMAR_RTADDR_REG, addr); + +- writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd | DMA_GCMD_SRTP); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +- readl, (sts & DMA_GSTS_RTPS), sts); ++ dmar_readl, (sts & DMA_GSTS_RTPS), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + +@@ -1259,11 +1259,11 @@ void iommu_flush_write_buffer(struct intel_iommu *iommu) + return; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); +- writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd | DMA_GCMD_WBF); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +- readl, (!(val & DMA_GSTS_WBFS)), val); ++ dmar_readl, (!(val & DMA_GSTS_WBFS)), val); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } +@@ -1293,7 +1293,7 @@ static void __iommu_flush_context(struct intel_iommu *iommu, + val |= DMA_CCMD_ICC; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); +- dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); ++ dmar_writeq(iommu, DMAR_CCMD_REG, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, +@@ -1341,8 +1341,8 @@ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, + raw_spin_lock_irqsave(&iommu->register_lock, flag); + /* Note: Only uses first TLB reg currently */ + if (val_iva) +- dmar_writeq(iommu->reg + tlb_offset, val_iva); +- dmar_writeq(iommu->reg + tlb_offset + 8, val); ++ dmar_writeq(iommu, tlb_offset, val_iva); ++ dmar_writeq(iommu, tlb_offset + 8, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, tlb_offset + 8, +@@ -1619,13 +1619,13 @@ static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) + return; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); +- pmen = readl(iommu->reg + DMAR_PMEN_REG); ++ pmen = dmar_readl(iommu, DMAR_PMEN_REG); + pmen &= ~DMA_PMEN_EPM; +- writel(pmen, iommu->reg + DMAR_PMEN_REG); ++ dmar_writel(iommu, DMAR_PMEN_REG, pmen); + + /* wait for the protected region status bit to clear */ + IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, +- readl, !(pmen & DMA_PMEN_PRS), pmen); ++ dmar_readl, !(pmen & DMA_PMEN_PRS), pmen); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + } +@@ -1637,11 +1637,11 @@ static void iommu_enable_translation(struct intel_iommu *iommu) + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + iommu->gcmd |= DMA_GCMD_TE; +- writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +- readl, (sts & DMA_GSTS_TES), sts); ++ dmar_readl, (sts & DMA_GSTS_TES), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + } +@@ -1657,11 +1657,11 @@ static void iommu_disable_translation(struct intel_iommu *iommu) + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + iommu->gcmd &= ~DMA_GCMD_TE; +- writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +- readl, (!(sts & DMA_GSTS_TES)), sts); ++ dmar_readl, (!(sts & DMA_GSTS_TES)), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } +@@ -2764,7 +2764,7 @@ static int copy_translation_tables(struct intel_iommu *iommu) + int bus, ret; + bool new_ext, ext; + +- rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); ++ rtaddr_reg = dmar_readq(iommu, DMAR_RTADDR_REG); + ext = !!(rtaddr_reg & DMA_RTADDR_SMT); + new_ext = !!sm_supported(iommu); + +@@ -3171,13 +3171,13 @@ static int iommu_suspend(void) + raw_spin_lock_irqsave(&iommu->register_lock, flag); + + iommu->iommu_state[SR_DMAR_FECTL_REG] = +- readl(iommu->reg + DMAR_FECTL_REG); ++ dmar_readl(iommu, DMAR_FECTL_REG); + iommu->iommu_state[SR_DMAR_FEDATA_REG] = +- readl(iommu->reg + DMAR_FEDATA_REG); ++ dmar_readl(iommu, DMAR_FEDATA_REG); + iommu->iommu_state[SR_DMAR_FEADDR_REG] = +- readl(iommu->reg + DMAR_FEADDR_REG); ++ dmar_readl(iommu, DMAR_FEADDR_REG); + iommu->iommu_state[SR_DMAR_FEUADDR_REG] = +- readl(iommu->reg + DMAR_FEUADDR_REG); ++ dmar_readl(iommu, DMAR_FEUADDR_REG); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } +@@ -3202,14 +3202,14 @@ static void iommu_resume(void) + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + +- writel(iommu->iommu_state[SR_DMAR_FECTL_REG], +- iommu->reg + DMAR_FECTL_REG); +- writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], +- iommu->reg + DMAR_FEDATA_REG); +- writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], +- iommu->reg + DMAR_FEADDR_REG); +- writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], +- iommu->reg + DMAR_FEUADDR_REG); ++ dmar_writel(iommu, DMAR_FECTL_REG, ++ iommu->iommu_state[SR_DMAR_FECTL_REG]); ++ dmar_writel(iommu, DMAR_FEDATA_REG, ++ iommu->iommu_state[SR_DMAR_FEDATA_REG]); ++ dmar_writel(iommu, DMAR_FEADDR_REG, ++ iommu->iommu_state[SR_DMAR_FEADDR_REG]); ++ dmar_writel(iommu, DMAR_FEUADDR_REG, ++ iommu->iommu_state[SR_DMAR_FEUADDR_REG]); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } +@@ -3785,7 +3785,7 @@ static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) + { + struct intel_iommu *iommu = dev_to_intel_iommu(dev); +- u32 ver = readl(iommu->reg + DMAR_VER_REG); ++ u32 ver = dmar_readl(iommu, DMAR_VER_REG); + return sprintf(buf, "%d:%d\n", + DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); + } +diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h +index c99cb715bd9a..0cc852fa9713 100644 +--- a/drivers/iommu/intel/iommu.h ++++ b/drivers/iommu/intel/iommu.h +@@ -135,10 +135,18 @@ + + #define OFFSET_STRIDE (9) + +-#define dmar_readq(a) readq(a) +-#define dmar_writeq(a,v) writeq(v,a) +-#define dmar_readl(a) readl(a) +-#define dmar_writel(a, v) writel(v, a) ++#ifdef CONFIG_PKVM_INTEL ++#include ++#define dmar_readq(iommu, o) pkvm_readq((iommu)->reg, (iommu)->reg_phys, o) ++#define dmar_writeq(iommu, o, v) pkvm_writeq((iommu)->reg, (iommu)->reg_phys, o, v) ++#define dmar_readl(iommu, o) pkvm_readl((iommu)->reg, (iommu)->reg_phys, o) ++#define dmar_writel(iommu, o, v) pkvm_writel((iommu)->reg, (iommu)->reg_phys, o, v) ++#else ++#define dmar_readq(iommu, o) readq((iommu)->reg + o) ++#define dmar_writeq(iommu, o, v) writeq(v, (iommu)->reg + o) ++#define dmar_readl(iommu, o) readl((iommu)->reg + o) ++#define dmar_writel(iommu, o, v) writel(v, (iommu)->reg + o) ++#endif + + #define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) + #define DMAR_VER_MINOR(v) ((v) & 0x0f) +@@ -313,7 +321,7 @@ + do { \ + cycles_t start_time = get_cycles(); \ + while (1) { \ +- sts = op(iommu->reg + offset); \ ++ sts = op(iommu, offset); \ + if (cond) \ + break; \ + if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\ +diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c +index 5962bb5027d0..65cff7667ee4 100644 +--- a/drivers/iommu/intel/irq_remapping.c ++++ b/drivers/iommu/intel/irq_remapping.c +@@ -97,7 +97,7 @@ static void init_ir_status(struct intel_iommu *iommu) + { + u32 gsts; + +- gsts = readl(iommu->reg + DMAR_GSTS_REG); ++ gsts = dmar_readl(iommu, DMAR_GSTS_REG); + if (gsts & DMA_GSTS_IRES) + iommu->flags |= VTD_FLAG_IRQ_REMAP_PRE_ENABLED; + } +@@ -437,7 +437,7 @@ static int iommu_load_old_irte(struct intel_iommu *iommu) + u64 irta; + + /* Check whether the old ir-table has the same size as ours */ +- irta = dmar_readq(iommu->reg + DMAR_IRTA_REG); ++ irta = dmar_readq(iommu, DMAR_IRTA_REG); + if ((irta & INTR_REMAP_TABLE_REG_SIZE_MASK) + != INTR_REMAP_TABLE_REG_SIZE) + return -EINVAL; +@@ -480,14 +480,14 @@ static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode) + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + +- dmar_writeq(iommu->reg + DMAR_IRTA_REG, ++ dmar_writeq(iommu, DMAR_IRTA_REG, + (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE); + + /* Set interrupt-remapping table pointer */ +- writel(iommu->gcmd | DMA_GCMD_SIRTP, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd | DMA_GCMD_SIRTP); + + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +- readl, (sts & DMA_GSTS_IRTPS), sts); ++ dmar_readl, (sts & DMA_GSTS_IRTPS), sts); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + /* +@@ -507,16 +507,16 @@ static void iommu_enable_irq_remapping(struct intel_iommu *iommu) + + /* Enable interrupt-remapping */ + iommu->gcmd |= DMA_GCMD_IRE; +- writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd); + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +- readl, (sts & DMA_GSTS_IRES), sts); ++ dmar_readl, (sts & DMA_GSTS_IRES), sts); + + /* Block compatibility-format MSIs */ + if (sts & DMA_GSTS_CFIS) { + iommu->gcmd &= ~DMA_GCMD_CFI; +- writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd); + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +- readl, !(sts & DMA_GSTS_CFIS), sts); ++ dmar_readl, !(sts & DMA_GSTS_CFIS), sts); + } + + /* +@@ -686,15 +686,15 @@ static void iommu_disable_irq_remapping(struct intel_iommu *iommu) + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + +- sts = readl(iommu->reg + DMAR_GSTS_REG); ++ sts = dmar_readl(iommu, DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_IRES)) + goto end; + + iommu->gcmd &= ~DMA_GCMD_IRE; +- writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ++ dmar_writel(iommu, DMAR_GCMD_REG, iommu->gcmd); + + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +- readl, !(sts & DMA_GSTS_IRES), sts); ++ dmar_readl, !(sts & DMA_GSTS_IRES), sts); + + end: + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c +index 3f03039e5cce..dcf0b02187e7 100644 +--- a/drivers/iommu/intel/pasid.c ++++ b/drivers/iommu/intel/pasid.c +@@ -34,7 +34,7 @@ int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid) + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); +- dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC); ++ dmar_writeq(iommu, DMAR_VCMD_REG, VCMD_CMD_ALLOC); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +@@ -64,7 +64,7 @@ void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid) + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); +- dmar_writeq(iommu->reg + DMAR_VCMD_REG, ++ dmar_writeq(iommu, DMAR_VCMD_REG, + VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); +diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c +index 03b25358946c..326ff8f03f68 100644 +--- a/drivers/iommu/intel/svm.c ++++ b/drivers/iommu/intel/svm.c +@@ -107,9 +107,9 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) + iommu->name); + goto free_iopfq; + } +- dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); +- dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); +- dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); ++ dmar_writeq(iommu, DMAR_PQH_REG, 0ULL); ++ dmar_writeq(iommu, DMAR_PQT_REG, 0ULL); ++ dmar_writeq(iommu, DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + + init_completion(&iommu->prq_complete); + +@@ -130,9 +130,9 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) + + int intel_svm_finish_prq(struct intel_iommu *iommu) + { +- dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); +- dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); +- dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); ++ dmar_writeq(iommu, DMAR_PQH_REG, 0ULL); ++ dmar_writeq(iommu, DMAR_PQT_REG, 0ULL); ++ dmar_writeq(iommu, DMAR_PQA_REG, 0ULL); + + if (iommu->pr_irq) { + free_irq(iommu->pr_irq, iommu); +@@ -536,8 +536,8 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid) + */ + prq_retry: + reinit_completion(&iommu->prq_complete); +- tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; +- head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; ++ tail = dmar_readq(iommu, DMAR_PQT_REG) & PRQ_RING_MASK; ++ head = dmar_readq(iommu, DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + +@@ -585,7 +585,7 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid) + qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); +- if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { ++ if (dmar_readl(iommu, DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +@@ -702,10 +702,10 @@ static irqreturn_t prq_event_thread(int irq, void *d) + * Clear PPR bit before reading head/tail registers, to ensure that + * we get a new interrupt if needed. + */ +- writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); ++ dmar_writel(iommu, DMAR_PRS_REG, DMA_PRS_PPR); + +- tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; +- head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; ++ tail = dmar_readq(iommu, DMAR_PQT_REG) & PRQ_RING_MASK; ++ head = dmar_readq(iommu, DMAR_PQH_REG) & PRQ_RING_MASK; + handled = (head != tail); + while (head != tail) { + req = &iommu->prq[head / sizeof(*req)]; +@@ -762,20 +762,20 @@ static irqreturn_t prq_event_thread(int irq, void *d) + head = (head + sizeof(*req)) & PRQ_RING_MASK; + } + +- dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); ++ dmar_writeq(iommu, DMAR_PQH_REG, tail); + + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ +- if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { ++ if (dmar_readl(iommu, DMAR_PRS_REG) & DMA_PRS_PRO) { + pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", + iommu->name); +- head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; +- tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; ++ head = dmar_readq(iommu, DMAR_PQH_REG) & PRQ_RING_MASK; ++ tail = dmar_readq(iommu, DMAR_PQT_REG) & PRQ_RING_MASK; + if (head == tail) { + iopf_queue_discard_partial(iommu->iopf_queue); +- writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); ++ dmar_writel(iommu, DMAR_PRS_REG, DMA_PRS_PRO); + pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", + iommu->name); + } +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index 7ad6f51b3d91..17ae144de43d 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -439,6 +439,22 @@ + __end_ro_after_init = .; + #endif + ++#ifdef CONFIG_PKVM_INTEL ++#include ++#define PKVM_RODATA \ ++ PKVM_SECTION_NAME(.rodata) : \ ++ AT(ADDR(PKVM_SECTION_NAME(.rodata)) - LOAD_OFFSET) { \ ++ . = ALIGN(PAGE_SIZE); \ ++ __pkvm_rodata_start = .; \ ++ *(PKVM_SECTION_NAME(.rodata)) \ ++ *(PKVM_SECTION_NAME(.data..ro_after_init)) \ ++ . = ALIGN(PAGE_SIZE); \ ++ __pkvm_rodata_end = .; \ ++ } ++#else ++#define PKVM_RODATA ++#endif ++ + /* + * .kcfi_traps contains a list KCFI trap locations. + */ +diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h +new file mode 100644 +index 000000000000..81da7107e3bd +--- /dev/null ++++ b/include/linux/intel-iommu.h +@@ -0,0 +1,863 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++ * Copyright © 2006-2015, Intel Corporation. ++ * ++ * Authors: Ashok Raj ++ * Anil S Keshavamurthy ++ * David Woodhouse ++ */ ++ ++#ifndef _INTEL_IOMMU_H_ ++#define _INTEL_IOMMU_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++/* ++ * VT-d hardware uses 4KiB page size regardless of host page size. ++ */ ++#define VTD_PAGE_SHIFT (12) ++#define VTD_PAGE_SIZE (1UL << VTD_PAGE_SHIFT) ++#define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT) ++#define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK) ++ ++#define VTD_STRIDE_SHIFT (9) ++#define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT) ++ ++#define DMA_PTE_READ BIT_ULL(0) ++#define DMA_PTE_WRITE BIT_ULL(1) ++#define DMA_PTE_LARGE_PAGE BIT_ULL(7) ++#define DMA_PTE_SNP BIT_ULL(11) ++ ++#define DMA_FL_PTE_PRESENT BIT_ULL(0) ++#define DMA_FL_PTE_US BIT_ULL(2) ++#define DMA_FL_PTE_ACCESS BIT_ULL(5) ++#define DMA_FL_PTE_DIRTY BIT_ULL(6) ++#define DMA_FL_PTE_XD BIT_ULL(63) ++ ++#define ADDR_WIDTH_5LEVEL (57) ++#define ADDR_WIDTH_4LEVEL (48) ++ ++#define CONTEXT_TT_MULTI_LEVEL 0 ++#define CONTEXT_TT_DEV_IOTLB 1 ++#define CONTEXT_TT_PASS_THROUGH 2 ++#define CONTEXT_PASIDE BIT_ULL(3) ++ ++/* ++ * Intel IOMMU register specification per version 1.0 public spec. ++ */ ++#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ ++#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ ++#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ ++#define DMAR_GCMD_REG 0x18 /* Global command register */ ++#define DMAR_GSTS_REG 0x1c /* Global status register */ ++#define DMAR_RTADDR_REG 0x20 /* Root entry table */ ++#define DMAR_CCMD_REG 0x28 /* Context command reg */ ++#define DMAR_FSTS_REG 0x34 /* Fault Status register */ ++#define DMAR_FECTL_REG 0x38 /* Fault control register */ ++#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ ++#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ ++#define DMAR_FEUADDR_REG 0x44 /* Upper address register */ ++#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ ++#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ ++#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ ++#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ ++#define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */ ++#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ ++#define DMAR_IQH_REG 0x80 /* Invalidation queue head register */ ++#define DMAR_IQT_REG 0x88 /* Invalidation queue tail register */ ++#define DMAR_IQ_SHIFT 4 /* Invalidation queue head/tail shift */ ++#define DMAR_IQA_REG 0x90 /* Invalidation queue addr register */ ++#define DMAR_ICS_REG 0x9c /* Invalidation complete status register */ ++#define DMAR_IQER_REG 0xb0 /* Invalidation queue error record register */ ++#define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr register */ ++#define DMAR_PQH_REG 0xc0 /* Page request queue head register */ ++#define DMAR_PQT_REG 0xc8 /* Page request queue tail register */ ++#define DMAR_PQA_REG 0xd0 /* Page request queue address register */ ++#define DMAR_PRS_REG 0xdc /* Page request status register */ ++#define DMAR_PECTL_REG 0xe0 /* Page request event control register */ ++#define DMAR_PEDATA_REG 0xe4 /* Page request event interrupt data register */ ++#define DMAR_PEADDR_REG 0xe8 /* Page request event interrupt addr register */ ++#define DMAR_PEUADDR_REG 0xec /* Page request event Upper address register */ ++#define DMAR_MTRRCAP_REG 0x100 /* MTRR capability register */ ++#define DMAR_MTRRDEF_REG 0x108 /* MTRR default type register */ ++#define DMAR_MTRR_FIX64K_00000_REG 0x120 /* MTRR Fixed range registers */ ++#define DMAR_MTRR_FIX16K_80000_REG 0x128 ++#define DMAR_MTRR_FIX16K_A0000_REG 0x130 ++#define DMAR_MTRR_FIX4K_C0000_REG 0x138 ++#define DMAR_MTRR_FIX4K_C8000_REG 0x140 ++#define DMAR_MTRR_FIX4K_D0000_REG 0x148 ++#define DMAR_MTRR_FIX4K_D8000_REG 0x150 ++#define DMAR_MTRR_FIX4K_E0000_REG 0x158 ++#define DMAR_MTRR_FIX4K_E8000_REG 0x160 ++#define DMAR_MTRR_FIX4K_F0000_REG 0x168 ++#define DMAR_MTRR_FIX4K_F8000_REG 0x170 ++#define DMAR_MTRR_PHYSBASE0_REG 0x180 /* MTRR Variable range registers */ ++#define DMAR_MTRR_PHYSMASK0_REG 0x188 ++#define DMAR_MTRR_PHYSBASE1_REG 0x190 ++#define DMAR_MTRR_PHYSMASK1_REG 0x198 ++#define DMAR_MTRR_PHYSBASE2_REG 0x1a0 ++#define DMAR_MTRR_PHYSMASK2_REG 0x1a8 ++#define DMAR_MTRR_PHYSBASE3_REG 0x1b0 ++#define DMAR_MTRR_PHYSMASK3_REG 0x1b8 ++#define DMAR_MTRR_PHYSBASE4_REG 0x1c0 ++#define DMAR_MTRR_PHYSMASK4_REG 0x1c8 ++#define DMAR_MTRR_PHYSBASE5_REG 0x1d0 ++#define DMAR_MTRR_PHYSMASK5_REG 0x1d8 ++#define DMAR_MTRR_PHYSBASE6_REG 0x1e0 ++#define DMAR_MTRR_PHYSMASK6_REG 0x1e8 ++#define DMAR_MTRR_PHYSBASE7_REG 0x1f0 ++#define DMAR_MTRR_PHYSMASK7_REG 0x1f8 ++#define DMAR_MTRR_PHYSBASE8_REG 0x200 ++#define DMAR_MTRR_PHYSMASK8_REG 0x208 ++#define DMAR_MTRR_PHYSBASE9_REG 0x210 ++#define DMAR_MTRR_PHYSMASK9_REG 0x218 ++#define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */ ++#define DMAR_VCMD_REG 0xe00 /* Virtual command register */ ++#define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */ ++ ++#define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg) ++#define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg) ++#define DMAR_IQER_REG_ICESID(reg) FIELD_GET(GENMASK_ULL(63, 48), reg) ++ ++#define OFFSET_STRIDE (9) ++ ++#define dmar_readq(a) readq(a) ++#define dmar_writeq(a,v) writeq(v,a) ++#define dmar_readl(a) readl(a) ++#define dmar_writel(a, v) writel(v, a) ++ ++#define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) ++#define DMAR_VER_MINOR(v) ((v) & 0x0f) ++ ++/* ++ * Decoding Capability Register ++ */ ++#define cap_5lp_support(c) (((c) >> 60) & 1) ++#define cap_pi_support(c) (((c) >> 59) & 1) ++#define cap_fl1gp_support(c) (((c) >> 56) & 1) ++#define cap_read_drain(c) (((c) >> 55) & 1) ++#define cap_write_drain(c) (((c) >> 54) & 1) ++#define cap_max_amask_val(c) (((c) >> 48) & 0x3f) ++#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) ++#define cap_pgsel_inv(c) (((c) >> 39) & 1) ++ ++#define cap_super_page_val(c) (((c) >> 34) & 0xf) ++#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ ++ * OFFSET_STRIDE) + 21) ++ ++#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) ++#define cap_max_fault_reg_offset(c) \ ++ (cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16) ++ ++#define cap_zlr(c) (((c) >> 22) & 1) ++#define cap_isoch(c) (((c) >> 23) & 1) ++#define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1) ++#define cap_sagaw(c) (((c) >> 8) & 0x1f) ++#define cap_caching_mode(c) (((c) >> 7) & 1) ++#define cap_phmr(c) (((c) >> 6) & 1) ++#define cap_plmr(c) (((c) >> 5) & 1) ++#define cap_rwbf(c) (((c) >> 4) & 1) ++#define cap_afl(c) (((c) >> 3) & 1) ++#define cap_ndoms(c) (((unsigned long)1) << (4 + 2 * ((c) & 0x7))) ++/* ++ * Extended Capability Register ++ */ ++ ++#define ecap_rps(e) (((e) >> 49) & 0x1) ++#define ecap_smpwc(e) (((e) >> 48) & 0x1) ++#define ecap_flts(e) (((e) >> 47) & 0x1) ++#define ecap_slts(e) (((e) >> 46) & 0x1) ++#define ecap_slads(e) (((e) >> 45) & 0x1) ++#define ecap_vcs(e) (((e) >> 44) & 0x1) ++#define ecap_smts(e) (((e) >> 43) & 0x1) ++#define ecap_dit(e) (((e) >> 41) & 0x1) ++#define ecap_pds(e) (((e) >> 42) & 0x1) ++#define ecap_pasid(e) (((e) >> 40) & 0x1) ++#define ecap_pss(e) (((e) >> 35) & 0x1f) ++#define ecap_eafs(e) (((e) >> 34) & 0x1) ++#define ecap_nwfs(e) (((e) >> 33) & 0x1) ++#define ecap_srs(e) (((e) >> 31) & 0x1) ++#define ecap_ers(e) (((e) >> 30) & 0x1) ++#define ecap_prs(e) (((e) >> 29) & 0x1) ++#define ecap_broken_pasid(e) (((e) >> 28) & 0x1) ++#define ecap_dis(e) (((e) >> 27) & 0x1) ++#define ecap_nest(e) (((e) >> 26) & 0x1) ++#define ecap_mts(e) (((e) >> 25) & 0x1) ++#define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) ++#define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16) ++#define ecap_coherent(e) ((e) & 0x1) ++#define ecap_qis(e) ((e) & 0x2) ++#define ecap_pass_through(e) (((e) >> 6) & 0x1) ++#define ecap_eim_support(e) (((e) >> 4) & 0x1) ++#define ecap_ir_support(e) (((e) >> 3) & 0x1) ++#define ecap_dev_iotlb_support(e) (((e) >> 2) & 0x1) ++#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf) ++#define ecap_sc_support(e) (((e) >> 7) & 0x1) /* Snooping Control */ ++ ++/* Virtual command interface capability */ ++#define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */ ++ ++/* IOTLB_REG */ ++#define DMA_TLB_FLUSH_GRANU_OFFSET 60 ++#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) ++#define DMA_TLB_DSI_FLUSH (((u64)2) << 60) ++#define DMA_TLB_PSI_FLUSH (((u64)3) << 60) ++#define DMA_TLB_IIRG(type) ((type >> 60) & 3) ++#define DMA_TLB_IAIG(val) (((val) >> 57) & 3) ++#define DMA_TLB_READ_DRAIN (((u64)1) << 49) ++#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48) ++#define DMA_TLB_DID(id) (((u64)((id) & 0xffff)) << 32) ++#define DMA_TLB_IVT (((u64)1) << 63) ++#define DMA_TLB_IH_NONLEAF (((u64)1) << 6) ++#define DMA_TLB_MAX_SIZE (0x3f) ++ ++/* INVALID_DESC */ ++#define DMA_CCMD_INVL_GRANU_OFFSET 61 ++#define DMA_ID_TLB_GLOBAL_FLUSH (((u64)1) << 4) ++#define DMA_ID_TLB_DSI_FLUSH (((u64)2) << 4) ++#define DMA_ID_TLB_PSI_FLUSH (((u64)3) << 4) ++#define DMA_ID_TLB_READ_DRAIN (((u64)1) << 7) ++#define DMA_ID_TLB_WRITE_DRAIN (((u64)1) << 6) ++#define DMA_ID_TLB_DID(id) (((u64)((id & 0xffff) << 16))) ++#define DMA_ID_TLB_IH_NONLEAF (((u64)1) << 6) ++#define DMA_ID_TLB_ADDR(addr) (addr) ++#define DMA_ID_TLB_ADDR_MASK(mask) (mask) ++ ++/* PMEN_REG */ ++#define DMA_PMEN_EPM (((u32)1)<<31) ++#define DMA_PMEN_PRS (((u32)1)<<0) ++ ++/* GCMD_REG */ ++#define DMA_GCMD_TE (((u32)1) << 31) ++#define DMA_GCMD_SRTP (((u32)1) << 30) ++#define DMA_GCMD_SFL (((u32)1) << 29) ++#define DMA_GCMD_EAFL (((u32)1) << 28) ++#define DMA_GCMD_WBF (((u32)1) << 27) ++#define DMA_GCMD_QIE (((u32)1) << 26) ++#define DMA_GCMD_SIRTP (((u32)1) << 24) ++#define DMA_GCMD_IRE (((u32) 1) << 25) ++#define DMA_GCMD_CFI (((u32) 1) << 23) ++ ++/* GSTS_REG */ ++#define DMA_GSTS_TES (((u32)1) << 31) ++#define DMA_GSTS_RTPS (((u32)1) << 30) ++#define DMA_GSTS_FLS (((u32)1) << 29) ++#define DMA_GSTS_AFLS (((u32)1) << 28) ++#define DMA_GSTS_WBFS (((u32)1) << 27) ++#define DMA_GSTS_QIES (((u32)1) << 26) ++#define DMA_GSTS_IRTPS (((u32)1) << 24) ++#define DMA_GSTS_IRES (((u32)1) << 25) ++#define DMA_GSTS_CFIS (((u32)1) << 23) ++ ++/* DMA_RTADDR_REG */ ++#define DMA_RTADDR_SMT (((u64)1) << 10) ++ ++/* CCMD_REG */ ++#define DMA_CCMD_ICC (((u64)1) << 63) ++#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) ++#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61) ++#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61) ++#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32) ++#define DMA_CCMD_MASK_NOBIT 0 ++#define DMA_CCMD_MASK_1BIT 1 ++#define DMA_CCMD_MASK_2BIT 2 ++#define DMA_CCMD_MASK_3BIT 3 ++#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) ++#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) ++ ++/* FECTL_REG */ ++#define DMA_FECTL_IM (((u32)1) << 31) ++ ++/* FSTS_REG */ ++#define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */ ++#define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */ ++#define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */ ++#define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */ ++#define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */ ++#define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */ ++#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) ++ ++/* FRCD_REG, 32 bits access */ ++#define DMA_FRCD_F (((u32)1) << 31) ++#define dma_frcd_type(d) ((d >> 30) & 1) ++#define dma_frcd_fault_reason(c) (c & 0xff) ++#define dma_frcd_source_id(c) (c & 0xffff) ++#define dma_frcd_pasid_value(c) (((c) >> 8) & 0xfffff) ++#define dma_frcd_pasid_present(c) (((c) >> 31) & 1) ++/* low 64 bit */ ++#define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT)) ++ ++/* PRS_REG */ ++#define DMA_PRS_PPR ((u32)1) ++#define DMA_PRS_PRO ((u32)2) ++ ++#define DMA_VCS_PAS ((u64)1) ++ ++#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ ++do { \ ++ cycles_t start_time = get_cycles(); \ ++ while (1) { \ ++ sts = op(iommu->reg + offset); \ ++ if (cond) \ ++ break; \ ++ if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\ ++ panic("DMAR hardware is malfunctioning\n"); \ ++ cpu_relax(); \ ++ } \ ++} while (0) ++ ++#define QI_LENGTH 256 /* queue length */ ++ ++enum { ++ QI_FREE, ++ QI_IN_USE, ++ QI_DONE, ++ QI_ABORT ++}; ++ ++#define QI_CC_TYPE 0x1 ++#define QI_IOTLB_TYPE 0x2 ++#define QI_DIOTLB_TYPE 0x3 ++#define QI_IEC_TYPE 0x4 ++#define QI_IWD_TYPE 0x5 ++#define QI_EIOTLB_TYPE 0x6 ++#define QI_PC_TYPE 0x7 ++#define QI_DEIOTLB_TYPE 0x8 ++#define QI_PGRP_RESP_TYPE 0x9 ++#define QI_PSTRM_RESP_TYPE 0xa ++ ++#define QI_IEC_SELECTIVE (((u64)1) << 4) ++#define QI_IEC_IIDEX(idx) (((u64)(idx & 0xffff) << 32)) ++#define QI_IEC_IM(m) (((u64)(m & 0x1f) << 27)) ++ ++#define QI_IWD_STATUS_DATA(d) (((u64)d) << 32) ++#define QI_IWD_STATUS_WRITE (((u64)1) << 5) ++#define QI_IWD_FENCE (((u64)1) << 6) ++#define QI_IWD_PRQ_DRAIN (((u64)1) << 7) ++ ++#define QI_IOTLB_DID(did) (((u64)did) << 16) ++#define QI_IOTLB_DR(dr) (((u64)dr) << 7) ++#define QI_IOTLB_DW(dw) (((u64)dw) << 6) ++#define QI_IOTLB_GRAN(gran) (((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4)) ++#define QI_IOTLB_ADDR(addr) (((u64)addr) & VTD_PAGE_MASK) ++#define QI_IOTLB_IH(ih) (((u64)ih) << 6) ++#define QI_IOTLB_AM(am) (((u8)am) & 0x3f) ++ ++#define QI_CC_FM(fm) (((u64)fm) << 48) ++#define QI_CC_SID(sid) (((u64)sid) << 32) ++#define QI_CC_DID(did) (((u64)did) << 16) ++#define QI_CC_GRAN(gran) (((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4)) ++ ++#define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32) ++#define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16) ++#define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) ++#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \ ++ ((u64)((pfsid >> 4) & 0xfff) << 52)) ++#define QI_DEV_IOTLB_SIZE 1 ++#define QI_DEV_IOTLB_MAX_INVS 32 ++ ++#define QI_PC_PASID(pasid) (((u64)pasid) << 32) ++#define QI_PC_DID(did) (((u64)did) << 16) ++#define QI_PC_GRAN(gran) (((u64)gran) << 4) ++ ++/* PASID cache invalidation granu */ ++#define QI_PC_ALL_PASIDS 0 ++#define QI_PC_PASID_SEL 1 ++#define QI_PC_GLOBAL 3 ++ ++#define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) ++#define QI_EIOTLB_IH(ih) (((u64)ih) << 6) ++#define QI_EIOTLB_AM(am) (((u64)am) & 0x3f) ++#define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32) ++#define QI_EIOTLB_DID(did) (((u64)did) << 16) ++#define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4) ++ ++/* QI Dev-IOTLB inv granu */ ++#define QI_DEV_IOTLB_GRAN_ALL 1 ++#define QI_DEV_IOTLB_GRAN_PASID_SEL 0 ++ ++#define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) ++#define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) ++#define QI_DEV_EIOTLB_PASID(p) ((u64)((p) & 0xfffff) << 32) ++#define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16) ++#define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4) ++#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \ ++ ((u64)((pfsid >> 4) & 0xfff) << 52)) ++#define QI_DEV_EIOTLB_MAX_INVS 32 ++ ++/* Page group response descriptor QW0 */ ++#define QI_PGRP_PASID_P(p) (((u64)(p)) << 4) ++#define QI_PGRP_PDP(p) (((u64)(p)) << 5) ++#define QI_PGRP_RESP_CODE(res) (((u64)(res)) << 12) ++#define QI_PGRP_DID(rid) (((u64)(rid)) << 16) ++#define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32) ++ ++/* Page group response descriptor QW1 */ ++#define QI_PGRP_LPIG(x) (((u64)(x)) << 2) ++#define QI_PGRP_IDX(idx) (((u64)(idx)) << 3) ++ ++ ++#define QI_RESP_SUCCESS 0x0 ++#define QI_RESP_INVALID 0x1 ++#define QI_RESP_FAILURE 0xf ++ ++#define QI_GRAN_NONG_PASID 2 ++#define QI_GRAN_PSI_PASID 3 ++ ++#define qi_shift(iommu) (DMAR_IQ_SHIFT + !!ecap_smts((iommu)->ecap)) ++ ++struct qi_desc { ++ u64 qw0; ++ u64 qw1; ++ u64 qw2; ++ u64 qw3; ++}; ++ ++struct q_inval { ++ raw_spinlock_t q_lock; ++ void *desc; /* invalidation queue */ ++ int *desc_status; /* desc status */ ++ int free_head; /* first free entry */ ++ int free_tail; /* last free entry */ ++ int free_cnt; ++}; ++ ++struct dmar_pci_notify_info; ++ ++#ifdef CONFIG_IRQ_REMAP ++/* 1MB - maximum possible interrupt remapping table size */ ++#define INTR_REMAP_PAGE_ORDER 8 ++#define INTR_REMAP_TABLE_REG_SIZE 0xf ++#define INTR_REMAP_TABLE_REG_SIZE_MASK 0xf ++ ++#define INTR_REMAP_TABLE_ENTRIES 65536 ++ ++struct irq_domain; ++ ++struct ir_table { ++ struct irte *base; ++ unsigned long *bitmap; ++}; ++ ++void intel_irq_remap_add_device(struct dmar_pci_notify_info *info); ++#else ++static inline void ++intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { } ++#endif ++ ++struct iommu_flush { ++ void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, ++ u8 fm, u64 type); ++ void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr, ++ unsigned int size_order, u64 type); ++}; ++ ++enum { ++ SR_DMAR_FECTL_REG, ++ SR_DMAR_FEDATA_REG, ++ SR_DMAR_FEADDR_REG, ++ SR_DMAR_FEUADDR_REG, ++ MAX_SR_DMAR_REGS ++}; ++ ++#define VTD_FLAG_TRANS_PRE_ENABLED (1 << 0) ++#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1) ++#define VTD_FLAG_SVM_CAPABLE (1 << 2) ++ ++extern int intel_iommu_sm; ++extern spinlock_t device_domain_lock; ++ ++#define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) ++#define pasid_supported(iommu) (sm_supported(iommu) && \ ++ ecap_pasid((iommu)->ecap)) ++ ++struct pasid_entry; ++struct pasid_state_entry; ++struct page_req_dsc; ++ ++/* ++ * 0: Present ++ * 1-11: Reserved ++ * 12-63: Context Ptr (12 - (haw-1)) ++ * 64-127: Reserved ++ */ ++struct root_entry { ++ u64 lo; ++ u64 hi; ++}; ++ ++/* ++ * low 64 bits: ++ * 0: present ++ * 1: fault processing disable ++ * 2-3: translation type ++ * 12-63: address space root ++ * high 64 bits: ++ * 0-2: address width ++ * 3-6: aval ++ * 8-23: domain id ++ */ ++struct context_entry { ++ u64 lo; ++ u64 hi; ++}; ++ ++/* si_domain contains mulitple devices */ ++#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) ++ ++/* ++ * When VT-d works in the scalable mode, it allows DMA translation to ++ * happen through either first level or second level page table. This ++ * bit marks that the DMA translation for the domain goes through the ++ * first level page table, otherwise, it goes through the second level. ++ */ ++#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(1) ++ ++/* ++ * Domain represents a virtual machine which demands iommu nested ++ * translation mode support. ++ */ ++#define DOMAIN_FLAG_NESTING_MODE BIT(2) ++ ++struct dmar_domain { ++ int nid; /* node id */ ++ ++ unsigned int iommu_refcnt[DMAR_UNITS_SUPPORTED]; ++ /* Refcount of devices per iommu */ ++ ++ ++ u16 iommu_did[DMAR_UNITS_SUPPORTED]; ++ /* Domain ids per IOMMU. Use u16 since ++ * domain ids are 16 bit wide according ++ * to VT-d spec, section 9.3 */ ++ ++ u8 has_iotlb_device: 1; ++ u8 iommu_coherency: 1; /* indicate coherency of iommu access */ ++ u8 iommu_snooping: 1; /* indicate snooping control feature */ ++ ++ struct list_head devices; /* all devices' list */ ++ struct list_head subdevices; /* all subdevices' list */ ++ struct iova_domain iovad; /* iova's that belong to this domain */ ++ ++ struct dma_pte *pgd; /* virtual address */ ++ int gaw; /* max guest address width */ ++ ++ /* adjusted guest address width, 0 is level 2 30-bit */ ++ int agaw; ++ ++ int flags; /* flags to find out type of domain */ ++ int iommu_superpage;/* Level of superpages supported: ++ 0 == 4KiB (no superpages), 1 == 2MiB, ++ 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ ++ u64 max_addr; /* maximum mapped address */ ++ ++ u32 default_pasid; /* ++ * The default pasid used for non-SVM ++ * traffic on mediated devices. ++ */ ++ ++ struct iommu_domain domain; /* generic domain data structure for ++ iommu core */ ++}; ++ ++struct intel_iommu { ++ void __iomem *reg; /* Pointer to hardware regs, virtual addr */ ++ u64 reg_phys; /* physical address of hw register set */ ++ u64 reg_size; /* size of hw register set */ ++ u64 cap; ++ u64 ecap; ++ u64 vccap; ++ u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ ++ raw_spinlock_t register_lock; /* protect register handling */ ++ int seq_id; /* sequence id of the iommu */ ++ int agaw; /* agaw of this iommu */ ++ int msagaw; /* max sagaw of this iommu */ ++ unsigned int irq, pr_irq; ++ u16 segment; /* PCI segment# */ ++ unsigned char name[13]; /* Device Name */ ++ ++#ifdef CONFIG_INTEL_IOMMU ++ unsigned long *domain_ids; /* bitmap of domains */ ++ struct dmar_domain ***domains; /* ptr to domains */ ++ unsigned long *copied_tables; /* bitmap of copied tables */ ++ spinlock_t lock; /* protect context, domain ids */ ++ struct root_entry *root_entry; /* virtual address */ ++ ++ struct iommu_flush flush; ++#endif ++#ifdef CONFIG_INTEL_IOMMU_SVM ++ struct page_req_dsc *prq; ++ unsigned char prq_name[16]; /* Name for PRQ interrupt */ ++ struct completion prq_complete; ++ struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */ ++#endif ++ struct iopf_queue *iopf_queue; ++ unsigned char iopfq_name[16]; ++ struct q_inval *qi; /* Queued invalidation info */ ++ u32 *iommu_state; /* Store iommu states between suspend and resume.*/ ++ ++#ifdef CONFIG_IRQ_REMAP ++ struct ir_table *ir_table; /* Interrupt remapping info */ ++ struct irq_domain *ir_domain; ++ struct irq_domain *ir_msi_domain; ++#endif ++ struct iommu_device iommu; /* IOMMU core code handle */ ++ int node; ++ u32 flags; /* Software defined flags */ ++ ++ struct dmar_drhd_unit *drhd; ++ void *perf_statistic; ++}; ++ ++/* Per subdevice private data */ ++struct subdev_domain_info { ++ struct list_head link_phys; /* link to phys device siblings */ ++ struct list_head link_domain; /* link to domain siblings */ ++ struct device *pdev; /* physical device derived from */ ++ struct dmar_domain *domain; /* aux-domain */ ++ int users; /* user count */ ++}; ++ ++/* PCI domain-device relationship */ ++struct device_domain_info { ++ struct list_head link; /* link to domain siblings */ ++ struct list_head global; /* link to global list */ ++ struct list_head table; /* link to pasid table */ ++ struct list_head subdevices; /* subdevices sibling */ ++ u32 segment; /* PCI segment number */ ++ u8 bus; /* PCI bus number */ ++ u8 devfn; /* PCI devfn number */ ++ u16 pfsid; /* SRIOV physical function source ID */ ++ u8 pasid_supported:3; ++ u8 pasid_enabled:1; ++ u8 pri_supported:1; ++ u8 pri_enabled:1; ++ u8 ats_supported:1; ++ u8 ats_enabled:1; ++ u8 auxd_enabled:1; /* Multiple domains per device */ ++ u8 ats_qdep; ++ struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ ++ struct intel_iommu *iommu; /* IOMMU used by this device */ ++ struct dmar_domain *domain; /* pointer to domain */ ++ struct pasid_table *pasid_table; /* pasid table */ ++}; ++ ++static inline void __iommu_flush_cache( ++ struct intel_iommu *iommu, void *addr, int size) ++{ ++ if (!ecap_coherent(iommu->ecap)) ++ clflush_cache_range(addr, size); ++} ++ ++/* Convert generic struct iommu_domain to private struct dmar_domain */ ++static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) ++{ ++ return container_of(dom, struct dmar_domain, domain); ++} ++ ++/* ++ * 0: readable ++ * 1: writable ++ * 2-6: reserved ++ * 7: super page ++ * 8-10: available ++ * 11: snoop behavior ++ * 12-63: Host physical address ++ */ ++struct dma_pte { ++ u64 val; ++}; ++ ++static inline void dma_clear_pte(struct dma_pte *pte) ++{ ++ pte->val = 0; ++} ++ ++static inline u64 dma_pte_addr(struct dma_pte *pte) ++{ ++#ifdef CONFIG_64BIT ++ return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD); ++#else ++ /* Must have a full atomic 64-bit read */ ++ return __cmpxchg64(&pte->val, 0ULL, 0ULL) & ++ VTD_PAGE_MASK & (~DMA_FL_PTE_XD); ++#endif ++} ++ ++static inline bool dma_pte_present(struct dma_pte *pte) ++{ ++ return (pte->val & 3) != 0; ++} ++ ++static inline bool dma_pte_superpage(struct dma_pte *pte) ++{ ++ return (pte->val & DMA_PTE_LARGE_PAGE); ++} ++ ++static inline int first_pte_in_page(struct dma_pte *pte) ++{ ++ return !((unsigned long)pte & ~VTD_PAGE_MASK); ++} ++ ++static inline bool context_present(struct context_entry *context) ++{ ++ return (context->lo & 1); ++} ++ ++extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); ++extern int dmar_find_matched_atsr_unit(struct pci_dev *dev); ++ ++extern int dmar_enable_qi(struct intel_iommu *iommu); ++extern void dmar_disable_qi(struct intel_iommu *iommu); ++extern int dmar_reenable_qi(struct intel_iommu *iommu); ++extern void qi_global_iec(struct intel_iommu *iommu); ++ ++extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, ++ u8 fm, u64 type); ++extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, ++ unsigned int size_order, u64 type); ++extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, ++ u16 qdep, u64 addr, unsigned mask); ++ ++void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, ++ unsigned long npages, bool ih); ++ ++void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, ++ u32 pasid, u16 qdep, u64 addr, ++ unsigned int size_order); ++void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, ++ u32 pasid); ++ ++int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, ++ unsigned int count, unsigned long options); ++/* ++ * Options used in qi_submit_sync: ++ * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. ++ */ ++#define QI_OPT_WAIT_DRAIN BIT(0) ++ ++extern int dmar_ir_support(void); ++ ++void *alloc_pgtable_page(int node); ++void free_pgtable_page(void *vaddr); ++struct intel_iommu *domain_get_iommu(struct dmar_domain *domain); ++int for_each_device_domain(int (*fn)(struct device_domain_info *info, ++ void *data), void *data); ++void iommu_flush_write_buffer(struct intel_iommu *iommu); ++int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev); ++struct dmar_domain *find_domain(struct device *dev); ++struct device_domain_info *get_domain_info(struct device *dev); ++struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); ++ ++#ifdef CONFIG_INTEL_IOMMU_SVM ++extern void intel_svm_check(struct intel_iommu *iommu); ++extern int intel_svm_enable_prq(struct intel_iommu *iommu); ++extern int intel_svm_finish_prq(struct intel_iommu *iommu); ++int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, ++ struct iommu_gpasid_bind_data *data); ++int intel_svm_unbind_gpasid(struct device *dev, u32 pasid); ++struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, ++ void *drvdata); ++void intel_svm_unbind(struct iommu_sva *handle); ++u32 intel_svm_get_pasid(struct iommu_sva *handle); ++int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, ++ struct iommu_page_response *msg); ++ ++struct intel_svm_dev { ++ struct list_head list; ++ struct rcu_head rcu; ++ struct device *dev; ++ struct intel_iommu *iommu; ++ struct iommu_sva sva; ++ unsigned long prq_seq_number; ++ u32 pasid; ++ int users; ++ u16 did; ++ u16 dev_iotlb:1; ++ u16 sid, qdep; ++}; ++ ++struct intel_svm { ++ struct mmu_notifier notifier; ++ struct mm_struct *mm; ++ ++ unsigned int flags; ++ u32 pasid; ++ int gpasid; /* In case that guest PASID is different from host PASID */ ++ struct list_head devs; ++}; ++#else ++static inline void intel_svm_check(struct intel_iommu *iommu) {} ++#endif ++ ++#ifdef CONFIG_INTEL_IOMMU_DEBUGFS ++void intel_iommu_debugfs_init(void); ++#else ++static inline void intel_iommu_debugfs_init(void) {} ++#endif /* CONFIG_INTEL_IOMMU_DEBUGFS */ ++ ++extern const struct attribute_group *intel_iommu_groups[]; ++struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, ++ u8 devfn, int alloc); ++ ++#ifdef CONFIG_INTEL_IOMMU ++extern int iommu_calculate_agaw(struct intel_iommu *iommu); ++extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); ++extern int dmar_disabled; ++extern int intel_iommu_enabled; ++extern int intel_iommu_gfx_mapped; ++#else ++static inline int iommu_calculate_agaw(struct intel_iommu *iommu) ++{ ++ return 0; ++} ++static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu) ++{ ++ return 0; ++} ++#define dmar_disabled (1) ++#define intel_iommu_enabled (0) ++#endif ++ ++static inline const char *decode_prq_descriptor(char *str, size_t size, ++ u64 dw0, u64 dw1, u64 dw2, u64 dw3) ++{ ++ char *buf = str; ++ int bytes; ++ ++ bytes = snprintf(buf, size, ++ "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx", ++ FIELD_GET(GENMASK_ULL(31, 16), dw0), ++ FIELD_GET(GENMASK_ULL(63, 12), dw1), ++ dw1 & BIT_ULL(0) ? 'r' : '-', ++ dw1 & BIT_ULL(1) ? 'w' : '-', ++ dw0 & BIT_ULL(52) ? 'x' : '-', ++ dw0 & BIT_ULL(53) ? 'p' : '-', ++ dw1 & BIT_ULL(2) ? 'l' : '-', ++ FIELD_GET(GENMASK_ULL(51, 32), dw0), ++ FIELD_GET(GENMASK_ULL(11, 3), dw1)); ++ ++ /* Private Data */ ++ if (dw0 & BIT_ULL(9)) { ++ size -= bytes; ++ buf += bytes; ++ snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3); ++ } ++ ++ return str; ++} ++ ++#endif +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 637a60607c7d..453d4ee759fd 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -379,6 +379,12 @@ struct kvm_vcpu { + */ + struct kvm_memory_slot *last_used_slot; + u64 last_used_slot_gen; ++ ++ /* ++ * Save the handle returned from the pkvm when init a shadow vcpu. This ++ * will be used when teardown this shadow vcpu. ++ */ ++ s64 pkvm_shadow_vcpu_handle; + }; + + /* +@@ -686,6 +692,18 @@ struct kvm_memslots { + int node_idx; + }; + ++struct kvm_pinned_page { ++ struct list_head list; ++ struct page *page; ++}; ++ ++struct kvm_protected_vm { ++ int shadow_vm_handle; ++ ++ struct list_head pinned_pages; ++ spinlock_t pinned_page_lock; ++}; ++ + struct kvm { + #ifdef KVM_HAVE_MMU_RWLOCK + rwlock_t mmu_lock; +@@ -786,6 +804,8 @@ struct kvm { + struct notifier_block pm_notifier; + #endif + char stats_id[KVM_STATS_NAME_SIZE]; ++ ++ struct kvm_protected_vm pkvm; + }; + + #define kvm_err(fmt, ...) \ +@@ -1358,6 +1378,8 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target); + void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); + + void kvm_flush_remote_tlbs(struct kvm *kvm); ++int kvm_flush_remote_tlbs_with_range(struct kvm *kvm, ++ struct kvm_tlb_range *range); + + #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE + int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min); +@@ -1490,6 +1512,14 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) + } + #endif + ++#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB_WITH_RANGE ++static inline int kvm_arch_flush_remote_tlb_with_range(struct kvm *kvm, ++ struct kvm_tlb_range *range) ++{ ++ return -ENOTSUPP; ++} ++#endif ++ + #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA + void kvm_arch_register_noncoherent_dma(struct kvm *kvm); + void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm); +@@ -1848,7 +1878,8 @@ struct _kvm_stats_desc { + + #define KVM_GENERIC_VM_STATS() \ + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush), \ +- STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests) ++ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests), \ ++ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_with_range) + + #define KVM_GENERIC_VCPU_STATS() \ + STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll), \ +@@ -2282,4 +2313,5 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) + /* Max number of entries allowed for each kvm dirty ring */ + #define KVM_DIRTY_RING_MAX_ENTRIES 65536 + ++int kvm_arch_add_device_to_pkvm(struct kvm *kvm, struct iommu_group *grp); + #endif +diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h +index 3ca3db020e0e..e556fcf5036a 100644 +--- a/include/linux/kvm_types.h ++++ b/include/linux/kvm_types.h +@@ -104,6 +104,7 @@ struct kvm_mmu_memory_cache { + struct kvm_vm_stat_generic { + u64 remote_tlb_flush; + u64 remote_tlb_flush_requests; ++ u64 remote_tlb_flush_with_range; + }; + + struct kvm_vcpu_stat_generic { +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 0d5d4419139a..22dcc759fd07 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt { + #define KVM_CAP_S390_ZPCI_OP 221 + #define KVM_CAP_S390_CPU_TOPOLOGY 222 + #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 ++#define KVM_CAP_VM_TYPES 224 + + #ifdef KVM_CAP_IRQ_ROUTING + +diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h +index 960c7e93d1a9..a06f4a9e5033 100644 +--- a/include/uapi/linux/kvm_para.h ++++ b/include/uapi/linux/kvm_para.h +@@ -31,6 +31,17 @@ + #define KVM_HC_SCHED_YIELD 11 + #define KVM_HC_MAP_GPA_RANGE 12 + ++#define KVM_HC_PKVM_OP 20 ++ ++/* PKVM provided hypercalls for guest use. */ ++#define PKVM_GHC_NUM(x) (x + KVM_HC_PKVM_OP) ++ ++#define PKVM_GHC_SHARE_MEM PKVM_GHC_NUM(1) ++#define PKVM_GHC_UNSHARE_MEM PKVM_GHC_NUM(2) ++#define PKVM_GHC_IOREAD PKVM_GHC_NUM(3) ++#define PKVM_GHC_IOWRITE PKVM_GHC_NUM(4) ++#define PKVM_GHC_GET_VE_INFO PKVM_GHC_NUM(5) ++ + /* + * hypercalls use architecture specific + */ +diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h +index 46de10a809ec..f513dc0ae610 100644 +--- a/tools/arch/x86/include/uapi/asm/kvm.h ++++ b/tools/arch/x86/include/uapi/asm/kvm.h +@@ -532,4 +532,7 @@ struct kvm_pmu_event_filter { + #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ + #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ + ++#define KVM_X86_DEFAULT_VM 0 ++#define KVM_X86_PROTECTED_VM 1 ++ + #endif /* _ASM_X86_KVM_H */ +diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h +index 0d5d4419139a..22dcc759fd07 100644 +--- a/tools/include/uapi/linux/kvm.h ++++ b/tools/include/uapi/linux/kvm.h +@@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt { + #define KVM_CAP_S390_ZPCI_OP 221 + #define KVM_CAP_S390_CPU_TOPOLOGY 222 + #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 ++#define KVM_CAP_VM_TYPES 224 + + #ifdef KVM_CAP_IRQ_ROUTING + +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 8123f4d15930..1a1cc36d20a4 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -378,6 +378,33 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) + EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); + #endif + ++int kvm_flush_remote_tlbs_with_range(struct kvm *kvm, struct kvm_tlb_range *range) ++{ ++ int ret; ++ ++ ret = kvm_arch_flush_remote_tlb_with_range(kvm, range); ++ if (!ret) ++ ++kvm->stat.generic.remote_tlb_flush_with_range; ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs_with_range); ++ ++static bool kvm_try_flush_remote_tlbs_with_range(struct kvm *kvm, ++ struct kvm_gfn_range *gfn_range) ++{ ++#ifdef CONFIG_PKVM_INTEL ++ struct kvm_tlb_range tlb_range = { ++ .start_gfn = gfn_range->start, ++ .pages = gfn_range->end - gfn_range->start, ++ }; ++ ++ return !!kvm_flush_remote_tlbs_with_range(kvm, &tlb_range); ++#else ++ return true; ++#endif ++} ++ + static void kvm_flush_shadow_all(struct kvm *kvm) + { + kvm_arch_flush_shadow_all(kvm); +@@ -578,7 +605,7 @@ static void kvm_null_fn(void) + static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_hva_range *range) + { +- bool ret = false, locked = false; ++ bool ret = false, locked = false, need_global_flush = false; + struct kvm_gfn_range gfn_range; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; +@@ -633,10 +660,14 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, + break; + } + ret |= range->handler(kvm, &gfn_range); ++ if (range->flush_on_ret && ret) ++ need_global_flush |= ++ kvm_try_flush_remote_tlbs_with_range(kvm, &gfn_range); ++ + } + } + +- if (range->flush_on_ret && ret) ++ if (range->flush_on_ret && ret && need_global_flush) + kvm_flush_remote_tlbs(kvm); + + if (locked) { +diff --git a/virt/kvm/pkvm/buddy_memory.h b/virt/kvm/pkvm/buddy_memory.h +new file mode 100644 +index 000000000000..56ae67a1c294 +--- /dev/null ++++ b/virt/kvm/pkvm/buddy_memory.h +@@ -0,0 +1,36 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef __PKVM_BUDDY_MEMORY_H ++#define __PKVM_BUDDY_MEMORY_H ++ ++#include ++#include ++ ++#include ++ ++struct pkvm_page { ++ unsigned short refcount; ++ unsigned short order; ++}; ++ ++extern u64 __pkvm_vmemmap; ++#define pkvm_vmemmap ((struct pkvm_page *)__pkvm_vmemmap) ++ ++#define pkvm_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) ++#define pkvm_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) ++#define pkvm_phys_to_page(phys) (&pkvm_vmemmap[pkvm_phys_to_pfn(phys)]) ++#define pkvm_virt_to_page(virt) pkvm_phys_to_page(__pkvm_pa(virt)) ++#define pkvm_virt_to_pfn(virt) pkvm_phys_to_pfn(__pkvm_pa(virt)) ++ ++#define pkvm_page_to_pfn(page) ((struct pkvm_page *)(page) - pkvm_vmemmap) ++#define pkvm_page_to_phys(page) pkvm_pfn_to_phys((pkvm_page_to_pfn(page))) ++#define pkvm_page_to_virt(page) __pkvm_va(pkvm_page_to_phys(page)) ++#define pkvm_page_to_pool(page) (((struct pkvm_page *)page)->pool) ++ ++static inline int pkvm_page_count(void *addr) ++{ ++ struct pkvm_page *p = pkvm_virt_to_page(addr); ++ ++ return p->refcount; ++} ++ ++#endif /* __PKVM_BUDDY_MEMORY_H */ +diff --git a/virt/kvm/pkvm/gfp.h b/virt/kvm/pkvm/gfp.h +new file mode 100644 +index 000000000000..47351de0522e +--- /dev/null ++++ b/virt/kvm/pkvm/gfp.h +@@ -0,0 +1,35 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef __PKVM_GFP_H ++#define __PKVM_GFP_H ++ ++#include ++#include ++ ++#include ++#include ++ ++#define PKVM_NO_ORDER USHRT_MAX ++ ++struct pkvm_pool { ++ /* ++ * Spinlock protecting concurrent changes to the memory pool as well as ++ * the struct pkvm_page of the pool's pages until we have a proper atomic ++ * API at hypervisor. ++ */ ++ pkvm_spinlock_t lock; ++ struct list_head free_area[MAX_ORDER]; ++ phys_addr_t range_start; ++ phys_addr_t range_end; ++ unsigned short max_order; ++}; ++ ++/* Allocation */ ++void *pkvm_alloc_pages(struct pkvm_pool *pool, unsigned short order); ++void pkvm_split_page(struct pkvm_page *page); ++void pkvm_get_page(struct pkvm_pool *pool, void *addr); ++void pkvm_put_page(struct pkvm_pool *pool, void *addr); ++ ++/* Used pages cannot be freed */ ++int pkvm_pool_init(struct pkvm_pool *pool, u64 pfn, unsigned int nr_pages, ++ unsigned int reserved_pages); ++#endif /* __PKVM_GFP_H */ +diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/virt/kvm/pkvm/page_alloc.c +similarity index 56% +rename from arch/arm64/kvm/hyp/nvhe/page_alloc.c +rename to virt/kvm/pkvm/page_alloc.c +index d40f0b30b534..83a61c55cf0f 100644 +--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c ++++ b/virt/kvm/pkvm/page_alloc.c +@@ -4,13 +4,12 @@ + * Author: Quentin Perret + */ + +-#include +-#include ++#include + +-u64 __hyp_vmemmap; ++u64 __pkvm_vmemmap; + + /* +- * Index the hyp_vmemmap to find a potential buddy page, but make no assumption ++ * Index the pkvm_vmemmap to find a potential buddy page, but make no assumption + * about its current state. + * + * Example buddy-tree for a 4-pages physically contiguous pool: +@@ -30,30 +29,30 @@ u64 __hyp_vmemmap; + * __find_buddy_nocheck(pool, page 1, order 0) => page 0 + * __find_buddy_nocheck(pool, page 2, order 0) => page 3 + */ +-static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, +- struct hyp_page *p, ++static struct pkvm_page *__find_buddy_nocheck(struct pkvm_pool *pool, ++ struct pkvm_page *p, + unsigned short order) + { +- phys_addr_t addr = hyp_page_to_phys(p); ++ phys_addr_t addr = pkvm_page_to_phys(p); + + addr ^= (PAGE_SIZE << order); + + /* + * Don't return a page outside the pool range -- it belongs to +- * something else and may not be mapped in hyp_vmemmap. ++ * something else and may not be mapped in pkvm_vmemmap. + */ + if (addr < pool->range_start || addr >= pool->range_end) + return NULL; + +- return hyp_phys_to_page(addr); ++ return pkvm_phys_to_page(addr); + } + + /* Find a buddy page currently available for allocation */ +-static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, +- struct hyp_page *p, ++static struct pkvm_page *__find_buddy_avail(struct pkvm_pool *pool, ++ struct pkvm_page *p, + unsigned short order) + { +- struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); ++ struct pkvm_page *buddy = __find_buddy_nocheck(pool, p, order); + + if (!buddy || buddy->order != order || buddy->refcount) + return NULL; +@@ -65,46 +64,46 @@ static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, + /* + * Pages that are available for allocation are tracked in free-lists, so we use + * the pages themselves to store the list nodes to avoid wasting space. As the +- * allocator always returns zeroed pages (which are zeroed on the hyp_put_page() ++ * allocator always returns zeroed pages (which are zeroed on the pkvm_put_page() + * path to optimize allocation speed), we also need to clean-up the list node in + * each page when we take it out of the list. + */ +-static inline void page_remove_from_list(struct hyp_page *p) ++static inline void page_remove_from_list(struct pkvm_page *p) + { +- struct list_head *node = hyp_page_to_virt(p); ++ struct list_head *node = pkvm_page_to_virt(p); + + __list_del_entry(node); + memset(node, 0, sizeof(*node)); + } + +-static inline void page_add_to_list(struct hyp_page *p, struct list_head *head) ++static inline void page_add_to_list(struct pkvm_page *p, struct list_head *head) + { +- struct list_head *node = hyp_page_to_virt(p); ++ struct list_head *node = pkvm_page_to_virt(p); + + INIT_LIST_HEAD(node); + list_add_tail(node, head); + } + +-static inline struct hyp_page *node_to_page(struct list_head *node) ++static inline struct pkvm_page *node_to_page(struct list_head *node) + { +- return hyp_virt_to_page(node); ++ return pkvm_virt_to_page(node); + } + +-static void __hyp_attach_page(struct hyp_pool *pool, +- struct hyp_page *p) ++static void __pkvm_attach_page(struct pkvm_pool *pool, ++ struct pkvm_page *p) + { + unsigned short order = p->order; +- struct hyp_page *buddy; ++ struct pkvm_page *buddy; + +- memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); ++ memset(pkvm_page_to_virt(p), 0, PAGE_SIZE << p->order); + + /* +- * Only the first struct hyp_page of a high-order page (otherwise known ++ * Only the first struct pkvm_page of a high-order page (otherwise known + * as the 'head') should have p->order set. The non-head pages should +- * have p->order = HYP_NO_ORDER. Here @p may no longer be the head +- * after coalescing, so make sure to mark it HYP_NO_ORDER proactively. ++ * have p->order = PKVM_NO_ORDER. Here @p may no longer be the head ++ * after coalescing, so make sure to mark it PKVM_NO_ORDER proactively. + */ +- p->order = HYP_NO_ORDER; ++ p->order = PKVM_NO_ORDER; + for (; (order + 1) < pool->max_order; order++) { + buddy = __find_buddy_avail(pool, p, order); + if (!buddy) +@@ -112,7 +111,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, + + /* Take the buddy out of its list, and coalesce with @p */ + page_remove_from_list(buddy); +- buddy->order = HYP_NO_ORDER; ++ buddy->order = PKVM_NO_ORDER; + p = min(p, buddy); + } + +@@ -121,16 +120,16 @@ static void __hyp_attach_page(struct hyp_pool *pool, + page_add_to_list(p, &pool->free_area[order]); + } + +-static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, +- struct hyp_page *p, ++static struct pkvm_page *__pkvm_extract_page(struct pkvm_pool *pool, ++ struct pkvm_page *p, + unsigned short order) + { +- struct hyp_page *buddy; ++ struct pkvm_page *buddy; + + page_remove_from_list(p); + while (p->order > order) { + /* +- * The buddy of order n - 1 currently has HYP_NO_ORDER as it ++ * The buddy of order n - 1 currently has PKVM_NO_ORDER as it + * is covered by a higher-level page (whose head is @p). Use + * __find_buddy_nocheck() to find it and inject it in the + * free_list[n - 1], effectively splitting @p in half. +@@ -144,103 +143,103 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, + return p; + } + +-static inline void hyp_page_ref_inc(struct hyp_page *p) ++static inline void pkvm_page_ref_inc(struct pkvm_page *p) + { + BUG_ON(p->refcount == USHRT_MAX); + p->refcount++; + } + +-static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) ++static inline int pkvm_page_ref_dec_and_test(struct pkvm_page *p) + { + BUG_ON(!p->refcount); + p->refcount--; + return (p->refcount == 0); + } + +-static inline void hyp_set_page_refcounted(struct hyp_page *p) ++static inline void pkvm_set_page_refcounted(struct pkvm_page *p) + { + BUG_ON(p->refcount); + p->refcount = 1; + } + +-static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p) ++static void __pkvm_put_page(struct pkvm_pool *pool, struct pkvm_page *p) + { +- if (hyp_page_ref_dec_and_test(p)) +- __hyp_attach_page(pool, p); ++ if (pkvm_page_ref_dec_and_test(p)) ++ __pkvm_attach_page(pool, p); + } + + /* +- * Changes to the buddy tree and page refcounts must be done with the hyp_pool ++ * Changes to the buddy tree and page refcounts must be done with the pkvm_pool + * lock held. If a refcount change requires an update to the buddy tree (e.g. +- * hyp_put_page()), both operations must be done within the same critical ++ * pkvm_put_page()), both operations must be done within the same critical + * section to guarantee transient states (e.g. a page with null refcount but + * not yet attached to a free list) can't be observed by well-behaved readers. + */ +-void hyp_put_page(struct hyp_pool *pool, void *addr) ++void pkvm_put_page(struct pkvm_pool *pool, void *addr) + { +- struct hyp_page *p = hyp_virt_to_page(addr); ++ struct pkvm_page *p = pkvm_virt_to_page(addr); + +- hyp_spin_lock(&pool->lock); +- __hyp_put_page(pool, p); +- hyp_spin_unlock(&pool->lock); ++ pkvm_spin_lock(&pool->lock); ++ __pkvm_put_page(pool, p); ++ pkvm_spin_unlock(&pool->lock); + } + +-void hyp_get_page(struct hyp_pool *pool, void *addr) ++void pkvm_get_page(struct pkvm_pool *pool, void *addr) + { +- struct hyp_page *p = hyp_virt_to_page(addr); ++ struct pkvm_page *p = pkvm_virt_to_page(addr); + +- hyp_spin_lock(&pool->lock); +- hyp_page_ref_inc(p); +- hyp_spin_unlock(&pool->lock); ++ pkvm_spin_lock(&pool->lock); ++ pkvm_page_ref_inc(p); ++ pkvm_spin_unlock(&pool->lock); + } + +-void hyp_split_page(struct hyp_page *p) ++void pkvm_split_page(struct pkvm_page *p) + { + unsigned short order = p->order; + unsigned int i; + + p->order = 0; + for (i = 1; i < (1 << order); i++) { +- struct hyp_page *tail = p + i; ++ struct pkvm_page *tail = p + i; + + tail->order = 0; +- hyp_set_page_refcounted(tail); ++ pkvm_set_page_refcounted(tail); + } + } + +-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) ++void *pkvm_alloc_pages(struct pkvm_pool *pool, unsigned short order) + { + unsigned short i = order; +- struct hyp_page *p; ++ struct pkvm_page *p; + +- hyp_spin_lock(&pool->lock); ++ pkvm_spin_lock(&pool->lock); + + /* Look for a high-enough-order page */ + while (i < pool->max_order && list_empty(&pool->free_area[i])) + i++; + if (i >= pool->max_order) { +- hyp_spin_unlock(&pool->lock); ++ pkvm_spin_unlock(&pool->lock); + return NULL; + } + + /* Extract it from the tree at the right order */ + p = node_to_page(pool->free_area[i].next); +- p = __hyp_extract_page(pool, p, order); ++ p = __pkvm_extract_page(pool, p, order); + +- hyp_set_page_refcounted(p); +- hyp_spin_unlock(&pool->lock); ++ pkvm_set_page_refcounted(p); ++ pkvm_spin_unlock(&pool->lock); + +- return hyp_page_to_virt(p); ++ return pkvm_page_to_virt(p); + } + +-int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, ++int pkvm_pool_init(struct pkvm_pool *pool, u64 pfn, unsigned int nr_pages, + unsigned int reserved_pages) + { +- phys_addr_t phys = hyp_pfn_to_phys(pfn); +- struct hyp_page *p; ++ phys_addr_t phys = pkvm_pfn_to_phys(pfn); ++ struct pkvm_page *p; + int i; + +- hyp_spin_lock_init(&pool->lock); ++ pkvm_spinlock_init(&pool->lock); + pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT)); + for (i = 0; i < pool->max_order; i++) + INIT_LIST_HEAD(&pool->free_area[i]); +@@ -248,15 +247,15 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, + pool->range_end = phys + (nr_pages << PAGE_SHIFT); + + /* Init the vmemmap portion */ +- p = hyp_phys_to_page(phys); ++ p = pkvm_phys_to_page(phys); + for (i = 0; i < nr_pages; i++) { + p[i].order = 0; +- hyp_set_page_refcounted(&p[i]); ++ pkvm_set_page_refcounted(&p[i]); + } + + /* Attach the unused pages to the buddy tree */ + for (i = reserved_pages; i < nr_pages; i++) +- __hyp_put_page(pool, &p[i]); ++ __pkvm_put_page(pool, &p[i]); + + return 0; + } +diff --git a/virt/kvm/pkvm/pkvm.c b/virt/kvm/pkvm/pkvm.c +new file mode 100644 +index 000000000000..03894f4ca24c +--- /dev/null ++++ b/virt/kvm/pkvm/pkvm.c +@@ -0,0 +1,85 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2020 - Google LLC ++ * Author: Quentin Perret ++ */ ++ ++#include ++#include ++#include ++ ++#include ++ ++static struct memblock_region *_pkvm_memory = pkvm_sym(pkvm_memory); ++static unsigned int *pkvm_memblock_nr_ptr = &pkvm_sym(pkvm_memblock_nr); ++ ++phys_addr_t pkvm_mem_base; ++phys_addr_t pkvm_mem_size; ++ ++static int cmp_pkvm_memblock(const void *p1, const void *p2) ++{ ++ const struct memblock_region *r1 = p1; ++ const struct memblock_region *r2 = p2; ++ ++ return r1->base < r2->base ? -1 : (r1->base > r2->base); ++} ++ ++static void __init sort_memblock_regions(void) ++{ ++ sort(_pkvm_memory, ++ *pkvm_memblock_nr_ptr, ++ sizeof(struct memblock_region), ++ cmp_pkvm_memblock, ++ NULL); ++} ++ ++static int __init register_memblock_regions(void) ++{ ++ struct memblock_region *reg; ++ ++ for_each_mem_region(reg) { ++ if (*pkvm_memblock_nr_ptr >= PKVM_MEMBLOCK_REGIONS) ++ return -ENOMEM; ++ ++ _pkvm_memory[*pkvm_memblock_nr_ptr] = *reg; ++ (*pkvm_memblock_nr_ptr)++; ++ } ++ sort_memblock_regions(); ++ ++ return 0; ++} ++ ++void __init pkvm_reserve(void) ++{ ++ int ret; ++ ++ if (pkvm_pre_reserve_check() < 0) ++ return; ++ ++ ret = register_memblock_regions(); ++ if (ret) { ++ *pkvm_memblock_nr_ptr = 0; ++ kvm_err("Failed to register pkvm memblocks: %d\n", ret); ++ return; ++ } ++ ++ /* ++ * Try to allocate a PMD-aligned region to reduce TLB pressure once ++ * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. ++ */ ++ pkvm_mem_size = pkvm_total_reserve_pages() << PAGE_SHIFT; ++ pkvm_mem_base = memblock_phys_alloc(ALIGN(pkvm_mem_size, PMD_SIZE), ++ PMD_SIZE); ++ if (!pkvm_mem_base) ++ pkvm_mem_base = memblock_phys_alloc(pkvm_mem_size, PAGE_SIZE); ++ else ++ pkvm_mem_size = ALIGN(pkvm_mem_size, PMD_SIZE); ++ ++ if (!pkvm_mem_base) { ++ kvm_err("Failed to reserve pkvm memory\n"); ++ return; ++ } ++ ++ kvm_info("Reserved %lld MiB at 0x%llx\n", pkvm_mem_size >> 20, ++ pkvm_mem_base); ++} +diff --git a/virt/kvm/pkvm/pkvm_spinlock.h b/virt/kvm/pkvm/pkvm_spinlock.h +new file mode 100644 +index 000000000000..d234ed1188e2 +--- /dev/null ++++ b/virt/kvm/pkvm/pkvm_spinlock.h +@@ -0,0 +1,47 @@ ++/* ++ * SPDX-License-Identifier: GPL-2.0 ++ * Copyright (C) 2022 Intel Corporation ++ * ++ * pkvm runs in a self-contained environment ++ * and requires a self-contained spinlock implementation ++ * which doesn't rely on any other external symbols. ++ * ++ * This is a common interface with wrapping the arch ++ * specific implementation. ++ * */ ++#ifndef __PKVM_SPINLOCK_H ++#define __PKVM_SPINLOCK_H ++ ++#include ++ ++typedef struct pkvm_spinlock { ++ arch_pkvm_spinlock_t pkvm_lock; ++} pkvm_spinlock_t; ++ ++#define __PKVM_SPINLOCK_INITIALIZER \ ++ { .pkvm_lock = __ARCH_PKVM_SPINLOCK_UNLOCKED } ++ ++#define __PKVM_SPINLOCK_UNLOCKED \ ++ ((pkvm_spinlock_t) __PKVM_SPINLOCK_INITIALIZER) ++ ++#define pkvm_spinlock_init(l) \ ++do { \ ++ *(l) = __PKVM_SPINLOCK_UNLOCKED; \ ++} while (0); ++ ++static __always_inline void pkvm_spin_lock(pkvm_spinlock_t *lock) ++{ ++ arch_pkvm_spin_lock(&lock->pkvm_lock); ++} ++ ++static __always_inline void pkvm_spin_unlock(pkvm_spinlock_t *lock) ++{ ++ arch_pkvm_spin_unlock(&lock->pkvm_lock); ++} ++ ++static __always_inline void pkvm_assert_lock_held(pkvm_spinlock_t *lock) ++{ ++ arch_pkvm_assert_lock_held(&lock->pkvm_lock); ++} ++ ++#endif +diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c +index 365d30779768..c43b975c4729 100644 +--- a/virt/kvm/vfio.c ++++ b/virt/kvm/vfio.c +@@ -144,10 +144,16 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev) + mutex_unlock(&kv->lock); + } + ++int __weak kvm_arch_add_device_to_pkvm(struct kvm *kvm, struct iommu_group *grp) ++{ ++ return 0; ++} ++ + static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) + { + struct kvm_vfio *kv = dev->private; + struct kvm_vfio_file *kvf; ++ struct iommu_group *iommu_grp; + struct file *filp; + int ret; + +@@ -177,6 +183,11 @@ static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) + } + + kvf->file = filp; ++ ++ ret = kvm_arch_add_device_to_pkvm(dev->kvm, iommu_grp); ++ if (ret) ++ goto free_kvf; ++ + list_add_tail(&kvf->node, &kv->file_list); + + kvm_arch_start_assignment(dev->kvm); +@@ -187,6 +198,8 @@ static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) + kvm_vfio_update_coherency(dev); + + return 0; ++free_kvf: ++ kfree(kvf); + err_unlock: + mutex_unlock(&kv->lock); + err_fput: +-- +2.34.1 + diff --git a/targets/lenovo-x1-carbon.nix b/targets/lenovo-x1-carbon.nix index 7282e98c9..d88e4e5b0 100644 --- a/targets/lenovo-x1-carbon.nix +++ b/targets/lenovo-x1-carbon.nix @@ -197,6 +197,8 @@ ghaf = { host.kernel_hardening.enable = false; + host.hypervisor_hardening.enable = false; + hardware.x86_64.common.enable = true; virtualization.microvm-host.enable = true;