diff mbox series

[v4,2/3] kvm: arm64: set io memory s2 pte as normalnc for vfio pci devices

Message ID 20231218090719.22250-3-ankita@nvidia.com (mailing list archive)
State New
Headers show
Series kvm: arm64: allow the VM to select DEVICE_* and NORMAL_NC for IO memory | expand

Commit Message

Ankit Agrawal Dec. 18, 2023, 9:07 a.m. UTC
From: Ankit Agrawal <ankita@nvidia.com>

To provide VM with the ability to get device IO memory with NormalNC
property, map device MMIO in KVM for ARM64 at stage2 as NormalNC.
Having NormalNC S2 default puts guests in control (based on [1],
"Combining stage 1 and stage 2 memory type attributes") of device
MMIO regions memory mappings. The rules are summarized below:
([(S1) - stage1], [(S2) - stage 2])

S1           |  S2           | Result
NORMAL-WB    |  NORMAL-NC    | NORMAL-NC
NORMAL-WT    |  NORMAL-NC    | NORMAL-NC
NORMAL-NC    |  NORMAL-NC    | NORMAL-NC
DEVICE<attr> |  NORMAL-NC    | DEVICE<attr>

Generalizing this to non PCI devices may be problematic. E.g. GICv2
vCPU interface, which is effectively a shared peripheral, can allow
a guest to affect another guest's interrupt distribution. The issue
may be solved by limiting the relaxation to mappings that have a user
VMA. Still There is insufficient information and uncertainity in the
behavior of non PCI driver. Hence caution is maintained and the change
is restricted to the VFIO-PCI devices. PCIe on the other hand is safe
because the PCI bridge does not generate errors, and thus do not cause
uncontained failures.

A new flag VM_VFIO_ALLOW_WC to indicate KVM that the device is WC capable.
KVM use this flag to activate the code.

This could be extended to other devices in the future once that
is deemed safe.

[1] section D8.5.5 of DDI0487J_a_a-profile_architecture_reference_manual.pdf

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Ankit Agrawal <ankita@nvidia.com>
---
 arch/arm64/kvm/mmu.c | 18 ++++++++++++++----
 include/linux/mm.h   | 13 +++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

Comments

Marc Zyngier Dec. 20, 2023, 2:16 p.m. UTC | #1
On Mon, 18 Dec 2023 09:07:18 +0000,
<ankita@nvidia.com> wrote:
> 
> From: Ankit Agrawal <ankita@nvidia.com>
> 
> To provide VM with the ability to get device IO memory with NormalNC
> property, map device MMIO in KVM for ARM64 at stage2 as NormalNC.
> Having NormalNC S2 default puts guests in control (based on [1],
> "Combining stage 1 and stage 2 memory type attributes") of device
> MMIO regions memory mappings. The rules are summarized below:
> ([(S1) - stage1], [(S2) - stage 2])
> 
> S1           |  S2           | Result
> NORMAL-WB    |  NORMAL-NC    | NORMAL-NC
> NORMAL-WT    |  NORMAL-NC    | NORMAL-NC
> NORMAL-NC    |  NORMAL-NC    | NORMAL-NC
> DEVICE<attr> |  NORMAL-NC    | DEVICE<attr>
> 
> Generalizing this to non PCI devices may be problematic. E.g. GICv2
> vCPU interface, which is effectively a shared peripheral, can allow
> a guest to affect another guest's interrupt distribution. The issue
> may be solved by limiting the relaxation to mappings that have a user
> VMA. Still There is insufficient information and uncertainity in the
> behavior of non PCI driver. Hence caution is maintained and the change
> is restricted to the VFIO-PCI devices. PCIe on the other hand is safe
> because the PCI bridge does not generate errors, and thus do not cause
> uncontained failures.
> 
> A new flag VM_VFIO_ALLOW_WC to indicate KVM that the device is WC capable.
> KVM use this flag to activate the code.
> 
> This could be extended to other devices in the future once that
> is deemed safe.
> 
> [1] section D8.5.5 of DDI0487J_a_a-profile_architecture_reference_manual.pdf
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
> Acked-by: Jason Gunthorpe <jgg@nvidia.com>
> Tested-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  arch/arm64/kvm/mmu.c | 18 ++++++++++++++----
>  include/linux/mm.h   | 13 +++++++++++++
>  2 files changed, 27 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index d14504821b79..e1e6847a793b 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1381,7 +1381,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	int ret = 0;
>  	bool write_fault, writable, force_pte = false;
>  	bool exec_fault, mte_allowed;
> -	bool device = false;
> +	bool device = false, vfio_allow_wc = false;
>  	unsigned long mmu_seq;
>  	struct kvm *kvm = vcpu->kvm;
>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
> @@ -1472,6 +1472,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	gfn = fault_ipa >> PAGE_SHIFT;
>  	mte_allowed = kvm_vma_mte_allowed(vma);
>  
> +	vfio_allow_wc = (vma->vm_flags & VM_VFIO_ALLOW_WC);
> +
>  	/* Don't use the VMA after the unlock -- it may have vanished */
>  	vma = NULL;
>  
> @@ -1557,10 +1559,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (exec_fault)
>  		prot |= KVM_PGTABLE_PROT_X;
>  
> -	if (device)
> -		prot |= KVM_PGTABLE_PROT_DEVICE;
> -	else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
> +	if (device) {
> +		/*
> +		 * To provide VM with the ability to get device IO memory
> +		 * with NormalNC property, map device MMIO as NormalNC in S2.
> +		 */
> +		if (vfio_allow_wc)
> +			prot |= KVM_PGTABLE_PROT_NORMAL_NC;
> +		else
> +			prot |= KVM_PGTABLE_PROT_DEVICE;
> +	} else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
>  		prot |= KVM_PGTABLE_PROT_X;
> +	}
>  
>  	/*
>  	 * Under the premise of getting a FSC_PERM fault, we just need to relax
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2bea89dc0bdf..d2f0f969875c 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -391,6 +391,19 @@ extern unsigned int kobjsize(const void *objp);
>  # define VM_UFFD_MINOR		VM_NONE
>  #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
>  
> +/* This flag is used to connect VFIO to arch specific KVM code. It
> + * indicates that the memory under this VMA is safe for use with any
> + * non-cachable memory type inside KVM. Some VFIO devices, on some
> + * platforms, are thought to be unsafe and can cause machine crashes if
> + * KVM does not lock down the memory type.
> + */

Comment format.

> +#ifdef CONFIG_64BIT
> +#define VM_VFIO_ALLOW_WC_BIT	39
> +#define VM_VFIO_ALLOW_WC	BIT(VM_VFIO_ALLOW_WC_BIT)
> +#else
> +#define VM_VFIO_ALLOW_WC	VM_NONE
> +#endif
> +
>  /* Bits set in the VMA until the stack is in its final location */
>  #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)

The mm.h change should be standalone, separate from the KVM stuff.

	M.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d14504821b79..e1e6847a793b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1381,7 +1381,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	int ret = 0;
 	bool write_fault, writable, force_pte = false;
 	bool exec_fault, mte_allowed;
-	bool device = false;
+	bool device = false, vfio_allow_wc = false;
 	unsigned long mmu_seq;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -1472,6 +1472,8 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	gfn = fault_ipa >> PAGE_SHIFT;
 	mte_allowed = kvm_vma_mte_allowed(vma);
 
+	vfio_allow_wc = (vma->vm_flags & VM_VFIO_ALLOW_WC);
+
 	/* Don't use the VMA after the unlock -- it may have vanished */
 	vma = NULL;
 
@@ -1557,10 +1559,18 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (exec_fault)
 		prot |= KVM_PGTABLE_PROT_X;
 
-	if (device)
-		prot |= KVM_PGTABLE_PROT_DEVICE;
-	else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
+	if (device) {
+		/*
+		 * To provide VM with the ability to get device IO memory
+		 * with NormalNC property, map device MMIO as NormalNC in S2.
+		 */
+		if (vfio_allow_wc)
+			prot |= KVM_PGTABLE_PROT_NORMAL_NC;
+		else
+			prot |= KVM_PGTABLE_PROT_DEVICE;
+	} else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
 		prot |= KVM_PGTABLE_PROT_X;
+	}
 
 	/*
 	 * Under the premise of getting a FSC_PERM fault, we just need to relax
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2bea89dc0bdf..d2f0f969875c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -391,6 +391,19 @@  extern unsigned int kobjsize(const void *objp);
 # define VM_UFFD_MINOR		VM_NONE
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
 
+/* This flag is used to connect VFIO to arch specific KVM code. It
+ * indicates that the memory under this VMA is safe for use with any
+ * non-cachable memory type inside KVM. Some VFIO devices, on some
+ * platforms, are thought to be unsafe and can cause machine crashes if
+ * KVM does not lock down the memory type.
+ */
+#ifdef CONFIG_64BIT
+#define VM_VFIO_ALLOW_WC_BIT	39
+#define VM_VFIO_ALLOW_WC	BIT(VM_VFIO_ALLOW_WC_BIT)
+#else
+#define VM_VFIO_ALLOW_WC	VM_NONE
+#endif
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)