@@ -160,7 +160,8 @@ void stage2_unmap_vm(struct kvm *kvm);
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu);
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
- phys_addr_t pa, unsigned long size, bool writable);
+ phys_addr_t pa, unsigned long size, bool writable,
+ bool writecombine);
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
@@ -35,6 +35,7 @@ struct kvm_pgtable {
* @KVM_PGTABLE_PROT_W: Write permission.
* @KVM_PGTABLE_PROT_R: Read permission.
* @KVM_PGTABLE_PROT_DEVICE: Device attributes.
+ * @KVM_PGTABLE_PROT_WC: Normal non-cacheable (WC).
*/
enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_X = BIT(0),
@@ -42,6 +43,7 @@ enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_R = BIT(2),
KVM_PGTABLE_PROT_DEVICE = BIT(3),
+ KVM_PGTABLE_PROT_WC = BIT(4),
};
#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
@@ -144,13 +144,15 @@
* Memory types for Stage-2 translation
*/
#define MT_S2_NORMAL 0xf
+#define MT_S2_WRITE_COMBINE 5
#define MT_S2_DEVICE_nGnRE 0x1
/*
* Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001
- * Stage-2 enforces Normal-WB and Device-nGnRE
+ * Stage-2 enforces Normal-WB, Normal-NC and Device-nGnRE
*/
#define MT_S2_FWB_NORMAL 6
+#define MT_S2_FWB_WRITE_COMBINE 5
#define MT_S2_FWB_DEVICE_nGnRE 1
#ifdef CONFIG_ARM64_4K_PAGES
@@ -444,9 +444,14 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
struct stage2_map_data *data)
{
bool device = prot & KVM_PGTABLE_PROT_DEVICE;
- kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
- PAGE_S2_MEMATTR(NORMAL);
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+ kvm_pte_t attr = PAGE_S2_MEMATTR(NORMAL);
+
+ if (device) {
+ attr = (prot & KVM_PGTABLE_PROT_WC) ?
+ PAGE_S2_MEMATTR(WRITE_COMBINE) :
+ PAGE_S2_MEMATTR(DEVICE_nGnRE);
+ }
if (!(prot & KVM_PGTABLE_PROT_X))
attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
@@ -487,6 +487,16 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
}
}
+/**
+ * is_vma_write_combine - check if VMA is mapped with writecombine or not
+ * Return true if VMA mapped with MT_NORMAL_NC otherwise fasle
+ */
+static bool inline is_vma_write_combine(struct vm_area_struct *vma)
+{
+ pteval_t pteval = pgprot_val(vma->vm_page_prot);
+ return ((pteval & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_NC));
+}
+
/**
* kvm_phys_addr_ioremap - map a device range to guest IPA
*
@@ -495,9 +505,11 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
* @pa: The physical address of the device
* @size: The size of the mapping
* @writable: Whether or not to create a writable mapping
+ * @writecombine: Whether or not to create a writecombine mapping
*/
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
- phys_addr_t pa, unsigned long size, bool writable)
+ phys_addr_t pa, unsigned long size, bool writable,
+ bool writecombine)
{
phys_addr_t addr;
int ret = 0;
@@ -505,6 +517,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
KVM_PGTABLE_PROT_R |
+ (writecombine ? KVM_PGTABLE_PROT_WC : 0) |
(writable ? KVM_PGTABLE_PROT_W : 0);
size += offset_in_page(guest_ipa);
@@ -891,7 +904,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
if (device)
- prot |= KVM_PGTABLE_PROT_DEVICE;
+ prot |= KVM_PGTABLE_PROT_DEVICE |
+ (is_vma_write_combine(vma) ? KVM_PGTABLE_PROT_WC : 0);
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
@@ -1357,7 +1371,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
vm_end - vm_start,
- writable);
+ writable,
+ is_vma_write_combine(vma));
if (ret)
break;
}
@@ -336,7 +336,7 @@ int vgic_v2_map_resources(struct kvm *kvm)
if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
kvm_vgic_global_state.vcpu_base,
- KVM_VGIC_V2_CPU_SIZE, true);
+ KVM_VGIC_V2_CPU_SIZE, true, false);
if (ret) {
kvm_err("Unable to remap VGIC CPU to VCPU\n");
return ret;
In the current implementation, the device memory is always mapped as DEVICE_nGnRE in stage-2. In the host kernel, device drivers have flexibility whether to choose a memory-type device or write-combine (Non-cacheable) depends on the use case. PCI specification has a prefetchable BAR concept where multiple writes can be combined and no side effects on reads. It provides huge performance improvement and also allows unaligned access. NVIDIA GPU PCIe devices have 3 BAR regions. Two regions are mapped to video/compute memory and marked as prefetchable. The GPU driver takes advantage of the write-combine feature for higher performance. The same driver has no issues in the host kernel but crashes inside the virtual machine because of unaligned accesses. This patch finds the PTE attributes for device memory in VMA. It updates the stage-2 attribute to NORMAL_NC for WC regions and the default type DEVICE_nGnRE for non-WC regions. Change-Id: Ibaea69c7a301df3c86609e871f6d066728391080 Signed-off-by: Shanker Donthineni <sdonthineni@nvidia.com> --- arch/arm64/include/asm/kvm_mmu.h | 3 ++- arch/arm64/include/asm/kvm_pgtable.h | 2 ++ arch/arm64/include/asm/memory.h | 4 +++- arch/arm64/kvm/hyp/pgtable.c | 9 +++++++-- arch/arm64/kvm/mmu.c | 21 ++++++++++++++++++--- arch/arm64/kvm/vgic/vgic-v2.c | 2 +- 6 files changed, 33 insertions(+), 8 deletions(-)