@@ -33,10 +33,9 @@
#define KVM_VCPU_MAX_FEATURES 1
-/* We don't currently support large pages. */
-#define KVM_HPAGE_GFN_SHIFT(x) 0
-#define KVM_NR_PAGE_SIZES 1
-#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
+#define KVM_HPAGE_GFN_SHIFT(_level) (((_level) - 1) * 21)
+#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_GFN_SHIFT(2))
+#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
#include <kvm/arm_vgic.h>
@@ -105,7 +105,8 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
struct kvm;
-static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
+static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
+ unsigned long size)
{
/*
* If we are going to insert an instruction page and the icache is
@@ -120,8 +121,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
* need any kind of flushing (DDI 0406C.b - Page B3-1392).
*/
if (icache_is_pipt()) {
- unsigned long hva = gfn_to_hva(kvm, gfn);
- __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
+ __cpuc_coherent_user_range(hva, hva + size);
} else if (!icache_is_vivt_asid_tagged()) {
/* any kind of VIPT cache */
__flush_icache_all();
@@ -19,6 +19,7 @@
#include <linux/mman.h>
#include <linux/kvm_host.h>
#include <linux/io.h>
+#include <linux/hugetlb.h>
#include <trace/events/kvm.h>
#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
@@ -87,19 +88,27 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
{
- pmd_t *pmd_table = pmd_offset(pud, 0);
- pud_clear(pud);
- kvm_tlb_flush_vmid_ipa(kvm, addr);
- pmd_free(NULL, pmd_table);
+ if (pud_huge(*pud)) {
+ pud_clear(pud);
+ } else {
+ pmd_t *pmd_table = pmd_offset(pud, 0);
+ pud_clear(pud);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ pmd_free(NULL, pmd_table);
+ }
put_page(virt_to_page(pud));
}
static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
{
- pte_t *pte_table = pte_offset_kernel(pmd, 0);
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(kvm, addr);
- pte_free_kernel(NULL, pte_table);
+ if (pmd_huge(*pmd)) {
+ pmd_clear(pmd);
+ } else {
+ pte_t *pte_table = pte_offset_kernel(pmd, 0);
+ pmd_clear(pmd);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ pte_free_kernel(NULL, pte_table);
+ }
put_page(virt_to_page(pmd));
}
@@ -142,12 +151,34 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
continue;
}
+ if (pud_huge(*pud)) {
+ /*
+ * If we are dealing with a huge pud, just clear it and
+ * move on.
+ */
+ clear_pud_entry(kvm, pud, addr);
+ addr += PUD_SIZE;
+ continue;
+ }
+
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
addr += PMD_SIZE;
continue;
}
+ if (pmd_huge(*pmd)) {
+ /*
+ * If we are dealing with a huge pmd, just clear it and
+ * walk back up the ladder.
+ */
+ clear_pmd_entry(kvm, pmd, addr);
+ if (pmd_empty(pmd))
+ clear_pud_entry(kvm, pud, addr);
+ addr += PMD_SIZE;
+ continue;
+ }
+
pte = pte_offset_kernel(pmd, addr);
clear_pte_entry(kvm, pte, addr);
range = PAGE_SIZE;
@@ -432,7 +463,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
{
pgd_t *pgd;
pud_t *pud;
- pmd_t *pmd;
+ pmd_t *pmd, old_pmd;
pte_t *pte, old_pte;
/* Create 2nd stage page table mapping - Level 1 */
@@ -448,7 +479,22 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
pmd = pmd_offset(pud, addr);
- /* Create 2nd stage page table mapping - Level 2 */
+ /* Create 2nd stage section mappings (huge tlb pages) - Level 2 */
+ if (pte_huge(*new_pte) || pmd_huge(*pmd)) {
+ pte_t *huge_pte = (pte_t *)pmd;
+ VM_BUG_ON(pmd_present(*pmd) && !pmd_huge(*pmd));
+
+ old_pmd = *pmd;
+ kvm_set_pte(huge_pte, *new_pte); /* new_pte really new_pmd */
+ if (pmd_present(old_pmd))
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ else
+ get_page(virt_to_page(pmd));
+ return 0;
+ }
+
+ /* Create 2nd stage page mappings - Level 2 */
+ BUG_ON(pmd_present(*pmd) && pmd_huge(*pmd));
if (pmd_none(*pmd)) {
if (!cache)
return 0; /* ignore calls from kvm_set_spte_hva */
@@ -514,16 +560,55 @@ out:
return ret;
}
+static bool transparent_hugepage_adjust(struct kvm *kvm, pfn_t *pfnp,
+ phys_addr_t *ipap)
+{
+ pfn_t pfn = *pfnp;
+ gfn_t gfn = *ipap >> PAGE_SHIFT;
+
+ if (PageTransCompound(pfn_to_page(pfn))) {
+ unsigned long mask;
+ /*
+ * mmu_notifier_retry was successful and we hold the
+ * mmu_lock here, so the pmd can't become splitting
+ * from under us, and in turn
+ * __split_huge_page_refcount() can't run from under
+ * us and we can safely transfer the refcount from
+ * PG_tail to PG_head as we switch the pfn from tail to
+ * head.
+ */
+ mask = KVM_PAGES_PER_HPAGE - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ if (pfn & mask) {
+ gfn &= ~mask;
+ *ipap &= ~(KVM_HPAGE_SIZE - 1);
+ kvm_release_pfn_clean(pfn);
+ pfn &= ~mask;
+ kvm_get_pfn(pfn);
+ *pfnp = pfn;
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- gfn_t gfn, struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot *memslot,
unsigned long fault_status)
{
- pte_t new_pte;
- pfn_t pfn;
int ret;
- bool write_fault, writable;
+ bool write_fault, writable, hugetlb = false, force_pte = false;
unsigned long mmu_seq;
+ gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+ unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
+ struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+ struct vm_area_struct *vma;
+ pfn_t pfn;
+ pte_t new_pte;
+ unsigned long psize;
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM && !write_fault) {
@@ -531,6 +616,27 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
+ /* Let's check if we will get back a huge page */
+ down_read(¤t->mm->mmap_sem);
+ vma = find_vma_intersection(current->mm, hva, hva + 1);
+ if (is_vm_hugetlb_page(vma)) {
+ hugetlb = true;
+ hva &= PMD_MASK;
+ gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
+ psize = PMD_SIZE;
+ } else {
+ psize = PAGE_SIZE;
+ if (vma->vm_start & ~PMD_MASK)
+ force_pte = true;
+ }
+ up_read(¤t->mm->mmap_sem);
+
+ pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+ if (is_error_pfn(pfn))
+ return -EFAULT;
+
+ coherent_icache_guest_page(kvm, hva, psize);
+
/* We need minimum second+third level pages */
ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
if (ret)
@@ -548,26 +654,24 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
*/
smp_rmb();
- pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
- if (is_error_pfn(pfn))
- return -EFAULT;
-
- new_pte = pfn_pte(pfn, PAGE_S2);
- coherent_icache_guest_page(vcpu->kvm, gfn);
-
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ spin_lock(&kvm->mmu_lock);
+ if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
+ if (!hugetlb && !force_pte)
+ hugetlb = transparent_hugepage_adjust(kvm, &pfn, &fault_ipa);
+ new_pte = pfn_pte(pfn, PAGE_S2);
+ if (hugetlb)
+ new_pte = pte_mkhuge(new_pte);
if (writable) {
kvm_set_s2pte_writable(&new_pte);
kvm_set_pfn_dirty(pfn);
}
- stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false);
+ ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
+ spin_unlock(&kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return ret;
}
/**
@@ -636,7 +740,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
memslot = gfn_to_memslot(vcpu->kvm, gfn);
- ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status);
+ ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status);
if (ret == 0)
ret = 1;
out_unlock: