@@ -47,6 +47,7 @@ config KVM_X86
select KVM_GENERIC_PRE_FAULT_MEMORY
select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
select KVM_WERROR if WERROR
+ select HAVE_KVM_USERFAULT
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
@@ -4292,14 +4292,19 @@ static inline u8 kvm_max_level_for_order(int order)
return PG_LEVEL_4K;
}
-static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
- u8 max_level, int gmem_order)
+static u8 kvm_max_private_mapping_level(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ kvm_pfn_t pfn, u8 max_level,
+ int gmem_order)
{
u8 req_max_level;
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
+ if (kvm_memslot_userfault(slot))
+ return PG_LEVEL_4K;
+
max_level = min(kvm_max_level_for_order(gmem_order), max_level);
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
@@ -4336,8 +4341,10 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
}
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
- fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
- fault->max_level, max_order);
+ fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->slot,
+ fault->pfn,
+ fault->max_level,
+ max_order);
return RET_PF_CONTINUE;
}
@@ -4346,6 +4353,18 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
unsigned int foll = fault->write ? FOLL_WRITE : 0;
+ int userfault;
+
+ userfault = kvm_gfn_userfault(vcpu->kvm, fault->slot, fault->gfn);
+ if (userfault < 0)
+ return userfault;
+ if (userfault) {
+ kvm_mmu_prepare_userfault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (kvm_memslot_userfault(fault->slot))
+ fault->max_level = PG_LEVEL_4K;
if (fault->is_private)
return kvm_mmu_faultin_pfn_private(vcpu, fault);
@@ -282,12 +282,26 @@ enum {
RET_PF_SPURIOUS,
};
-static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
+static inline void __kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ bool is_userfault)
{
kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
PAGE_SIZE, fault->write, fault->exec,
- fault->is_private);
+ fault->is_private,
+ is_userfault);
+}
+
+static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ __kvm_mmu_prepare_memory_fault_exit(vcpu, fault, false);
+}
+
+static inline void kvm_mmu_prepare_userfault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ __kvm_mmu_prepare_memory_fault_exit(vcpu, fault, true);
}
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
@@ -13053,12 +13053,36 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
u32 new_flags = new ? new->flags : 0;
bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
+ /*
+ * When toggling KVM Userfault on, zap all sptes so that userfault-ness
+ * will be respected at refault time. All new faults will only install
+ * small sptes. Therefore, when toggling it off, recover hugepages.
+ *
+ * For MOVE and DELETE, there will be nothing to do, as the old
+ * mappings will have already been deleted by
+ * kvm_arch_flush_shadow_memslot().
+ *
+ * For CREATE, no mappings will have been created yet.
+ */
+ if ((old_flags ^ new_flags) & KVM_MEM_USERFAULT &&
+ (change == KVM_MR_FLAGS_ONLY)) {
+ if (old_flags & KVM_MEM_USERFAULT)
+ kvm_mmu_recover_huge_pages(kvm, new);
+ else
+ kvm_arch_flush_shadow_memslot(kvm, old);
+ }
+
+ /*
+ * Nothing more to do if dirty logging isn't being toggled.
+ */
+ if (!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))
+ return;
+
/*
* Update CPU dirty logging if dirty logging is being toggled. This
* applies to all operations.
*/
- if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
- kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
+ kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
/*
* Nothing more to do for RO slots (which can't be dirtied and can't be
@@ -13078,14 +13102,6 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
return;
- /*
- * READONLY and non-flags changes were filtered out above, and the only
- * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
- * logging isn't being toggled on or off.
- */
- if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
- return;
-
if (!log_dirty_pages) {
/*
* Recover huge page mappings in the slot now that dirty logging
@@ -2465,7 +2465,8 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
gpa_t gpa, gpa_t size,
bool is_write, bool is_exec,
- bool is_private)
+ bool is_private,
+ bool is_userfault)
{
vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
vcpu->run->memory_fault.gpa = gpa;
@@ -2475,6 +2476,8 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
vcpu->run->memory_fault.flags = 0;
if (is_private)
vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
+ if (is_userfault)
+ vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_USERFAULT;
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
Adhering to the requirements of KVM Userfault: 1. Zap all sptes for the memslot when KVM_MEM_USERFAULT is toggled on with kvm_arch_flush_shadow_memslot(). 2. Only all PAGE_SIZE sptes while KVM_MEM_USERFAULT is enabled (for both normal/GUP memory and guest_memfd memory). 3. Reconstruct huge mappings when KVM_MEM_USERFAULT is toggled off with kvm_mmu_recover_huge_pages(). With the new logic in kvm_mmu_slot_apply_flags(), I've simplified the two dirty-logging-toggle checks into one, and I have dropped the WARN_ON() that was there. Signed-off-by: James Houghton <jthoughton@google.com> --- arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/mmu/mmu.c | 27 +++++++++++++++++++++---- arch/x86/kvm/mmu/mmu_internal.h | 20 +++++++++++++++--- arch/x86/kvm/x86.c | 36 ++++++++++++++++++++++++--------- include/linux/kvm_host.h | 5 ++++- 5 files changed, 71 insertions(+), 18 deletions(-)