@@ -2347,7 +2347,9 @@
KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
disabled, all huge pages in a memslot will be eagerly
split when dirty logging is enabled on that memslot. If
- enabled, huge pages will not be eagerly split.
+ enabled, eager page splitting will be performed during
+ the KVM_CLEAR_DIRTY ioctl, and only for the pages being
+ cleared.
Eager page splitting currently only supports splitting
huge pages mapped by the TDP MMU.
@@ -1582,6 +1582,10 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *memslot,
int target_level);
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
@@ -1360,6 +1360,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
/* Cross two large pages? */
@@ -5834,16 +5837,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level)
+{
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
+ target_level, false);
+
+ /*
+ * A TLB flush is unnecessary at this point for the same resons as in
+ * kvm_mmu_slot_try_split_huge_pages().
+ */
+}
+
void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
- const struct kvm_memory_slot *memslot,
- int target_level)
+ const struct kvm_memory_slot *memslot,
+ int target_level)
{
u64 start = memslot->base_gfn;
u64 end = start + memslot->npages;
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
- kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
read_unlock(&kvm->mmu_lock);
}
@@ -943,27 +943,33 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
}
/*
- * tdp_mmu_link_sp_atomic - Atomically replace the given spte with an spte
- * pointing to the provided page table.
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
*
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* @sp: The new TDP page table to install.
* @account_nx: True if this page table is being installed to split a
* non-executable huge page.
+ * @shared: This operation is running under the MMU lock in read mode.
*
* Returns: 0 if the new page table was installed. Non-0 if the page table
* could not be installed (e.g. the atomic compare-exchange failed).
*/
-static int tdp_mmu_link_sp_atomic(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_mmu_page *sp, bool account_nx)
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool account_nx,
+ bool shared)
{
u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
- int ret;
+ int ret = 0;
- ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
- if (ret)
- return ret;
+ if (shared) {
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+ if (ret)
+ return ret;
+ } else {
+ tdp_mmu_set_spte(kvm, iter, spte);
+ }
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
@@ -1031,7 +1037,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
sp = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_child_sp(sp, &iter);
- if (tdp_mmu_link_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) {
+ if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
tdp_mmu_free_sp(sp);
break;
}
@@ -1262,12 +1268,11 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
}
static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
- struct tdp_iter *iter)
+ struct tdp_iter *iter,
+ bool shared)
{
struct kvm_mmu_page *sp;
- lockdep_assert_held_read(&kvm->mmu_lock);
-
/*
* Since we are allocating while under the MMU lock we have to be
* careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
@@ -1282,20 +1287,27 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
return sp;
rcu_read_unlock();
- read_unlock(&kvm->mmu_lock);
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
iter->yielded = true;
sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
- read_lock(&kvm->mmu_lock);
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
rcu_read_lock();
return sp;
}
-static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- struct kvm_mmu_page *sp)
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
{
const u64 huge_spte = iter->old_spte;
const int level = iter->level;
@@ -1318,7 +1330,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
* correctness standpoint since the translation will be the same either
* way.
*/
- ret = tdp_mmu_link_sp_atomic(kvm, iter, sp, false);
+ ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
if (ret)
return ret;
@@ -1335,7 +1347,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
struct kvm_mmu_page *root,
gfn_t start, gfn_t end,
- int target_level)
+ int target_level, bool shared)
{
struct kvm_mmu_page *sp = NULL;
struct tdp_iter iter;
@@ -1356,14 +1368,14 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
*/
for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
retry:
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
continue;
if (!sp) {
- sp = tdp_mmu_alloc_sp_for_split(kvm, &iter);
+ sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
if (!sp) {
ret = -ENOMEM;
break;
@@ -1373,7 +1385,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
continue;
}
- if (tdp_mmu_split_huge_page_atomic(kvm, &iter, sp))
+ if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
goto retry;
sp = NULL;
@@ -1393,23 +1405,24 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
return ret;
}
+
/*
* Try to split all huge pages mapped by the TDP MMU down to the target level.
*/
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *slot,
gfn_t start, gfn_t end,
- int target_level)
+ int target_level, bool shared)
{
struct kvm_mmu_page *root;
int r = 0;
- lockdep_assert_held_read(&kvm->mmu_lock);
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) {
- r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+ r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
if (r) {
- kvm_tdp_mmu_put_root(kvm, root, true);
+ kvm_tdp_mmu_put_root(kvm, root, shared);
break;
}
}
@@ -74,7 +74,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *slot,
gfn_t start, gfn_t end,
- int target_level);
+ int target_level, bool shared);
static inline void kvm_tdp_mmu_walk_lockless_begin(void)
{
@@ -192,7 +192,7 @@ bool __read_mostly enable_pmu = true;
EXPORT_SYMBOL_GPL(enable_pmu);
module_param(enable_pmu, bool, 0444);
-static bool __read_mostly eager_page_split = true;
+bool __read_mostly eager_page_split = true;
module_param(eager_page_split, bool, 0644);
/*
@@ -352,6 +352,8 @@ extern int pi_inject_timer;
extern bool report_ignored_msrs;
+extern bool eager_page_split;
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
When using KVM_DIRTY_LOG_INITIALLY_SET, huge pages are not write-protected when dirty logging is enabled on the memslot. Instead they are write-protected once userspace invokes KVM_CLEAR_DIRTY_LOG for the first time and only for the specific sub-region being cleared. Enhance KVM_CLEAR_DIRTY_LOG to also try to split huge pages prior to write-protecting to avoid causing write-protection faults on vCPU threads. This also allows userspace to smear the cost of huge page splitting across multiple ioctls rather than splitting the entire memslot when not using initially-all-set. Signed-off-by: David Matlack <dmatlack@google.com> --- .../admin-guide/kernel-parameters.txt | 4 +- arch/x86/include/asm/kvm_host.h | 4 ++ arch/x86/kvm/mmu/mmu.c | 25 ++++++- arch/x86/kvm/mmu/tdp_mmu.c | 67 +++++++++++-------- arch/x86/kvm/mmu/tdp_mmu.h | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86.h | 2 + 7 files changed, 73 insertions(+), 33 deletions(-)