@@ -1305,6 +1305,16 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
+
+#define KVM_DIRTY_LOG_MODE_WRPROT 1
+#define KVM_DIRTY_LOG_MODE_PML 2
+
+void kvm_mmu_zap_collapsible_direct_ptes(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot);
+void reset_direct_mmu_dirty_logging(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ int dirty_log_mode,
+ bool record_dirty_pages);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -790,6 +790,18 @@ static bool is_accessed_direct_pte(u64 pte, int level)
return pte & shadow_acc_track_mask;
}
+static bool is_dirty_direct_pte(u64 pte, int dlog_mode)
+{
+ /* If the pte is non-present, the entry cannot have been dirtied. */
+ if (!is_present_direct_pte(pte))
+ return false;
+
+ if (dlog_mode == KVM_DIRTY_LOG_MODE_WRPROT)
+ return pte & PT_WRITABLE_MASK;
+
+ return pte & shadow_dirty_mask;
+}
+
static bool is_accessed_spte(u64 spte)
{
u64 accessed_mask = spte_shadow_accessed_mask(spte);
@@ -1743,6 +1755,38 @@ static void handle_changed_pte_acc_track(u64 old_pte, u64 new_pte, int level)
kvm_set_pfn_accessed(spte_to_pfn(old_pte));
}
+static void handle_changed_pte_dlog(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_pte, u64 new_pte, int level)
+{
+ bool pfn_changed = spte_to_pfn(old_pte) != spte_to_pfn(new_pte);
+ bool was_wrprot_dirty = is_dirty_direct_pte(old_pte,
+ KVM_DIRTY_LOG_MODE_WRPROT);
+ bool is_wrprot_dirty = is_dirty_direct_pte(new_pte,
+ KVM_DIRTY_LOG_MODE_WRPROT);
+ bool wrprot_dirty = (!was_wrprot_dirty || pfn_changed) &&
+ is_wrprot_dirty;
+ struct kvm_memory_slot *slot;
+
+ if (level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ /*
+ * Only mark pages dirty if they are becoming writable or no longer have
+ * the dbit set and dbit dirty logging is enabled.
+ * If pages are marked dirty when unsetting the dbit when dbit
+ * dirty logging isn't on, it can cause spurious dirty pages, e.g. from
+ * zapping PTEs during VM teardown.
+ * If, on the other hand, pages were only marked dirty when becoming
+ * writable when in wrprot dirty logging, that would also cause problems
+ * because dirty pages could be lost when switching from dbit to wrprot
+ * dirty logging.
+ */
+ if (wrprot_dirty) {
+ slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
+ mark_page_dirty_in_slot(slot, gfn);
+ }
+}
+
/*
* Takes a snapshot of, and clears, the direct MMU disconnected pt list. Once
* TLBs have been flushed, this snapshot can be transferred to the direct MMU
@@ -1873,6 +1917,8 @@ static void mark_pte_disconnected(struct kvm *kvm, int as_id, gfn_t gfn,
handle_changed_pte(kvm, as_id, gfn, old_pte, DISCONNECTED_PTE, level,
vm_teardown, disconnected_pts);
handle_changed_pte_acc_track(old_pte, DISCONNECTED_PTE, level);
+ handle_changed_pte_dlog(kvm, as_id, gfn, old_pte, DISCONNECTED_PTE,
+ level);
}
/**
@@ -1964,6 +2010,14 @@ static void handle_changed_pte(struct kvm *kvm, int as_id, gfn_t gfn,
bool was_present = is_present_direct_pte(old_pte);
bool is_present = is_present_direct_pte(new_pte);
bool was_leaf = was_present && is_last_spte(old_pte, level);
+ bool was_dirty = is_dirty_direct_pte(old_pte,
+ KVM_DIRTY_LOG_MODE_WRPROT) ||
+ is_dirty_direct_pte(old_pte,
+ KVM_DIRTY_LOG_MODE_PML);
+ bool is_dirty = is_dirty_direct_pte(new_pte,
+ KVM_DIRTY_LOG_MODE_WRPROT) ||
+ is_dirty_direct_pte(new_pte,
+ KVM_DIRTY_LOG_MODE_PML);
bool pfn_changed = spte_to_pfn(old_pte) != spte_to_pfn(new_pte);
int child_level;
@@ -1990,6 +2044,9 @@ static void handle_changed_pte(struct kvm *kvm, int as_id, gfn_t gfn,
return;
}
+ if (((was_dirty && !is_dirty) || pfn_changed) && was_leaf)
+ kvm_set_pfn_dirty(spte_to_pfn(old_pte));
+
if (was_present && !was_leaf && (pfn_changed || !is_present)) {
/*
* The level of the page table being freed is one level lower
@@ -2439,7 +2496,8 @@ static bool cmpxchg_pte(u64 *ptep, u64 old_pte, u64 new_pte, int level, u64 gfn)
}
static bool direct_walk_iterator_set_pte_raw(struct direct_walk_iterator *iter,
- u64 new_pte, bool handle_acc_track)
+ u64 new_pte, bool handle_acc_track,
+ bool handle_dlog)
{
bool r;
@@ -2464,6 +2522,11 @@ static bool direct_walk_iterator_set_pte_raw(struct direct_walk_iterator *iter,
if (handle_acc_track)
handle_changed_pte_acc_track(iter->old_pte, new_pte,
iter->level);
+ if (handle_dlog)
+ handle_changed_pte_dlog(iter->kvm, iter->as_id,
+ iter->pte_gfn_start,
+ iter->old_pte, new_pte,
+ iter->level);
if (iter->lock_mode & (MMU_WRITE_LOCK | MMU_READ_LOCK))
iter->tlbs_dirty++;
@@ -2476,13 +2539,19 @@ static bool direct_walk_iterator_set_pte_raw(struct direct_walk_iterator *iter,
static bool direct_walk_iterator_set_pte_no_acc_track(
struct direct_walk_iterator *iter, u64 new_pte)
{
- return direct_walk_iterator_set_pte_raw(iter, new_pte, false);
+ return direct_walk_iterator_set_pte_raw(iter, new_pte, false, true);
+}
+
+static bool direct_walk_iterator_set_pte_no_dlog(
+ struct direct_walk_iterator *iter, u64 new_pte)
+{
+ return direct_walk_iterator_set_pte_raw(iter, new_pte, true, false);
}
static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
u64 new_pte)
{
- return direct_walk_iterator_set_pte_raw(iter, new_pte, true);
+ return direct_walk_iterator_set_pte_raw(iter, new_pte, true, true);
}
static u64 generate_nonleaf_pte(u64 *child_pt, bool ad_disabled)
@@ -2500,6 +2569,83 @@ static u64 generate_nonleaf_pte(u64 *child_pt, bool ad_disabled)
return pte;
}
+static u64 mark_direct_pte_for_dirty_track(u64 pte, int dlog_mode)
+{
+ if (dlog_mode == KVM_DIRTY_LOG_MODE_WRPROT)
+ pte &= ~PT_WRITABLE_MASK;
+ else
+ pte &= ~shadow_dirty_mask;
+
+ return pte;
+}
+
+void reset_direct_mmu_dirty_logging(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ int dirty_log_mode, bool record_dirty_pages)
+{
+ struct direct_walk_iterator iter;
+ u64 new_pte;
+ bool pte_set;
+
+ write_lock(&kvm->mmu_lock);
+
+ direct_walk_iterator_setup_walk(&iter, kvm, slot->as_id, slot->base_gfn,
+ slot->base_gfn + slot->npages,
+ MMU_WRITE_LOCK);
+ while (direct_walk_iterator_next_present_leaf_pte(&iter)) {
+ if (iter.level == PT_PAGE_TABLE_LEVEL &&
+ !is_dirty_direct_pte(iter.old_pte, dirty_log_mode))
+ continue;
+
+ new_pte = mark_direct_pte_for_dirty_track(iter.old_pte,
+ dirty_log_mode);
+
+ if (record_dirty_pages)
+ pte_set = direct_walk_iterator_set_pte(&iter, new_pte);
+ else
+ pte_set = direct_walk_iterator_set_pte_no_dlog(&iter,
+ new_pte);
+ if (!pte_set)
+ continue;
+ }
+ if (direct_walk_iterator_end_traversal(&iter))
+ kvm_flush_remote_tlbs(kvm);
+ write_unlock(&kvm->mmu_lock);
+}
+EXPORT_SYMBOL_GPL(reset_direct_mmu_dirty_logging);
+
+static bool clear_direct_dirty_log_gfn_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask,
+ int dirty_log_mode, enum mmu_lock_mode lock_mode)
+{
+ struct direct_walk_iterator iter;
+ u64 new_pte;
+
+ direct_walk_iterator_setup_walk(&iter, kvm, slot->as_id,
+ gfn + __ffs(mask), gfn + BITS_PER_LONG, lock_mode);
+ while (mask && direct_walk_iterator_next_present_leaf_pte(&iter)) {
+ if (iter.level > PT_PAGE_TABLE_LEVEL) {
+ BUG_ON(iter.old_pte & PT_WRITABLE_MASK);
+ continue;
+ }
+
+ if (!is_dirty_direct_pte(iter.old_pte, dirty_log_mode))
+ continue;
+
+ if (!(mask & (1UL << (iter.pte_gfn_start - gfn))))
+ continue;
+
+ new_pte = mark_direct_pte_for_dirty_track(iter.old_pte,
+ dirty_log_mode);
+
+ if (!direct_walk_iterator_set_pte_no_dlog(&iter, new_pte))
+ continue;
+
+ mask &= ~(1UL << (iter.pte_gfn_start - gfn));
+ }
+ return direct_walk_iterator_end_traversal(&iter);
+}
+
/**
* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
* @kvm: kvm instance
@@ -2509,12 +2655,24 @@ static u64 generate_nonleaf_pte(u64 *child_pt, bool ad_disabled)
*
* Used when we do not need to care about huge page mappings: e.g. during dirty
* logging we do not have any such mappings.
+ *
+ * We don't need to worry about flushing tlbs here as they are flushed
+ * unconditionally at a higher level. See the comments on
+ * kvm_vm_ioctl_get_dirty_log and kvm_mmu_slot_remove_write_access.
*/
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
struct kvm_rmap_head *rmap_head;
+ gfn_t gfn = slot->base_gfn + gfn_offset;
+
+ if (kvm->arch.direct_mmu_enabled)
+ clear_direct_dirty_log_gfn_masked(kvm, slot, gfn, mask,
+ KVM_DIRTY_LOG_MODE_WRPROT,
+ MMU_WRITE_LOCK);
+ if (kvm->arch.pure_direct_mmu)
+ return;
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
@@ -2541,6 +2699,16 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset, unsigned long mask)
{
struct kvm_rmap_head *rmap_head;
+ gfn_t gfn = slot->base_gfn + gfn_offset;
+
+ if (!mask)
+ return;
+
+ if (kvm->arch.direct_mmu_enabled)
+ clear_direct_dirty_log_gfn_masked(kvm, slot, gfn, mask,
+ KVM_DIRTY_LOG_MODE_PML, MMU_WRITE_LOCK);
+ if (kvm->arch.pure_direct_mmu)
+ return;
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
@@ -3031,6 +3199,7 @@ static int age_direct_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
iter.old_pte);
new_pte |= shadow_acc_track_value;
}
+ new_pte &= ~shadow_dirty_mask;
/*
* We've created a new pte with the accessed state cleared.
@@ -7293,11 +7462,17 @@ static bool slot_rmap_write_protect(struct kvm *kvm,
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- bool flush;
+ bool flush = false;
+
+ if (kvm->arch.direct_mmu_enabled)
+ reset_direct_mmu_dirty_logging(kvm, memslot,
+ KVM_DIRTY_LOG_MODE_WRPROT, false);
write_lock(&kvm->mmu_lock);
- flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
- false);
+ if (!kvm->arch.pure_direct_mmu)
+ flush = slot_handle_all_level(kvm, memslot,
+ slot_rmap_write_protect,
+ false);
write_unlock(&kvm->mmu_lock);
/*
@@ -7367,8 +7542,42 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
write_lock(&kvm->mmu_lock);
- slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
- kvm_mmu_zap_collapsible_spte, true);
+ if (!kvm->arch.pure_direct_mmu)
+ slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
+ kvm_mmu_zap_collapsible_spte, true);
+ write_unlock(&kvm->mmu_lock);
+}
+
+void kvm_mmu_zap_collapsible_direct_ptes(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot)
+{
+ struct direct_walk_iterator iter;
+ kvm_pfn_t pfn;
+
+ if (!kvm->arch.direct_mmu_enabled)
+ return;
+
+ write_lock(&kvm->mmu_lock);
+
+ direct_walk_iterator_setup_walk(&iter, kvm, memslot->as_id,
+ memslot->base_gfn,
+ memslot->base_gfn + memslot->npages,
+ MMU_READ_LOCK | MMU_LOCK_MAY_RESCHED);
+ while (direct_walk_iterator_next_present_leaf_pte(&iter)) {
+ pfn = spte_to_pfn(iter.old_pte);
+ if (kvm_is_reserved_pfn(pfn) ||
+ !PageTransCompoundMap(pfn_to_page(pfn)))
+ continue;
+ /*
+ * If the compare / exchange succeeds, then we will continue on
+ * to the next pte. If it fails, the next iteration will repeat
+ * the current pte. We'll handle both cases in the same way, so
+ * we don't need to check the result here.
+ */
+ direct_walk_iterator_set_pte(&iter, 0);
+ }
+ direct_walk_iterator_end_traversal(&iter);
+
write_unlock(&kvm->mmu_lock);
}
@@ -7414,18 +7623,46 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
}
EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
+static bool slot_set_dirty_direct(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ struct direct_walk_iterator iter;
+ u64 new_pte;
+
+ direct_walk_iterator_setup_walk(&iter, kvm, memslot->as_id,
+ memslot->base_gfn, memslot->base_gfn + memslot->npages,
+ MMU_WRITE_LOCK | MMU_LOCK_MAY_RESCHED);
+ while (direct_walk_iterator_next_present_pte(&iter)) {
+ new_pte = iter.old_pte | shadow_dirty_mask;
+
+ if (!direct_walk_iterator_set_pte(&iter, new_pte))
+ continue;
+ }
+ return direct_walk_iterator_end_traversal(&iter);
+}
+
void kvm_mmu_slot_set_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- bool flush;
+ bool flush = false;
write_lock(&kvm->mmu_lock);
- flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
+ if (kvm->arch.direct_mmu_enabled)
+ flush |= slot_set_dirty_direct(kvm, memslot);
+
+ if (!kvm->arch.pure_direct_mmu)
+ flush |= slot_handle_all_level(kvm, memslot, __rmap_set_dirty,
+ false);
write_unlock(&kvm->mmu_lock);
lockdep_assert_held(&kvm->slots_lock);
- /* see kvm_mmu_slot_leaf_clear_dirty */
+ /*
+ * It's also safe to flush TLBs out of mmu lock here as currently this
+ * function is only used for dirty logging, in which case flushing TLB
+ * out of mmu lock also guarantees no dirty pages will be lost in
+ * dirty_bitmap.
+ */
if (flush)
kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
memslot->npages);
@@ -7221,8 +7221,14 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
static void vmx_slot_enable_log_dirty(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
- kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+ if (kvm->arch.direct_mmu_enabled)
+ reset_direct_mmu_dirty_logging(kvm, slot,
+ KVM_DIRTY_LOG_MODE_PML, false);
+
+ if (!kvm->arch.pure_direct_mmu) {
+ kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+ kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+ }
}
static void vmx_slot_disable_log_dirty(struct kvm *kvm,
@@ -9776,8 +9776,10 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
*/
if (change == KVM_MR_FLAGS_ONLY &&
(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
- !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
kvm_mmu_zap_collapsible_sptes(kvm, new);
+ kvm_mmu_zap_collapsible_direct_ptes(kvm, new);
+ }
/*
* Set up write protection and/or dirty logging for the new slot.
Adds functions for handling changes to the dirty state of PTEs and functions for enabling / resetting dirty logging which use a paging structure iterator. Signed-off-by: Ben Gardon <bgardon@google.com> --- arch/x86/include/asm/kvm_host.h | 10 ++ arch/x86/kvm/mmu.c | 259 ++++++++++++++++++++++++++++++-- arch/x86/kvm/vmx/vmx.c | 10 +- arch/x86/kvm/x86.c | 4 +- 4 files changed, 269 insertions(+), 14 deletions(-)