@@ -305,6 +305,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+int kvm_mmu_map_gpa(struct kvm_vcpu *vcpu, gfn_t *startp, gfn_t end);
+
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
@@ -6119,6 +6119,62 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
}
}
+int kvm_mmu_map_gpa(struct kvm_vcpu *vcpu, gfn_t *startp, gfn_t end)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memslots *slots;
+ struct kvm_memslot_iter iter;
+ gfn_t start = *startp;
+ bool allow_private;
+ int ret;
+
+ if (!kvm_gfn_stolen_mask(kvm))
+ return -EOPNOTSUPP;
+
+ ret = mmu_topup_memory_caches(vcpu, false);
+ if (ret)
+ return ret;
+
+ allow_private = kvm_is_private_gfn(kvm, start);
+ start = kvm_gfn_unalias(kvm, start);
+ end = kvm_gfn_unalias(kvm, end);
+
+ mutex_lock(&kvm->slots_lock);
+ write_lock(&kvm->mmu_lock);
+
+ slots = __kvm_memslots(kvm, 0 /* only normal ram. not SMM. */);
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+ struct kvm_memory_slot *memslot = iter.slot;
+ gfn_t s = max(start, memslot->base_gfn);
+ gfn_t e = min(end, memslot->base_gfn + memslot->npages);
+
+ if (WARN_ON_ONCE(s >= e))
+ continue;
+ if (is_tdp_mmu_enabled(kvm)) {
+ ret = kvm_tdp_mmu_map_gpa(vcpu, &s, e, allow_private);
+ if (ret) {
+ start = s;
+ break;
+ }
+ } else {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ }
+
+ write_unlock(&kvm->mmu_lock);
+ mutex_unlock(&kvm->slots_lock);
+
+ if (ret == -EAGAIN) {
+ if (allow_private)
+ *startp = kvm_gfn_private(kvm, start);
+ else
+ *startp = kvm_gfn_shared(kvm, start);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_map_gpa);
+
static unsigned long
mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
@@ -533,6 +533,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
WARN_ON(sp->gfn != gfn);
}
+ /*
+ * SPTE_PRIVATE_PROHIBIT is only changed by map_gpa that obtains
+ * write lock of mmu_lock.
+ */
+ WARN_ON(shared &&
+ (is_private_prohibit_spte(old_spte) !=
+ is_private_prohibit_spte(new_spte)));
static_call(kvm_x86_handle_changed_private_spte)(
kvm, gfn, level,
old_pfn, was_present, was_leaf,
@@ -1751,6 +1758,207 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
return spte_set;
}
+typedef void (*update_spte_t)(
+ struct kvm *kvm, struct tdp_iter *iter, bool allow_private);
+
+static int kvm_tdp_mmu_update_range(struct kvm_vcpu *vcpu, bool is_private,
+ gfn_t start, gfn_t end, gfn_t *nextp,
+ update_spte_t fn, bool allow_private)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct tdp_iter iter;
+ int ret = 0;
+
+ rcu_read_lock();
+ tdp_mmu_for_each_pte(iter, vcpu->arch.mmu, is_private, start, end) {
+ if (iter.level == PG_LEVEL_4K) {
+ fn(kvm, &iter, allow_private);
+ continue;
+ }
+
+ /*
+ * Which GPA is allowed, private or shared, is recorded in the
+ * granular of 4K in private leaf spte as SPTE_PRIVATE_PROHIBIT.
+ * Break large page into 4K.
+ */
+ if (is_shadow_present_pte(iter.old_spte) &&
+ is_large_pte(iter.old_spte)) {
+ /*
+ * TODO: large page support.
+ * Doesn't support large page for TDX now
+ */
+ WARN_ON_ONCE(true);
+ tdp_mmu_set_spte(kvm, &iter, shadow_init_value);
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ }
+
+ if (!is_shadow_present_pte(iter.old_spte)) {
+ /*
+ * Guarantee that alloc_tdp_mmu_page() succees which
+ * assumes page allocation from cache always successes.
+ */
+ if (vcpu->arch.mmu_page_header_cache.nobjs == 0 ||
+ vcpu->arch.mmu_shadow_page_cache.nobjs == 0 ||
+ vcpu->arch.mmu_private_sp_cache.nobjs == 0) {
+ ret = -EAGAIN;
+ break;
+ }
+ /*
+ * write lock of mmu_lock is held. No other thread
+ * freezes SPTE.
+ */
+ if (!tdp_mmu_populate_nonleaf(
+ vcpu, &iter, is_private, false)) {
+ /* As write lock is held, this case sholdn't happen. */
+ WARN_ON_ONCE(true);
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ if (ret == -EAGAIN)
+ *nextp = iter.next_last_level_gfn;
+
+ return ret;
+}
+
+static void kvm_tdp_mmu_update_shared_spte(
+ struct kvm *kvm, struct tdp_iter *iter, bool allow_private)
+{
+ u64 new_spte;
+
+ WARN_ON(kvm_is_private_gfn(kvm, iter->gfn));
+ if (allow_private) {
+ /* Zap SPTE and clear PRIVATE_PROHIBIT */
+ new_spte = shadow_init_value;
+ if (new_spte != iter->old_spte)
+ tdp_mmu_set_spte(kvm, iter, new_spte);
+ } else {
+ new_spte = iter->old_spte | SPTE_PRIVATE_PROHIBIT;
+ /* No side effect is needed */
+ if (new_spte != iter->old_spte)
+ WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
+ }
+}
+
+static void kvm_tdp_mmu_update_private_spte(
+ struct kvm *kvm, struct tdp_iter *iter, bool allow_private)
+{
+ u64 new_spte;
+
+ WARN_ON(!kvm_is_private_gfn(kvm, iter->gfn));
+ if (allow_private) {
+ new_spte = iter->old_spte & ~SPTE_PRIVATE_PROHIBIT;
+ /* No side effect is needed */
+ if (new_spte != iter->old_spte)
+ WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
+ } else {
+ if (is_shadow_present_pte(iter->old_spte)) {
+ /* Zap SPTE */
+ new_spte = shadow_init_value | SPTE_PRIVATE_PROHIBIT;
+ tdp_mmu_set_spte(kvm, iter, new_spte);
+ } else {
+ new_spte = iter->old_spte | SPTE_PRIVATE_PROHIBIT;
+ /* No side effect is needed */
+ WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
+ }
+ }
+}
+
+/*
+ * Whether GPA is allowed to map private or shared is recorded in both private
+ * and shared leaf spte entry as SPTE_PRIVATE_PROHIBIT bit. They must match.
+ * private leaf spte entry
+ * - present: private mapping is allowed. (already mapped)
+ * - non-present: private mapping is allowed.
+ * - present | PRIVATE_PROHIBIT: invalid state.
+ * - non-present | SPTE_PRIVATE_PROHIBIT: shared mapping is allowed.
+ * may or may not be mapped as shared.
+ * shared leaf spte entry
+ * - present: invalid state
+ * - non-present: private mapping is allowed.
+ * - present | PRIVATE_PROHIBIT: shared mapping is allowed (already mapped)
+ * - non-present | PRIVATE_PROHIBIT: shared mapping is allowed.
+ *
+ * state change of private spte:
+ * map_gpa(private):
+ * private EPT entry: clear PRIVATE_PROHIBIT
+ * present: nop
+ * non-present: nop
+ * non-present | PRIVATE_PROHIBIT -> non-present
+ * share EPT entry: zap and clear PRIVATE_PROHIBIT
+ * any -> non-present
+ * map_gpa(shared):
+ * private EPT entry: zap and set PRIVATE_PROHIBIT
+ * present -> non-present | PRIVATE_PROHIBIT
+ * non-present -> non-present | PRIVATE_PROHIBIT
+ * non-present | PRIVATE_PROHIBIT: nop
+ * shared EPT entry: set PRIVATE_PROHIBIT
+ * present | PRIVATE_PROHIBIT: nop
+ * non-present -> non-present | PRIVATE_PROHIBIT
+ * non-present | PRIVATE_PROHIBIT: nop
+ * map(private GPA):
+ * private EPT entry: try to populate
+ * present: nop
+ * non-present -> present
+ * non-present | PRIVATE_PROHIBIT: nop. looping on EPT violation
+ * shared EPT entry: nop
+ * map(shared GPA):
+ * private EPT entry: nop
+ * shared EPT entry: populate
+ * present | PRIVATE_PROHIBIT: nop
+ * non-present | PRIVATE_PROHIBIT -> present | PRIVATE_PROHIBIT
+ * non-present: nop. looping on EPT violation
+ * zap(private GPA):
+ * private EPT entry: zap and keep PRIVATE_PROHIBIT
+ * present | PRIVATE_PROHIBIT -> non-present | PRIVATE_PROHIBIT
+ * non-present: nop as is_shadow_prsent_pte() is checked
+ * non-present | PRIVATE_PROHIBIT: nop by is_shadow_present_pte()
+ * shared EPT entry: nop
+ * zap(shared GPA):
+ * private EPT entry: nop
+ * shared EPT entry: zap and keep PRIVATE_PROHIBIT
+ * present | PRIVATE_PROHIBIT -> non-present | PRIVATE_PROHIBIT
+ * non-present | PRIVATE_PROHIBIT: nop
+ * non-present: nop.
+ */
+int kvm_tdp_mmu_map_gpa(struct kvm_vcpu *vcpu,
+ gfn_t *startp, gfn_t end, bool allow_private)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ gfn_t start = *startp;
+ gfn_t next;
+ int ret = 0;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ WARN_ON(start & kvm_gfn_stolen_mask(kvm));
+ WARN_ON(end & kvm_gfn_stolen_mask(kvm));
+
+ if (!VALID_PAGE(mmu->root_hpa) || !VALID_PAGE(mmu->private_root_hpa))
+ return -EINVAL;
+
+ next = end;
+ ret = kvm_tdp_mmu_update_range(
+ vcpu, false, kvm_gfn_shared(kvm, start), kvm_gfn_shared(kvm, end),
+ &next, kvm_tdp_mmu_update_shared_spte, allow_private);
+ if (ret) {
+ kvm_flush_remote_tlbs_with_address(kvm, start, next - start);
+ return ret;
+ }
+
+ ret = kvm_tdp_mmu_update_range(vcpu, true, start, end, &next,
+ kvm_tdp_mmu_update_private_spte, allow_private);
+ if (ret == -EAGAIN) {
+ *startp = next;
+ end = *startp;
+ }
+ kvm_flush_remote_tlbs_with_address(kvm, start, end - start);
+ return ret;
+}
+
/*
* Return the level of the lowest level SPTE added to sptes.
* That SPTE may be non-present.
@@ -74,6 +74,9 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
int min_level);
+int kvm_tdp_mmu_map_gpa(struct kvm_vcpu *vcpu,
+ gfn_t *startp, gfn_t end, bool is_private);
+
static inline void kvm_tdp_mmu_walk_lockless_begin(void)
{
rcu_read_lock();