diff mbox series

[v2,02/20] kvm: x86/mmu: Introduce tdp_iter

Message ID 20201014182700.2888246-3-bgardon@google.com
State New, archived
Headers show
Series Introduce the TDP MMU | expand

Commit Message

Ben Gardon Oct. 14, 2020, 6:26 p.m. UTC
The TDP iterator implements a pre-order traversal of a TDP paging
structure. This iterator will be used in future patches to create
an efficient implementation of the KVM MMU for the TDP case.

Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
machine. This series introduced no new failures.

This series can be viewed in Gerrit at:
	https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/Makefile           |   3 +-
 arch/x86/kvm/mmu/mmu.c          |  66 ------------
 arch/x86/kvm/mmu/mmu_internal.h |  66 ++++++++++++
 arch/x86/kvm/mmu/tdp_iter.c     | 176 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu/tdp_iter.h     |  56 ++++++++++
 5 files changed, 300 insertions(+), 67 deletions(-)
 create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
 create mode 100644 arch/x86/kvm/mmu/tdp_iter.h

Comments

Yu Zhang Oct. 21, 2020, 2:58 p.m. UTC | #1
On Wed, Oct 14, 2020 at 11:26:42AM -0700, Ben Gardon wrote:
> The TDP iterator implements a pre-order traversal of a TDP paging
> structure. This iterator will be used in future patches to create
> an efficient implementation of the KVM MMU for the TDP case.
> 
> Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> machine. This series introduced no new failures.
> 
> This series can be viewed in Gerrit at:
> 	https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/kvm/Makefile           |   3 +-
>  arch/x86/kvm/mmu/mmu.c          |  66 ------------
>  arch/x86/kvm/mmu/mmu_internal.h |  66 ++++++++++++
>  arch/x86/kvm/mmu/tdp_iter.c     | 176 ++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu/tdp_iter.h     |  56 ++++++++++
>  5 files changed, 300 insertions(+), 67 deletions(-)
>  create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
>  create mode 100644 arch/x86/kvm/mmu/tdp_iter.h
> 
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 7f86a14aed0e9..4525c1151bf99 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
>  
>  kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
>  			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
> -			   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
> +			   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
> +			   mmu/tdp_iter.o
>  
>  kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
>  			   vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6c9db349600c8..6d82784ed5679 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -121,28 +121,6 @@ module_param(dbg, bool, 0644);
>  
>  #define PTE_PREFETCH_NUM		8
>  
> -#define PT_FIRST_AVAIL_BITS_SHIFT 10
> -#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> -
> -/*
> - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
> - * Access Tracking SPTEs.
> - */
> -#define SPTE_SPECIAL_MASK (3ULL << 52)
> -#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> -#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> -#define SPTE_MMIO_MASK (3ULL << 52)
> -
> -#define PT64_LEVEL_BITS 9
> -
> -#define PT64_LEVEL_SHIFT(level) \
> -		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> -
> -#define PT64_INDEX(address, level)\
> -	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> -
> -
>  #define PT32_LEVEL_BITS 10
>  
>  #define PT32_LEVEL_SHIFT(level) \
> @@ -155,19 +133,6 @@ module_param(dbg, bool, 0644);
>  #define PT32_INDEX(address, level)\
>  	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
>  
> -
> -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> -#else
> -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> -#endif
> -#define PT64_LVL_ADDR_MASK(level) \
> -	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> -						* PT64_LEVEL_BITS))) - 1))
> -#define PT64_LVL_OFFSET_MASK(level) \
> -	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> -						* PT64_LEVEL_BITS))) - 1))
> -
>  #define PT32_BASE_ADDR_MASK PAGE_MASK
>  #define PT32_DIR_BASE_ADDR_MASK \
>  	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
> @@ -192,8 +157,6 @@ module_param(dbg, bool, 0644);
>  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
>  #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
>  
> -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> -
>  /* make pte_list_desc fit well in cache line */
>  #define PTE_LIST_EXT 3
>  
> @@ -349,11 +312,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
>  
> -static bool is_mmio_spte(u64 spte)
> -{
> -	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> -}
> -
>  static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
>  {
>  	return sp->role.ad_disabled;
> @@ -626,35 +584,11 @@ static int is_nx(struct kvm_vcpu *vcpu)
>  	return vcpu->arch.efer & EFER_NX;
>  }
>  
> -static int is_shadow_present_pte(u64 pte)
> -{
> -	return (pte != 0) && !is_mmio_spte(pte);
> -}
> -
> -static int is_large_pte(u64 pte)
> -{
> -	return pte & PT_PAGE_SIZE_MASK;
> -}
> -
> -static int is_last_spte(u64 pte, int level)
> -{
> -	if (level == PG_LEVEL_4K)
> -		return 1;
> -	if (is_large_pte(pte))
> -		return 1;
> -	return 0;
> -}
> -
>  static bool is_executable_pte(u64 spte)
>  {
>  	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>  }
>  
> -static kvm_pfn_t spte_to_pfn(u64 pte)
> -{
> -	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> -}
> -
>  static gfn_t pse36_gfn_delta(u32 gpte)
>  {
>  	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 3acf3b8eb469d..74ccbf001a42e 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -2,6 +2,8 @@
>  #ifndef __KVM_X86_MMU_INTERNAL_H
>  #define __KVM_X86_MMU_INTERNAL_H
>  
> +#include "mmu.h"
> +
>  #include <linux/types.h>
>  
>  #include <asm/kvm_host.h>
> @@ -60,4 +62,68 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
>  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
>  				    struct kvm_memory_slot *slot, u64 gfn);
>  
> +#define PT64_LEVEL_BITS 9
> +
> +#define PT64_LEVEL_SHIFT(level) \
> +		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> +
> +#define PT64_INDEX(address, level)\
> +	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> +
> +#define PT_FIRST_AVAIL_BITS_SHIFT 10
> +#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> +
> +/*
> + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
> + * Access Tracking SPTEs.
> + */
> +#define SPTE_SPECIAL_MASK (3ULL << 52)
> +#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> +#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> +#define SPTE_MMIO_MASK (3ULL << 52)
> +
> +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> +#else
> +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> +#endif
> +#define PT64_LVL_ADDR_MASK(level) \
> +	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> +						* PT64_LEVEL_BITS))) - 1))
> +#define PT64_LVL_OFFSET_MASK(level) \
> +	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> +						* PT64_LEVEL_BITS))) - 1))
> +
> +/* Functions for interpreting SPTEs */
> +static inline bool is_mmio_spte(u64 spte)
> +{
> +	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> +}
> +
> +static inline int is_shadow_present_pte(u64 pte)
> +{
> +	return (pte != 0) && !is_mmio_spte(pte);
> +}
> +
> +static inline int is_large_pte(u64 pte)
> +{
> +	return pte & PT_PAGE_SIZE_MASK;
> +}
> +
> +static inline int is_last_spte(u64 pte, int level)
> +{
> +	if (level == PG_LEVEL_4K)
> +		return 1;
> +	if (is_large_pte(pte))
> +		return 1;
> +	return 0;
> +}
> +
> +static inline kvm_pfn_t spte_to_pfn(u64 pte)
> +{
> +	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> +}
> +
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> new file mode 100644
> index 0000000000000..b07e9f0c5d4aa
> --- /dev/null
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -0,0 +1,176 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include "mmu_internal.h"
> +#include "tdp_iter.h"
> +
> +/*
> + * Recalculates the pointer to the SPTE for the current GFN and level and
> + * reread the SPTE.
> + */
> +static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
> +{
> +	iter->sptep = iter->pt_path[iter->level - 1] +
> +		SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
> +	iter->old_spte = READ_ONCE(*iter->sptep);
> +}
> +
> +static gfn_t round_gfn_for_level(gfn_t gfn, int level)
> +{
> +	return gfn - (gfn % KVM_PAGES_PER_HPAGE(level));

Instead of the modulo operator, how about we use:
	return gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 
here? :)

> +}
> +
> +/*
> + * Sets a TDP iterator to walk a pre-order traversal of the paging structure
> + * rooted at root_pt, starting with the walk to translate goal_gfn.
> + */
> +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> +		    int min_level, gfn_t goal_gfn)
> +{
> +	WARN_ON(root_level < 1);
> +	WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
> +
> +	iter->goal_gfn = goal_gfn;
> +	iter->root_level = root_level;
> +	iter->min_level = min_level;
> +	iter->level = root_level;
> +	iter->pt_path[iter->level - 1] = root_pt;
> +
> +	iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
> +	tdp_iter_refresh_sptep(iter);
> +
> +	iter->valid = true;
> +}
> +
> +/*
> + * Given an SPTE and its level, returns a pointer containing the host virtual
> + * address of the child page table referenced by the SPTE. Returns null if
> + * there is no such entry.
> + */
> +u64 *spte_to_child_pt(u64 spte, int level)
> +{
> +	/*
> +	 * There's no child entry if this entry isn't present or is a
> +	 * last-level entry.
> +	 */
> +	if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
> +		return NULL;
> +
> +	return __va(spte_to_pfn(spte) << PAGE_SHIFT);
> +}
> +
> +/*
> + * Steps down one level in the paging structure towards the goal GFN. Returns
> + * true if the iterator was able to step down a level, false otherwise.
> + */
> +static bool try_step_down(struct tdp_iter *iter)
> +{
> +	u64 *child_pt;
> +
> +	if (iter->level == iter->min_level)
> +		return false;
> +
> +	/*
> +	 * Reread the SPTE before stepping down to avoid traversing into page
> +	 * tables that are no longer linked from this entry.
> +	 */
> +	iter->old_spte = READ_ONCE(*iter->sptep);
> +
> +	child_pt = spte_to_child_pt(iter->old_spte, iter->level);
> +	if (!child_pt)
> +		return false;
> +
> +	iter->level--;
> +	iter->pt_path[iter->level - 1] = child_pt;
> +	iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
> +	tdp_iter_refresh_sptep(iter);
> +
> +	return true;
> +}
> +
> +/*
> + * Steps to the next entry in the current page table, at the current page table
> + * level. The next entry could point to a page backing guest memory or another
> + * page table, or it could be non-present. Returns true if the iterator was
> + * able to step to the next entry in the page table, false if the iterator was
> + * already at the end of the current page table.
> + */
> +static bool try_step_side(struct tdp_iter *iter)
> +{
> +	/*
> +	 * Check if the iterator is already at the end of the current page
> +	 * table.
> +	 */
> +	if (!((iter->gfn + KVM_PAGES_PER_HPAGE(iter->level)) %
> +	      KVM_PAGES_PER_HPAGE(iter->level + 1)))
> +		return false;
> +

And maybe: 
	if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
	    (PT64_ENT_PER_PAGE - 1))
here?

> +	iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
> +	iter->goal_gfn = iter->gfn;
> +	iter->sptep++;
> +	iter->old_spte = READ_ONCE(*iter->sptep);
> +
> +	return true;
> +}
> +
> +/*
> + * Tries to traverse back up a level in the paging structure so that the walk
> + * can continue from the next entry in the parent page table. Returns true on a
> + * successful step up, false if already in the root page.
> + */
> +static bool try_step_up(struct tdp_iter *iter)
> +{
> +	if (iter->level == iter->root_level)
> +		return false;
> +
> +	iter->level++;
> +	iter->gfn = round_gfn_for_level(iter->gfn, iter->level);
> +	tdp_iter_refresh_sptep(iter);
> +
> +	return true;
> +}
> +
> +/*
> + * Step to the next SPTE in a pre-order traversal of the paging structure.
> + * To get to the next SPTE, the iterator either steps down towards the goal
> + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
> + * highter GFN.
> + *
> + * The basic algorithm is as follows:
> + * 1. If the current SPTE is a non-last-level SPTE, step down into the page
> + *    table it points to.
> + * 2. If the iterator cannot step down, it will try to step to the next SPTE
> + *    in the current page of the paging structure.
> + * 3. If the iterator cannot step to the next entry in the current page, it will
> + *    try to step up to the parent paging structure page. In this case, that
> + *    SPTE will have already been visited, and so the iterator must also step
> + *    to the side again.
> + */
> +void tdp_iter_next(struct tdp_iter *iter)
> +{
> +	if (try_step_down(iter))
> +		return;
> +
> +	do {
> +		if (try_step_side(iter))
> +			return;
> +	} while (try_step_up(iter));
> +	iter->valid = false;
> +}
> +
> +/*
> + * Restart the walk over the paging structure from the root, starting from the
> + * highest gfn the iterator had previously reached. Assumes that the entire
> + * paging structure, except the root page, may have been completely torn down
> + * and rebuilt.
> + */
> +void tdp_iter_refresh_walk(struct tdp_iter *iter)
> +{
> +	gfn_t goal_gfn = iter->goal_gfn;
> +
> +	if (iter->gfn > goal_gfn)
> +		goal_gfn = iter->gfn;
> +
> +	tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
> +		       iter->root_level, iter->min_level, goal_gfn);
> +}
> +
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> new file mode 100644
> index 0000000000000..d629a53e1b73f
> --- /dev/null
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -0,0 +1,56 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#ifndef __KVM_X86_MMU_TDP_ITER_H
> +#define __KVM_X86_MMU_TDP_ITER_H
> +
> +#include <linux/kvm_host.h>
> +
> +#include "mmu.h"
> +
> +/*
> + * A TDP iterator performs a pre-order walk over a TDP paging structure.
> + */
> +struct tdp_iter {
> +	/*
> +	 * The iterator will traverse the paging structure towards the mapping
> +	 * for this GFN.
> +	 */
> +	gfn_t goal_gfn;
> +	/* Pointers to the page tables traversed to reach the current SPTE */
> +	u64 *pt_path[PT64_ROOT_MAX_LEVEL];
> +	/* A pointer to the current SPTE */
> +	u64 *sptep;
> +	/* The lowest GFN mapped by the current SPTE */
> +	gfn_t gfn;
> +	/* The level of the root page given to the iterator */
> +	int root_level;
> +	/* The lowest level the iterator should traverse to */
> +	int min_level;
> +	/* The iterator's current level within the paging structure */
> +	int level;
> +	/* A snapshot of the value at sptep */
> +	u64 old_spte;
> +	/*
> +	 * Whether the iterator has a valid state. This will be false if the
> +	 * iterator walks off the end of the paging structure.
> +	 */
> +	bool valid;
> +};
> +
> +/*
> + * Iterates over every SPTE mapping the GFN range [start, end) in a
> + * preorder traversal.
> + */
> +#define for_each_tdp_pte(iter, root, root_level, start, end) \
> +	for (tdp_iter_start(&iter, root, root_level, PG_LEVEL_4K, start); \
> +	     iter.valid && iter.gfn < end;		     \
> +	     tdp_iter_next(&iter))
> +
> +u64 *spte_to_child_pt(u64 pte, int level);
> +
> +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> +		    int min_level, gfn_t goal_gfn);
> +void tdp_iter_next(struct tdp_iter *iter);
> +void tdp_iter_refresh_walk(struct tdp_iter *iter);
> +
> +#endif /* __KVM_X86_MMU_TDP_ITER_H */
> -- 
> 2.28.0.1011.ga647a8990f-goog
> 

I am just suggesting to replace the modulo operations with bit-shifts... 
Also, it's very exciting to see such patch set. Thanks!

B.R.
Yu
Ben Gardon Oct. 21, 2020, 6:08 p.m. UTC | #2
On Wed, Oct 21, 2020 at 7:59 AM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
>
> On Wed, Oct 14, 2020 at 11:26:42AM -0700, Ben Gardon wrote:
> > The TDP iterator implements a pre-order traversal of a TDP paging
> > structure. This iterator will be used in future patches to create
> > an efficient implementation of the KVM MMU for the TDP case.
> >
> > Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> > machine. This series introduced no new failures.
> >
> > This series can be viewed in Gerrit at:
> >       https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> >
> > Signed-off-by: Ben Gardon <bgardon@google.com>
> > ---
> >  arch/x86/kvm/Makefile           |   3 +-
> >  arch/x86/kvm/mmu/mmu.c          |  66 ------------
> >  arch/x86/kvm/mmu/mmu_internal.h |  66 ++++++++++++
> >  arch/x86/kvm/mmu/tdp_iter.c     | 176 ++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/mmu/tdp_iter.h     |  56 ++++++++++
> >  5 files changed, 300 insertions(+), 67 deletions(-)
> >  create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
> >  create mode 100644 arch/x86/kvm/mmu/tdp_iter.h
> >
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index 7f86a14aed0e9..4525c1151bf99 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF)  += $(KVM)/async_pf.o
> >
> >  kvm-y                        += x86.o emulate.o i8259.o irq.o lapic.o \
> >                          i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
> > -                        hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
> > +                        hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
> > +                        mmu/tdp_iter.o
> >
> >  kvm-intel-y          += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> >                          vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 6c9db349600c8..6d82784ed5679 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -121,28 +121,6 @@ module_param(dbg, bool, 0644);
> >
> >  #define PTE_PREFETCH_NUM             8
> >
> > -#define PT_FIRST_AVAIL_BITS_SHIFT 10
> > -#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> > -
> > -/*
> > - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
> > - * Access Tracking SPTEs.
> > - */
> > -#define SPTE_SPECIAL_MASK (3ULL << 52)
> > -#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> > -#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> > -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> > -#define SPTE_MMIO_MASK (3ULL << 52)
> > -
> > -#define PT64_LEVEL_BITS 9
> > -
> > -#define PT64_LEVEL_SHIFT(level) \
> > -             (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> > -
> > -#define PT64_INDEX(address, level)\
> > -     (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> > -
> > -
> >  #define PT32_LEVEL_BITS 10
> >
> >  #define PT32_LEVEL_SHIFT(level) \
> > @@ -155,19 +133,6 @@ module_param(dbg, bool, 0644);
> >  #define PT32_INDEX(address, level)\
> >       (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
> >
> > -
> > -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> > -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> > -#else
> > -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> > -#endif
> > -#define PT64_LVL_ADDR_MASK(level) \
> > -     (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > -                                             * PT64_LEVEL_BITS))) - 1))
> > -#define PT64_LVL_OFFSET_MASK(level) \
> > -     (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > -                                             * PT64_LEVEL_BITS))) - 1))
> > -
> >  #define PT32_BASE_ADDR_MASK PAGE_MASK
> >  #define PT32_DIR_BASE_ADDR_MASK \
> >       (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
> > @@ -192,8 +157,6 @@ module_param(dbg, bool, 0644);
> >  #define SPTE_HOST_WRITEABLE  (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> >  #define SPTE_MMU_WRITEABLE   (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> >
> > -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > -
> >  /* make pte_list_desc fit well in cache line */
> >  #define PTE_LIST_EXT 3
> >
> > @@ -349,11 +312,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
> >
> > -static bool is_mmio_spte(u64 spte)
> > -{
> > -     return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> > -}
> > -
> >  static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
> >  {
> >       return sp->role.ad_disabled;
> > @@ -626,35 +584,11 @@ static int is_nx(struct kvm_vcpu *vcpu)
> >       return vcpu->arch.efer & EFER_NX;
> >  }
> >
> > -static int is_shadow_present_pte(u64 pte)
> > -{
> > -     return (pte != 0) && !is_mmio_spte(pte);
> > -}
> > -
> > -static int is_large_pte(u64 pte)
> > -{
> > -     return pte & PT_PAGE_SIZE_MASK;
> > -}
> > -
> > -static int is_last_spte(u64 pte, int level)
> > -{
> > -     if (level == PG_LEVEL_4K)
> > -             return 1;
> > -     if (is_large_pte(pte))
> > -             return 1;
> > -     return 0;
> > -}
> > -
> >  static bool is_executable_pte(u64 spte)
> >  {
> >       return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
> >  }
> >
> > -static kvm_pfn_t spte_to_pfn(u64 pte)
> > -{
> > -     return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> > -}
> > -
> >  static gfn_t pse36_gfn_delta(u32 gpte)
> >  {
> >       int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 3acf3b8eb469d..74ccbf001a42e 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -2,6 +2,8 @@
> >  #ifndef __KVM_X86_MMU_INTERNAL_H
> >  #define __KVM_X86_MMU_INTERNAL_H
> >
> > +#include "mmu.h"
> > +
> >  #include <linux/types.h>
> >
> >  #include <asm/kvm_host.h>
> > @@ -60,4 +62,68 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
> >  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
> >                                   struct kvm_memory_slot *slot, u64 gfn);
> >
> > +#define PT64_LEVEL_BITS 9
> > +
> > +#define PT64_LEVEL_SHIFT(level) \
> > +             (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> > +
> > +#define PT64_INDEX(address, level)\
> > +     (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> > +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > +
> > +#define PT_FIRST_AVAIL_BITS_SHIFT 10
> > +#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> > +
> > +/*
> > + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
> > + * Access Tracking SPTEs.
> > + */
> > +#define SPTE_SPECIAL_MASK (3ULL << 52)
> > +#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> > +#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> > +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> > +#define SPTE_MMIO_MASK (3ULL << 52)
> > +
> > +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> > +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> > +#else
> > +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> > +#endif
> > +#define PT64_LVL_ADDR_MASK(level) \
> > +     (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > +                                             * PT64_LEVEL_BITS))) - 1))
> > +#define PT64_LVL_OFFSET_MASK(level) \
> > +     (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > +                                             * PT64_LEVEL_BITS))) - 1))
> > +
> > +/* Functions for interpreting SPTEs */
> > +static inline bool is_mmio_spte(u64 spte)
> > +{
> > +     return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> > +}
> > +
> > +static inline int is_shadow_present_pte(u64 pte)
> > +{
> > +     return (pte != 0) && !is_mmio_spte(pte);
> > +}
> > +
> > +static inline int is_large_pte(u64 pte)
> > +{
> > +     return pte & PT_PAGE_SIZE_MASK;
> > +}
> > +
> > +static inline int is_last_spte(u64 pte, int level)
> > +{
> > +     if (level == PG_LEVEL_4K)
> > +             return 1;
> > +     if (is_large_pte(pte))
> > +             return 1;
> > +     return 0;
> > +}
> > +
> > +static inline kvm_pfn_t spte_to_pfn(u64 pte)
> > +{
> > +     return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> > +}
> > +
> >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> > new file mode 100644
> > index 0000000000000..b07e9f0c5d4aa
> > --- /dev/null
> > +++ b/arch/x86/kvm/mmu/tdp_iter.c
> > @@ -0,0 +1,176 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#include "mmu_internal.h"
> > +#include "tdp_iter.h"
> > +
> > +/*
> > + * Recalculates the pointer to the SPTE for the current GFN and level and
> > + * reread the SPTE.
> > + */
> > +static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
> > +{
> > +     iter->sptep = iter->pt_path[iter->level - 1] +
> > +             SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
> > +     iter->old_spte = READ_ONCE(*iter->sptep);
> > +}
> > +
> > +static gfn_t round_gfn_for_level(gfn_t gfn, int level)
> > +{
> > +     return gfn - (gfn % KVM_PAGES_PER_HPAGE(level));
>
> Instead of the modulo operator, how about we use:
>         return gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
> here? :)
>
> > +}
> > +
> > +/*
> > + * Sets a TDP iterator to walk a pre-order traversal of the paging structure
> > + * rooted at root_pt, starting with the walk to translate goal_gfn.
> > + */
> > +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> > +                 int min_level, gfn_t goal_gfn)
> > +{
> > +     WARN_ON(root_level < 1);
> > +     WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
> > +
> > +     iter->goal_gfn = goal_gfn;
> > +     iter->root_level = root_level;
> > +     iter->min_level = min_level;
> > +     iter->level = root_level;
> > +     iter->pt_path[iter->level - 1] = root_pt;
> > +
> > +     iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
> > +     tdp_iter_refresh_sptep(iter);
> > +
> > +     iter->valid = true;
> > +}
> > +
> > +/*
> > + * Given an SPTE and its level, returns a pointer containing the host virtual
> > + * address of the child page table referenced by the SPTE. Returns null if
> > + * there is no such entry.
> > + */
> > +u64 *spte_to_child_pt(u64 spte, int level)
> > +{
> > +     /*
> > +      * There's no child entry if this entry isn't present or is a
> > +      * last-level entry.
> > +      */
> > +     if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
> > +             return NULL;
> > +
> > +     return __va(spte_to_pfn(spte) << PAGE_SHIFT);
> > +}
> > +
> > +/*
> > + * Steps down one level in the paging structure towards the goal GFN. Returns
> > + * true if the iterator was able to step down a level, false otherwise.
> > + */
> > +static bool try_step_down(struct tdp_iter *iter)
> > +{
> > +     u64 *child_pt;
> > +
> > +     if (iter->level == iter->min_level)
> > +             return false;
> > +
> > +     /*
> > +      * Reread the SPTE before stepping down to avoid traversing into page
> > +      * tables that are no longer linked from this entry.
> > +      */
> > +     iter->old_spte = READ_ONCE(*iter->sptep);
> > +
> > +     child_pt = spte_to_child_pt(iter->old_spte, iter->level);
> > +     if (!child_pt)
> > +             return false;
> > +
> > +     iter->level--;
> > +     iter->pt_path[iter->level - 1] = child_pt;
> > +     iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
> > +     tdp_iter_refresh_sptep(iter);
> > +
> > +     return true;
> > +}
> > +
> > +/*
> > + * Steps to the next entry in the current page table, at the current page table
> > + * level. The next entry could point to a page backing guest memory or another
> > + * page table, or it could be non-present. Returns true if the iterator was
> > + * able to step to the next entry in the page table, false if the iterator was
> > + * already at the end of the current page table.
> > + */
> > +static bool try_step_side(struct tdp_iter *iter)
> > +{
> > +     /*
> > +      * Check if the iterator is already at the end of the current page
> > +      * table.
> > +      */
> > +     if (!((iter->gfn + KVM_PAGES_PER_HPAGE(iter->level)) %
> > +           KVM_PAGES_PER_HPAGE(iter->level + 1)))
> > +             return false;
> > +
>
> And maybe:
>         if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
>             (PT64_ENT_PER_PAGE - 1))
> here?
>
> > +     iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
> > +     iter->goal_gfn = iter->gfn;
> > +     iter->sptep++;
> > +     iter->old_spte = READ_ONCE(*iter->sptep);
> > +
> > +     return true;
> > +}
> > +
> > +/*
> > + * Tries to traverse back up a level in the paging structure so that the walk
> > + * can continue from the next entry in the parent page table. Returns true on a
> > + * successful step up, false if already in the root page.
> > + */
> > +static bool try_step_up(struct tdp_iter *iter)
> > +{
> > +     if (iter->level == iter->root_level)
> > +             return false;
> > +
> > +     iter->level++;
> > +     iter->gfn = round_gfn_for_level(iter->gfn, iter->level);
> > +     tdp_iter_refresh_sptep(iter);
> > +
> > +     return true;
> > +}
> > +
> > +/*
> > + * Step to the next SPTE in a pre-order traversal of the paging structure.
> > + * To get to the next SPTE, the iterator either steps down towards the goal
> > + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
> > + * highter GFN.
> > + *
> > + * The basic algorithm is as follows:
> > + * 1. If the current SPTE is a non-last-level SPTE, step down into the page
> > + *    table it points to.
> > + * 2. If the iterator cannot step down, it will try to step to the next SPTE
> > + *    in the current page of the paging structure.
> > + * 3. If the iterator cannot step to the next entry in the current page, it will
> > + *    try to step up to the parent paging structure page. In this case, that
> > + *    SPTE will have already been visited, and so the iterator must also step
> > + *    to the side again.
> > + */
> > +void tdp_iter_next(struct tdp_iter *iter)
> > +{
> > +     if (try_step_down(iter))
> > +             return;
> > +
> > +     do {
> > +             if (try_step_side(iter))
> > +                     return;
> > +     } while (try_step_up(iter));
> > +     iter->valid = false;
> > +}
> > +
> > +/*
> > + * Restart the walk over the paging structure from the root, starting from the
> > + * highest gfn the iterator had previously reached. Assumes that the entire
> > + * paging structure, except the root page, may have been completely torn down
> > + * and rebuilt.
> > + */
> > +void tdp_iter_refresh_walk(struct tdp_iter *iter)
> > +{
> > +     gfn_t goal_gfn = iter->goal_gfn;
> > +
> > +     if (iter->gfn > goal_gfn)
> > +             goal_gfn = iter->gfn;
> > +
> > +     tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
> > +                    iter->root_level, iter->min_level, goal_gfn);
> > +}
> > +
> > diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> > new file mode 100644
> > index 0000000000000..d629a53e1b73f
> > --- /dev/null
> > +++ b/arch/x86/kvm/mmu/tdp_iter.h
> > @@ -0,0 +1,56 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#ifndef __KVM_X86_MMU_TDP_ITER_H
> > +#define __KVM_X86_MMU_TDP_ITER_H
> > +
> > +#include <linux/kvm_host.h>
> > +
> > +#include "mmu.h"
> > +
> > +/*
> > + * A TDP iterator performs a pre-order walk over a TDP paging structure.
> > + */
> > +struct tdp_iter {
> > +     /*
> > +      * The iterator will traverse the paging structure towards the mapping
> > +      * for this GFN.
> > +      */
> > +     gfn_t goal_gfn;
> > +     /* Pointers to the page tables traversed to reach the current SPTE */
> > +     u64 *pt_path[PT64_ROOT_MAX_LEVEL];
> > +     /* A pointer to the current SPTE */
> > +     u64 *sptep;
> > +     /* The lowest GFN mapped by the current SPTE */
> > +     gfn_t gfn;
> > +     /* The level of the root page given to the iterator */
> > +     int root_level;
> > +     /* The lowest level the iterator should traverse to */
> > +     int min_level;
> > +     /* The iterator's current level within the paging structure */
> > +     int level;
> > +     /* A snapshot of the value at sptep */
> > +     u64 old_spte;
> > +     /*
> > +      * Whether the iterator has a valid state. This will be false if the
> > +      * iterator walks off the end of the paging structure.
> > +      */
> > +     bool valid;
> > +};
> > +
> > +/*
> > + * Iterates over every SPTE mapping the GFN range [start, end) in a
> > + * preorder traversal.
> > + */
> > +#define for_each_tdp_pte(iter, root, root_level, start, end) \
> > +     for (tdp_iter_start(&iter, root, root_level, PG_LEVEL_4K, start); \
> > +          iter.valid && iter.gfn < end;                   \
> > +          tdp_iter_next(&iter))
> > +
> > +u64 *spte_to_child_pt(u64 pte, int level);
> > +
> > +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> > +                 int min_level, gfn_t goal_gfn);
> > +void tdp_iter_next(struct tdp_iter *iter);
> > +void tdp_iter_refresh_walk(struct tdp_iter *iter);
> > +
> > +#endif /* __KVM_X86_MMU_TDP_ITER_H */
> > --
> > 2.28.0.1011.ga647a8990f-goog
> >
>
> I am just suggesting to replace the modulo operations with bit-shifts...
> Also, it's very exciting to see such patch set. Thanks!

Those are great suggestions, thank you. I wonder if the compiler would
have been smart enough to convert those to bit shifts. (I kind of
doubt it)
I'm glad you're excited about this patch set, thank you for taking a
look and helping to review it!
It doesn't sound like I'll be sending out a v3 of this set, but I'll
be sure to amend the code with your suggestions if they don't make it
into the version Paolo queues.

>
> B.R.
> Yu
>
Yu Zhang Oct. 22, 2020, 2:34 a.m. UTC | #3
On Wed, Oct 21, 2020 at 11:08:52AM -0700, Ben Gardon wrote:
> On Wed, Oct 21, 2020 at 7:59 AM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> >
> > On Wed, Oct 14, 2020 at 11:26:42AM -0700, Ben Gardon wrote:
> > > The TDP iterator implements a pre-order traversal of a TDP paging
> > > structure. This iterator will be used in future patches to create
> > > an efficient implementation of the KVM MMU for the TDP case.
> > >
> > > Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> > > machine. This series introduced no new failures.
> > >
> > > This series can be viewed in Gerrit at:
> > >       https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> > >
> > > Signed-off-by: Ben Gardon <bgardon@google.com>
> > > ---
> > >  arch/x86/kvm/Makefile           |   3 +-
> > >  arch/x86/kvm/mmu/mmu.c          |  66 ------------
> > >  arch/x86/kvm/mmu/mmu_internal.h |  66 ++++++++++++
> > >  arch/x86/kvm/mmu/tdp_iter.c     | 176 ++++++++++++++++++++++++++++++++
> > >  arch/x86/kvm/mmu/tdp_iter.h     |  56 ++++++++++
> > >  5 files changed, 300 insertions(+), 67 deletions(-)
> > >  create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
> > >  create mode 100644 arch/x86/kvm/mmu/tdp_iter.h
> > >
> > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > index 7f86a14aed0e9..4525c1151bf99 100644
> > > --- a/arch/x86/kvm/Makefile
> > > +++ b/arch/x86/kvm/Makefile
> > > @@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF)  += $(KVM)/async_pf.o
> > >
> > >  kvm-y                        += x86.o emulate.o i8259.o irq.o lapic.o \
> > >                          i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
> > > -                        hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
> > > +                        hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
> > > +                        mmu/tdp_iter.o
> > >
> > >  kvm-intel-y          += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> > >                          vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 6c9db349600c8..6d82784ed5679 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -121,28 +121,6 @@ module_param(dbg, bool, 0644);
> > >
> > >  #define PTE_PREFETCH_NUM             8
> > >
> > > -#define PT_FIRST_AVAIL_BITS_SHIFT 10
> > > -#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> > > -
> > > -/*
> > > - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
> > > - * Access Tracking SPTEs.
> > > - */
> > > -#define SPTE_SPECIAL_MASK (3ULL << 52)
> > > -#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> > > -#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> > > -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> > > -#define SPTE_MMIO_MASK (3ULL << 52)
> > > -
> > > -#define PT64_LEVEL_BITS 9
> > > -
> > > -#define PT64_LEVEL_SHIFT(level) \
> > > -             (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> > > -
> > > -#define PT64_INDEX(address, level)\
> > > -     (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> > > -
> > > -
> > >  #define PT32_LEVEL_BITS 10
> > >
> > >  #define PT32_LEVEL_SHIFT(level) \
> > > @@ -155,19 +133,6 @@ module_param(dbg, bool, 0644);
> > >  #define PT32_INDEX(address, level)\
> > >       (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
> > >
> > > -
> > > -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> > > -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> > > -#else
> > > -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> > > -#endif
> > > -#define PT64_LVL_ADDR_MASK(level) \
> > > -     (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > -                                             * PT64_LEVEL_BITS))) - 1))
> > > -#define PT64_LVL_OFFSET_MASK(level) \
> > > -     (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > -                                             * PT64_LEVEL_BITS))) - 1))
> > > -
> > >  #define PT32_BASE_ADDR_MASK PAGE_MASK
> > >  #define PT32_DIR_BASE_ADDR_MASK \
> > >       (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
> > > @@ -192,8 +157,6 @@ module_param(dbg, bool, 0644);
> > >  #define SPTE_HOST_WRITEABLE  (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> > >  #define SPTE_MMU_WRITEABLE   (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> > >
> > > -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > > -
> > >  /* make pte_list_desc fit well in cache line */
> > >  #define PTE_LIST_EXT 3
> > >
> > > @@ -349,11 +312,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
> > >  }
> > >  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
> > >
> > > -static bool is_mmio_spte(u64 spte)
> > > -{
> > > -     return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> > > -}
> > > -
> > >  static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
> > >  {
> > >       return sp->role.ad_disabled;
> > > @@ -626,35 +584,11 @@ static int is_nx(struct kvm_vcpu *vcpu)
> > >       return vcpu->arch.efer & EFER_NX;
> > >  }
> > >
> > > -static int is_shadow_present_pte(u64 pte)
> > > -{
> > > -     return (pte != 0) && !is_mmio_spte(pte);
> > > -}
> > > -
> > > -static int is_large_pte(u64 pte)
> > > -{
> > > -     return pte & PT_PAGE_SIZE_MASK;
> > > -}
> > > -
> > > -static int is_last_spte(u64 pte, int level)
> > > -{
> > > -     if (level == PG_LEVEL_4K)
> > > -             return 1;
> > > -     if (is_large_pte(pte))
> > > -             return 1;
> > > -     return 0;
> > > -}
> > > -
> > >  static bool is_executable_pte(u64 spte)
> > >  {
> > >       return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
> > >  }
> > >
> > > -static kvm_pfn_t spte_to_pfn(u64 pte)
> > > -{
> > > -     return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> > > -}
> > > -
> > >  static gfn_t pse36_gfn_delta(u32 gpte)
> > >  {
> > >       int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
> > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > > index 3acf3b8eb469d..74ccbf001a42e 100644
> > > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > > @@ -2,6 +2,8 @@
> > >  #ifndef __KVM_X86_MMU_INTERNAL_H
> > >  #define __KVM_X86_MMU_INTERNAL_H
> > >
> > > +#include "mmu.h"
> > > +
> > >  #include <linux/types.h>
> > >
> > >  #include <asm/kvm_host.h>
> > > @@ -60,4 +62,68 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
> > >  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
> > >                                   struct kvm_memory_slot *slot, u64 gfn);
> > >
> > > +#define PT64_LEVEL_BITS 9
> > > +
> > > +#define PT64_LEVEL_SHIFT(level) \
> > > +             (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> > > +
> > > +#define PT64_INDEX(address, level)\
> > > +     (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> > > +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > > +
> > > +#define PT_FIRST_AVAIL_BITS_SHIFT 10
> > > +#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> > > +
> > > +/*
> > > + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
> > > + * Access Tracking SPTEs.
> > > + */
> > > +#define SPTE_SPECIAL_MASK (3ULL << 52)
> > > +#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> > > +#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> > > +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> > > +#define SPTE_MMIO_MASK (3ULL << 52)
> > > +
> > > +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> > > +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> > > +#else
> > > +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> > > +#endif
> > > +#define PT64_LVL_ADDR_MASK(level) \
> > > +     (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > +                                             * PT64_LEVEL_BITS))) - 1))
> > > +#define PT64_LVL_OFFSET_MASK(level) \
> > > +     (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > +                                             * PT64_LEVEL_BITS))) - 1))
> > > +
> > > +/* Functions for interpreting SPTEs */
> > > +static inline bool is_mmio_spte(u64 spte)
> > > +{
> > > +     return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> > > +}
> > > +
> > > +static inline int is_shadow_present_pte(u64 pte)
> > > +{
> > > +     return (pte != 0) && !is_mmio_spte(pte);
> > > +}
> > > +
> > > +static inline int is_large_pte(u64 pte)
> > > +{
> > > +     return pte & PT_PAGE_SIZE_MASK;
> > > +}
> > > +
> > > +static inline int is_last_spte(u64 pte, int level)
> > > +{
> > > +     if (level == PG_LEVEL_4K)
> > > +             return 1;
> > > +     if (is_large_pte(pte))
> > > +             return 1;
> > > +     return 0;
> > > +}
> > > +
> > > +static inline kvm_pfn_t spte_to_pfn(u64 pte)
> > > +{
> > > +     return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> > > +}
> > > +
> > >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > > diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> > > new file mode 100644
> > > index 0000000000000..b07e9f0c5d4aa
> > > --- /dev/null
> > > +++ b/arch/x86/kvm/mmu/tdp_iter.c
> > > @@ -0,0 +1,176 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +
> > > +#include "mmu_internal.h"
> > > +#include "tdp_iter.h"
> > > +
> > > +/*
> > > + * Recalculates the pointer to the SPTE for the current GFN and level and
> > > + * reread the SPTE.
> > > + */
> > > +static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
> > > +{
> > > +     iter->sptep = iter->pt_path[iter->level - 1] +
> > > +             SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
> > > +     iter->old_spte = READ_ONCE(*iter->sptep);
> > > +}
> > > +
> > > +static gfn_t round_gfn_for_level(gfn_t gfn, int level)
> > > +{
> > > +     return gfn - (gfn % KVM_PAGES_PER_HPAGE(level));
> >
> > Instead of the modulo operator, how about we use:
> >         return gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
> > here? :)
> >
> > > +}
> > > +
> > > +/*
> > > + * Sets a TDP iterator to walk a pre-order traversal of the paging structure
> > > + * rooted at root_pt, starting with the walk to translate goal_gfn.
> > > + */
> > > +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> > > +                 int min_level, gfn_t goal_gfn)
> > > +{
> > > +     WARN_ON(root_level < 1);
> > > +     WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
> > > +
> > > +     iter->goal_gfn = goal_gfn;
> > > +     iter->root_level = root_level;
> > > +     iter->min_level = min_level;
> > > +     iter->level = root_level;
> > > +     iter->pt_path[iter->level - 1] = root_pt;
> > > +
> > > +     iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
> > > +     tdp_iter_refresh_sptep(iter);
> > > +
> > > +     iter->valid = true;
> > > +}
> > > +
> > > +/*
> > > + * Given an SPTE and its level, returns a pointer containing the host virtual
> > > + * address of the child page table referenced by the SPTE. Returns null if
> > > + * there is no such entry.
> > > + */
> > > +u64 *spte_to_child_pt(u64 spte, int level)
> > > +{
> > > +     /*
> > > +      * There's no child entry if this entry isn't present or is a
> > > +      * last-level entry.
> > > +      */
> > > +     if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
> > > +             return NULL;
> > > +
> > > +     return __va(spte_to_pfn(spte) << PAGE_SHIFT);
> > > +}
> > > +
> > > +/*
> > > + * Steps down one level in the paging structure towards the goal GFN. Returns
> > > + * true if the iterator was able to step down a level, false otherwise.
> > > + */
> > > +static bool try_step_down(struct tdp_iter *iter)
> > > +{
> > > +     u64 *child_pt;
> > > +
> > > +     if (iter->level == iter->min_level)
> > > +             return false;
> > > +
> > > +     /*
> > > +      * Reread the SPTE before stepping down to avoid traversing into page
> > > +      * tables that are no longer linked from this entry.
> > > +      */
> > > +     iter->old_spte = READ_ONCE(*iter->sptep);
> > > +
> > > +     child_pt = spte_to_child_pt(iter->old_spte, iter->level);
> > > +     if (!child_pt)
> > > +             return false;
> > > +
> > > +     iter->level--;
> > > +     iter->pt_path[iter->level - 1] = child_pt;
> > > +     iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
> > > +     tdp_iter_refresh_sptep(iter);
> > > +
> > > +     return true;
> > > +}
> > > +
> > > +/*
> > > + * Steps to the next entry in the current page table, at the current page table
> > > + * level. The next entry could point to a page backing guest memory or another
> > > + * page table, or it could be non-present. Returns true if the iterator was
> > > + * able to step to the next entry in the page table, false if the iterator was
> > > + * already at the end of the current page table.
> > > + */
> > > +static bool try_step_side(struct tdp_iter *iter)
> > > +{
> > > +     /*
> > > +      * Check if the iterator is already at the end of the current page
> > > +      * table.
> > > +      */
> > > +     if (!((iter->gfn + KVM_PAGES_PER_HPAGE(iter->level)) %
> > > +           KVM_PAGES_PER_HPAGE(iter->level + 1)))
> > > +             return false;
> > > +
> >
> > And maybe:
> >         if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
> >             (PT64_ENT_PER_PAGE - 1))
> > here?
> >
> > > +     iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
> > > +     iter->goal_gfn = iter->gfn;
> > > +     iter->sptep++;
> > > +     iter->old_spte = READ_ONCE(*iter->sptep);
> > > +
> > > +     return true;
> > > +}
> > > +
> > > +/*
> > > + * Tries to traverse back up a level in the paging structure so that the walk
> > > + * can continue from the next entry in the parent page table. Returns true on a
> > > + * successful step up, false if already in the root page.
> > > + */
> > > +static bool try_step_up(struct tdp_iter *iter)
> > > +{
> > > +     if (iter->level == iter->root_level)
> > > +             return false;
> > > +
> > > +     iter->level++;
> > > +     iter->gfn = round_gfn_for_level(iter->gfn, iter->level);
> > > +     tdp_iter_refresh_sptep(iter);
> > > +
> > > +     return true;
> > > +}
> > > +
> > > +/*
> > > + * Step to the next SPTE in a pre-order traversal of the paging structure.
> > > + * To get to the next SPTE, the iterator either steps down towards the goal
> > > + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
> > > + * highter GFN.
> > > + *
> > > + * The basic algorithm is as follows:
> > > + * 1. If the current SPTE is a non-last-level SPTE, step down into the page
> > > + *    table it points to.
> > > + * 2. If the iterator cannot step down, it will try to step to the next SPTE
> > > + *    in the current page of the paging structure.
> > > + * 3. If the iterator cannot step to the next entry in the current page, it will
> > > + *    try to step up to the parent paging structure page. In this case, that
> > > + *    SPTE will have already been visited, and so the iterator must also step
> > > + *    to the side again.
> > > + */
> > > +void tdp_iter_next(struct tdp_iter *iter)
> > > +{
> > > +     if (try_step_down(iter))
> > > +             return;
> > > +
> > > +     do {
> > > +             if (try_step_side(iter))
> > > +                     return;
> > > +     } while (try_step_up(iter));
> > > +     iter->valid = false;
> > > +}
> > > +
> > > +/*
> > > + * Restart the walk over the paging structure from the root, starting from the
> > > + * highest gfn the iterator had previously reached. Assumes that the entire
> > > + * paging structure, except the root page, may have been completely torn down
> > > + * and rebuilt.
> > > + */
> > > +void tdp_iter_refresh_walk(struct tdp_iter *iter)
> > > +{
> > > +     gfn_t goal_gfn = iter->goal_gfn;
> > > +
> > > +     if (iter->gfn > goal_gfn)
> > > +             goal_gfn = iter->gfn;
> > > +
> > > +     tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
> > > +                    iter->root_level, iter->min_level, goal_gfn);
> > > +}
> > > +
> > > diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> > > new file mode 100644
> > > index 0000000000000..d629a53e1b73f
> > > --- /dev/null
> > > +++ b/arch/x86/kvm/mmu/tdp_iter.h
> > > @@ -0,0 +1,56 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +
> > > +#ifndef __KVM_X86_MMU_TDP_ITER_H
> > > +#define __KVM_X86_MMU_TDP_ITER_H
> > > +
> > > +#include <linux/kvm_host.h>
> > > +
> > > +#include "mmu.h"
> > > +
> > > +/*
> > > + * A TDP iterator performs a pre-order walk over a TDP paging structure.
> > > + */
> > > +struct tdp_iter {
> > > +     /*
> > > +      * The iterator will traverse the paging structure towards the mapping
> > > +      * for this GFN.
> > > +      */
> > > +     gfn_t goal_gfn;
> > > +     /* Pointers to the page tables traversed to reach the current SPTE */
> > > +     u64 *pt_path[PT64_ROOT_MAX_LEVEL];
> > > +     /* A pointer to the current SPTE */
> > > +     u64 *sptep;
> > > +     /* The lowest GFN mapped by the current SPTE */
> > > +     gfn_t gfn;
> > > +     /* The level of the root page given to the iterator */
> > > +     int root_level;
> > > +     /* The lowest level the iterator should traverse to */
> > > +     int min_level;
> > > +     /* The iterator's current level within the paging structure */
> > > +     int level;
> > > +     /* A snapshot of the value at sptep */
> > > +     u64 old_spte;
> > > +     /*
> > > +      * Whether the iterator has a valid state. This will be false if the
> > > +      * iterator walks off the end of the paging structure.
> > > +      */
> > > +     bool valid;
> > > +};
> > > +
> > > +/*
> > > + * Iterates over every SPTE mapping the GFN range [start, end) in a
> > > + * preorder traversal.
> > > + */
> > > +#define for_each_tdp_pte(iter, root, root_level, start, end) \
> > > +     for (tdp_iter_start(&iter, root, root_level, PG_LEVEL_4K, start); \
> > > +          iter.valid && iter.gfn < end;                   \
> > > +          tdp_iter_next(&iter))
> > > +
> > > +u64 *spte_to_child_pt(u64 pte, int level);
> > > +
> > > +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> > > +                 int min_level, gfn_t goal_gfn);
> > > +void tdp_iter_next(struct tdp_iter *iter);
> > > +void tdp_iter_refresh_walk(struct tdp_iter *iter);
> > > +
> > > +#endif /* __KVM_X86_MMU_TDP_ITER_H */
> > > --
> > > 2.28.0.1011.ga647a8990f-goog
> > >
> >
> > I am just suggesting to replace the modulo operations with bit-shifts...
> > Also, it's very exciting to see such patch set. Thanks!
> 
> Those are great suggestions, thank you. I wonder if the compiler would
> have been smart enough to convert those to bit shifts. (I kind of
> doubt it)
> I'm glad you're excited about this patch set, thank you for taking a
> look and helping to review it!
> It doesn't sound like I'll be sending out a v3 of this set, but I'll
> be sure to amend the code with your suggestions if they don't make it
> into the version Paolo queues.

Great. And looking forward to the performance improvement series! :)

Yu

> 
> >
> > B.R.
> > Yu
> >
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 7f86a14aed0e9..4525c1151bf99 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -15,7 +15,8 @@  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
-			   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
+			   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
+			   mmu/tdp_iter.o
 
 kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
 			   vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6c9db349600c8..6d82784ed5679 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -121,28 +121,6 @@  module_param(dbg, bool, 0644);
 
 #define PTE_PREFETCH_NUM		8
 
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
-
-/*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
- */
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
-		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-
-#define PT64_INDEX(address, level)\
-	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
 #define PT32_LEVEL_BITS 10
 
 #define PT32_LEVEL_SHIFT(level) \
@@ -155,19 +133,6 @@  module_param(dbg, bool, 0644);
 #define PT32_INDEX(address, level)\
 	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 
-
-#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
-#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
-#else
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#endif
-#define PT64_LVL_ADDR_MASK(level) \
-	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
-						* PT64_LEVEL_BITS))) - 1))
-#define PT64_LVL_OFFSET_MASK(level) \
-	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
-						* PT64_LEVEL_BITS))) - 1))
-
 #define PT32_BASE_ADDR_MASK PAGE_MASK
 #define PT32_DIR_BASE_ADDR_MASK \
 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
@@ -192,8 +157,6 @@  module_param(dbg, bool, 0644);
 #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 
-#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-
 /* make pte_list_desc fit well in cache line */
 #define PTE_LIST_EXT 3
 
@@ -349,11 +312,6 @@  void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
-static bool is_mmio_spte(u64 spte)
-{
-	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
-}
-
 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 {
 	return sp->role.ad_disabled;
@@ -626,35 +584,11 @@  static int is_nx(struct kvm_vcpu *vcpu)
 	return vcpu->arch.efer & EFER_NX;
 }
 
-static int is_shadow_present_pte(u64 pte)
-{
-	return (pte != 0) && !is_mmio_spte(pte);
-}
-
-static int is_large_pte(u64 pte)
-{
-	return pte & PT_PAGE_SIZE_MASK;
-}
-
-static int is_last_spte(u64 pte, int level)
-{
-	if (level == PG_LEVEL_4K)
-		return 1;
-	if (is_large_pte(pte))
-		return 1;
-	return 0;
-}
-
 static bool is_executable_pte(u64 spte)
 {
 	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
 }
 
-static kvm_pfn_t spte_to_pfn(u64 pte)
-{
-	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
 static gfn_t pse36_gfn_delta(u32 gpte)
 {
 	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 3acf3b8eb469d..74ccbf001a42e 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -2,6 +2,8 @@ 
 #ifndef __KVM_X86_MMU_INTERNAL_H
 #define __KVM_X86_MMU_INTERNAL_H
 
+#include "mmu.h"
+
 #include <linux/types.h>
 
 #include <asm/kvm_host.h>
@@ -60,4 +62,68 @@  void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 				    struct kvm_memory_slot *slot, u64 gfn);
 
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_INDEX(address, level)\
+	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 10
+#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs.
+ */
+#define SPTE_SPECIAL_MASK (3ULL << 52)
+#define SPTE_AD_ENABLED_MASK (0ULL << 52)
+#define SPTE_AD_DISABLED_MASK (1ULL << 52)
+#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
+#define SPTE_MMIO_MASK (3ULL << 52)
+
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
+#else
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#endif
+#define PT64_LVL_ADDR_MASK(level) \
+	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+						* PT64_LEVEL_BITS))) - 1))
+#define PT64_LVL_OFFSET_MASK(level) \
+	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+						* PT64_LEVEL_BITS))) - 1))
+
+/* Functions for interpreting SPTEs */
+static inline bool is_mmio_spte(u64 spte)
+{
+	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
+}
+
+static inline int is_shadow_present_pte(u64 pte)
+{
+	return (pte != 0) && !is_mmio_spte(pte);
+}
+
+static inline int is_large_pte(u64 pte)
+{
+	return pte & PT_PAGE_SIZE_MASK;
+}
+
+static inline int is_last_spte(u64 pte, int level)
+{
+	if (level == PG_LEVEL_4K)
+		return 1;
+	if (is_large_pte(pte))
+		return 1;
+	return 0;
+}
+
+static inline kvm_pfn_t spte_to_pfn(u64 pte)
+{
+	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
new file mode 100644
index 0000000000000..b07e9f0c5d4aa
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -0,0 +1,176 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include "mmu_internal.h"
+#include "tdp_iter.h"
+
+/*
+ * Recalculates the pointer to the SPTE for the current GFN and level and
+ * reread the SPTE.
+ */
+static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
+{
+	iter->sptep = iter->pt_path[iter->level - 1] +
+		SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
+	iter->old_spte = READ_ONCE(*iter->sptep);
+}
+
+static gfn_t round_gfn_for_level(gfn_t gfn, int level)
+{
+	return gfn - (gfn % KVM_PAGES_PER_HPAGE(level));
+}
+
+/*
+ * Sets a TDP iterator to walk a pre-order traversal of the paging structure
+ * rooted at root_pt, starting with the walk to translate goal_gfn.
+ */
+void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+		    int min_level, gfn_t goal_gfn)
+{
+	WARN_ON(root_level < 1);
+	WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
+
+	iter->goal_gfn = goal_gfn;
+	iter->root_level = root_level;
+	iter->min_level = min_level;
+	iter->level = root_level;
+	iter->pt_path[iter->level - 1] = root_pt;
+
+	iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
+	tdp_iter_refresh_sptep(iter);
+
+	iter->valid = true;
+}
+
+/*
+ * Given an SPTE and its level, returns a pointer containing the host virtual
+ * address of the child page table referenced by the SPTE. Returns null if
+ * there is no such entry.
+ */
+u64 *spte_to_child_pt(u64 spte, int level)
+{
+	/*
+	 * There's no child entry if this entry isn't present or is a
+	 * last-level entry.
+	 */
+	if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
+		return NULL;
+
+	return __va(spte_to_pfn(spte) << PAGE_SHIFT);
+}
+
+/*
+ * Steps down one level in the paging structure towards the goal GFN. Returns
+ * true if the iterator was able to step down a level, false otherwise.
+ */
+static bool try_step_down(struct tdp_iter *iter)
+{
+	u64 *child_pt;
+
+	if (iter->level == iter->min_level)
+		return false;
+
+	/*
+	 * Reread the SPTE before stepping down to avoid traversing into page
+	 * tables that are no longer linked from this entry.
+	 */
+	iter->old_spte = READ_ONCE(*iter->sptep);
+
+	child_pt = spte_to_child_pt(iter->old_spte, iter->level);
+	if (!child_pt)
+		return false;
+
+	iter->level--;
+	iter->pt_path[iter->level - 1] = child_pt;
+	iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
+	tdp_iter_refresh_sptep(iter);
+
+	return true;
+}
+
+/*
+ * Steps to the next entry in the current page table, at the current page table
+ * level. The next entry could point to a page backing guest memory or another
+ * page table, or it could be non-present. Returns true if the iterator was
+ * able to step to the next entry in the page table, false if the iterator was
+ * already at the end of the current page table.
+ */
+static bool try_step_side(struct tdp_iter *iter)
+{
+	/*
+	 * Check if the iterator is already at the end of the current page
+	 * table.
+	 */
+	if (!((iter->gfn + KVM_PAGES_PER_HPAGE(iter->level)) %
+	      KVM_PAGES_PER_HPAGE(iter->level + 1)))
+		return false;
+
+	iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
+	iter->goal_gfn = iter->gfn;
+	iter->sptep++;
+	iter->old_spte = READ_ONCE(*iter->sptep);
+
+	return true;
+}
+
+/*
+ * Tries to traverse back up a level in the paging structure so that the walk
+ * can continue from the next entry in the parent page table. Returns true on a
+ * successful step up, false if already in the root page.
+ */
+static bool try_step_up(struct tdp_iter *iter)
+{
+	if (iter->level == iter->root_level)
+		return false;
+
+	iter->level++;
+	iter->gfn = round_gfn_for_level(iter->gfn, iter->level);
+	tdp_iter_refresh_sptep(iter);
+
+	return true;
+}
+
+/*
+ * Step to the next SPTE in a pre-order traversal of the paging structure.
+ * To get to the next SPTE, the iterator either steps down towards the goal
+ * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
+ * highter GFN.
+ *
+ * The basic algorithm is as follows:
+ * 1. If the current SPTE is a non-last-level SPTE, step down into the page
+ *    table it points to.
+ * 2. If the iterator cannot step down, it will try to step to the next SPTE
+ *    in the current page of the paging structure.
+ * 3. If the iterator cannot step to the next entry in the current page, it will
+ *    try to step up to the parent paging structure page. In this case, that
+ *    SPTE will have already been visited, and so the iterator must also step
+ *    to the side again.
+ */
+void tdp_iter_next(struct tdp_iter *iter)
+{
+	if (try_step_down(iter))
+		return;
+
+	do {
+		if (try_step_side(iter))
+			return;
+	} while (try_step_up(iter));
+	iter->valid = false;
+}
+
+/*
+ * Restart the walk over the paging structure from the root, starting from the
+ * highest gfn the iterator had previously reached. Assumes that the entire
+ * paging structure, except the root page, may have been completely torn down
+ * and rebuilt.
+ */
+void tdp_iter_refresh_walk(struct tdp_iter *iter)
+{
+	gfn_t goal_gfn = iter->goal_gfn;
+
+	if (iter->gfn > goal_gfn)
+		goal_gfn = iter->gfn;
+
+	tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
+		       iter->root_level, iter->min_level, goal_gfn);
+}
+
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
new file mode 100644
index 0000000000000..d629a53e1b73f
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -0,0 +1,56 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef __KVM_X86_MMU_TDP_ITER_H
+#define __KVM_X86_MMU_TDP_ITER_H
+
+#include <linux/kvm_host.h>
+
+#include "mmu.h"
+
+/*
+ * A TDP iterator performs a pre-order walk over a TDP paging structure.
+ */
+struct tdp_iter {
+	/*
+	 * The iterator will traverse the paging structure towards the mapping
+	 * for this GFN.
+	 */
+	gfn_t goal_gfn;
+	/* Pointers to the page tables traversed to reach the current SPTE */
+	u64 *pt_path[PT64_ROOT_MAX_LEVEL];
+	/* A pointer to the current SPTE */
+	u64 *sptep;
+	/* The lowest GFN mapped by the current SPTE */
+	gfn_t gfn;
+	/* The level of the root page given to the iterator */
+	int root_level;
+	/* The lowest level the iterator should traverse to */
+	int min_level;
+	/* The iterator's current level within the paging structure */
+	int level;
+	/* A snapshot of the value at sptep */
+	u64 old_spte;
+	/*
+	 * Whether the iterator has a valid state. This will be false if the
+	 * iterator walks off the end of the paging structure.
+	 */
+	bool valid;
+};
+
+/*
+ * Iterates over every SPTE mapping the GFN range [start, end) in a
+ * preorder traversal.
+ */
+#define for_each_tdp_pte(iter, root, root_level, start, end) \
+	for (tdp_iter_start(&iter, root, root_level, PG_LEVEL_4K, start); \
+	     iter.valid && iter.gfn < end;		     \
+	     tdp_iter_next(&iter))
+
+u64 *spte_to_child_pt(u64 pte, int level);
+
+void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+		    int min_level, gfn_t goal_gfn);
+void tdp_iter_next(struct tdp_iter *iter);
+void tdp_iter_refresh_walk(struct tdp_iter *iter);
+
+#endif /* __KVM_X86_MMU_TDP_ITER_H */