diff mbox series

[02/22] kvm: mmu: Introduce tdp_iter

Message ID 20200925212302.3979661-3-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series Introduce the TDP MMU | expand

Commit Message

Ben Gardon Sept. 25, 2020, 9:22 p.m. UTC
The TDP iterator implements a pre-order traversal of a TDP paging
structure. This iterator will be used in future patches to create
an efficient implementation of the KVM MMU for the TDP case.

Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
machine. This series introduced no new failures.

This series can be viewed in Gerrit at:
	https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/Makefile           |   3 +-
 arch/x86/kvm/mmu/mmu.c          |  19 +---
 arch/x86/kvm/mmu/mmu_internal.h |  15 +++
 arch/x86/kvm/mmu/tdp_iter.c     | 163 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu/tdp_iter.h     |  53 +++++++++++
 5 files changed, 237 insertions(+), 16 deletions(-)
 create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
 create mode 100644 arch/x86/kvm/mmu/tdp_iter.h

Comments

Paolo Bonzini Sept. 26, 2020, 12:04 a.m. UTC | #1
On 25/09/20 23:22, Ben Gardon wrote:
>  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
>  
> -static bool is_mmio_spte(u64 spte)
> +bool is_mmio_spte(u64 spte)
>  {
>  	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
>  }
> @@ -623,7 +612,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
>  	return vcpu->arch.efer & EFER_NX;
>  }
>  
> -static int is_shadow_present_pte(u64 pte)
> +int is_shadow_present_pte(u64 pte)
>  {
>  	return (pte != 0) && !is_mmio_spte(pte);
>  }
> @@ -633,7 +622,7 @@ static int is_large_pte(u64 pte)
>  	return pte & PT_PAGE_SIZE_MASK;
>  }
>  
> -static int is_last_spte(u64 pte, int level)
> +int is_last_spte(u64 pte, int level)
>  {
>  	if (level == PG_LEVEL_4K)
>  		return 1;
> @@ -647,7 +636,7 @@ static bool is_executable_pte(u64 spte)
>  	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>  }
>  
> -static kvm_pfn_t spte_to_pfn(u64 pte)
> +kvm_pfn_t spte_to_pfn(u64 pte)
>  {
>  	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
>  }

Should these be inlines in mmu_internal.h instead?

Paolo
Paolo Bonzini Sept. 26, 2020, 12:54 a.m. UTC | #2
On 25/09/20 23:22, Ben Gardon wrote:
> +	bool done;
> +
> +	done = try_step_down(iter);
> +	if (done)
> +		return;
> +
> +	done = try_step_side(iter);
> +	while (!done) {
> +		if (!try_step_up(iter)) {
> +			iter->valid = false;
> +			break;
> +		}
> +		done = try_step_side(iter);

Seems easier to read without the "done" boolean:

	if (try_step_down(iter))
		return;

	do {
		/* Maybe try_step_right? :) */
		if (try_step_side(iter))
			return;
	} while (try_step_up(iter));
	iter->valid = false;

Also it may be worth adding an "end_level" argument to the constructor,
and checking against it in try_step_down instead of using PG_LEVEL_4K.
By passing in PG_LEVEL_2M, you can avoid adding
tdp_iter_next_no_step_down in patch 17 and generally simplify the logic
there.

Paolo
Sean Christopherson Sept. 30, 2020, 5:06 a.m. UTC | #3
On Sat, Sep 26, 2020 at 02:04:49AM +0200, Paolo Bonzini wrote:
> On 25/09/20 23:22, Ben Gardon wrote:
> >  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
> >  
> > -static bool is_mmio_spte(u64 spte)
> > +bool is_mmio_spte(u64 spte)
> >  {
> >  	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> >  }
> > @@ -623,7 +612,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
> >  	return vcpu->arch.efer & EFER_NX;
> >  }
> >  
> > -static int is_shadow_present_pte(u64 pte)
> > +int is_shadow_present_pte(u64 pte)
> >  {
> >  	return (pte != 0) && !is_mmio_spte(pte);
> >  }
> > @@ -633,7 +622,7 @@ static int is_large_pte(u64 pte)
> >  	return pte & PT_PAGE_SIZE_MASK;
> >  }
> >  
> > -static int is_last_spte(u64 pte, int level)
> > +int is_last_spte(u64 pte, int level)
> >  {
> >  	if (level == PG_LEVEL_4K)
> >  		return 1;
> > @@ -647,7 +636,7 @@ static bool is_executable_pte(u64 spte)
> >  	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
> >  }
> >  
> > -static kvm_pfn_t spte_to_pfn(u64 pte)
> > +kvm_pfn_t spte_to_pfn(u64 pte)
> >  {
> >  	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> >  }
> 
> Should these be inlines in mmu_internal.h instead?

Ya, that would be my preference as well.
Sean Christopherson Sept. 30, 2020, 5:08 a.m. UTC | #4
On Fri, Sep 25, 2020 at 02:22:42PM -0700, Ben Gardon wrote:
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> new file mode 100644
> index 0000000000000..b102109778eac
> --- /dev/null
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -0,0 +1,53 @@
> +/* SPDX-License-Identifier: GPL-2.0 */

Kernel style is to use C++ comments for the SPDX headers in .c and .h (and
maybe.S too?).
Sean Christopherson Sept. 30, 2020, 5:24 a.m. UTC | #5
On Fri, Sep 25, 2020 at 02:22:42PM -0700, Ben Gardon wrote:
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> new file mode 100644
> index 0000000000000..ee90d62d2a9b1
> --- /dev/null
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -0,0 +1,163 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include "mmu_internal.h"
> +#include "tdp_iter.h"
> +
> +/*
> + * Recalculates the pointer to the SPTE for the current GFN and level and
> + * reread the SPTE.
> + */
> +static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
> +{
> +	iter->sptep = iter->pt_path[iter->level - 1] +
> +		SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
> +	iter->old_spte = READ_ONCE(*iter->sptep);
> +}
> +
> +/*
> + * Sets a TDP iterator to walk a pre-order traversal of the paging structure
> + * rooted at root_pt, starting with the walk to translate goal_gfn.
> + */
> +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> +		    gfn_t goal_gfn)
> +{
> +	WARN_ON(root_level < 1);
> +	WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
> +
> +	iter->goal_gfn = goal_gfn;
> +	iter->root_level = root_level;
> +	iter->level = root_level;
> +	iter->pt_path[iter->level - 1] = root_pt;
> +
> +	iter->gfn = iter->goal_gfn -
> +		(iter->goal_gfn % KVM_PAGES_PER_HPAGE(iter->level));

Maybe use the params, if only to avoid the line wrap?

	iter->gfn = goal_gfn - (goal_gfn % KVM_PAGES_PER_HPAGE(root_level));

Actually, peeking further into the file, this calculation is repeated in both
try_step_up and try_step_down,  probably worth adding a helper of some form.

> +	tdp_iter_refresh_sptep(iter);
> +
> +	iter->valid = true;
> +}
> +
> +/*
> + * Given an SPTE and its level, returns a pointer containing the host virtual
> + * address of the child page table referenced by the SPTE. Returns null if
> + * there is no such entry.
> + */
> +u64 *spte_to_child_pt(u64 spte, int level)
> +{
> +	u64 *pt;
> +	/* There's no child entry if this entry isn't present */
> +	if (!is_shadow_present_pte(spte))
> +		return NULL;
> +
> +	/* There is no child page table if this is a leaf entry. */
> +	if (is_last_spte(spte, level))
> +		return NULL;

I'd collapse the checks and their comments.

> +
> +	pt = (u64 *)__va(spte_to_pfn(spte) << PAGE_SHIFT);
> +	return pt;

No need for the local variable or the explicit cast.

	/* There's no child if this entry is non-present or a leaf entry. */
	if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
		return NULL;

	return __va(spte_to_pfn(spte) << PAGE_SHIFT);

> +}

...

> +void tdp_iter_next(struct tdp_iter *iter)
> +{
> +	bool done;
> +
> +	done = try_step_down(iter);
> +	if (done)
> +		return;
> +
> +	done = try_step_side(iter);
> +	while (!done) {
> +		if (!try_step_up(iter)) {
> +			iter->valid = false;
> +			break;
> +		}
> +		done = try_step_side(iter);
> +	}

At the risk of being too clever:

	bool done;

	if (try_step_down(iter))
		return;

	do {
		done = try_step_side(iter);
	} while (!done && try_step_up(iter));

	iter->valid = done;

> +}
Paolo Bonzini Sept. 30, 2020, 6:24 a.m. UTC | #6
On 30/09/20 07:24, Sean Christopherson wrote:
> Maybe use the params, if only to avoid the line wrap?
> 
> 	iter->gfn = goal_gfn - (goal_gfn % KVM_PAGES_PER_HPAGE(root_level));
> 
> Actually, peeking further into the file, this calculation is repeated in both
> try_step_up and try_step_down,  probably worth adding a helper of some form.

Also it's written more concisely as

	iter->gfn = goal_gfn & -KVM_PAGES_PER_HPAGE(iter->level);

> 
> 
> 	bool done;
> 
> 	if (try_step_down(iter))
> 		return;
> 
> 	do {
> 		done = try_step_side(iter);
> 	} while (!done && try_step_up(iter));
> 
> 	iter->valid = done;

I pointed out something similar in my review, my version was

	bool done;

	if (try_step_down(iter))
		return;

	do {
		if (try_step_side(iter))
			return;
	} while (try_step_up(iter));
	iter->valid = false;

Paolo
Eric van Tassell Sept. 30, 2020, 11:20 p.m. UTC | #7
On 9/25/20 4:22 PM, Ben Gardon wrote:
> The TDP iterator implements a pre-order traversal of a TDP paging
> structure. This iterator will be used in future patches to create
> an efficient implementation of the KVM MMU for the TDP case.
> 
> Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> machine. This series introduced no new failures.
> 
> This series can be viewed in Gerrit at:
> 	https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flinux-review.googlesource.com%2Fc%2Fvirt%2Fkvm%2Fkvm%2F%2B%2F2538&amp;data=02%7C01%7CEric.VanTassell%40amd.com%7C08ae391c1b9a4da5eb1e08d861997899%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637366659084240984&amp;sdata=pVD8B%2BoNGf1fN18y%2Bjrwyuhlu7TbP1DhUIg%2BbP8KP2s%3D&amp;reserved=0
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>   arch/x86/kvm/Makefile           |   3 +-
>   arch/x86/kvm/mmu/mmu.c          |  19 +---
>   arch/x86/kvm/mmu/mmu_internal.h |  15 +++
>   arch/x86/kvm/mmu/tdp_iter.c     | 163 ++++++++++++++++++++++++++++++++
>   arch/x86/kvm/mmu/tdp_iter.h     |  53 +++++++++++
>   5 files changed, 237 insertions(+), 16 deletions(-)
>   create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
>   create mode 100644 arch/x86/kvm/mmu/tdp_iter.h
> 
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 4a3081e9f4b5d..cf6a9947955f7 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
>   
>   kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
>   			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
> -			   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
> +			   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
> +			   mmu/tdp_iter.o
>   
>   kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
>   kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 81240b558d67f..b48b00c8cde65 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -134,15 +134,6 @@ module_param(dbg, bool, 0644);
>   #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
>   #define SPTE_MMIO_MASK (3ULL << 52)
>   
> -#define PT64_LEVEL_BITS 9
> -
> -#define PT64_LEVEL_SHIFT(level) \
> -		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> -
> -#define PT64_INDEX(address, level)\
> -	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> -
> -
>   #define PT32_LEVEL_BITS 10
>   
>   #define PT32_LEVEL_SHIFT(level) \
> @@ -192,8 +183,6 @@ module_param(dbg, bool, 0644);
>   #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
>   #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
>   
> -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> -
>   /* make pte_list_desc fit well in cache line */
>   #define PTE_LIST_EXT 3
>   
> @@ -346,7 +335,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
>   }
>   EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
>   
> -static bool is_mmio_spte(u64 spte)
> +bool is_mmio_spte(u64 spte)
>   {
>   	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
>   }
> @@ -623,7 +612,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
>   	return vcpu->arch.efer & EFER_NX;
>   }
>   
> -static int is_shadow_present_pte(u64 pte)
> +int is_shadow_present_pte(u64 pte)
>   {
>   	return (pte != 0) && !is_mmio_spte(pte);
 From <Figure 28-1: Formats of EPTP and EPT Paging-Structure Entries" of 
the manual I don't have at my fingertips right now, I believe you should 
only check the low 3 bits(mask = 0x7). Since the upper bits are ignored, 
might that not mean they're not guaranteed to be 0?
>   }
> @@ -633,7 +622,7 @@ static int is_large_pte(u64 pte)
>   	return pte & PT_PAGE_SIZE_MASK;
>   }
>   
> -static int is_last_spte(u64 pte, int level)
> +int is_last_spte(u64 pte, int level)
>   {
>   	if (level == PG_LEVEL_4K)
>   		return 1;
> @@ -647,7 +636,7 @@ static bool is_executable_pte(u64 spte)
>   	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>   }
>   
> -static kvm_pfn_t spte_to_pfn(u64 pte)
> +kvm_pfn_t spte_to_pfn(u64 pte)
>   {
>   	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
>   }
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 3acf3b8eb469d..65bb110847858 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -60,4 +60,19 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
>   bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
>   				    struct kvm_memory_slot *slot, u64 gfn);
>   
> +#define PT64_LEVEL_BITS 9
> +
> +#define PT64_LEVEL_SHIFT(level) \
> +		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> +
> +#define PT64_INDEX(address, level)\
> +	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> +
> +/* Functions for interpreting SPTEs */
> +kvm_pfn_t spte_to_pfn(u64 pte);
> +bool is_mmio_spte(u64 spte);
> +int is_shadow_present_pte(u64 pte);
> +int is_last_spte(u64 pte, int level);
> +
>   #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> new file mode 100644
> index 0000000000000..ee90d62d2a9b1
> --- /dev/null
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -0,0 +1,163 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include "mmu_internal.h"
> +#include "tdp_iter.h"
> +
> +/*
> + * Recalculates the pointer to the SPTE for the current GFN and level and
> + * reread the SPTE.
> + */
> +static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
> +{
> +	iter->sptep = iter->pt_path[iter->level - 1] +
> +		SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
> +	iter->old_spte = READ_ONCE(*iter->sptep);
> +}
> +
> +/*
> + * Sets a TDP iterator to walk a pre-order traversal of the paging structure
> + * rooted at root_pt, starting with the walk to translate goal_gfn.
> + */
> +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> +		    gfn_t goal_gfn)
> +{
> +	WARN_ON(root_level < 1);
> +	WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
> +
> +	iter->goal_gfn = goal_gfn;
> +	iter->root_level = root_level;
> +	iter->level = root_level;
> +	iter->pt_path[iter->level - 1] = root_pt;
> +
> +	iter->gfn = iter->goal_gfn -
> +		(iter->goal_gfn % KVM_PAGES_PER_HPAGE(iter->level));
> +	tdp_iter_refresh_sptep(iter);
> +
> +	iter->valid = true;
> +}
> +
> +/*
> + * Given an SPTE and its level, returns a pointer containing the host virtual
> + * address of the child page table referenced by the SPTE. Returns null if
> + * there is no such entry.
> + */
> +u64 *spte_to_child_pt(u64 spte, int level)
> +{
> +	u64 *pt;
> +	/* There's no child entry if this entry isn't present */
> +	if (!is_shadow_present_pte(spte))
> +		return NULL;
> +
> +	/* There is no child page table if this is a leaf entry. */
> +	if (is_last_spte(spte, level))
> +		return NULL;
> +
> +	pt = (u64 *)__va(spte_to_pfn(spte) << PAGE_SHIFT);
> +	return pt;
> +}
> +
> +/*
> + * Steps down one level in the paging structure towards the goal GFN. Returns
> + * true if the iterator was able to step down a level, false otherwise.
> + */
> +static bool try_step_down(struct tdp_iter *iter)
> +{
> +	u64 *child_pt;
> +
> +	if (iter->level == PG_LEVEL_4K)
> +		return false;
> +
> +	/*
> +	 * Reread the SPTE before stepping down to avoid traversing into page
> +	 * tables that are no longer linked from this entry.
> +	 */
> +	iter->old_spte = READ_ONCE(*iter->sptep);
> +
> +	child_pt = spte_to_child_pt(iter->old_spte, iter->level);
> +	if (!child_pt)
> +		return false;
> +
> +	iter->level--;
> +	iter->pt_path[iter->level - 1] = child_pt;
> +	iter->gfn = iter->goal_gfn -
> +		(iter->goal_gfn % KVM_PAGES_PER_HPAGE(iter->level));
> +	tdp_iter_refresh_sptep(iter);
> +
> +	return true;
> +}
> +
> +/*
> + * Steps to the next entry in the current page table, at the current page table
> + * level. The next entry could point to a page backing guest memory or another
> + * page table, or it could be non-present. Returns true if the iterator was
> + * able to step to the next entry in the page table, false if the iterator was
> + * already at the end of the current page table.
> + */
> +static bool try_step_side(struct tdp_iter *iter)
> +{
> +	/*
> +	 * Check if the iterator is already at the end of the current page
> +	 * table.
> +	 */
> +	if (!((iter->gfn + KVM_PAGES_PER_HPAGE(iter->level)) %
> +	      KVM_PAGES_PER_HPAGE(iter->level + 1)))
> +		return false;
> +
> +	iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
> +	iter->goal_gfn = iter->gfn;
> +	iter->sptep++;
> +	iter->old_spte = READ_ONCE(*iter->sptep);
> +
> +	return true;
> +}
> +
> +/*
> + * Tries to traverse back up a level in the paging structure so that the walk
> + * can continue from the next entry in the parent page table. Returns true on a
> + * successful step up, false if already in the root page.
> + */
> +static bool try_step_up(struct tdp_iter *iter)
> +{
> +	if (iter->level == iter->root_level)
> +		return false;
> +
> +	iter->level++;
> +	iter->gfn =  iter->gfn - (iter->gfn % KVM_PAGES_PER_HPAGE(iter->level));
> +	tdp_iter_refresh_sptep(iter);
> +
> +	return true;
> +}
> +
> +/*
> + * Step to the next SPTE in a pre-order traversal of the paging structure.
> + * To get to the next SPTE, the iterator either steps down towards the goal
> + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
> + * highter GFN.
> + *
> + * The basic algorithm is as follows:
> + * 1. If the current SPTE is a non-last-level SPTE, step down into the page
> + *    table it points to.
> + * 2. If the iterator cannot step down, it will try to step to the next SPTE
> + *    in the current page of the paging structure.
> + * 3. If the iterator cannot step to the next entry in the current page, it will
> + *    try to step up to the parent paging structure page. In this case, that
> + *    SPTE will have already been visited, and so the iterator must also step
> + *    to the side again.
> + */
> +void tdp_iter_next(struct tdp_iter *iter)
> +{
> +	bool done;
> +
> +	done = try_step_down(iter);
> +	if (done)
> +		return;
> +
> +	done = try_step_side(iter);
> +	while (!done) {
> +		if (!try_step_up(iter)) {
> +			iter->valid = false;
> +			break;
> +		}
> +		done = try_step_side(iter);
> +	}
> +}
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> new file mode 100644
> index 0000000000000..b102109778eac
> --- /dev/null
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -0,0 +1,53 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __KVM_X86_MMU_TDP_ITER_H
> +#define __KVM_X86_MMU_TDP_ITER_H
> +
> +#include <linux/kvm_host.h>
> +
> +#include "mmu.h"
> +
> +/*
> + * A TDP iterator performs a pre-order walk over a TDP paging structure.
> + */
> +struct tdp_iter {
> +	/*
> +	 * The iterator will traverse the paging structure towards the mapping
> +	 * for this GFN.
> +	 */
> +	gfn_t goal_gfn;
> +	/* Pointers to the page tables traversed to reach the current SPTE */
> +	u64 *pt_path[PT64_ROOT_MAX_LEVEL];
> +	/* A pointer to the current SPTE */
> +	u64 *sptep;
> +	/* The lowest GFN mapped by the current SPTE */
> +	gfn_t gfn;
> +	/* The level of the root page given to the iterator */
> +	int root_level;
> +	/* The iterator's current level within the paging structure */
> +	int level;
> +	/* A snapshot of the value at sptep */
> +	u64 old_spte;
> +	/*
> +	 * Whether the iterator has a valid state. This will be false if the
> +	 * iterator walks off the end of the paging structure.
> +	 */
> +	bool valid;
> +};
> +
> +/*
> + * Iterates over every SPTE mapping the GFN range [start, end) in a
> + * preorder traversal.
> + */
> +#define for_each_tdp_pte(iter, root, root_level, start, end) \
> +	for (tdp_iter_start(&iter, root, root_level, start); \
> +	     iter.valid && iter.gfn < end;		     \
> +	     tdp_iter_next(&iter))
> +
> +u64 *spte_to_child_pt(u64 pte, int level);
> +
> +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> +		    gfn_t goal_gfn);
> +void tdp_iter_next(struct tdp_iter *iter);
> +
> +#endif /* __KVM_X86_MMU_TDP_ITER_H */
>
Paolo Bonzini Sept. 30, 2020, 11:34 p.m. UTC | #8
On 01/10/20 01:20, Eric van Tassell wrote:
>>
>> +int is_shadow_present_pte(u64 pte)
>>   {
>>       return (pte != 0) && !is_mmio_spte(pte);
> From <Figure 28-1: Formats of EPTP and EPT Paging-Structure Entries" of
> the manual I don't have at my fingertips right now, I believe you should
> only check the low 3 bits(mask = 0x7). Since the upper bits are ignored,
> might that not mean they're not guaranteed to be 0?

No, this a property of the KVM MMU (and how it builds its PTEs) rather
than the hardware present check.

Paolo
Sean Christopherson Oct. 1, 2020, 12:07 a.m. UTC | #9
On Thu, Oct 01, 2020 at 01:34:53AM +0200, Paolo Bonzini wrote:
> On 01/10/20 01:20, Eric van Tassell wrote:
> >>
> >> +int is_shadow_present_pte(u64 pte)
> >>   {
> >>       return (pte != 0) && !is_mmio_spte(pte);
> > From <Figure 28-1: Formats of EPTP and EPT Paging-Structure Entries" of
> > the manual I don't have at my fingertips right now, I believe you should
> > only check the low 3 bits(mask = 0x7). Since the upper bits are ignored,
> > might that not mean they're not guaranteed to be 0?
> 
> No, this a property of the KVM MMU (and how it builds its PTEs) rather
> than the hardware present check.

Ya, I found out the hard way that "present" in is_shadow_present_pte() really
means "valid", or "may be present".  The most notable case is EPT without A/D
bits (I think this is the only case where a valid SPTE can be fully not-present
in hardware).  Accessed tracking will clear all RWX bits to make the EPT entry
not-present, but from KVM's perspective it's treated as valid/present because
it can be made present in hardware without taking the MMU lock.
diff mbox series

Patch

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4a3081e9f4b5d..cf6a9947955f7 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -15,7 +15,8 @@  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
-			   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
+			   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
+			   mmu/tdp_iter.o
 
 kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
 kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 81240b558d67f..b48b00c8cde65 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -134,15 +134,6 @@  module_param(dbg, bool, 0644);
 #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
 #define SPTE_MMIO_MASK (3ULL << 52)
 
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
-		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-
-#define PT64_INDEX(address, level)\
-	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
 #define PT32_LEVEL_BITS 10
 
 #define PT32_LEVEL_SHIFT(level) \
@@ -192,8 +183,6 @@  module_param(dbg, bool, 0644);
 #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 
-#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-
 /* make pte_list_desc fit well in cache line */
 #define PTE_LIST_EXT 3
 
@@ -346,7 +335,7 @@  void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
-static bool is_mmio_spte(u64 spte)
+bool is_mmio_spte(u64 spte)
 {
 	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
 }
@@ -623,7 +612,7 @@  static int is_nx(struct kvm_vcpu *vcpu)
 	return vcpu->arch.efer & EFER_NX;
 }
 
-static int is_shadow_present_pte(u64 pte)
+int is_shadow_present_pte(u64 pte)
 {
 	return (pte != 0) && !is_mmio_spte(pte);
 }
@@ -633,7 +622,7 @@  static int is_large_pte(u64 pte)
 	return pte & PT_PAGE_SIZE_MASK;
 }
 
-static int is_last_spte(u64 pte, int level)
+int is_last_spte(u64 pte, int level)
 {
 	if (level == PG_LEVEL_4K)
 		return 1;
@@ -647,7 +636,7 @@  static bool is_executable_pte(u64 spte)
 	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
 }
 
-static kvm_pfn_t spte_to_pfn(u64 pte)
+kvm_pfn_t spte_to_pfn(u64 pte)
 {
 	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 }
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 3acf3b8eb469d..65bb110847858 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -60,4 +60,19 @@  void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 				    struct kvm_memory_slot *slot, u64 gfn);
 
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_INDEX(address, level)\
+	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+
+/* Functions for interpreting SPTEs */
+kvm_pfn_t spte_to_pfn(u64 pte);
+bool is_mmio_spte(u64 spte);
+int is_shadow_present_pte(u64 pte);
+int is_last_spte(u64 pte, int level);
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
new file mode 100644
index 0000000000000..ee90d62d2a9b1
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -0,0 +1,163 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "mmu_internal.h"
+#include "tdp_iter.h"
+
+/*
+ * Recalculates the pointer to the SPTE for the current GFN and level and
+ * reread the SPTE.
+ */
+static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
+{
+	iter->sptep = iter->pt_path[iter->level - 1] +
+		SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
+	iter->old_spte = READ_ONCE(*iter->sptep);
+}
+
+/*
+ * Sets a TDP iterator to walk a pre-order traversal of the paging structure
+ * rooted at root_pt, starting with the walk to translate goal_gfn.
+ */
+void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+		    gfn_t goal_gfn)
+{
+	WARN_ON(root_level < 1);
+	WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
+
+	iter->goal_gfn = goal_gfn;
+	iter->root_level = root_level;
+	iter->level = root_level;
+	iter->pt_path[iter->level - 1] = root_pt;
+
+	iter->gfn = iter->goal_gfn -
+		(iter->goal_gfn % KVM_PAGES_PER_HPAGE(iter->level));
+	tdp_iter_refresh_sptep(iter);
+
+	iter->valid = true;
+}
+
+/*
+ * Given an SPTE and its level, returns a pointer containing the host virtual
+ * address of the child page table referenced by the SPTE. Returns null if
+ * there is no such entry.
+ */
+u64 *spte_to_child_pt(u64 spte, int level)
+{
+	u64 *pt;
+	/* There's no child entry if this entry isn't present */
+	if (!is_shadow_present_pte(spte))
+		return NULL;
+
+	/* There is no child page table if this is a leaf entry. */
+	if (is_last_spte(spte, level))
+		return NULL;
+
+	pt = (u64 *)__va(spte_to_pfn(spte) << PAGE_SHIFT);
+	return pt;
+}
+
+/*
+ * Steps down one level in the paging structure towards the goal GFN. Returns
+ * true if the iterator was able to step down a level, false otherwise.
+ */
+static bool try_step_down(struct tdp_iter *iter)
+{
+	u64 *child_pt;
+
+	if (iter->level == PG_LEVEL_4K)
+		return false;
+
+	/*
+	 * Reread the SPTE before stepping down to avoid traversing into page
+	 * tables that are no longer linked from this entry.
+	 */
+	iter->old_spte = READ_ONCE(*iter->sptep);
+
+	child_pt = spte_to_child_pt(iter->old_spte, iter->level);
+	if (!child_pt)
+		return false;
+
+	iter->level--;
+	iter->pt_path[iter->level - 1] = child_pt;
+	iter->gfn = iter->goal_gfn -
+		(iter->goal_gfn % KVM_PAGES_PER_HPAGE(iter->level));
+	tdp_iter_refresh_sptep(iter);
+
+	return true;
+}
+
+/*
+ * Steps to the next entry in the current page table, at the current page table
+ * level. The next entry could point to a page backing guest memory or another
+ * page table, or it could be non-present. Returns true if the iterator was
+ * able to step to the next entry in the page table, false if the iterator was
+ * already at the end of the current page table.
+ */
+static bool try_step_side(struct tdp_iter *iter)
+{
+	/*
+	 * Check if the iterator is already at the end of the current page
+	 * table.
+	 */
+	if (!((iter->gfn + KVM_PAGES_PER_HPAGE(iter->level)) %
+	      KVM_PAGES_PER_HPAGE(iter->level + 1)))
+		return false;
+
+	iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
+	iter->goal_gfn = iter->gfn;
+	iter->sptep++;
+	iter->old_spte = READ_ONCE(*iter->sptep);
+
+	return true;
+}
+
+/*
+ * Tries to traverse back up a level in the paging structure so that the walk
+ * can continue from the next entry in the parent page table. Returns true on a
+ * successful step up, false if already in the root page.
+ */
+static bool try_step_up(struct tdp_iter *iter)
+{
+	if (iter->level == iter->root_level)
+		return false;
+
+	iter->level++;
+	iter->gfn =  iter->gfn - (iter->gfn % KVM_PAGES_PER_HPAGE(iter->level));
+	tdp_iter_refresh_sptep(iter);
+
+	return true;
+}
+
+/*
+ * Step to the next SPTE in a pre-order traversal of the paging structure.
+ * To get to the next SPTE, the iterator either steps down towards the goal
+ * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
+ * highter GFN.
+ *
+ * The basic algorithm is as follows:
+ * 1. If the current SPTE is a non-last-level SPTE, step down into the page
+ *    table it points to.
+ * 2. If the iterator cannot step down, it will try to step to the next SPTE
+ *    in the current page of the paging structure.
+ * 3. If the iterator cannot step to the next entry in the current page, it will
+ *    try to step up to the parent paging structure page. In this case, that
+ *    SPTE will have already been visited, and so the iterator must also step
+ *    to the side again.
+ */
+void tdp_iter_next(struct tdp_iter *iter)
+{
+	bool done;
+
+	done = try_step_down(iter);
+	if (done)
+		return;
+
+	done = try_step_side(iter);
+	while (!done) {
+		if (!try_step_up(iter)) {
+			iter->valid = false;
+			break;
+		}
+		done = try_step_side(iter);
+	}
+}
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
new file mode 100644
index 0000000000000..b102109778eac
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -0,0 +1,53 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KVM_X86_MMU_TDP_ITER_H
+#define __KVM_X86_MMU_TDP_ITER_H
+
+#include <linux/kvm_host.h>
+
+#include "mmu.h"
+
+/*
+ * A TDP iterator performs a pre-order walk over a TDP paging structure.
+ */
+struct tdp_iter {
+	/*
+	 * The iterator will traverse the paging structure towards the mapping
+	 * for this GFN.
+	 */
+	gfn_t goal_gfn;
+	/* Pointers to the page tables traversed to reach the current SPTE */
+	u64 *pt_path[PT64_ROOT_MAX_LEVEL];
+	/* A pointer to the current SPTE */
+	u64 *sptep;
+	/* The lowest GFN mapped by the current SPTE */
+	gfn_t gfn;
+	/* The level of the root page given to the iterator */
+	int root_level;
+	/* The iterator's current level within the paging structure */
+	int level;
+	/* A snapshot of the value at sptep */
+	u64 old_spte;
+	/*
+	 * Whether the iterator has a valid state. This will be false if the
+	 * iterator walks off the end of the paging structure.
+	 */
+	bool valid;
+};
+
+/*
+ * Iterates over every SPTE mapping the GFN range [start, end) in a
+ * preorder traversal.
+ */
+#define for_each_tdp_pte(iter, root, root_level, start, end) \
+	for (tdp_iter_start(&iter, root, root_level, start); \
+	     iter.valid && iter.gfn < end;		     \
+	     tdp_iter_next(&iter))
+
+u64 *spte_to_child_pt(u64 pte, int level);
+
+void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+		    gfn_t goal_gfn);
+void tdp_iter_next(struct tdp_iter *iter);
+
+#endif /* __KVM_X86_MMU_TDP_ITER_H */