diff mbox series

[v4,02/21] KVM: arm64: Add stand-alone page-table walker infrastructure

Message ID 20200907152344.12978-3-will@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Rewrite page-table code and fault handling | expand

Commit Message

Will Deacon Sept. 7, 2020, 3:23 p.m. UTC
The KVM page-table code is intricately tied into the kernel page-table
code and re-uses the pte/pmd/pud/p4d/pgd macros directly in an attempt
to reduce code duplication. Unfortunately, the reality is that there is
an awful lot of code required to make this work, and at the end of the
day you're limited to creating page-tables with the same configuration
as the host kernel. Furthermore, lifting the page-table code to run
directly at EL2 on a non-VHE system (as we plan to to do in future
patches) is practically impossible due to the number of dependencies it
has on the core kernel.

Introduce a framework for walking Armv8 page-tables configured
independently from the host kernel.

Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kvm_pgtable.h | 101 ++++++++++
 arch/arm64/kvm/hyp/Makefile          |   2 +-
 arch/arm64/kvm/hyp/pgtable.c         | 285 +++++++++++++++++++++++++++
 3 files changed, 387 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/kvm_pgtable.h
 create mode 100644 arch/arm64/kvm/hyp/pgtable.c

Comments

Gavin Shan Sept. 8, 2020, 12:03 a.m. UTC | #1
Hi Will,

On 9/8/20 1:23 AM, Will Deacon wrote:
> The KVM page-table code is intricately tied into the kernel page-table
> code and re-uses the pte/pmd/pud/p4d/pgd macros directly in an attempt
> to reduce code duplication. Unfortunately, the reality is that there is
> an awful lot of code required to make this work, and at the end of the
> day you're limited to creating page-tables with the same configuration
> as the host kernel. Furthermore, lifting the page-table code to run
> directly at EL2 on a non-VHE system (as we plan to to do in future
> patches) is practically impossible due to the number of dependencies it
> has on the core kernel.
> 
> Introduce a framework for walking Armv8 page-tables configured
> independently from the host kernel.
> 
> Cc: Marc Zyngier <maz@kernel.org>
> Cc: Quentin Perret <qperret@google.com>
> Signed-off-by: Will Deacon <will@kernel.org>
> ---

It looks good to me. However, If you get a chance to respin, I guess
it would be nice to add some comments about kvm_block_mapping_supported().
More details are provoided below.

Reviewed-by: Gavin Shan <gshan@redhat.com>

>   arch/arm64/include/asm/kvm_pgtable.h | 101 ++++++++++
>   arch/arm64/kvm/hyp/Makefile          |   2 +-
>   arch/arm64/kvm/hyp/pgtable.c         | 285 +++++++++++++++++++++++++++
>   3 files changed, 387 insertions(+), 1 deletion(-)
>   create mode 100644 arch/arm64/include/asm/kvm_pgtable.h
>   create mode 100644 arch/arm64/kvm/hyp/pgtable.c
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> new file mode 100644
> index 000000000000..1c5d981e15c3
> --- /dev/null
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -0,0 +1,101 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2020 Google LLC
> + * Author: Will Deacon <will@kernel.org>
> + */
> +
> +#ifndef __ARM64_KVM_PGTABLE_H__
> +#define __ARM64_KVM_PGTABLE_H__
> +
> +#include <linux/bits.h>
> +#include <linux/kvm_host.h>
> +#include <linux/types.h>
> +
> +typedef u64 kvm_pte_t;
> +
> +/**
> + * struct kvm_pgtable - KVM page-table.
> + * @ia_bits:		Maximum input address size, in bits.
> + * @start_level:	Level at which the page-table walk starts.
> + * @pgd:		Pointer to the first top-level entry of the page-table.
> + * @mmu:		Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
> + */
> +struct kvm_pgtable {
> +	u32					ia_bits;
> +	u32					start_level;
> +	kvm_pte_t				*pgd;
> +
> +	/* Stage-2 only */
> +	struct kvm_s2_mmu			*mmu;
> +};
> +
> +/**
> + * enum kvm_pgtable_prot - Page-table permissions and attributes.
> + * @KVM_PGTABLE_PROT_X:		Execute permission.
> + * @KVM_PGTABLE_PROT_W:		Write permission.
> + * @KVM_PGTABLE_PROT_R:		Read permission.
> + * @KVM_PGTABLE_PROT_DEVICE:	Device attributes.
> + */
> +enum kvm_pgtable_prot {
> +	KVM_PGTABLE_PROT_X			= BIT(0),
> +	KVM_PGTABLE_PROT_W			= BIT(1),
> +	KVM_PGTABLE_PROT_R			= BIT(2),
> +
> +	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
> +};
> +
> +/**
> + * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
> + * @KVM_PGTABLE_WALK_LEAF:		Visit leaf entries, including invalid
> + *					entries.
> + * @KVM_PGTABLE_WALK_TABLE_PRE:		Visit table entries before their
> + *					children.
> + * @KVM_PGTABLE_WALK_TABLE_POST:	Visit table entries after their
> + *					children.
> + */
> +enum kvm_pgtable_walk_flags {
> +	KVM_PGTABLE_WALK_LEAF			= BIT(0),
> +	KVM_PGTABLE_WALK_TABLE_PRE		= BIT(1),
> +	KVM_PGTABLE_WALK_TABLE_POST		= BIT(2),
> +};
> +
> +typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
> +					kvm_pte_t *ptep,
> +					enum kvm_pgtable_walk_flags flag,
> +					void * const arg);
> +
> +/**
> + * struct kvm_pgtable_walker - Hook into a page-table walk.
> + * @cb:		Callback function to invoke during the walk.
> + * @arg:	Argument passed to the callback function.
> + * @flags:	Bitwise-OR of flags to identify the entry types on which to
> + *		invoke the callback function.
> + */
> +struct kvm_pgtable_walker {
> +	const kvm_pgtable_visitor_fn_t		cb;
> +	void * const				arg;
> +	const enum kvm_pgtable_walk_flags	flags;
> +};
> +
> +/**
> + * kvm_pgtable_walk() - Walk a page-table.
> + * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
> + * @addr:	Input address for the start of the walk.
> + * @size:	Size of the range to walk.
> + * @walker:	Walker callback description.
> + *
> + * The walker will walk the page-table entries corresponding to the input
> + * address range specified, visiting entries according to the walker flags.
> + * Invalid entries are treated as leaf entries. Leaf entries are reloaded
> + * after invoking the walker callback, allowing the walker to descend into
> + * a newly installed table.
> + *
> + * Returning a negative error code from the walker callback function will
> + * terminate the walk immediately with the same error code.
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +		     struct kvm_pgtable_walker *walker);
> +
> +#endif	/* __ARM64_KVM_PGTABLE_H__ */
> diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
> index f54f0e89a71c..607b8a898826 100644
> --- a/arch/arm64/kvm/hyp/Makefile
> +++ b/arch/arm64/kvm/hyp/Makefile
> @@ -10,5 +10,5 @@ subdir-ccflags-y := -I$(incdir)				\
>   		    -DDISABLE_BRANCH_PROFILING		\
>   		    $(DISABLE_STACKLEAK_PLUGIN)
>   
> -obj-$(CONFIG_KVM) += vhe/ nvhe/
> +obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
>   obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> new file mode 100644
> index 000000000000..3fb9d1949a3f
> --- /dev/null
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -0,0 +1,285 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
> + * No bombay mix was harmed in the writing of this file.
> + *
> + * Copyright (C) 2020 Google LLC
> + * Author: Will Deacon <will@kernel.org>
> + */
> +
> +#include <linux/bitfield.h>
> +#include <asm/kvm_pgtable.h>
> +
> +#define KVM_PGTABLE_MAX_LEVELS		4U
> +
> +#define KVM_PTE_VALID			BIT(0)
> +
> +#define KVM_PTE_TYPE			BIT(1)
> +#define KVM_PTE_TYPE_BLOCK		0
> +#define KVM_PTE_TYPE_PAGE		1
> +#define KVM_PTE_TYPE_TABLE		1
> +
> +#define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
> +#define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
> +
> +#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
> +
> +#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
> +
> +struct kvm_pgtable_walk_data {
> +	struct kvm_pgtable		*pgt;
> +	struct kvm_pgtable_walker	*walker;
> +
> +	u64				addr;
> +	u64				end;
> +};
> +
> +static u64 kvm_granule_shift(u32 level)
> +{
> +	/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
> +	return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
> +}
> +
> +static u64 kvm_granule_size(u32 level)
> +{
> +	return BIT(kvm_granule_shift(level));
> +}
> +
> +static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
> +{
> +	u64 granule = kvm_granule_size(level);
> +
> +	/*
> +	 * Reject invalid block mappings and don't bother with 4TB mappings for
> +	 * 52-bit PAs.
> +	 */
> +	if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
> +		return false;
> +
> +	if (granule > (end - addr))
> +		return false;
> +
> +	return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
> +}
> +

If you get a chance to respin, it would be nice to have more comments to
explain why 4TB block mapping is rejected here. I guess it depends on the
fact: hugeTLB/THP doesn't support such kind of huge block mapping yet?

> +static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
> +{
> +	u64 shift = kvm_granule_shift(level);
> +	u64 mask = BIT(PAGE_SHIFT - 3) - 1;
> +
> +	return (data->addr >> shift) & mask;
> +}
> +
> +static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
> +{
> +	u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
> +	u64 mask = BIT(pgt->ia_bits) - 1;
> +
> +	return (addr & mask) >> shift;
> +}
> +
> +static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
> +{
> +	return __kvm_pgd_page_idx(data->pgt, data->addr);
> +}
> +
> +static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
> +{
> +	struct kvm_pgtable pgt = {
> +		.ia_bits	= ia_bits,
> +		.start_level	= start_level,
> +	};
> +
> +	return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
> +}
> +
> +static bool kvm_pte_valid(kvm_pte_t pte)
> +{
> +	return pte & KVM_PTE_VALID;
> +}
> +
> +static bool kvm_pte_table(kvm_pte_t pte, u32 level)
> +{
> +	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> +		return false;
> +
> +	if (!kvm_pte_valid(pte))
> +		return false;
> +
> +	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
> +}
> +
> +static u64 kvm_pte_to_phys(kvm_pte_t pte)
> +{
> +	u64 pa = pte & KVM_PTE_ADDR_MASK;
> +
> +	if (PAGE_SHIFT == 16)
> +		pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
> +
> +	return pa;
> +}
> +
> +static kvm_pte_t kvm_phys_to_pte(u64 pa)
> +{
> +	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
> +
> +	if (PAGE_SHIFT == 16)
> +		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
> +
> +	return pte;
> +}
> +
> +static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
> +{
> +	return __va(kvm_pte_to_phys(pte));
> +}
> +
> +static void kvm_set_invalid_pte(kvm_pte_t *ptep)
> +{
> +	kvm_pte_t pte = *ptep;
> +	WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
> +}
> +
> +static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
> +{
> +	kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
> +
> +	pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
> +	pte |= KVM_PTE_VALID;
> +
> +	WARN_ON(kvm_pte_valid(old));
> +	smp_store_release(ptep, pte);
> +}
> +
> +static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
> +				   u32 level)
> +{
> +	kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
> +	u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
> +							   KVM_PTE_TYPE_BLOCK;
> +
> +	pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
> +	pte |= FIELD_PREP(KVM_PTE_TYPE, type);
> +	pte |= KVM_PTE_VALID;
> +
> +	/* Tolerate KVM recreating the exact same mapping. */
> +	if (kvm_pte_valid(old))
> +		return old == pte;
> +
> +	smp_store_release(ptep, pte);
> +	return true;
> +}
> +
> +static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
> +				  u32 level, kvm_pte_t *ptep,
> +				  enum kvm_pgtable_walk_flags flag)
> +{
> +	struct kvm_pgtable_walker *walker = data->walker;
> +	return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
> +}
> +
> +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
> +			      kvm_pte_t *pgtable, u32 level);
> +
> +static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
> +				      kvm_pte_t *ptep, u32 level)
> +{
> +	int ret = 0;
> +	u64 addr = data->addr;
> +	kvm_pte_t *childp, pte = *ptep;
> +	bool table = kvm_pte_table(pte, level);
> +	enum kvm_pgtable_walk_flags flags = data->walker->flags;
> +
> +	if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
> +		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
> +					     KVM_PGTABLE_WALK_TABLE_PRE);
> +	}
> +
> +	if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
> +		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
> +					     KVM_PGTABLE_WALK_LEAF);
> +		pte = *ptep;
> +		table = kvm_pte_table(pte, level);
> +	}
> +
> +	if (ret)
> +		goto out;
> +
> +	if (!table) {
> +		data->addr += kvm_granule_size(level);
> +		goto out;
> +	}
> +
> +	childp = kvm_pte_follow(pte);
> +	ret = __kvm_pgtable_walk(data, childp, level + 1);
> +	if (ret)
> +		goto out;
> +
> +	if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
> +		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
> +					     KVM_PGTABLE_WALK_TABLE_POST);
> +	}
> +
> +out:
> +	return ret;
> +}
> +
> +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
> +			      kvm_pte_t *pgtable, u32 level)
> +{
> +	u32 idx;
> +	int ret = 0;
> +
> +	if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
> +		return -EINVAL;
> +
> +	for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
> +		kvm_pte_t *ptep = &pgtable[idx];
> +
> +		if (data->addr >= data->end)
> +			break;
> +
> +		ret = __kvm_pgtable_visit(data, ptep, level);
> +		if (ret)
> +			break;
> +	}
> +
> +	return ret;
> +}
> +
> +static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
> +{
> +	u32 idx;
> +	int ret = 0;
> +	struct kvm_pgtable *pgt = data->pgt;
> +	u64 limit = BIT(pgt->ia_bits);
> +
> +	if (data->addr > limit || data->end > limit)
> +		return -ERANGE;
> +
> +	if (!pgt->pgd)
> +		return -EINVAL;
> +
> +	for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
> +		kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
> +
> +		ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
> +		if (ret)
> +			break;
> +	}
> +
> +	return ret;
> +}
> +
> +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +		     struct kvm_pgtable_walker *walker)
> +{
> +	struct kvm_pgtable_walk_data walk_data = {
> +		.pgt	= pgt,
> +		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
> +		.end	= PAGE_ALIGN(walk_data.addr + size),
> +		.walker	= walker,
> +	};
> +
> +	return _kvm_pgtable_walk(&walk_data);
> +}
> 

Thanks,
Gavin
Alexandru Elisei Sept. 9, 2020, 3:29 p.m. UTC | #2
Hi Will,

On 9/7/20 4:23 PM, Will Deacon wrote:
> [..]
> +
> +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +		     struct kvm_pgtable_walker *walker)
> +{
> +	struct kvm_pgtable_walk_data walk_data = {
> +		.pgt	= pgt,
> +		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
> +		.end	= PAGE_ALIGN(walk_data.addr + size),
> +		.walker	= walker,
> +	};

If the caller wants to walk [0x500, 0x1500), for PAGE_SIZE = 0x1000 (4K), the
function walks the range [0x0, 0x1000). Is that intentional?

Thanks,
Alex
Will Deacon Sept. 10, 2020, 10:57 a.m. UTC | #3
On Tue, Sep 08, 2020 at 10:03:30AM +1000, Gavin Shan wrote:
> On 9/8/20 1:23 AM, Will Deacon wrote:
> > The KVM page-table code is intricately tied into the kernel page-table
> > code and re-uses the pte/pmd/pud/p4d/pgd macros directly in an attempt
> > to reduce code duplication. Unfortunately, the reality is that there is
> > an awful lot of code required to make this work, and at the end of the
> > day you're limited to creating page-tables with the same configuration
> > as the host kernel. Furthermore, lifting the page-table code to run
> > directly at EL2 on a non-VHE system (as we plan to to do in future
> > patches) is practically impossible due to the number of dependencies it
> > has on the core kernel.
> > 
> > Introduce a framework for walking Armv8 page-tables configured
> > independently from the host kernel.
> > 
> > Cc: Marc Zyngier <maz@kernel.org>
> > Cc: Quentin Perret <qperret@google.com>
> > Signed-off-by: Will Deacon <will@kernel.org>
> > ---
> 
> It looks good to me. However, If you get a chance to respin, I guess
> it would be nice to add some comments about kvm_block_mapping_supported().
> More details are provoided below.
> 
> Reviewed-by: Gavin Shan <gshan@redhat.com>

[...]

> > +static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
> > +{
> > +	u64 granule = kvm_granule_size(level);
> > +
> > +	/*
> > +	 * Reject invalid block mappings and don't bother with 4TB mappings for
> > +	 * 52-bit PAs.
> > +	 */
> > +	if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
> > +		return false;
> > +
> > +	if (granule > (end - addr))
> > +		return false;
> > +
> > +	return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
> > +}
> > +
> 
> If you get a chance to respin, it would be nice to have more comments to
> explain why 4TB block mapping is rejected here. I guess it depends on the
> fact: hugeTLB/THP doesn't support such kind of huge block mapping yet?

It's just because I'm lazy, really :) We don't know if we're using 52-bit
PAs here, and I couldn't be bothered to propagate that information given
that this doesn't seem like something anybody will be using at the moment.

Will
Will Deacon Sept. 10, 2020, 12:37 p.m. UTC | #4
On Wed, Sep 09, 2020 at 04:29:26PM +0100, Alexandru Elisei wrote:
> On 9/7/20 4:23 PM, Will Deacon wrote:
> > [..]
> > +
> > +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > +		     struct kvm_pgtable_walker *walker)
> > +{
> > +	struct kvm_pgtable_walk_data walk_data = {
> > +		.pgt	= pgt,
> > +		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
> > +		.end	= PAGE_ALIGN(walk_data.addr + size),
> > +		.walker	= walker,
> > +	};
> 
> If the caller wants to walk [0x500, 0x1500), for PAGE_SIZE = 0x1000 (4K), the
> function walks the range [0x0, 0x1000). Is that intentional?

Yes, although the caller specifies a base and a *size*, rather than an end
address. As a concrete example, much of the hypervisor stage-1 mapping
is created using PAGE_SIZE mappings of random ELF symbols, which correspond
to arbitrary addresses. In these cases, we really do want to round-down the
address and perform a PAGE_SIZE mapping.

The alternative would be to return an error if the size is not page-aligned,
but I don't think that's really necessary, especially since we accept an
unaligned base.

Will
Andrew Scull Sept. 10, 2020, 2:21 p.m. UTC | #5
On Thu, Sep 10, 2020 at 01:37:13PM +0100, Will Deacon wrote:
> On Wed, Sep 09, 2020 at 04:29:26PM +0100, Alexandru Elisei wrote:
> > On 9/7/20 4:23 PM, Will Deacon wrote:
> > > [..]
> > > +
> > > +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > > +		     struct kvm_pgtable_walker *walker)
> > > +{
> > > +	struct kvm_pgtable_walk_data walk_data = {
> > > +		.pgt	= pgt,
> > > +		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
> > > +		.end	= PAGE_ALIGN(walk_data.addr + size),
> > > +		.walker	= walker,
> > > +	};
> > 
> > If the caller wants to walk [0x500, 0x1500), for PAGE_SIZE = 0x1000 (4K), the
> > function walks the range [0x0, 0x1000). Is that intentional?
> 
> Yes, although the caller specifies a base and a *size*, rather than an end
> address. As a concrete example, much of the hypervisor stage-1 mapping
> is created using PAGE_SIZE mappings of random ELF symbols, which correspond
> to arbitrary addresses. In these cases, we really do want to round-down the
> address and perform a PAGE_SIZE mapping.

I think Alexandru has a point here. Turning his example into something
equivalent that maps a random ELF symbol:

    struct some_hyp_state s = { ... };
    // &s == 0x500
    // sizeof(s) == PAGE_SIZE
    kvm_pgtable_walk(&s, sizeof(s), walker);

Given `s` straddles the two pages, the part in the second page won't be
mapped.

Should the end address instead be calculated as:

   .end = PAGE_ALIGN(addr + size),

> 
> The alternative would be to return an error if the size is not page-aligned,
> but I don't think that's really necessary, especially since we accept an
> unaligned base.
> 
> Will
> 
> -- 
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Will Deacon Sept. 11, 2020, 10:15 a.m. UTC | #6
On Thu, Sep 10, 2020 at 03:21:59PM +0100, Andrew Scull wrote:
> On Thu, Sep 10, 2020 at 01:37:13PM +0100, Will Deacon wrote:
> > On Wed, Sep 09, 2020 at 04:29:26PM +0100, Alexandru Elisei wrote:
> > > On 9/7/20 4:23 PM, Will Deacon wrote:
> > > > [..]
> > > > +
> > > > +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > > > +		     struct kvm_pgtable_walker *walker)
> > > > +{
> > > > +	struct kvm_pgtable_walk_data walk_data = {
> > > > +		.pgt	= pgt,
> > > > +		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
> > > > +		.end	= PAGE_ALIGN(walk_data.addr + size),
> > > > +		.walker	= walker,
> > > > +	};
> > > 
> > > If the caller wants to walk [0x500, 0x1500), for PAGE_SIZE = 0x1000 (4K), the
> > > function walks the range [0x0, 0x1000). Is that intentional?
> > 
> > Yes, although the caller specifies a base and a *size*, rather than an end
> > address. As a concrete example, much of the hypervisor stage-1 mapping
> > is created using PAGE_SIZE mappings of random ELF symbols, which correspond
> > to arbitrary addresses. In these cases, we really do want to round-down the
> > address and perform a PAGE_SIZE mapping.
> 
> I think Alexandru has a point here. Turning his example into something
> equivalent that maps a random ELF symbol:
> 
>     struct some_hyp_state s = { ... };
>     // &s == 0x500
>     // sizeof(s) == PAGE_SIZE
>     kvm_pgtable_walk(&s, sizeof(s), walker);
> 
> Given `s` straddles the two pages, the part in the second page won't be
> mapped.
> 
> Should the end address instead be calculated as:
> 
>    .end = PAGE_ALIGN(addr + size),

Cheers for the example, and I see what you mean about structures that
straddle a page boundary. However, I think it's important here that the size
parameter accurately reflects the number of pages mapped: if the caller
passes PAGE_SIZE, we better not map more than a page, since the mmu cache
might not have enough pre-allocated pages for that.

So the API is just that the virtual address bits corresponding to the offset
within a page are ignored. Looking back at the code, that works out for the
hyp mappings (it's actually the _physical_ address that is unaligned there,
and we can just round that down), but I think I have a potential bug in the
ioremap code if you place the GIC (v2) somewhere funky on a machine using
64k pages. In this case, the ioctl() handler only enforces 4k alignment,
and so we could end up truncating the mapping in a similar case to what you
describe above. For example, trying to place it from 60k - 68k would result
in only the first page getting mapped.

I've fixed that in the ioremap code (diff below), and I'll update the
kerneldoc to say that the bottom bits of the VA are ignored.

Cheers,

Will

--->8

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1041be1fafe4..21b70abf65a7 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -505,6 +505,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
                                     KVM_PGTABLE_PROT_R |
                                     (writable ? KVM_PGTABLE_PROT_W : 0);
 
+       size += offset_in_page(guest_ipa);
+       guest_ipa &= PAGE_MASK;
+
        for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
                ret = kvm_mmu_topup_memory_cache(&cache,
                                                 kvm_mmu_cache_min_pages(kvm));
Andrew Scull Sept. 11, 2020, 11:22 a.m. UTC | #7
On Fri, Sep 11, 2020 at 11:15:04AM +0100, Will Deacon wrote:
> On Thu, Sep 10, 2020 at 03:21:59PM +0100, Andrew Scull wrote:
> > On Thu, Sep 10, 2020 at 01:37:13PM +0100, Will Deacon wrote:
> > > On Wed, Sep 09, 2020 at 04:29:26PM +0100, Alexandru Elisei wrote:
> > > > On 9/7/20 4:23 PM, Will Deacon wrote:
> > > > > [..]
> > > > > +
> > > > > +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > > > > +		     struct kvm_pgtable_walker *walker)
> > > > > +{
> > > > > +	struct kvm_pgtable_walk_data walk_data = {
> > > > > +		.pgt	= pgt,
> > > > > +		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
> > > > > +		.end	= PAGE_ALIGN(walk_data.addr + size),
> > > > > +		.walker	= walker,
> > > > > +	};
> > > > 
> > > > If the caller wants to walk [0x500, 0x1500), for PAGE_SIZE = 0x1000 (4K), the
> > > > function walks the range [0x0, 0x1000). Is that intentional?
> > > 
> > > Yes, although the caller specifies a base and a *size*, rather than an end
> > > address. As a concrete example, much of the hypervisor stage-1 mapping
> > > is created using PAGE_SIZE mappings of random ELF symbols, which correspond
> > > to arbitrary addresses. In these cases, we really do want to round-down the
> > > address and perform a PAGE_SIZE mapping.
> > 
> > I think Alexandru has a point here. Turning his example into something
> > equivalent that maps a random ELF symbol:
> > 
> >     struct some_hyp_state s = { ... };
> >     // &s == 0x500
> >     // sizeof(s) == PAGE_SIZE
> >     kvm_pgtable_walk(&s, sizeof(s), walker);
> > 
> > Given `s` straddles the two pages, the part in the second page won't be
> > mapped.
> > 
> > Should the end address instead be calculated as:
> > 
> >    .end = PAGE_ALIGN(addr + size),
> 
> Cheers for the example, and I see what you mean about structures that
> straddle a page boundary. However, I think it's important here that the size
> parameter accurately reflects the number of pages mapped: if the caller
> passes PAGE_SIZE, we better not map more than a page, since the mmu cache
> might not have enough pre-allocated pages for that.
> 
> So the API is just that the virtual address bits corresponding to the offset
> within a page are ignored. Looking back at the code, that works out for the
> hyp mappings (it's actually the _physical_ address that is unaligned there,
> and we can just round that down), but I think I have a potential bug in the
> ioremap code if you place the GIC (v2) somewhere funky on a machine using
> 64k pages. In this case, the ioctl() handler only enforces 4k alignment,
> and so we could end up truncating the mapping in a similar case to what you
> describe above. For example, trying to place it from 60k - 68k would result
> in only the first page getting mapped.
> 
> I've fixed that in the ioremap code (diff below), and I'll update the
> kerneldoc to say that the bottom bits of the VA are ignored.

You've described kvm_pgtable_walk as taking a start page and page count
so an alternative to the comment could be defining function to reflect
that e.g.

    int kvm_pgtable_walk(struct kvm_pgtable *pgt, gfn_t start,
    			 size_t count, kvm_pgtable_walker *walker);

Up to you.

> 
> Cheers,
> 
> Will
> 
> --->8
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 1041be1fafe4..21b70abf65a7 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -505,6 +505,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>                                      KVM_PGTABLE_PROT_R |
>                                      (writable ? KVM_PGTABLE_PROT_W : 0);
>  
> +       size += offset_in_page(guest_ipa);
> +       guest_ipa &= PAGE_MASK;
> +
>         for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
>                 ret = kvm_mmu_topup_memory_cache(&cache,
>                                                  kvm_mmu_cache_min_pages(kvm));
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
new file mode 100644
index 000000000000..1c5d981e15c3
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -0,0 +1,101 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ARM64_KVM_PGTABLE_H__
+#define __ARM64_KVM_PGTABLE_H__
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+
+typedef u64 kvm_pte_t;
+
+/**
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits:		Maximum input address size, in bits.
+ * @start_level:	Level at which the page-table walk starts.
+ * @pgd:		Pointer to the first top-level entry of the page-table.
+ * @mmu:		Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ */
+struct kvm_pgtable {
+	u32					ia_bits;
+	u32					start_level;
+	kvm_pte_t				*pgd;
+
+	/* Stage-2 only */
+	struct kvm_s2_mmu			*mmu;
+};
+
+/**
+ * enum kvm_pgtable_prot - Page-table permissions and attributes.
+ * @KVM_PGTABLE_PROT_X:		Execute permission.
+ * @KVM_PGTABLE_PROT_W:		Write permission.
+ * @KVM_PGTABLE_PROT_R:		Read permission.
+ * @KVM_PGTABLE_PROT_DEVICE:	Device attributes.
+ */
+enum kvm_pgtable_prot {
+	KVM_PGTABLE_PROT_X			= BIT(0),
+	KVM_PGTABLE_PROT_W			= BIT(1),
+	KVM_PGTABLE_PROT_R			= BIT(2),
+
+	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
+};
+
+/**
+ * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
+ * @KVM_PGTABLE_WALK_LEAF:		Visit leaf entries, including invalid
+ *					entries.
+ * @KVM_PGTABLE_WALK_TABLE_PRE:		Visit table entries before their
+ *					children.
+ * @KVM_PGTABLE_WALK_TABLE_POST:	Visit table entries after their
+ *					children.
+ */
+enum kvm_pgtable_walk_flags {
+	KVM_PGTABLE_WALK_LEAF			= BIT(0),
+	KVM_PGTABLE_WALK_TABLE_PRE		= BIT(1),
+	KVM_PGTABLE_WALK_TABLE_POST		= BIT(2),
+};
+
+typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
+					kvm_pte_t *ptep,
+					enum kvm_pgtable_walk_flags flag,
+					void * const arg);
+
+/**
+ * struct kvm_pgtable_walker - Hook into a page-table walk.
+ * @cb:		Callback function to invoke during the walk.
+ * @arg:	Argument passed to the callback function.
+ * @flags:	Bitwise-OR of flags to identify the entry types on which to
+ *		invoke the callback function.
+ */
+struct kvm_pgtable_walker {
+	const kvm_pgtable_visitor_fn_t		cb;
+	void * const				arg;
+	const enum kvm_pgtable_walk_flags	flags;
+};
+
+/**
+ * kvm_pgtable_walk() - Walk a page-table.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
+ * @addr:	Input address for the start of the walk.
+ * @size:	Size of the range to walk.
+ * @walker:	Walker callback description.
+ *
+ * The walker will walk the page-table entries corresponding to the input
+ * address range specified, visiting entries according to the walker flags.
+ * Invalid entries are treated as leaf entries. Leaf entries are reloaded
+ * after invoking the walker callback, allowing the walker to descend into
+ * a newly installed table.
+ *
+ * Returning a negative error code from the walker callback function will
+ * terminate the walk immediately with the same error code.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+		     struct kvm_pgtable_walker *walker);
+
+#endif	/* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index f54f0e89a71c..607b8a898826 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,5 +10,5 @@  subdir-ccflags-y := -I$(incdir)				\
 		    -DDISABLE_BRANCH_PROFILING		\
 		    $(DISABLE_STACKLEAK_PLUGIN)
 
-obj-$(CONFIG_KVM) += vhe/ nvhe/
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
 obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
new file mode 100644
index 000000000000..3fb9d1949a3f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -0,0 +1,285 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
+ * No bombay mix was harmed in the writing of this file.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#include <linux/bitfield.h>
+#include <asm/kvm_pgtable.h>
+
+#define KVM_PGTABLE_MAX_LEVELS		4U
+
+#define KVM_PTE_VALID			BIT(0)
+
+#define KVM_PTE_TYPE			BIT(1)
+#define KVM_PTE_TYPE_BLOCK		0
+#define KVM_PTE_TYPE_PAGE		1
+#define KVM_PTE_TYPE_TABLE		1
+
+#define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
+
+#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
+
+struct kvm_pgtable_walk_data {
+	struct kvm_pgtable		*pgt;
+	struct kvm_pgtable_walker	*walker;
+
+	u64				addr;
+	u64				end;
+};
+
+static u64 kvm_granule_shift(u32 level)
+{
+	/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
+	return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
+}
+
+static u64 kvm_granule_size(u32 level)
+{
+	return BIT(kvm_granule_shift(level));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+	u64 granule = kvm_granule_size(level);
+
+	/*
+	 * Reject invalid block mappings and don't bother with 4TB mappings for
+	 * 52-bit PAs.
+	 */
+	if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+		return false;
+
+	if (granule > (end - addr))
+		return false;
+
+	return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+}
+
+static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
+{
+	u64 shift = kvm_granule_shift(level);
+	u64 mask = BIT(PAGE_SHIFT - 3) - 1;
+
+	return (data->addr >> shift) & mask;
+}
+
+static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
+{
+	u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
+	u64 mask = BIT(pgt->ia_bits) - 1;
+
+	return (addr & mask) >> shift;
+}
+
+static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
+{
+	return __kvm_pgd_page_idx(data->pgt, data->addr);
+}
+
+static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
+{
+	struct kvm_pgtable pgt = {
+		.ia_bits	= ia_bits,
+		.start_level	= start_level,
+	};
+
+	return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
+}
+
+static bool kvm_pte_valid(kvm_pte_t pte)
+{
+	return pte & KVM_PTE_VALID;
+}
+
+static bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+		return false;
+
+	if (!kvm_pte_valid(pte))
+		return false;
+
+	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
+static u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+	u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+	if (PAGE_SHIFT == 16)
+		pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+	return pa;
+}
+
+static kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+	if (PAGE_SHIFT == 16)
+		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+
+	return pte;
+}
+
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+{
+	return __va(kvm_pte_to_phys(pte));
+}
+
+static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+{
+	kvm_pte_t pte = *ptep;
+	WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+}
+
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+{
+	kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+
+	pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
+	pte |= KVM_PTE_VALID;
+
+	WARN_ON(kvm_pte_valid(old));
+	smp_store_release(ptep, pte);
+}
+
+static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
+				   u32 level)
+{
+	kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+	u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
+							   KVM_PTE_TYPE_BLOCK;
+
+	pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
+	pte |= FIELD_PREP(KVM_PTE_TYPE, type);
+	pte |= KVM_PTE_VALID;
+
+	/* Tolerate KVM recreating the exact same mapping. */
+	if (kvm_pte_valid(old))
+		return old == pte;
+
+	smp_store_release(ptep, pte);
+	return true;
+}
+
+static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
+				  u32 level, kvm_pte_t *ptep,
+				  enum kvm_pgtable_walk_flags flag)
+{
+	struct kvm_pgtable_walker *walker = data->walker;
+	return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+			      kvm_pte_t *pgtable, u32 level);
+
+static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
+				      kvm_pte_t *ptep, u32 level)
+{
+	int ret = 0;
+	u64 addr = data->addr;
+	kvm_pte_t *childp, pte = *ptep;
+	bool table = kvm_pte_table(pte, level);
+	enum kvm_pgtable_walk_flags flags = data->walker->flags;
+
+	if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
+		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+					     KVM_PGTABLE_WALK_TABLE_PRE);
+	}
+
+	if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
+		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+					     KVM_PGTABLE_WALK_LEAF);
+		pte = *ptep;
+		table = kvm_pte_table(pte, level);
+	}
+
+	if (ret)
+		goto out;
+
+	if (!table) {
+		data->addr += kvm_granule_size(level);
+		goto out;
+	}
+
+	childp = kvm_pte_follow(pte);
+	ret = __kvm_pgtable_walk(data, childp, level + 1);
+	if (ret)
+		goto out;
+
+	if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
+		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+					     KVM_PGTABLE_WALK_TABLE_POST);
+	}
+
+out:
+	return ret;
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+			      kvm_pte_t *pgtable, u32 level)
+{
+	u32 idx;
+	int ret = 0;
+
+	if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
+		return -EINVAL;
+
+	for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
+		kvm_pte_t *ptep = &pgtable[idx];
+
+		if (data->addr >= data->end)
+			break;
+
+		ret = __kvm_pgtable_visit(data, ptep, level);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
+{
+	u32 idx;
+	int ret = 0;
+	struct kvm_pgtable *pgt = data->pgt;
+	u64 limit = BIT(pgt->ia_bits);
+
+	if (data->addr > limit || data->end > limit)
+		return -ERANGE;
+
+	if (!pgt->pgd)
+		return -EINVAL;
+
+	for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
+		kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+
+		ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+		     struct kvm_pgtable_walker *walker)
+{
+	struct kvm_pgtable_walk_data walk_data = {
+		.pgt	= pgt,
+		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
+		.end	= PAGE_ALIGN(walk_data.addr + size),
+		.walker	= walker,
+	};
+
+	return _kvm_pgtable_walk(&walk_data);
+}