diff mbox series

[v9,4/5] KVM: arm64: Register ptdump with debugfs on guest creation

Message ID 20240827084549.45731-5-sebastianene@google.com (mailing list archive)
State New, archived
Headers show
Series arm64: ptdump: View the second stage page-tables | expand

Commit Message

Sebastian Ene Aug. 27, 2024, 8:45 a.m. UTC
While arch/*/mem/ptdump handles the kernel pagetable dumping code,
introduce KVM/ptdump to show the guest stage-2 pagetables. The
separation is necessary because most of the definitions from the
stage-2 pagetable reside in the KVM path and we will be invoking
functionality specific to KVM.

When a guest is created, register a new file entry under the guest
debugfs dir which allows userspace to show the contents of the guest
stage-2 pagetables when accessed.

Signed-off-by: Sebastian Ene <sebastianene@google.com>
---
 arch/arm64/include/asm/kvm_host.h |   6 +
 arch/arm64/kvm/Makefile           |   1 +
 arch/arm64/kvm/arm.c              |   1 +
 arch/arm64/kvm/ptdump.c           | 247 ++++++++++++++++++++++++++++++
 4 files changed, 255 insertions(+)
 create mode 100644 arch/arm64/kvm/ptdump.c

Comments

Vincent Donnefort Aug. 30, 2024, 10:24 a.m. UTC | #1
Hi Seb,

Thanks for the respin.

On Tue, Aug 27, 2024 at 08:45:47AM +0000, Sebastian Ene wrote:
> While arch/*/mem/ptdump handles the kernel pagetable dumping code,
> introduce KVM/ptdump to show the guest stage-2 pagetables. The
> separation is necessary because most of the definitions from the
> stage-2 pagetable reside in the KVM path and we will be invoking
> functionality specific to KVM.
> 
> When a guest is created, register a new file entry under the guest
> debugfs dir which allows userspace to show the contents of the guest
> stage-2 pagetables when accessed.
> 
> Signed-off-by: Sebastian Ene <sebastianene@google.com>

I only have some nits, otherwise:

Reviewed-by: Vincent Donnefort <vdonnefort@google.com>

> ---
>  arch/arm64/include/asm/kvm_host.h |   6 +
>  arch/arm64/kvm/Makefile           |   1 +
>  arch/arm64/kvm/arm.c              |   1 +
>  arch/arm64/kvm/ptdump.c           | 247 ++++++++++++++++++++++++++++++
>  4 files changed, 255 insertions(+)
>  create mode 100644 arch/arm64/kvm/ptdump.c
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index a33f5996ca9f..4acd589f086b 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -1473,4 +1473,10 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
>  		(pa + pi + pa3) == 1;					\
>  	})
>  
> +#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
> +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
> +#else
> +static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
> +#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
> +
>  #endif /* __ARM64_KVM_HOST_H__ */
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index 86a629aaf0a1..e4233b323a73 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -27,6 +27,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
>  
>  kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
>  kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
> +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
>  
>  always-y := hyp_constants.h hyp-constants.s
>  
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 9bef7638342e..b9fd928d3477 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -228,6 +228,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
>  void kvm_arch_create_vm_debugfs(struct kvm *kvm)
>  {
>  	kvm_sys_regs_create_debugfs(kvm);
> +	kvm_s2_ptdump_create_debugfs(kvm);
>  }
>  
>  static void kvm_destroy_mpidr_data(struct kvm *kvm)
> diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
> new file mode 100644
> index 000000000000..e72a928d4445
> --- /dev/null
> +++ b/arch/arm64/kvm/ptdump.c
> @@ -0,0 +1,247 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Debug helper used to dump the stage-2 pagetables of the system and their
> + * associated permissions.
> + *
> + * Copyright (C) Google, 2024
> + * Author: Sebastian Ene <sebastianene@google.com>
> + */
> +#include <linux/debugfs.h>
> +#include <linux/kvm_host.h>
> +#include <linux/seq_file.h>
> +
> +#include <asm/kvm_pgtable.h>
> +#include <asm/kvm_host.h>

nit: I believe you wanted to follow the alphabetical order, if that is the case,
kvm_host.h then kvm_pgtable.h

> +#include <asm/ptdump.h>
> +
> +

nit: don't think double empty are a rule, I would remove it.

> +#define MARKERS_LEN		(2)

nit: The brackets are not necessary for MARKERS_LEN.

> +#define KVM_PGTABLE_MAX_LEVELS	(KVM_PGTABLE_LAST_LEVEL + 1)
> +
> +struct kvm_ptdump_guest_state {
> +	struct kvm		*kvm;
> +	struct ptdump_pg_state	parser_state;
> +	struct addr_marker	ipa_marker[MARKERS_LEN];
> +	struct ptdump_pg_level	level[KVM_PGTABLE_MAX_LEVELS];
> +	struct ptdump_range	range[MARKERS_LEN];
> +};
> +
> +static const struct ptdump_prot_bits stage2_pte_bits[] = {
> +	{
> +		.mask	= PTE_VALID,
> +		.val	= PTE_VALID,
> +		.set	= " ",
> +		.clear	= "F",

This is effectively never used because an invalid PTE is 0 and note_page() won't
print it. This probably can be removed?

> +	}, {
> +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> +		.set	= "R",
> +		.clear	= " ",
> +	}, {
> +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> +		.set	= "W",
> +		.clear	= " ",
> +	}, {
> +		.mask	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> +		.val	= PTE_VALID,
> +		.set	= " ",
> +		.clear	= "X",
> +	}, {
> +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> +		.set	= "AF",
> +		.clear	= "  ",
> +	}, {
> +		.mask	= PTE_TABLE_BIT | PTE_VALID,
> +		.val	= PTE_VALID,
> +		.set	= "BLK",
> +		.clear	= "   ",
> +	},
> +};
> +
> +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx,
> +			      enum kvm_pgtable_walk_flags visit)
> +{
> +	struct ptdump_pg_state *st = ctx->arg;
> +	struct ptdump_state *pt_st = &st->ptdump;
> +
> +	note_page(pt_st, ctx->addr, ctx->level, ctx->old);
> +
> +	return 0;
> +}
> +
> +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
> +{
> +	u32 i;
> +	u64 mask;
> +
> +	if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL))
> +		return -EINVAL;
> +
> +	mask = 0;
> +	for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
> +		mask |= stage2_pte_bits[i].mask;
> +
> +	for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) {
> +		snprintf(level[i].name, sizeof(level[i].name), "%d", i);

%u, i being unsigned.

> +
> +		level[i].num	= ARRAY_SIZE(stage2_pte_bits);
> +		level[i].bits	= stage2_pte_bits;
> +		level[i].mask	= mask;
> +	}
> +
> +	return 0;
> +}
> +
> +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
> +{
> +	struct kvm_ptdump_guest_state *st;
> +	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
> +	struct kvm_pgtable *pgtable = mmu->pgt;
> +	int ret;
> +
> +	st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT);
> +	if (!st)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level);
> +	if (ret) {
> +		kfree(st);
> +		return ERR_PTR(ret);
> +	}
> +
> +	st->ipa_marker[0].name		= "Guest IPA";
> +	st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
> +	st->range[0].end		= BIT(pgtable->ia_bits);
> +
> +	st->kvm				= kvm;
> +	st->parser_state = (struct ptdump_pg_state) {
> +		.marker		= &st->ipa_marker[0],
> +		.level		= -1,
> +		.pg_level	= &st->level[0],
> +		.ptdump.range	= &st->range[0],
> +		.start_address	= 0,
> +	};
> +
> +	return st;
> +}
> +
> +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
> +{
> +	int ret;
> +	struct kvm_ptdump_guest_state *st = m->private;
> +	struct kvm *kvm = st->kvm;
> +	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
> +	struct ptdump_pg_state *parser_state = &st->parser_state;
> +	struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
> +		.cb	= kvm_ptdump_visitor,
> +		.arg	= parser_state,
> +		.flags	= KVM_PGTABLE_WALK_LEAF,
> +	};
> +
> +	parser_state->seq = m;
> +
> +	write_lock(&kvm->mmu_lock);
> +	ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
> +	write_unlock(&kvm->mmu_lock);
> +
> +	return ret;
> +}
> +
> +static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
> +{
> +	struct kvm *kvm = m->i_private;
> +	struct kvm_ptdump_guest_state *st;
> +	int ret;
> +
> +	if (!kvm_get_kvm_safe(kvm))
> +		return -ENOENT;
> +
> +	st = kvm_ptdump_parser_create(kvm);
> +	if (IS_ERR(st)) {
> +		ret = PTR_ERR(st);
> +		goto free_with_kvm_ref;
> +	}
> +
> +	ret = single_open(file, kvm_ptdump_guest_show, st);
> +	if (!ret)
> +		return 0;
> +
> +	kfree(st);
> +free_with_kvm_ref:

nit: I believe kfree understands IS_ERR() so you could have a simple "err:"
label covering all the error path.

> +	kvm_put_kvm(kvm);
> +	return ret;
> +}
> +
> +static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
> +{
> +	struct kvm *kvm = m->i_private;
> +	void *st = ((struct seq_file *)file->private_data)->private;
> +
> +	kfree(st);
> +	kvm_put_kvm(kvm);
> +
> +	return single_release(m, file);
> +}
> +
> +static const struct file_operations kvm_ptdump_guest_fops = {
> +	.open		= kvm_ptdump_guest_open,
> +	.read		= seq_read,
> +	.llseek		= seq_lseek,
> +	.release	= kvm_ptdump_guest_close,
> +};
> +
> +static int kvm_pgtable_debugfs_show(struct seq_file *m, void *unused)
> +{
> +	const struct file *file = m->file;
> +	struct kvm_pgtable *pgtable = m->private;
> +
> +	if (!strcmp(file_dentry(file)->d_iname, "ipa_range"))
> +		seq_printf(m, "%2u\n", pgtable->ia_bits);
> +	else if (!strcmp(file_dentry(file)->d_iname, "stage2_levels"))
> +		seq_printf(m, "%1d\n", KVM_PGTABLE_LAST_LEVEL - pgtable->start_level + 1);

nit: KVM_PGTABLE_MAX_LEVELS - pgtable->start_level ?

> +	return 0;
> +}
> +
> +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file)
> +{
> +	struct kvm *kvm = m->i_private;
> +	struct kvm_pgtable *pgtable;
> +	int ret;
> +
> +	if (!kvm_get_kvm_safe(kvm))
> +		return -ENOENT;
> +
> +	pgtable = kvm->arch.mmu.pgt;
> +
> +	ret = single_open(file, kvm_pgtable_debugfs_show, pgtable);
> +	if (ret < 0)
> +		kvm_put_kvm(kvm);
> +	return ret;
> +}
> +
> +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
> +{
> +	struct kvm *kvm = m->i_private;
> +
> +	kvm_put_kvm(kvm);
> +	return single_release(m, file);
> +}
> +
> +static const struct file_operations kvm_pgtable_debugfs_fops = {
> +	.open		= kvm_pgtable_debugfs_open,
> +	.read		= seq_read,
> +	.llseek		= seq_lseek,
> +	.release	= kvm_pgtable_debugfs_close,
> +};
> +
> +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
> +{
> +	debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
> +			    kvm, &kvm_ptdump_guest_fops);
> +	debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
> +			    &kvm_pgtable_debugfs_fops);
> +	debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
> +			    kvm, &kvm_pgtable_debugfs_fops);
> +}
> -- 
> 2.46.0.295.g3b9ea8a38a-goog
>
Marc Zyngier Aug. 30, 2024, 2:11 p.m. UTC | #2
On Fri, 30 Aug 2024 11:24:53 +0100,
Vincent Donnefort <vdonnefort@google.com> wrote:
> 
> Hi Seb,
> 
> Thanks for the respin.
> 
> On Tue, Aug 27, 2024 at 08:45:47AM +0000, Sebastian Ene wrote:
> > While arch/*/mem/ptdump handles the kernel pagetable dumping code,
> > introduce KVM/ptdump to show the guest stage-2 pagetables. The
> > separation is necessary because most of the definitions from the
> > stage-2 pagetable reside in the KVM path and we will be invoking
> > functionality specific to KVM.
> > 
> > When a guest is created, register a new file entry under the guest
> > debugfs dir which allows userspace to show the contents of the guest
> > stage-2 pagetables when accessed.
> > 
> > Signed-off-by: Sebastian Ene <sebastianene@google.com>
> 
> I only have some nits, otherwise:
> 
> Reviewed-by: Vincent Donnefort <vdonnefort@google.com>
> 
> > ---
> >  arch/arm64/include/asm/kvm_host.h |   6 +
> >  arch/arm64/kvm/Makefile           |   1 +
> >  arch/arm64/kvm/arm.c              |   1 +
> >  arch/arm64/kvm/ptdump.c           | 247 ++++++++++++++++++++++++++++++
> >  4 files changed, 255 insertions(+)
> >  create mode 100644 arch/arm64/kvm/ptdump.c
> > 
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index a33f5996ca9f..4acd589f086b 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -1473,4 +1473,10 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
> >  		(pa + pi + pa3) == 1;					\
> >  	})
> >  
> > +#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
> > +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
> > +#else
> > +static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
> > +#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
> > +
> >  #endif /* __ARM64_KVM_HOST_H__ */
> > diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> > index 86a629aaf0a1..e4233b323a73 100644
> > --- a/arch/arm64/kvm/Makefile
> > +++ b/arch/arm64/kvm/Makefile
> > @@ -27,6 +27,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
> >  
> >  kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
> >  kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
> > +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
> >  
> >  always-y := hyp_constants.h hyp-constants.s
> >  
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 9bef7638342e..b9fd928d3477 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -228,6 +228,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
> >  void kvm_arch_create_vm_debugfs(struct kvm *kvm)
> >  {
> >  	kvm_sys_regs_create_debugfs(kvm);
> > +	kvm_s2_ptdump_create_debugfs(kvm);
> >  }
> >  
> >  static void kvm_destroy_mpidr_data(struct kvm *kvm)
> > diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
> > new file mode 100644
> > index 000000000000..e72a928d4445
> > --- /dev/null
> > +++ b/arch/arm64/kvm/ptdump.c
> > @@ -0,0 +1,247 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Debug helper used to dump the stage-2 pagetables of the system and their
> > + * associated permissions.
> > + *
> > + * Copyright (C) Google, 2024
> > + * Author: Sebastian Ene <sebastianene@google.com>
> > + */
> > +#include <linux/debugfs.h>
> > +#include <linux/kvm_host.h>
> > +#include <linux/seq_file.h>
> > +
> > +#include <asm/kvm_pgtable.h>
> > +#include <asm/kvm_host.h>
> 
> nit: I believe you wanted to follow the alphabetical order, if that is the case,
> kvm_host.h then kvm_pgtable.h
> 
> > +#include <asm/ptdump.h>
> > +
> > +
> 
> nit: don't think double empty are a rule, I would remove it.
> 
> > +#define MARKERS_LEN		(2)
> 
> nit: The brackets are not necessary for MARKERS_LEN.
> 
> > +#define KVM_PGTABLE_MAX_LEVELS	(KVM_PGTABLE_LAST_LEVEL + 1)
> > +
> > +struct kvm_ptdump_guest_state {
> > +	struct kvm		*kvm;
> > +	struct ptdump_pg_state	parser_state;
> > +	struct addr_marker	ipa_marker[MARKERS_LEN];
> > +	struct ptdump_pg_level	level[KVM_PGTABLE_MAX_LEVELS];
> > +	struct ptdump_range	range[MARKERS_LEN];
> > +};
> > +
> > +static const struct ptdump_prot_bits stage2_pte_bits[] = {
> > +	{
> > +		.mask	= PTE_VALID,
> > +		.val	= PTE_VALID,
> > +		.set	= " ",
> > +		.clear	= "F",
> 
> This is effectively never used because an invalid PTE is 0 and note_page() won't
> print it. This probably can be removed?

Yeah, I can't see how we are going to trigger that one given that
PTE_VALID must be set, and that we only print something if the bit is
clear.

Seb?

>
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > +		.set	= "R",
> > +		.clear	= " ",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > +		.set	= "W",
> > +		.clear	= " ",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> > +		.val	= PTE_VALID,
> > +		.set	= " ",
> > +		.clear	= "X",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > +		.set	= "AF",
> > +		.clear	= "  ",
> > +	}, {
> > +		.mask	= PTE_TABLE_BIT | PTE_VALID,
> > +		.val	= PTE_VALID,
> > +		.set	= "BLK",
> > +		.clear	= "   ",
> > +	},
> > +};
> > +
> > +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx,
> > +			      enum kvm_pgtable_walk_flags visit)
> > +{
> > +	struct ptdump_pg_state *st = ctx->arg;
> > +	struct ptdump_state *pt_st = &st->ptdump;
> > +
> > +	note_page(pt_st, ctx->addr, ctx->level, ctx->old);
> > +
> > +	return 0;
> > +}
> > +
> > +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
> > +{
> > +	u32 i;
> > +	u64 mask;
> > +
> > +	if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL))
> > +		return -EINVAL;
> > +
> > +	mask = 0;
> > +	for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
> > +		mask |= stage2_pte_bits[i].mask;
> > +
> > +	for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) {
> > +		snprintf(level[i].name, sizeof(level[i].name), "%d", i);
> 
> %u, i being unsigned.
> 
> > +
> > +		level[i].num	= ARRAY_SIZE(stage2_pte_bits);
> > +		level[i].bits	= stage2_pte_bits;
> > +		level[i].mask	= mask;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
> > +{
> > +	struct kvm_ptdump_guest_state *st;
> > +	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
> > +	struct kvm_pgtable *pgtable = mmu->pgt;
> > +	int ret;
> > +
> > +	st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT);
> > +	if (!st)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level);
> > +	if (ret) {
> > +		kfree(st);
> > +		return ERR_PTR(ret);
> > +	}
> > +
> > +	st->ipa_marker[0].name		= "Guest IPA";
> > +	st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
> > +	st->range[0].end		= BIT(pgtable->ia_bits);
> > +
> > +	st->kvm				= kvm;
> > +	st->parser_state = (struct ptdump_pg_state) {
> > +		.marker		= &st->ipa_marker[0],
> > +		.level		= -1,
> > +		.pg_level	= &st->level[0],
> > +		.ptdump.range	= &st->range[0],
> > +		.start_address	= 0,
> > +	};
> > +
> > +	return st;
> > +}
> > +
> > +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
> > +{
> > +	int ret;
> > +	struct kvm_ptdump_guest_state *st = m->private;
> > +	struct kvm *kvm = st->kvm;
> > +	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
> > +	struct ptdump_pg_state *parser_state = &st->parser_state;
> > +	struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
> > +		.cb	= kvm_ptdump_visitor,
> > +		.arg	= parser_state,
> > +		.flags	= KVM_PGTABLE_WALK_LEAF,
> > +	};
> > +
> > +	parser_state->seq = m;
> > +
> > +	write_lock(&kvm->mmu_lock);
> > +	ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
> > +	write_unlock(&kvm->mmu_lock);
> > +
> > +	return ret;
> > +}
> > +
> > +static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
> > +{
> > +	struct kvm *kvm = m->i_private;
> > +	struct kvm_ptdump_guest_state *st;
> > +	int ret;
> > +
> > +	if (!kvm_get_kvm_safe(kvm))
> > +		return -ENOENT;
> > +
> > +	st = kvm_ptdump_parser_create(kvm);
> > +	if (IS_ERR(st)) {
> > +		ret = PTR_ERR(st);
> > +		goto free_with_kvm_ref;
> > +	}
> > +
> > +	ret = single_open(file, kvm_ptdump_guest_show, st);
> > +	if (!ret)
> > +		return 0;
> > +
> > +	kfree(st);
> > +free_with_kvm_ref:
> 
> nit: I believe kfree understands IS_ERR() so you could have a simple "err:"
> label covering all the error path.

I couldn't find such handling in kfree(). Could you point be to it?

> 
> > +	kvm_put_kvm(kvm);
> > +	return ret;
> > +}
> > +
> > +static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
> > +{
> > +	struct kvm *kvm = m->i_private;
> > +	void *st = ((struct seq_file *)file->private_data)->private;
> > +
> > +	kfree(st);
> > +	kvm_put_kvm(kvm);
> > +
> > +	return single_release(m, file);
> > +}
> > +
> > +static const struct file_operations kvm_ptdump_guest_fops = {
> > +	.open		= kvm_ptdump_guest_open,
> > +	.read		= seq_read,
> > +	.llseek		= seq_lseek,
> > +	.release	= kvm_ptdump_guest_close,
> > +};
> > +
> > +static int kvm_pgtable_debugfs_show(struct seq_file *m, void *unused)
> > +{
> > +	const struct file *file = m->file;
> > +	struct kvm_pgtable *pgtable = m->private;
> > +
> > +	if (!strcmp(file_dentry(file)->d_iname, "ipa_range"))

I really dislike this sort of construct, and I'd rather we pick the
correct callback by construction rather than relying on a string
comparison. See below for a suggestion.

> > +		seq_printf(m, "%2u\n", pgtable->ia_bits);
> > +	else if (!strcmp(file_dentry(file)->d_iname, "stage2_levels"))
> > +		seq_printf(m, "%1d\n", KVM_PGTABLE_LAST_LEVEL - pgtable->start_level + 1);
> 
> nit: KVM_PGTABLE_MAX_LEVELS - pgtable->start_level ?
> 
> > +	return 0;
> > +}
> > +
> > +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file)
> > +{
> > +	struct kvm *kvm = m->i_private;
> > +	struct kvm_pgtable *pgtable;
> > +	int ret;
> > +
> > +	if (!kvm_get_kvm_safe(kvm))
> > +		return -ENOENT;
> > +
> > +	pgtable = kvm->arch.mmu.pgt;
> > +
> > +	ret = single_open(file, kvm_pgtable_debugfs_show, pgtable);
> > +	if (ret < 0)
> > +		kvm_put_kvm(kvm);
> > +	return ret;
> > +}
> > +
> > +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
> > +{
> > +	struct kvm *kvm = m->i_private;
> > +
> > +	kvm_put_kvm(kvm);
> > +	return single_release(m, file);
> > +}
> > +
> > +static const struct file_operations kvm_pgtable_debugfs_fops = {
> > +	.open		= kvm_pgtable_debugfs_open,
> > +	.read		= seq_read,
> > +	.llseek		= seq_lseek,
> > +	.release	= kvm_pgtable_debugfs_close,
> > +};
> > +
> > +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
> > +{
> > +	debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
> > +			    kvm, &kvm_ptdump_guest_fops);
> > +	debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
> > +			    &kvm_pgtable_debugfs_fops);
> > +	debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
> > +			    kvm, &kvm_pgtable_debugfs_fops);
> > +}

I'd expect something like this instead:

diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
index e72a928d4445..c11ea355aa51 100644
--- a/arch/arm64/kvm/ptdump.c
+++ b/arch/arm64/kvm/ptdump.c
@@ -192,19 +191,24 @@ static const struct file_operations kvm_ptdump_guest_fops = {
 	.release	= kvm_ptdump_guest_close,
 };
 
-static int kvm_pgtable_debugfs_show(struct seq_file *m, void *unused)
+static int kvm_pgtable_range_show(struct seq_file *m, void *unused)
+{
+	struct kvm_pgtable *pgtable = m->private;
+
+	seq_printf(m, "%2u\n", pgtable->ia_bits);
+	return 0;
+}
+
+static int kvm_pgtable_levels_show(struct seq_file *m, void *unused)
 {
-	const struct file *file = m->file;
 	struct kvm_pgtable *pgtable = m->private;
 
-	if (!strcmp(file_dentry(file)->d_iname, "ipa_range"))
-		seq_printf(m, "%2u\n", pgtable->ia_bits);
-	else if (!strcmp(file_dentry(file)->d_iname, "stage2_levels"))
-		seq_printf(m, "%1d\n", KVM_PGTABLE_LAST_LEVEL - pgtable->start_level + 1);
+	seq_printf(m, "%1d\n", KVM_PGTABLE_LAST_LEVEL - pgtable->start_level + 1);
 	return 0;
 }
 
-static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file)
+static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file,
+				    int (*show)(struct seq_file *, void *))
 {
 	struct kvm *kvm = m->i_private;
 	struct kvm_pgtable *pgtable;
@@ -215,12 +219,22 @@ static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file)
 
 	pgtable = kvm->arch.mmu.pgt;
 
-	ret = single_open(file, kvm_pgtable_debugfs_show, pgtable);
+	ret = single_open(file, show, pgtable);
 	if (ret < 0)
 		kvm_put_kvm(kvm);
 	return ret;
 }
 
+static int kvm_pgtable_range_open(struct inode *m, struct file *file)
+{
+	return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show);
+}
+
+static int kvm_pgtable_levels_open(struct inode *m, struct file *file)
+{
+	return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show);
+}
+
 static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
 {
 	struct kvm *kvm = m->i_private;
@@ -229,8 +243,15 @@ static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
 	return single_release(m, file);
 }
 
-static const struct file_operations kvm_pgtable_debugfs_fops = {
-	.open		= kvm_pgtable_debugfs_open,
+static const struct file_operations kvm_pgtable_range_fops = {
+	.open		= kvm_pgtable_range_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= kvm_pgtable_debugfs_close,
+};
+
+static const struct file_operations kvm_pgtable_levels_fops = {
+	.open		= kvm_pgtable_levels_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= kvm_pgtable_debugfs_close,
@@ -241,7 +262,7 @@ void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
 	debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
 			    kvm, &kvm_ptdump_guest_fops);
 	debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
-			    &kvm_pgtable_debugfs_fops);
+			    &kvm_pgtable_range_fops);
 	debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
-			    kvm, &kvm_pgtable_debugfs_fops);
+			    kvm, &kvm_pgtable_levels_fops);
 }


Thanks,

	M.
Sebastian Ene Sept. 2, 2024, 5:27 a.m. UTC | #3
On Fri, Aug 30, 2024 at 03:11:34PM +0100, Marc Zyngier wrote:
> On Fri, 30 Aug 2024 11:24:53 +0100,
> Vincent Donnefort <vdonnefort@google.com> wrote:
> > 
> > Hi Seb,
> > 
> > Thanks for the respin.
> > 
> > On Tue, Aug 27, 2024 at 08:45:47AM +0000, Sebastian Ene wrote:
> > > While arch/*/mem/ptdump handles the kernel pagetable dumping code,
> > > introduce KVM/ptdump to show the guest stage-2 pagetables. The
> > > separation is necessary because most of the definitions from the
> > > stage-2 pagetable reside in the KVM path and we will be invoking
> > > functionality specific to KVM.
> > > 
> > > When a guest is created, register a new file entry under the guest
> > > debugfs dir which allows userspace to show the contents of the guest
> > > stage-2 pagetables when accessed.
> > > 
> > > Signed-off-by: Sebastian Ene <sebastianene@google.com>
> > 
> > I only have some nits, otherwise:
> > 
> > Reviewed-by: Vincent Donnefort <vdonnefort@google.com>
> > 
> > > ---
> > >  arch/arm64/include/asm/kvm_host.h |   6 +
> > >  arch/arm64/kvm/Makefile           |   1 +
> > >  arch/arm64/kvm/arm.c              |   1 +
> > >  arch/arm64/kvm/ptdump.c           | 247 ++++++++++++++++++++++++++++++
> > >  4 files changed, 255 insertions(+)
> > >  create mode 100644 arch/arm64/kvm/ptdump.c
> > > 
> > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > > index a33f5996ca9f..4acd589f086b 100644
> > > --- a/arch/arm64/include/asm/kvm_host.h
> > > +++ b/arch/arm64/include/asm/kvm_host.h
> > > @@ -1473,4 +1473,10 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
> > >  		(pa + pi + pa3) == 1;					\
> > >  	})
> > >  
> > > +#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
> > > +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
> > > +#else
> > > +static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
> > > +#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
> > > +
> > >  #endif /* __ARM64_KVM_HOST_H__ */
> > > diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> > > index 86a629aaf0a1..e4233b323a73 100644
> > > --- a/arch/arm64/kvm/Makefile
> > > +++ b/arch/arm64/kvm/Makefile
> > > @@ -27,6 +27,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
> > >  
> > >  kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
> > >  kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
> > > +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
> > >  
> > >  always-y := hyp_constants.h hyp-constants.s
> > >  
> > > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > > index 9bef7638342e..b9fd928d3477 100644
> > > --- a/arch/arm64/kvm/arm.c
> > > +++ b/arch/arm64/kvm/arm.c
> > > @@ -228,6 +228,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
> > >  void kvm_arch_create_vm_debugfs(struct kvm *kvm)
> > >  {
> > >  	kvm_sys_regs_create_debugfs(kvm);
> > > +	kvm_s2_ptdump_create_debugfs(kvm);
> > >  }
> > >  
> > >  static void kvm_destroy_mpidr_data(struct kvm *kvm)
> > > diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
> > > new file mode 100644
> > > index 000000000000..e72a928d4445
> > > --- /dev/null
> > > +++ b/arch/arm64/kvm/ptdump.c
> > > @@ -0,0 +1,247 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * Debug helper used to dump the stage-2 pagetables of the system and their
> > > + * associated permissions.
> > > + *
> > > + * Copyright (C) Google, 2024
> > > + * Author: Sebastian Ene <sebastianene@google.com>
> > > + */
> > > +#include <linux/debugfs.h>
> > > +#include <linux/kvm_host.h>
> > > +#include <linux/seq_file.h>
> > > +
> > > +#include <asm/kvm_pgtable.h>
> > > +#include <asm/kvm_host.h>
> > 
> > nit: I believe you wanted to follow the alphabetical order, if that is the case,
> > kvm_host.h then kvm_pgtable.h
> > 
> > > +#include <asm/ptdump.h>
> > > +
> > > +
> > 
> > nit: don't think double empty are a rule, I would remove it.
> > 
> > > +#define MARKERS_LEN		(2)
> > 
> > nit: The brackets are not necessary for MARKERS_LEN.
> > 
> > > +#define KVM_PGTABLE_MAX_LEVELS	(KVM_PGTABLE_LAST_LEVEL + 1)
> > > +
> > > +struct kvm_ptdump_guest_state {
> > > +	struct kvm		*kvm;
> > > +	struct ptdump_pg_state	parser_state;
> > > +	struct addr_marker	ipa_marker[MARKERS_LEN];
> > > +	struct ptdump_pg_level	level[KVM_PGTABLE_MAX_LEVELS];
> > > +	struct ptdump_range	range[MARKERS_LEN];
> > > +};
> > > +
> > > +static const struct ptdump_prot_bits stage2_pte_bits[] = {
> > > +	{
> > > +		.mask	= PTE_VALID,
> > > +		.val	= PTE_VALID,
> > > +		.set	= " ",
> > > +		.clear	= "F",
> > 
> > This is effectively never used because an invalid PTE is 0 and note_page() won't
> > print it. This probably can be removed?
> 
> Yeah, I can't see how we are going to trigger that one given that
> PTE_VALID must be set, and that we only print something if the bit is
> clear.
> 
> Seb?

When (pte_prot & mask == val) we print .set (which is empty), otherwise we
print .clear (This is from dump_prot). Invalid non-zero PTEs should be
printed in this case.

> 
> >
> > > +	}, {
> > > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > > +		.set	= "R",
> > > +		.clear	= " ",
> > > +	}, {
> > > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > > +		.set	= "W",
> > > +		.clear	= " ",
> > > +	}, {
> > > +		.mask	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> > > +		.val	= PTE_VALID,
> > > +		.set	= " ",
> > > +		.clear	= "X",
> > > +	}, {
> > > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > > +		.set	= "AF",
> > > +		.clear	= "  ",
> > > +	}, {
> > > +		.mask	= PTE_TABLE_BIT | PTE_VALID,
> > > +		.val	= PTE_VALID,
> > > +		.set	= "BLK",
> > > +		.clear	= "   ",
> > > +	},
> > > +};
> > > +
> > > +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx,
> > > +			      enum kvm_pgtable_walk_flags visit)
> > > +{
> > > +	struct ptdump_pg_state *st = ctx->arg;
> > > +	struct ptdump_state *pt_st = &st->ptdump;
> > > +
> > > +	note_page(pt_st, ctx->addr, ctx->level, ctx->old);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
> > > +{
> > > +	u32 i;
> > > +	u64 mask;
> > > +
> > > +	if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL))
> > > +		return -EINVAL;
> > > +
> > > +	mask = 0;
> > > +	for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
> > > +		mask |= stage2_pte_bits[i].mask;
> > > +
> > > +	for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) {
> > > +		snprintf(level[i].name, sizeof(level[i].name), "%d", i);
> > 
> > %u, i being unsigned.
> > 
> > > +
> > > +		level[i].num	= ARRAY_SIZE(stage2_pte_bits);
> > > +		level[i].bits	= stage2_pte_bits;
> > > +		level[i].mask	= mask;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
> > > +{
> > > +	struct kvm_ptdump_guest_state *st;
> > > +	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
> > > +	struct kvm_pgtable *pgtable = mmu->pgt;
> > > +	int ret;
> > > +
> > > +	st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT);
> > > +	if (!st)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level);
> > > +	if (ret) {
> > > +		kfree(st);
> > > +		return ERR_PTR(ret);
> > > +	}
> > > +
> > > +	st->ipa_marker[0].name		= "Guest IPA";
> > > +	st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
> > > +	st->range[0].end		= BIT(pgtable->ia_bits);
> > > +
> > > +	st->kvm				= kvm;
> > > +	st->parser_state = (struct ptdump_pg_state) {
> > > +		.marker		= &st->ipa_marker[0],
> > > +		.level		= -1,
> > > +		.pg_level	= &st->level[0],
> > > +		.ptdump.range	= &st->range[0],
> > > +		.start_address	= 0,
> > > +	};
> > > +
> > > +	return st;
> > > +}
> > > +
> > > +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
> > > +{
> > > +	int ret;
> > > +	struct kvm_ptdump_guest_state *st = m->private;
> > > +	struct kvm *kvm = st->kvm;
> > > +	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
> > > +	struct ptdump_pg_state *parser_state = &st->parser_state;
> > > +	struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
> > > +		.cb	= kvm_ptdump_visitor,
> > > +		.arg	= parser_state,
> > > +		.flags	= KVM_PGTABLE_WALK_LEAF,
> > > +	};
> > > +
> > > +	parser_state->seq = m;
> > > +
> > > +	write_lock(&kvm->mmu_lock);
> > > +	ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
> > > +	write_unlock(&kvm->mmu_lock);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
> > > +{
> > > +	struct kvm *kvm = m->i_private;
> > > +	struct kvm_ptdump_guest_state *st;
> > > +	int ret;
> > > +
> > > +	if (!kvm_get_kvm_safe(kvm))
> > > +		return -ENOENT;
> > > +
> > > +	st = kvm_ptdump_parser_create(kvm);
> > > +	if (IS_ERR(st)) {
> > > +		ret = PTR_ERR(st);
> > > +		goto free_with_kvm_ref;
> > > +	}
> > > +
> > > +	ret = single_open(file, kvm_ptdump_guest_show, st);
> > > +	if (!ret)
> > > +		return 0;
> > > +
> > > +	kfree(st);
> > > +free_with_kvm_ref:
> > 
> > nit: I believe kfree understands IS_ERR() so you could have a simple "err:"
> > label covering all the error path.
> 
> I couldn't find such handling in kfree(). Could you point be to it?
> 
> > 
> > > +	kvm_put_kvm(kvm);
> > > +	return ret;
> > > +}
> > > +
> > > +static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
> > > +{
> > > +	struct kvm *kvm = m->i_private;
> > > +	void *st = ((struct seq_file *)file->private_data)->private;
> > > +
> > > +	kfree(st);
> > > +	kvm_put_kvm(kvm);
> > > +
> > > +	return single_release(m, file);
> > > +}
> > > +
> > > +static const struct file_operations kvm_ptdump_guest_fops = {
> > > +	.open		= kvm_ptdump_guest_open,
> > > +	.read		= seq_read,
> > > +	.llseek		= seq_lseek,
> > > +	.release	= kvm_ptdump_guest_close,
> > > +};
> > > +
> > > +static int kvm_pgtable_debugfs_show(struct seq_file *m, void *unused)
> > > +{
> > > +	const struct file *file = m->file;
> > > +	struct kvm_pgtable *pgtable = m->private;
> > > +
> > > +	if (!strcmp(file_dentry(file)->d_iname, "ipa_range"))
> 
> I really dislike this sort of construct, and I'd rather we pick the
> correct callback by construction rather than relying on a string
> comparison. See below for a suggestion.
> 

Thanks for the feedback and for the suggestion. Let me try to re-write
this as indicated.

> > > +		seq_printf(m, "%2u\n", pgtable->ia_bits);
> > > +	else if (!strcmp(file_dentry(file)->d_iname, "stage2_levels"))
> > > +		seq_printf(m, "%1d\n", KVM_PGTABLE_LAST_LEVEL - pgtable->start_level + 1);
> > 
> > nit: KVM_PGTABLE_MAX_LEVELS - pgtable->start_level ?
> > 
> > > +	return 0;
> > > +}
> > > +
> > > +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file)
> > > +{
> > > +	struct kvm *kvm = m->i_private;
> > > +	struct kvm_pgtable *pgtable;
> > > +	int ret;
> > > +
> > > +	if (!kvm_get_kvm_safe(kvm))
> > > +		return -ENOENT;
> > > +
> > > +	pgtable = kvm->arch.mmu.pgt;
> > > +
> > > +	ret = single_open(file, kvm_pgtable_debugfs_show, pgtable);
> > > +	if (ret < 0)
> > > +		kvm_put_kvm(kvm);
> > > +	return ret;
> > > +}
> > > +
> > > +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
> > > +{
> > > +	struct kvm *kvm = m->i_private;
> > > +
> > > +	kvm_put_kvm(kvm);
> > > +	return single_release(m, file);
> > > +}
> > > +
> > > +static const struct file_operations kvm_pgtable_debugfs_fops = {
> > > +	.open		= kvm_pgtable_debugfs_open,
> > > +	.read		= seq_read,
> > > +	.llseek		= seq_lseek,
> > > +	.release	= kvm_pgtable_debugfs_close,
> > > +};
> > > +
> > > +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
> > > +{
> > > +	debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
> > > +			    kvm, &kvm_ptdump_guest_fops);
> > > +	debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
> > > +			    &kvm_pgtable_debugfs_fops);
> > > +	debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
> > > +			    kvm, &kvm_pgtable_debugfs_fops);
> > > +}
> 
> I'd expect something like this instead:
> 
> diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
> index e72a928d4445..c11ea355aa51 100644
> --- a/arch/arm64/kvm/ptdump.c
> +++ b/arch/arm64/kvm/ptdump.c
> @@ -192,19 +191,24 @@ static const struct file_operations kvm_ptdump_guest_fops = {
>  	.release	= kvm_ptdump_guest_close,
>  };
>  
> -static int kvm_pgtable_debugfs_show(struct seq_file *m, void *unused)
> +static int kvm_pgtable_range_show(struct seq_file *m, void *unused)
> +{
> +	struct kvm_pgtable *pgtable = m->private;
> +
> +	seq_printf(m, "%2u\n", pgtable->ia_bits);
> +	return 0;
> +}
> +
> +static int kvm_pgtable_levels_show(struct seq_file *m, void *unused)
>  {
> -	const struct file *file = m->file;
>  	struct kvm_pgtable *pgtable = m->private;
>  
> -	if (!strcmp(file_dentry(file)->d_iname, "ipa_range"))
> -		seq_printf(m, "%2u\n", pgtable->ia_bits);
> -	else if (!strcmp(file_dentry(file)->d_iname, "stage2_levels"))
> -		seq_printf(m, "%1d\n", KVM_PGTABLE_LAST_LEVEL - pgtable->start_level + 1);
> +	seq_printf(m, "%1d\n", KVM_PGTABLE_LAST_LEVEL - pgtable->start_level + 1);
>  	return 0;
>  }
>  
> -static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file)
> +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file,
> +				    int (*show)(struct seq_file *, void *))
>  {
>  	struct kvm *kvm = m->i_private;
>  	struct kvm_pgtable *pgtable;
> @@ -215,12 +219,22 @@ static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file)
>  
>  	pgtable = kvm->arch.mmu.pgt;
>  
> -	ret = single_open(file, kvm_pgtable_debugfs_show, pgtable);
> +	ret = single_open(file, show, pgtable);
>  	if (ret < 0)
>  		kvm_put_kvm(kvm);
>  	return ret;
>  }
>  
> +static int kvm_pgtable_range_open(struct inode *m, struct file *file)
> +{
> +	return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show);
> +}
> +
> +static int kvm_pgtable_levels_open(struct inode *m, struct file *file)
> +{
> +	return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show);
> +}
> +
>  static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
>  {
>  	struct kvm *kvm = m->i_private;
> @@ -229,8 +243,15 @@ static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
>  	return single_release(m, file);
>  }
>  
> -static const struct file_operations kvm_pgtable_debugfs_fops = {
> -	.open		= kvm_pgtable_debugfs_open,
> +static const struct file_operations kvm_pgtable_range_fops = {
> +	.open		= kvm_pgtable_range_open,
> +	.read		= seq_read,
> +	.llseek		= seq_lseek,
> +	.release	= kvm_pgtable_debugfs_close,
> +};
> +
> +static const struct file_operations kvm_pgtable_levels_fops = {
> +	.open		= kvm_pgtable_levels_open,
>  	.read		= seq_read,
>  	.llseek		= seq_lseek,
>  	.release	= kvm_pgtable_debugfs_close,
> @@ -241,7 +262,7 @@ void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
>  	debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
>  			    kvm, &kvm_ptdump_guest_fops);
>  	debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
> -			    &kvm_pgtable_debugfs_fops);
> +			    &kvm_pgtable_range_fops);
>  	debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
> -			    kvm, &kvm_pgtable_debugfs_fops);
> +			    kvm, &kvm_pgtable_levels_fops);
>  }
> 
> 
> Thanks,
> 
> 	M.
> 

Thanks,
Seb


> -- 
> Without deviation from the norm, progress is not possible.
Sebastian Ene Sept. 2, 2024, 5:31 a.m. UTC | #4
On Fri, Aug 30, 2024 at 11:24:53AM +0100, Vincent Donnefort wrote:
> Hi Seb,
> 
> Thanks for the respin.
> 
> On Tue, Aug 27, 2024 at 08:45:47AM +0000, Sebastian Ene wrote:
> > While arch/*/mem/ptdump handles the kernel pagetable dumping code,
> > introduce KVM/ptdump to show the guest stage-2 pagetables. The
> > separation is necessary because most of the definitions from the
> > stage-2 pagetable reside in the KVM path and we will be invoking
> > functionality specific to KVM.
> > 
> > When a guest is created, register a new file entry under the guest
> > debugfs dir which allows userspace to show the contents of the guest
> > stage-2 pagetables when accessed.
> > 
> > Signed-off-by: Sebastian Ene <sebastianene@google.com>
> 
> I only have some nits, otherwise:

Hello Vincent,

> 
> Reviewed-by: Vincent Donnefort <vdonnefort@google.com>
> 

Thanks for giving me consistent feedback on the series. I will
incorporate your latest suggestions in my patch series and add the tag.

> > ---
> >  arch/arm64/include/asm/kvm_host.h |   6 +
> >  arch/arm64/kvm/Makefile           |   1 +
> >  arch/arm64/kvm/arm.c              |   1 +
> >  arch/arm64/kvm/ptdump.c           | 247 ++++++++++++++++++++++++++++++
> >  4 files changed, 255 insertions(+)
> >  create mode 100644 arch/arm64/kvm/ptdump.c
> > 
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index a33f5996ca9f..4acd589f086b 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -1473,4 +1473,10 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
> >  		(pa + pi + pa3) == 1;					\
> >  	})
> >  
> > +#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
> > +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
> > +#else
> > +static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
> > +#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
> > +
> >  #endif /* __ARM64_KVM_HOST_H__ */
> > diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> > index 86a629aaf0a1..e4233b323a73 100644
> > --- a/arch/arm64/kvm/Makefile
> > +++ b/arch/arm64/kvm/Makefile
> > @@ -27,6 +27,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
> >  
> >  kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
> >  kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
> > +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
> >  
> >  always-y := hyp_constants.h hyp-constants.s
> >  
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 9bef7638342e..b9fd928d3477 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -228,6 +228,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
> >  void kvm_arch_create_vm_debugfs(struct kvm *kvm)
> >  {
> >  	kvm_sys_regs_create_debugfs(kvm);
> > +	kvm_s2_ptdump_create_debugfs(kvm);
> >  }
> >  
> >  static void kvm_destroy_mpidr_data(struct kvm *kvm)
> > diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
> > new file mode 100644
> > index 000000000000..e72a928d4445
> > --- /dev/null
> > +++ b/arch/arm64/kvm/ptdump.c
> > @@ -0,0 +1,247 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Debug helper used to dump the stage-2 pagetables of the system and their
> > + * associated permissions.
> > + *
> > + * Copyright (C) Google, 2024
> > + * Author: Sebastian Ene <sebastianene@google.com>
> > + */
> > +#include <linux/debugfs.h>
> > +#include <linux/kvm_host.h>
> > +#include <linux/seq_file.h>
> > +
> > +#include <asm/kvm_pgtable.h>
> > +#include <asm/kvm_host.h>
> 
> nit: I believe you wanted to follow the alphabetical order, if that is the case,
> kvm_host.h then kvm_pgtable.h
> 
> > +#include <asm/ptdump.h>
> > +
> > +
> 
> nit: don't think double empty are a rule, I would remove it.
> 

Ack.

> > +#define MARKERS_LEN		(2)
> 
> nit: The brackets are not necessary for MARKERS_LEN.
> 
> > +#define KVM_PGTABLE_MAX_LEVELS	(KVM_PGTABLE_LAST_LEVEL + 1)
> > +
> > +struct kvm_ptdump_guest_state {
> > +	struct kvm		*kvm;
> > +	struct ptdump_pg_state	parser_state;
> > +	struct addr_marker	ipa_marker[MARKERS_LEN];
> > +	struct ptdump_pg_level	level[KVM_PGTABLE_MAX_LEVELS];
> > +	struct ptdump_range	range[MARKERS_LEN];
> > +};
> > +
> > +static const struct ptdump_prot_bits stage2_pte_bits[] = {
> > +	{
> > +		.mask	= PTE_VALID,
> > +		.val	= PTE_VALID,
> > +		.set	= " ",
> > +		.clear	= "F",
> 
> This is effectively never used because an invalid PTE is 0 and note_page() won't
> print it. This probably can be removed?
> 

Please see my previous reply to this. I would keep it around as it
should print out non-zero invalid PTEs.

> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > +		.set	= "R",
> > +		.clear	= " ",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > +		.set	= "W",
> > +		.clear	= " ",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> > +		.val	= PTE_VALID,
> > +		.set	= " ",
> > +		.clear	= "X",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > +		.set	= "AF",
> > +		.clear	= "  ",
> > +	}, {
> > +		.mask	= PTE_TABLE_BIT | PTE_VALID,
> > +		.val	= PTE_VALID,
> > +		.set	= "BLK",
> > +		.clear	= "   ",
> > +	},
> > +};
> > +
> > +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx,
> > +			      enum kvm_pgtable_walk_flags visit)
> > +{
> > +	struct ptdump_pg_state *st = ctx->arg;
> > +	struct ptdump_state *pt_st = &st->ptdump;
> > +
> > +	note_page(pt_st, ctx->addr, ctx->level, ctx->old);
> > +
> > +	return 0;
> > +}
> > +
> > +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
> > +{
> > +	u32 i;
> > +	u64 mask;
> > +
> > +	if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL))
> > +		return -EINVAL;
> > +
> > +	mask = 0;
> > +	for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
> > +		mask |= stage2_pte_bits[i].mask;
> > +
> > +	for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) {
> > +		snprintf(level[i].name, sizeof(level[i].name), "%d", i);
> 
> %u, i being unsigned.

Ack.

> 
> > +
> > +		level[i].num	= ARRAY_SIZE(stage2_pte_bits);
> > +		level[i].bits	= stage2_pte_bits;
> > +		level[i].mask	= mask;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
> > +{
> > +	struct kvm_ptdump_guest_state *st;
> > +	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
> > +	struct kvm_pgtable *pgtable = mmu->pgt;
> > +	int ret;
> > +
> > +	st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT);
> > +	if (!st)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level);
> > +	if (ret) {
> > +		kfree(st);
> > +		return ERR_PTR(ret);
> > +	}
> > +
> > +	st->ipa_marker[0].name		= "Guest IPA";
> > +	st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
> > +	st->range[0].end		= BIT(pgtable->ia_bits);
> > +
> > +	st->kvm				= kvm;
> > +	st->parser_state = (struct ptdump_pg_state) {
> > +		.marker		= &st->ipa_marker[0],
> > +		.level		= -1,
> > +		.pg_level	= &st->level[0],
> > +		.ptdump.range	= &st->range[0],
> > +		.start_address	= 0,
> > +	};
> > +
> > +	return st;
> > +}
> > +
> > +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
> > +{
> > +	int ret;
> > +	struct kvm_ptdump_guest_state *st = m->private;
> > +	struct kvm *kvm = st->kvm;
> > +	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
> > +	struct ptdump_pg_state *parser_state = &st->parser_state;
> > +	struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
> > +		.cb	= kvm_ptdump_visitor,
> > +		.arg	= parser_state,
> > +		.flags	= KVM_PGTABLE_WALK_LEAF,
> > +	};
> > +
> > +	parser_state->seq = m;
> > +
> > +	write_lock(&kvm->mmu_lock);
> > +	ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
> > +	write_unlock(&kvm->mmu_lock);
> > +
> > +	return ret;
> > +}
> > +
> > +static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
> > +{
> > +	struct kvm *kvm = m->i_private;
> > +	struct kvm_ptdump_guest_state *st;
> > +	int ret;
> > +
> > +	if (!kvm_get_kvm_safe(kvm))
> > +		return -ENOENT;
> > +
> > +	st = kvm_ptdump_parser_create(kvm);
> > +	if (IS_ERR(st)) {
> > +		ret = PTR_ERR(st);
> > +		goto free_with_kvm_ref;
> > +	}
> > +
> > +	ret = single_open(file, kvm_ptdump_guest_show, st);
> > +	if (!ret)
> > +		return 0;
> > +
> > +	kfree(st);
> > +free_with_kvm_ref:
> 
> nit: I believe kfree understands IS_ERR() so you could have a simple "err:"
> label covering all the error path.
> 
> > +	kvm_put_kvm(kvm);
> > +	return ret;
> > +}
> > +
> > +static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
> > +{
> > +	struct kvm *kvm = m->i_private;
> > +	void *st = ((struct seq_file *)file->private_data)->private;
> > +
> > +	kfree(st);
> > +	kvm_put_kvm(kvm);
> > +
> > +	return single_release(m, file);
> > +}
> > +
> > +static const struct file_operations kvm_ptdump_guest_fops = {
> > +	.open		= kvm_ptdump_guest_open,
> > +	.read		= seq_read,
> > +	.llseek		= seq_lseek,
> > +	.release	= kvm_ptdump_guest_close,
> > +};
> > +
> > +static int kvm_pgtable_debugfs_show(struct seq_file *m, void *unused)
> > +{
> > +	const struct file *file = m->file;
> > +	struct kvm_pgtable *pgtable = m->private;
> > +
> > +	if (!strcmp(file_dentry(file)->d_iname, "ipa_range"))
> > +		seq_printf(m, "%2u\n", pgtable->ia_bits);
> > +	else if (!strcmp(file_dentry(file)->d_iname, "stage2_levels"))
> > +		seq_printf(m, "%1d\n", KVM_PGTABLE_LAST_LEVEL - pgtable->start_level + 1);
> 
> nit: KVM_PGTABLE_MAX_LEVELS - pgtable->start_level ?

Yes, we can use this one.

> 
> > +	return 0;
> > +}
> > +
> > +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file)
> > +{
> > +	struct kvm *kvm = m->i_private;
> > +	struct kvm_pgtable *pgtable;
> > +	int ret;
> > +
> > +	if (!kvm_get_kvm_safe(kvm))
> > +		return -ENOENT;
> > +
> > +	pgtable = kvm->arch.mmu.pgt;
> > +
> > +	ret = single_open(file, kvm_pgtable_debugfs_show, pgtable);
> > +	if (ret < 0)
> > +		kvm_put_kvm(kvm);
> > +	return ret;
> > +}
> > +
> > +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
> > +{
> > +	struct kvm *kvm = m->i_private;
> > +
> > +	kvm_put_kvm(kvm);
> > +	return single_release(m, file);
> > +}
> > +
> > +static const struct file_operations kvm_pgtable_debugfs_fops = {
> > +	.open		= kvm_pgtable_debugfs_open,
> > +	.read		= seq_read,
> > +	.llseek		= seq_lseek,
> > +	.release	= kvm_pgtable_debugfs_close,
> > +};
> > +
> > +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
> > +{
> > +	debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
> > +			    kvm, &kvm_ptdump_guest_fops);
> > +	debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
> > +			    &kvm_pgtable_debugfs_fops);
> > +	debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
> > +			    kvm, &kvm_pgtable_debugfs_fops);
> > +}
> > -- 
> > 2.46.0.295.g3b9ea8a38a-goog
> >
Vincent Donnefort Sept. 2, 2024, 11:13 a.m. UTC | #5
[...]

> > > +static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
> > > +{
> > > +	struct kvm *kvm = m->i_private;
> > > +	struct kvm_ptdump_guest_state *st;
> > > +	int ret;
> > > +
> > > +	if (!kvm_get_kvm_safe(kvm))
> > > +		return -ENOENT;
> > > +
> > > +	st = kvm_ptdump_parser_create(kvm);
> > > +	if (IS_ERR(st)) {
> > > +		ret = PTR_ERR(st);
> > > +		goto free_with_kvm_ref;
> > > +	}
> > > +
> > > +	ret = single_open(file, kvm_ptdump_guest_show, st);
> > > +	if (!ret)
> > > +		return 0;
> > > +
> > > +	kfree(st);
> > > +free_with_kvm_ref:
> > 
> > nit: I believe kfree understands IS_ERR() so you could have a simple "err:"
> > label covering all the error path.
> 
> I couldn't find such handling in kfree(). Could you point be to it?

My aplogies, I was confused by the DEFINE_FREE(kfree ...) for __free(). kfree()
only checks for null ptr.

Although, I wonder if the naming "free_with_kvm_ref" isn't an artifact from
previous code? Nothing is freeed here. So perhaps err_with_kvm_ref? which could
be shorten as this is the only label?

[...]
Sebastian Ene Sept. 2, 2024, 1:45 p.m. UTC | #6
On Mon, Sep 02, 2024 at 12:13:12PM +0100, Vincent Donnefort wrote:
> [...]
> 
> > > > +static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
> > > > +{
> > > > +	struct kvm *kvm = m->i_private;
> > > > +	struct kvm_ptdump_guest_state *st;
> > > > +	int ret;
> > > > +
> > > > +	if (!kvm_get_kvm_safe(kvm))
> > > > +		return -ENOENT;
> > > > +
> > > > +	st = kvm_ptdump_parser_create(kvm);
> > > > +	if (IS_ERR(st)) {
> > > > +		ret = PTR_ERR(st);
> > > > +		goto free_with_kvm_ref;
> > > > +	}
> > > > +
> > > > +	ret = single_open(file, kvm_ptdump_guest_show, st);
> > > > +	if (!ret)
> > > > +		return 0;
> > > > +
> > > > +	kfree(st);
> > > > +free_with_kvm_ref:
> > > 
> > > nit: I believe kfree understands IS_ERR() so you could have a simple "err:"
> > > label covering all the error path.
> > 
> > I couldn't find such handling in kfree(). Could you point be to it?
> 
> My aplogies, I was confused by the DEFINE_FREE(kfree ...) for __free(). kfree()
> only checks for null ptr.
> 
> Although, I wonder if the naming "free_with_kvm_ref" isn't an artifact from
> previous code? Nothing is freeed here. So perhaps err_with_kvm_ref? which could
> be shorten as this is the only label?
> 
> [...]

Yes, I guess that works better. Thanks for checking,

Seb
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a33f5996ca9f..4acd589f086b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1473,4 +1473,10 @@  void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
 		(pa + pi + pa3) == 1;					\
 	})
 
+#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
+void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
+#else
+static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
+#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 86a629aaf0a1..e4233b323a73 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -27,6 +27,7 @@  kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 
 kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
 kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
+kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
 
 always-y := hyp_constants.h hyp-constants.s
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9bef7638342e..b9fd928d3477 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -228,6 +228,7 @@  vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 void kvm_arch_create_vm_debugfs(struct kvm *kvm)
 {
 	kvm_sys_regs_create_debugfs(kvm);
+	kvm_s2_ptdump_create_debugfs(kvm);
 }
 
 static void kvm_destroy_mpidr_data(struct kvm *kvm)
diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
new file mode 100644
index 000000000000..e72a928d4445
--- /dev/null
+++ b/arch/arm64/kvm/ptdump.c
@@ -0,0 +1,247 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Debug helper used to dump the stage-2 pagetables of the system and their
+ * associated permissions.
+ *
+ * Copyright (C) Google, 2024
+ * Author: Sebastian Ene <sebastianene@google.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/kvm_host.h>
+#include <linux/seq_file.h>
+
+#include <asm/kvm_pgtable.h>
+#include <asm/kvm_host.h>
+#include <asm/ptdump.h>
+
+
+#define MARKERS_LEN		(2)
+#define KVM_PGTABLE_MAX_LEVELS	(KVM_PGTABLE_LAST_LEVEL + 1)
+
+struct kvm_ptdump_guest_state {
+	struct kvm		*kvm;
+	struct ptdump_pg_state	parser_state;
+	struct addr_marker	ipa_marker[MARKERS_LEN];
+	struct ptdump_pg_level	level[KVM_PGTABLE_MAX_LEVELS];
+	struct ptdump_range	range[MARKERS_LEN];
+};
+
+static const struct ptdump_prot_bits stage2_pte_bits[] = {
+	{
+		.mask	= PTE_VALID,
+		.val	= PTE_VALID,
+		.set	= " ",
+		.clear	= "F",
+	}, {
+		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
+		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
+		.set	= "R",
+		.clear	= " ",
+	}, {
+		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
+		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
+		.set	= "W",
+		.clear	= " ",
+	}, {
+		.mask	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
+		.val	= PTE_VALID,
+		.set	= " ",
+		.clear	= "X",
+	}, {
+		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
+		.val	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
+		.set	= "AF",
+		.clear	= "  ",
+	}, {
+		.mask	= PTE_TABLE_BIT | PTE_VALID,
+		.val	= PTE_VALID,
+		.set	= "BLK",
+		.clear	= "   ",
+	},
+};
+
+static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx,
+			      enum kvm_pgtable_walk_flags visit)
+{
+	struct ptdump_pg_state *st = ctx->arg;
+	struct ptdump_state *pt_st = &st->ptdump;
+
+	note_page(pt_st, ctx->addr, ctx->level, ctx->old);
+
+	return 0;
+}
+
+static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
+{
+	u32 i;
+	u64 mask;
+
+	if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL))
+		return -EINVAL;
+
+	mask = 0;
+	for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
+		mask |= stage2_pte_bits[i].mask;
+
+	for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) {
+		snprintf(level[i].name, sizeof(level[i].name), "%d", i);
+
+		level[i].num	= ARRAY_SIZE(stage2_pte_bits);
+		level[i].bits	= stage2_pte_bits;
+		level[i].mask	= mask;
+	}
+
+	return 0;
+}
+
+static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
+{
+	struct kvm_ptdump_guest_state *st;
+	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+	struct kvm_pgtable *pgtable = mmu->pgt;
+	int ret;
+
+	st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT);
+	if (!st)
+		return ERR_PTR(-ENOMEM);
+
+	ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level);
+	if (ret) {
+		kfree(st);
+		return ERR_PTR(ret);
+	}
+
+	st->ipa_marker[0].name		= "Guest IPA";
+	st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
+	st->range[0].end		= BIT(pgtable->ia_bits);
+
+	st->kvm				= kvm;
+	st->parser_state = (struct ptdump_pg_state) {
+		.marker		= &st->ipa_marker[0],
+		.level		= -1,
+		.pg_level	= &st->level[0],
+		.ptdump.range	= &st->range[0],
+		.start_address	= 0,
+	};
+
+	return st;
+}
+
+static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
+{
+	int ret;
+	struct kvm_ptdump_guest_state *st = m->private;
+	struct kvm *kvm = st->kvm;
+	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+	struct ptdump_pg_state *parser_state = &st->parser_state;
+	struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
+		.cb	= kvm_ptdump_visitor,
+		.arg	= parser_state,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+	};
+
+	parser_state->seq = m;
+
+	write_lock(&kvm->mmu_lock);
+	ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
+	write_unlock(&kvm->mmu_lock);
+
+	return ret;
+}
+
+static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
+{
+	struct kvm *kvm = m->i_private;
+	struct kvm_ptdump_guest_state *st;
+	int ret;
+
+	if (!kvm_get_kvm_safe(kvm))
+		return -ENOENT;
+
+	st = kvm_ptdump_parser_create(kvm);
+	if (IS_ERR(st)) {
+		ret = PTR_ERR(st);
+		goto free_with_kvm_ref;
+	}
+
+	ret = single_open(file, kvm_ptdump_guest_show, st);
+	if (!ret)
+		return 0;
+
+	kfree(st);
+free_with_kvm_ref:
+	kvm_put_kvm(kvm);
+	return ret;
+}
+
+static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
+{
+	struct kvm *kvm = m->i_private;
+	void *st = ((struct seq_file *)file->private_data)->private;
+
+	kfree(st);
+	kvm_put_kvm(kvm);
+
+	return single_release(m, file);
+}
+
+static const struct file_operations kvm_ptdump_guest_fops = {
+	.open		= kvm_ptdump_guest_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= kvm_ptdump_guest_close,
+};
+
+static int kvm_pgtable_debugfs_show(struct seq_file *m, void *unused)
+{
+	const struct file *file = m->file;
+	struct kvm_pgtable *pgtable = m->private;
+
+	if (!strcmp(file_dentry(file)->d_iname, "ipa_range"))
+		seq_printf(m, "%2u\n", pgtable->ia_bits);
+	else if (!strcmp(file_dentry(file)->d_iname, "stage2_levels"))
+		seq_printf(m, "%1d\n", KVM_PGTABLE_LAST_LEVEL - pgtable->start_level + 1);
+	return 0;
+}
+
+static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file)
+{
+	struct kvm *kvm = m->i_private;
+	struct kvm_pgtable *pgtable;
+	int ret;
+
+	if (!kvm_get_kvm_safe(kvm))
+		return -ENOENT;
+
+	pgtable = kvm->arch.mmu.pgt;
+
+	ret = single_open(file, kvm_pgtable_debugfs_show, pgtable);
+	if (ret < 0)
+		kvm_put_kvm(kvm);
+	return ret;
+}
+
+static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
+{
+	struct kvm *kvm = m->i_private;
+
+	kvm_put_kvm(kvm);
+	return single_release(m, file);
+}
+
+static const struct file_operations kvm_pgtable_debugfs_fops = {
+	.open		= kvm_pgtable_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= kvm_pgtable_debugfs_close,
+};
+
+void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
+{
+	debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
+			    kvm, &kvm_ptdump_guest_fops);
+	debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
+			    &kvm_pgtable_debugfs_fops);
+	debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
+			    kvm, &kvm_pgtable_debugfs_fops);
+}