diff mbox series

[v2,18/33] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization

Message ID 1538127963-15645-19-git-send-email-paulus@ozlabs.org (mailing list archive)
State New, archived
Headers show
Series KVM: PPC: Book3S HV: Nested HV virtualization | expand

Commit Message

Paul Mackerras Sept. 28, 2018, 9:45 a.m. UTC
This starts the process of adding the code to support nested HV-style
virtualization.  It defines a new H_SET_PARTITION_TABLE hypercall which
a nested hypervisor can use to set the base address and size of a
partition table in its memory (analogous to the PTCR register).
On the host (level 0 hypervisor) side, the H_SET_PARTITION_TABLE
hypercall from the guest is handled by code that saves the virtual
PTCR value for the guest.

This also adds code for creating and destroying nested guests and for
reading the partition table entry for a nested guest from L1 memory.
Each nested guest has its own shadow LPID value, different in general
from the LPID value used by the nested hypervisor to refer to it.  The
shadow LPID value is allocated at nested guest creation time.

Nested hypervisor functionality is only available for a radix guest,
which therefore means a radix host on a POWER9 (or later) processor.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/hvcall.h         |   5 +
 arch/powerpc/include/asm/kvm_book3s.h     |  10 +-
 arch/powerpc/include/asm/kvm_book3s_64.h  |  33 ++++
 arch/powerpc/include/asm/kvm_book3s_asm.h |   3 +
 arch/powerpc/include/asm/kvm_host.h       |   5 +
 arch/powerpc/kvm/Makefile                 |   3 +-
 arch/powerpc/kvm/book3s_hv.c              |  26 ++-
 arch/powerpc/kvm/book3s_hv_nested.c       | 283 ++++++++++++++++++++++++++++++
 8 files changed, 361 insertions(+), 7 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_nested.c

Comments

David Gibson Oct. 2, 2018, 6:01 a.m. UTC | #1
On Fri, Sep 28, 2018 at 07:45:48PM +1000, Paul Mackerras wrote:
> This starts the process of adding the code to support nested HV-style
> virtualization.  It defines a new H_SET_PARTITION_TABLE hypercall which
> a nested hypervisor can use to set the base address and size of a
> partition table in its memory (analogous to the PTCR register).
> On the host (level 0 hypervisor) side, the H_SET_PARTITION_TABLE
> hypercall from the guest is handled by code that saves the virtual
> PTCR value for the guest.
> 
> This also adds code for creating and destroying nested guests and for
> reading the partition table entry for a nested guest from L1 memory.
> Each nested guest has its own shadow LPID value, different in general
> from the LPID value used by the nested hypervisor to refer to it.  The
> shadow LPID value is allocated at nested guest creation time.
> 
> Nested hypervisor functionality is only available for a radix guest,
> which therefore means a radix host on a POWER9 (or later) processor.
> 
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

I've made a number of comments below, but they're all pretty minor
things.  They might be worth including if we have to respin for
whatever reason, or as follow-up improvements, but I don't think we
need to hold this up for them.


[snip]
> @@ -287,6 +288,7 @@ struct kvm_arch {
>  	u8 radix;
>  	u8 fwnmi_enabled;
>  	bool threads_indep;
> +	bool nested_enable;
>  	pgd_t *pgtable;
>  	u64 process_table;
>  	struct dentry *debugfs_dir;
> @@ -312,6 +314,9 @@ struct kvm_arch {
>  #endif
>  	struct kvmppc_ops *kvm_ops;
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	u64 l1_ptcr;
> +	int max_nested_lpid;
> +	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];

This array could be quite large.  As a followup would it be worth
dynamically allocating it, so it can be skipped for L1s with no
nesting allowed, and/or dynamically resized as the L1 adds/removes L2s.

>  	/* This array can grow quite large, keep it at the end */
>  	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
>  #endif

[snip]
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> new file mode 100644
> index 0000000..5341052
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -0,0 +1,283 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright IBM Corporation, 2018
> + * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> + *	   Paul Mackerras <paulus@ozlabs.org>
> + *
> + * Description: KVM functions specific to running nested KVM-HV guests
> + * on Book3S processors (specifically POWER9 and later).
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +
> +#include <asm/kvm_ppc.h>
> +#include <asm/mmu.h>
> +#include <asm/pgtable.h>
> +#include <asm/pgalloc.h>
> +
> +static struct patb_entry *pseries_partition_tb;
> +
> +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
> +
> +/* Only called when we're not in hypervisor mode */

This comment isn't strictly accurate, the function is called, but
exits trivially.

> +bool kvmhv_nested_init(void)
> +{
> +	long int ptb_order;
> +	unsigned long ptcr;
> +	long rc;
> +
> +	if (!kvmhv_on_pseries())
> +		return true;
> +	if (!radix_enabled())
> +		return false;
> +
> +	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
> +	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
> +	if (ptb_order < 8)
> +		ptb_order = 8;
> +	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
> +				       GFP_KERNEL);
> +	if (!pseries_partition_tb) {
> +		pr_err("kvm-hv: failed to allocated nested partition table\n");
> +		return false;

Since this can fail in several different ways, it seems like returning
an errno, rather than a bool would make sense.

> +	}
> +
> +	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
> +	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
> +	if (rc != H_SUCCESS) {
> +		pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
> +		       rc);
> +		kfree(pseries_partition_tb);
> +		pseries_partition_tb = NULL;
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +void kvmhv_nested_exit(void)
> +{
> +	if (kvmhv_on_pseries() && pseries_partition_tb) {

First clause is redundant there, isn't it, since pseries_partition_tb
can only be set if we're on pseries?

> +		plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
> +		kfree(pseries_partition_tb);
> +		pseries_partition_tb = NULL;
> +	}
> +}
> +
> +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
> +{
> +	if (cpu_has_feature(CPU_FTR_HVMODE)) {
> +		mmu_partition_table_set_entry(lpid, dw0, dw1);
> +	} else {
> +		pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
> +		pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
> +		/* this will be emulated, L0 will do the necessary barriers */
> +		asm volatile(PPC_TLBIE_5(%0, %1, 2, 0, 1) : :
> +			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));

I think in this version you were using a paravirt TLB flush, instead
of emulation?

> +	}
> +}
> +
> +static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
> +{
> +	unsigned long dw0;
> +
> +	dw0 = PATB_HR | radix__get_tree_size() |
> +		__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
> +	kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
> +}
> +
> +void kvmhv_vm_nested_init(struct kvm *kvm)
> +{
> +	kvm->arch.max_nested_lpid = -1;
> +}
> +
> +/*
> + * Handle the H_SET_PARTITION_TABLE hcall.
> + * r4 = guest real address of partition table + log_2(size) - 12
> + * (formatted as for the PTCR).
> + */
> +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
> +
> +	kvm->arch.l1_ptcr = ptcr;

I don't think it's actually dangerous, since we validate the L1
addresses when we read from the table, but it would probably be better
for debugging a guest if this failed the hcall if the PTCR didn't make
sense (out of bounds order, or not within L1 memory size).

> +	return H_SUCCESS;
> +}

[snip]
> +/*
> + * Free up any resources allocated for a nested guest.
> + */
> +static void kvmhv_release_nested(struct kvm_nested_guest *gp)
> +{
> +	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
> +	kvmppc_free_lpid(gp->shadow_lpid);
> +	if (gp->shadow_pgtable)
> +		pgd_free(gp->l1_host->mm, gp->shadow_pgtable);
> +	kfree(gp);
> +}
> +
> +static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
> +{
> +	struct kvm *kvm = gp->l1_host;
> +	int lpid = gp->l1_lpid;
> +	long ref;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	if (gp == kvm->arch.nested_guests[lpid]) {

This is to protect against a race with another remove, yes?  Since kvm
and lpid are read before you take the lock.  Is that right?

> +		kvm->arch.nested_guests[lpid] = NULL;
> +		if (lpid == kvm->arch.max_nested_lpid) {
> +			while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
> +				;
> +			kvm->arch.max_nested_lpid = lpid;
> +		}
> +		--gp->refcnt;
> +	}
> +	ref = gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +	if (ref == 0)
> +		kvmhv_release_nested(gp);
> +}

[snip]
> +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
> +					  bool create)
> +{
> +	struct kvm_nested_guest *gp, *newgp;
> +
> +	if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
> +	    l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
> +		return NULL;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	gp = kvm->arch.nested_guests[l1_lpid];
> +	if (gp)
> +		++gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (gp || !create)
> +		return gp;
> +
> +	newgp = kvmhv_alloc_nested(kvm, l1_lpid);
> +	if (!newgp)
> +		return NULL;
> +	spin_lock(&kvm->mmu_lock);
> +	if (kvm->arch.nested_guests[l1_lpid]) {
> +		/* someone else beat us to it */

Should we print a message in this case.  It's no skin off the host's
nose, but wouldn't this mean the guest is concurrently trying to start
two guests with the same lpid, which seems like a dubious thing for it
to be doing.

> +		gp = kvm->arch.nested_guests[l1_lpid];
> +	} else {
> +		kvm->arch.nested_guests[l1_lpid] = newgp;
> +		++newgp->refcnt;
> +		gp = newgp;
> +		newgp = NULL;
> +		if (l1_lpid > kvm->arch.max_nested_lpid)
> +			kvm->arch.max_nested_lpid = l1_lpid;
> +	}
> +	++gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (newgp)
> +		kvmhv_release_nested(newgp);
> +
> +	return gp;
> +}
> +
> +void kvmhv_put_nested(struct kvm_nested_guest *gp)
> +{
> +	struct kvm *kvm = gp->l1_host;
> +	long ref;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	ref = --gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +	if (ref == 0)
> +		kvmhv_release_nested(gp);
> +}
Paul Mackerras Oct. 2, 2018, 7:48 a.m. UTC | #2
On Tue, Oct 02, 2018 at 04:01:52PM +1000, David Gibson wrote:
> On Fri, Sep 28, 2018 at 07:45:48PM +1000, Paul Mackerras wrote:
> > This starts the process of adding the code to support nested HV-style
> > virtualization.  It defines a new H_SET_PARTITION_TABLE hypercall which
> > a nested hypervisor can use to set the base address and size of a
> > partition table in its memory (analogous to the PTCR register).
> > On the host (level 0 hypervisor) side, the H_SET_PARTITION_TABLE
> > hypercall from the guest is handled by code that saves the virtual
> > PTCR value for the guest.
> > 
> > This also adds code for creating and destroying nested guests and for
> > reading the partition table entry for a nested guest from L1 memory.
> > Each nested guest has its own shadow LPID value, different in general
> > from the LPID value used by the nested hypervisor to refer to it.  The
> > shadow LPID value is allocated at nested guest creation time.
> > 
> > Nested hypervisor functionality is only available for a radix guest,
> > which therefore means a radix host on a POWER9 (or later) processor.
> > 
> > Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> 
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> 
> I've made a number of comments below, but they're all pretty minor
> things.  They might be worth including if we have to respin for
> whatever reason, or as follow-up improvements, but I don't think we
> need to hold this up for them.

I have some other changes that will mean I'll be sending a v3.

> 
> [snip]
> > @@ -287,6 +288,7 @@ struct kvm_arch {
> >  	u8 radix;
> >  	u8 fwnmi_enabled;
> >  	bool threads_indep;
> > +	bool nested_enable;
> >  	pgd_t *pgtable;
> >  	u64 process_table;
> >  	struct dentry *debugfs_dir;
> > @@ -312,6 +314,9 @@ struct kvm_arch {
> >  #endif
> >  	struct kvmppc_ops *kvm_ops;
> >  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> > +	u64 l1_ptcr;
> > +	int max_nested_lpid;
> > +	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
> 
> This array could be quite large.  As a followup would it be worth
> dynamically allocating it, so it can be skipped for L1s with no
> nesting allowed, and/or dynamically resized as the L1 adds/removes L2s.

True.

> >  	/* This array can grow quite large, keep it at the end */
> >  	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
> >  #endif
> 
> [snip]
> > diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> > new file mode 100644
> > index 0000000..5341052
> > --- /dev/null
> > +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> > @@ -0,0 +1,283 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright IBM Corporation, 2018
> > + * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> > + *	   Paul Mackerras <paulus@ozlabs.org>
> > + *
> > + * Description: KVM functions specific to running nested KVM-HV guests
> > + * on Book3S processors (specifically POWER9 and later).
> > + */
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/kvm_host.h>
> > +
> > +#include <asm/kvm_ppc.h>
> > +#include <asm/mmu.h>
> > +#include <asm/pgtable.h>
> > +#include <asm/pgalloc.h>
> > +
> > +static struct patb_entry *pseries_partition_tb;
> > +
> > +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
> > +
> > +/* Only called when we're not in hypervisor mode */
> 
> This comment isn't strictly accurate, the function is called, but
> exits trivially.

Right, I'll change the comment.

> > +bool kvmhv_nested_init(void)
> > +{
> > +	long int ptb_order;
> > +	unsigned long ptcr;
> > +	long rc;
> > +
> > +	if (!kvmhv_on_pseries())
> > +		return true;
> > +	if (!radix_enabled())
> > +		return false;
> > +
> > +	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
> > +	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
> > +	if (ptb_order < 8)
> > +		ptb_order = 8;
> > +	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
> > +				       GFP_KERNEL);
> > +	if (!pseries_partition_tb) {
> > +		pr_err("kvm-hv: failed to allocated nested partition table\n");
> > +		return false;
> 
> Since this can fail in several different ways, it seems like returning
> an errno, rather than a bool would make sense.

OK.

> > +	}
> > +
> > +	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
> > +	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
> > +	if (rc != H_SUCCESS) {
> > +		pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
> > +		       rc);
> > +		kfree(pseries_partition_tb);
> > +		pseries_partition_tb = NULL;
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +void kvmhv_nested_exit(void)
> > +{
> > +	if (kvmhv_on_pseries() && pseries_partition_tb) {
> 
> First clause is redundant there, isn't it, since pseries_partition_tb
> can only be set if we're on pseries?

It is, but the subtlety here is that we're relying on the compiler
removing the call to plpar_hcall_norets() in configs with
CONFIG_PPC_PSERIES=n (because there is no definition of
plpar_hcall_norets in such configs).  The compiler can tell that
kvmhv_on_pseries() is always false for those configs, but it can't
tell that pseries_partition_tb is always NULL.

> > +		plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
> > +		kfree(pseries_partition_tb);
> > +		pseries_partition_tb = NULL;
> > +	}
> > +}
> > +
> > +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
> > +{
> > +	if (cpu_has_feature(CPU_FTR_HVMODE)) {
> > +		mmu_partition_table_set_entry(lpid, dw0, dw1);
> > +	} else {
> > +		pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
> > +		pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
> > +		/* this will be emulated, L0 will do the necessary barriers */
> > +		asm volatile(PPC_TLBIE_5(%0, %1, 2, 0, 1) : :
> > +			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
> 
> I think in this version you were using a paravirt TLB flush, instead
> of emulation?

This gets converted to a paravirt flush in the last patch, yes.  I
suppose I could introduce that earlier.

> > +	}
> > +}
> > +
> > +static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
> > +{
> > +	unsigned long dw0;
> > +
> > +	dw0 = PATB_HR | radix__get_tree_size() |
> > +		__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
> > +	kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
> > +}
> > +
> > +void kvmhv_vm_nested_init(struct kvm *kvm)
> > +{
> > +	kvm->arch.max_nested_lpid = -1;
> > +}
> > +
> > +/*
> > + * Handle the H_SET_PARTITION_TABLE hcall.
> > + * r4 = guest real address of partition table + log_2(size) - 12
> > + * (formatted as for the PTCR).
> > + */
> > +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
> > +{
> > +	struct kvm *kvm = vcpu->kvm;
> > +	unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
> > +
> > +	kvm->arch.l1_ptcr = ptcr;
> 
> I don't think it's actually dangerous, since we validate the L1
> addresses when we read from the table, but it would probably be better
> for debugging a guest if this failed the hcall if the PTCR didn't make
> sense (out of bounds order, or not within L1 memory size).

OK.

> > +	return H_SUCCESS;
> > +}
> 
> [snip]
> > +/*
> > + * Free up any resources allocated for a nested guest.
> > + */
> > +static void kvmhv_release_nested(struct kvm_nested_guest *gp)
> > +{
> > +	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
> > +	kvmppc_free_lpid(gp->shadow_lpid);
> > +	if (gp->shadow_pgtable)
> > +		pgd_free(gp->l1_host->mm, gp->shadow_pgtable);
> > +	kfree(gp);
> > +}
> > +
> > +static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
> > +{
> > +	struct kvm *kvm = gp->l1_host;
> > +	int lpid = gp->l1_lpid;
> > +	long ref;
> > +
> > +	spin_lock(&kvm->mmu_lock);
> > +	if (gp == kvm->arch.nested_guests[lpid]) {
> 
> This is to protect against a race with another remove, yes?  Since kvm
> and lpid are read before you take the lock.  Is that right?

Basically, yes.  The lock is taken and dropped in kvmhv_get_nested()
and another CPU could have done kvmhv_remove_nested() in the
meanwhile.

> > +		kvm->arch.nested_guests[lpid] = NULL;
> > +		if (lpid == kvm->arch.max_nested_lpid) {
> > +			while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
> > +				;
> > +			kvm->arch.max_nested_lpid = lpid;
> > +		}
> > +		--gp->refcnt;
> > +	}
> > +	ref = gp->refcnt;
> > +	spin_unlock(&kvm->mmu_lock);
> > +	if (ref == 0)
> > +		kvmhv_release_nested(gp);
> > +}
> 
> [snip]
> > +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
> > +					  bool create)
> > +{
> > +	struct kvm_nested_guest *gp, *newgp;
> > +
> > +	if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
> > +	    l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
> > +		return NULL;
> > +
> > +	spin_lock(&kvm->mmu_lock);
> > +	gp = kvm->arch.nested_guests[l1_lpid];
> > +	if (gp)
> > +		++gp->refcnt;
> > +	spin_unlock(&kvm->mmu_lock);
> > +
> > +	if (gp || !create)
> > +		return gp;
> > +
> > +	newgp = kvmhv_alloc_nested(kvm, l1_lpid);
> > +	if (!newgp)
> > +		return NULL;
> > +	spin_lock(&kvm->mmu_lock);
> > +	if (kvm->arch.nested_guests[l1_lpid]) {
> > +		/* someone else beat us to it */
> 
> Should we print a message in this case.  It's no skin off the host's
> nose, but wouldn't this mean the guest is concurrently trying to start
> two guests with the same lpid, which seems like a dubious thing for it
> to be doing.

No, it could just be starting two vcpus from the same guest.  This
could happen when the L1 guest has just migrated in and now all of its
vcpus are getting started concurrently.

Paul.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index a0b17f9..c95c651 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -322,6 +322,11 @@ 
 #define H_GET_24X7_DATA		0xF07C
 #define H_GET_PERF_COUNTER_INFO	0xF080
 
+/* Platform-specific hcalls used for nested HV KVM */
+#define H_SET_PARTITION_TABLE	0xF800
+#define H_ENTER_NESTED		0xF804
+#define H_TLB_INVALIDATE	0xF808
+
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR		1
 #define H_SET_MODE_RESOURCE_SET_DAWR		2
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 91c9779..7719ca5 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -274,6 +274,13 @@  static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
+bool kvmhv_nested_init(void);
+void kvmhv_nested_exit(void);
+void kvmhv_vm_nested_init(struct kvm *kvm);
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
+void kvmhv_release_all_nested(struct kvm *kvm);
+
 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 
 extern int kvm_irq_bypass;
@@ -387,9 +394,6 @@  extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
 /* TO = 31 for unconditional trap */
 #define INS_TW				0x7fe00008
 
-/* LPIDs we support with this build -- runtime limit may be lower */
-#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
-
 #define SPLIT_HACK_MASK			0xff000000
 #define SPLIT_HACK_OFFS			0xfb000000
 
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 5c0e2d9..6d67b6a 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -23,6 +23,39 @@ 
 #include <linux/string.h>
 #include <asm/bitops.h>
 #include <asm/book3s/64/mmu-hash.h>
+#include <asm/cpu_has_feature.h>
+
+#ifdef CONFIG_PPC_PSERIES
+static inline bool kvmhv_on_pseries(void)
+{
+	return !cpu_has_feature(CPU_FTR_HVMODE);
+}
+#else
+static inline bool kvmhv_on_pseries(void)
+{
+	return false;
+}
+#endif
+
+/*
+ * Structure for a nested guest, that is, for a guest that is managed by
+ * one of our guests.
+ */
+struct kvm_nested_guest {
+	struct kvm *l1_host;		/* L1 VM that owns this nested guest */
+	int l1_lpid;			/* lpid L1 guest thinks this guest is */
+	int shadow_lpid;		/* real lpid of this nested guest */
+	pgd_t *shadow_pgtable;		/* our page table for this guest */
+	u64 l1_gr_to_hr;		/* L1's addr of part'n-scoped table */
+	u64 process_table;		/* process table entry for this guest */
+	long refcnt;			/* number of pointers to this struct */
+	struct mutex tlb_lock;		/* serialize page faults and tlbies */
+	struct kvm_nested_guest *next;
+};
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+					  bool create);
+void kvmhv_put_nested(struct kvm_nested_guest *gp);
 
 /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
 #define PPC_MIN_HPT_ORDER	18
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d978fdf..eb3ba63 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -25,6 +25,9 @@ 
 #define XICS_MFRR		0xc
 #define XICS_IPI		2	/* interrupt source # for IPIs */
 
+/* LPIDs we support with this build -- runtime limit may be lower */
+#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
+
 /* Maximum number of threads per physical core */
 #define MAX_SMT_THREADS		8
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c9cc42f..c35d4f2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -46,6 +46,7 @@ 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #include <asm/kvm_book3s_asm.h>		/* for MAX_SMT_THREADS */
 #define KVM_MAX_VCPU_ID		(MAX_SMT_THREADS * KVM_MAX_VCORES)
+#define KVM_MAX_NESTED_GUESTS	KVMPPC_NR_LPIDS
 
 #else
 #define KVM_MAX_VCPU_ID		KVM_MAX_VCPUS
@@ -287,6 +288,7 @@  struct kvm_arch {
 	u8 radix;
 	u8 fwnmi_enabled;
 	bool threads_indep;
+	bool nested_enable;
 	pgd_t *pgtable;
 	u64 process_table;
 	struct dentry *debugfs_dir;
@@ -312,6 +314,9 @@  struct kvm_arch {
 #endif
 	struct kvmppc_ops *kvm_ops;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	u64 l1_ptcr;
+	int max_nested_lpid;
+	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
 	/* This array can grow quite large, keep it at the end */
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 #endif
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index f872c04..e814f40 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -75,7 +75,8 @@  kvm-hv-y += \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
 	book3s_64_mmu_hv.o \
-	book3s_64_mmu_radix.o
+	book3s_64_mmu_radix.o \
+	book3s_hv_nested.o
 
 kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
 	book3s_hv_tm.o
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b8703be..43c607b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -934,6 +934,19 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		if (ret == H_TOO_HARD)
 			return RESUME_HOST;
 		break;
+
+	case H_SET_PARTITION_TABLE:
+		ret = H_FUNCTION;
+		if (vcpu->kvm->arch.nested_enable)
+			ret = kvmhv_set_partition_table(vcpu);
+		break;
+	case H_ENTER_NESTED:
+		ret = H_FUNCTION;
+		break;
+	case H_TLB_INVALIDATE:
+		ret = H_FUNCTION;
+		break;
+
 	default:
 		return RESUME_HOST;
 	}
@@ -4149,8 +4162,7 @@  void kvmppc_setup_partition_table(struct kvm *kvm)
 			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
 		dw1 = PATB_GR | kvm->arch.process_table;
 	}
-
-	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
+	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
 }
 
 /*
@@ -4366,6 +4378,8 @@  static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
 	kvmppc_alloc_host_rm_ops();
 
+	kvmhv_vm_nested_init(kvm);
+
 	/*
 	 * Since we don't flush the TLB when tearing down a VM,
 	 * and this lpid might have previously been used,
@@ -4509,8 +4523,10 @@  static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
 	/* Perform global invalidation and return lpid to the pool */
 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (kvm->arch.nested_enable)
+			kvmhv_release_all_nested(kvm);
 		kvm->arch.process_table = 0;
-		kvmppc_setup_partition_table(kvm);
+		kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
 	}
 	kvmppc_free_lpid(kvm->arch.lpid);
 
@@ -4981,6 +4997,9 @@  static int kvmppc_book3s_init_hv(void)
 	if (r < 0)
 		return -ENODEV;
 
+	if (!kvmhv_nested_init())
+		return -ENODEV;
+
 	r = kvm_init_subcore_bitmap();
 	if (r)
 		return r;
@@ -5039,6 +5058,7 @@  static void kvmppc_book3s_exit_hv(void)
 	if (kvmppc_radix_possible())
 		kvmppc_radix_exit();
 	kvmppc_hv_ops = NULL;
+	kvmhv_nested_exit();
 }
 
 module_init(kvmppc_book3s_init_hv);
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
new file mode 100644
index 0000000..5341052
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -0,0 +1,283 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2018
+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
+ *	   Paul Mackerras <paulus@ozlabs.org>
+ *
+ * Description: KVM functions specific to running nested KVM-HV guests
+ * on Book3S processors (specifically POWER9 and later).
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+static struct patb_entry *pseries_partition_tb;
+
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
+
+/* Only called when we're not in hypervisor mode */
+bool kvmhv_nested_init(void)
+{
+	long int ptb_order;
+	unsigned long ptcr;
+	long rc;
+
+	if (!kvmhv_on_pseries())
+		return true;
+	if (!radix_enabled())
+		return false;
+
+	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
+	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
+	if (ptb_order < 8)
+		ptb_order = 8;
+	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
+				       GFP_KERNEL);
+	if (!pseries_partition_tb) {
+		pr_err("kvm-hv: failed to allocated nested partition table\n");
+		return false;
+	}
+
+	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
+	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
+	if (rc != H_SUCCESS) {
+		pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
+		       rc);
+		kfree(pseries_partition_tb);
+		pseries_partition_tb = NULL;
+		return false;
+	}
+
+	return true;
+}
+
+void kvmhv_nested_exit(void)
+{
+	if (kvmhv_on_pseries() && pseries_partition_tb) {
+		plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
+		kfree(pseries_partition_tb);
+		pseries_partition_tb = NULL;
+	}
+}
+
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
+{
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		mmu_partition_table_set_entry(lpid, dw0, dw1);
+	} else {
+		pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+		pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+		/* this will be emulated, L0 will do the necessary barriers */
+		asm volatile(PPC_TLBIE_5(%0, %1, 2, 0, 1) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+	}
+}
+
+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
+{
+	unsigned long dw0;
+
+	dw0 = PATB_HR | radix__get_tree_size() |
+		__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
+	kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
+}
+
+void kvmhv_vm_nested_init(struct kvm *kvm)
+{
+	kvm->arch.max_nested_lpid = -1;
+}
+
+/*
+ * Handle the H_SET_PARTITION_TABLE hcall.
+ * r4 = guest real address of partition table + log_2(size) - 12
+ * (formatted as for the PTCR).
+ */
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
+
+	kvm->arch.l1_ptcr = ptcr;
+	return H_SUCCESS;
+}
+
+/*
+ * Reload the partition table entry for a guest.
+ * Caller must hold gp->tlb_lock.
+ */
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
+{
+	int ret;
+	struct patb_entry ptbl_entry;
+	unsigned long ptbl_addr;
+	struct kvm *kvm = gp->l1_host;
+
+	ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
+	ret = kvm_read_guest(kvm, ptbl_addr,
+			     &ptbl_entry, sizeof(ptbl_entry));
+	if (ret) {
+		gp->l1_gr_to_hr = 0;
+		gp->process_table = 0;
+	} else {
+		gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
+		gp->process_table = be64_to_cpu(ptbl_entry.patb1);
+	}
+	kvmhv_set_nested_ptbl(gp);
+}
+
+struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+{
+	struct kvm_nested_guest *gp;
+	long shadow_lpid;
+
+	gp = kzalloc(sizeof(*gp), GFP_KERNEL);
+	if (!gp)
+		return NULL;
+	gp->l1_host = kvm;
+	gp->l1_lpid = lpid;
+	mutex_init(&gp->tlb_lock);
+	gp->shadow_pgtable = pgd_alloc(kvm->mm);
+	if (!gp->shadow_pgtable)
+		goto out_free;
+	shadow_lpid = kvmppc_alloc_lpid();
+	if (shadow_lpid < 0)
+		goto out_free2;
+	gp->shadow_lpid = shadow_lpid;
+
+	return gp;
+
+ out_free2:
+	pgd_free(kvm->mm, gp->shadow_pgtable);
+ out_free:
+	kfree(gp);
+	return NULL;
+}
+
+/*
+ * Free up any resources allocated for a nested guest.
+ */
+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
+{
+	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
+	kvmppc_free_lpid(gp->shadow_lpid);
+	if (gp->shadow_pgtable)
+		pgd_free(gp->l1_host->mm, gp->shadow_pgtable);
+	kfree(gp);
+}
+
+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
+{
+	struct kvm *kvm = gp->l1_host;
+	int lpid = gp->l1_lpid;
+	long ref;
+
+	spin_lock(&kvm->mmu_lock);
+	if (gp == kvm->arch.nested_guests[lpid]) {
+		kvm->arch.nested_guests[lpid] = NULL;
+		if (lpid == kvm->arch.max_nested_lpid) {
+			while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
+				;
+			kvm->arch.max_nested_lpid = lpid;
+		}
+		--gp->refcnt;
+	}
+	ref = gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+	if (ref == 0)
+		kvmhv_release_nested(gp);
+}
+
+/*
+ * Free up all nested resources allocated for this guest.
+ */
+void kvmhv_release_all_nested(struct kvm *kvm)
+{
+	int i;
+	struct kvm_nested_guest *gp;
+	struct kvm_nested_guest *freelist = NULL;
+
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+		gp = kvm->arch.nested_guests[i];
+		if (!gp)
+			continue;
+		kvm->arch.nested_guests[i] = NULL;
+		if (--gp->refcnt == 0) {
+			gp->next = freelist;
+			freelist = gp;
+		}
+	}
+	kvm->arch.max_nested_lpid = -1;
+	spin_unlock(&kvm->mmu_lock);
+	while ((gp = freelist) != NULL) {
+		freelist = gp->next;
+		kvmhv_release_nested(gp);
+	}
+}
+
+/* caller must hold gp->tlb_lock */
+void kvmhv_flush_nested(struct kvm_nested_guest *gp)
+{
+	kvmhv_update_ptbl_cache(gp);
+	if (gp->l1_gr_to_hr == 0)
+		kvmhv_remove_nested(gp);
+}
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+					  bool create)
+{
+	struct kvm_nested_guest *gp, *newgp;
+
+	if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
+	    l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
+		return NULL;
+
+	spin_lock(&kvm->mmu_lock);
+	gp = kvm->arch.nested_guests[l1_lpid];
+	if (gp)
+		++gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+
+	if (gp || !create)
+		return gp;
+
+	newgp = kvmhv_alloc_nested(kvm, l1_lpid);
+	if (!newgp)
+		return NULL;
+	spin_lock(&kvm->mmu_lock);
+	if (kvm->arch.nested_guests[l1_lpid]) {
+		/* someone else beat us to it */
+		gp = kvm->arch.nested_guests[l1_lpid];
+	} else {
+		kvm->arch.nested_guests[l1_lpid] = newgp;
+		++newgp->refcnt;
+		gp = newgp;
+		newgp = NULL;
+		if (l1_lpid > kvm->arch.max_nested_lpid)
+			kvm->arch.max_nested_lpid = l1_lpid;
+	}
+	++gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+
+	if (newgp)
+		kvmhv_release_nested(newgp);
+
+	return gp;
+}
+
+void kvmhv_put_nested(struct kvm_nested_guest *gp)
+{
+	struct kvm *kvm = gp->l1_host;
+	long ref;
+
+	spin_lock(&kvm->mmu_lock);
+	ref = --gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+	if (ref == 0)
+		kvmhv_release_nested(gp);
+}