diff mbox series

[v4,25/32] KVM: PPC: Book3S HV: Invalidate TLB when nested vcpu moves physical cpu

Message ID 1538654169-15602-26-git-send-email-paulus@ozlabs.org (mailing list archive)
State New, archived
Headers show
Series KVM: PPC: Book3S HV: Nested HV virtualization | expand

Commit Message

Paul Mackerras Oct. 4, 2018, 11:56 a.m. UTC
From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>

This is only done at level 0, since only level 0 knows which physical
CPU a vcpu is running on.  This does for nested guests what L0 already
did for its own guests, which is to flush the TLB on a pCPU when it
goes to run a vCPU there, and there is another vCPU in the same VM
which previously ran on this pCPU and has now started to run on another
pCPU.  This is to handle the situation where the other vCPU touched
a mapping, moved to another pCPU and did a tlbiel (local-only tlbie)
on that new pCPU and thus left behind a stale TLB entry on this pCPU.

This introduces a limit on the the vcpu_token values used in the
H_ENTER_NESTED hcall -- they must now be less than NR_CPUS.

[paulus@ozlabs.org - made prev_cpu array be unsigned short[] to reduce
 memory consumption.]

Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h |   3 +
 arch/powerpc/kvm/book3s_hv.c             | 101 +++++++++++++++++++------------
 arch/powerpc/kvm/book3s_hv_nested.c      |   5 ++
 3 files changed, 71 insertions(+), 38 deletions(-)

Comments

David Gibson Oct. 5, 2018, 4:09 a.m. UTC | #1
On Thu, Oct 04, 2018 at 09:56:02PM +1000, Paul Mackerras wrote:
> From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> 
> This is only done at level 0, since only level 0 knows which physical
> CPU a vcpu is running on.  This does for nested guests what L0 already
> did for its own guests, which is to flush the TLB on a pCPU when it
> goes to run a vCPU there, and there is another vCPU in the same VM
> which previously ran on this pCPU and has now started to run on another
> pCPU.  This is to handle the situation where the other vCPU touched
> a mapping, moved to another pCPU and did a tlbiel (local-only tlbie)
> on that new pCPU and thus left behind a stale TLB entry on this pCPU.
> 
> This introduces a limit on the the vcpu_token values used in the
> H_ENTER_NESTED hcall -- they must now be less than NR_CPUS.

This does make the vcpu tokens no longer entirely opaque to the L0.
It works for now, because the only L1 is Linux and we know basically
how it allocates those tokens.  Eventually we probably want some way
to either remove this restriction or to advertise the limit to the L1.

> [paulus@ozlabs.org - made prev_cpu array be unsigned short[] to reduce
>  memory consumption.]
> 
> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/include/asm/kvm_book3s_64.h |   3 +
>  arch/powerpc/kvm/book3s_hv.c             | 101 +++++++++++++++++++------------
>  arch/powerpc/kvm/book3s_hv_nested.c      |   5 ++
>  3 files changed, 71 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index aa5bf85..1e96027 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -52,6 +52,9 @@ struct kvm_nested_guest {
>  	long refcnt;			/* number of pointers to this struct */
>  	struct mutex tlb_lock;		/* serialize page faults and tlbies */
>  	struct kvm_nested_guest *next;
> +	cpumask_t need_tlb_flush;
> +	cpumask_t cpu_in_guest;
> +	unsigned short prev_cpu[NR_CPUS];
>  };
>  
>  /*
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index ba58883..53a967ea 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -2397,10 +2397,18 @@ static void kvmppc_release_hwthread(int cpu)
>  
>  static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
>  {
> +	struct kvm_nested_guest *nested = vcpu->arch.nested;
> +	cpumask_t *cpu_in_guest;
>  	int i;
>  
>  	cpu = cpu_first_thread_sibling(cpu);
> -	cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
> +	if (nested) {
> +		cpumask_set_cpu(cpu, &nested->need_tlb_flush);
> +		cpu_in_guest = &nested->cpu_in_guest;
> +	} else {
> +		cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
> +		cpu_in_guest = &kvm->arch.cpu_in_guest;
> +	}
>  	/*
>  	 * Make sure setting of bit in need_tlb_flush precedes
>  	 * testing of cpu_in_guest bits.  The matching barrier on
> @@ -2408,13 +2416,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
>  	 */
>  	smp_mb();
>  	for (i = 0; i < threads_per_core; ++i)
> -		if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
> +		if (cpumask_test_cpu(cpu + i, cpu_in_guest))
>  			smp_call_function_single(cpu + i, do_nothing, NULL, 1);
>  }
>  
>  static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
>  {
> +	struct kvm_nested_guest *nested = vcpu->arch.nested;
>  	struct kvm *kvm = vcpu->kvm;
> +	int prev_cpu;
> +
> +	if (!cpu_has_feature(CPU_FTR_HVMODE))
> +		return;
> +
> +	if (nested)
> +		prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
> +	else
> +		prev_cpu = vcpu->arch.prev_cpu;
>  
>  	/*
>  	 * With radix, the guest can do TLB invalidations itself,
> @@ -2428,12 +2446,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
>  	 * ran to flush the TLB.  The TLB is shared between threads,
>  	 * so we use a single bit in .need_tlb_flush for all 4 threads.
>  	 */
> -	if (vcpu->arch.prev_cpu != pcpu) {
> -		if (vcpu->arch.prev_cpu >= 0 &&
> -		    cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
> +	if (prev_cpu != pcpu) {
> +		if (prev_cpu >= 0 &&
> +		    cpu_first_thread_sibling(prev_cpu) !=
>  		    cpu_first_thread_sibling(pcpu))
> -			radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
> -		vcpu->arch.prev_cpu = pcpu;
> +			radix_flush_cpu(kvm, prev_cpu, vcpu);
> +		if (nested)
> +			nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
> +		else
> +			vcpu->arch.prev_cpu = pcpu;
> +	}
> +}
> +
> +static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
> +					      struct kvm_nested_guest *nested)
> +{
> +	cpumask_t *need_tlb_flush;
> +	int lpid;
> +
> +	if (!cpu_has_feature(CPU_FTR_HVMODE))
> +		return;
> +
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
> +		pcpu &= ~0x3UL;
> +
> +	if (nested) {
> +		lpid = nested->shadow_lpid;
> +		need_tlb_flush = &nested->need_tlb_flush;
> +	} else {
> +		lpid = kvm->arch.lpid;
> +		need_tlb_flush = &kvm->arch.need_tlb_flush;
> +	}
> +
> +	mtspr(SPRN_LPID, lpid);
> +	isync();
> +	smp_mb();
> +
> +	if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
> +		radix__local_flush_tlb_lpid_guest(lpid);
> +		/* Clear the bit after the TLB flush */
> +		cpumask_clear_cpu(pcpu, need_tlb_flush);
>  	}
>  }
>  
> @@ -3127,8 +3179,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
>  		spin_unlock(&core_info.vc[sub]->lock);
>  
>  	if (kvm_is_radix(vc->kvm)) {
> -		int tmp = pcpu;
> -
>  		/*
>  		 * Do we need to flush the process scoped TLB for the LPAR?
>  		 *
> @@ -3139,17 +3189,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
>  		 *
>  		 * Hash must be flushed in realmode in order to use tlbiel.
>  		 */
> -		mtspr(SPRN_LPID, vc->kvm->arch.lpid);
> -		isync();
> -
> -		if (cpu_has_feature(CPU_FTR_ARCH_300))
> -			tmp &= ~0x3UL;
> -
> -		if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
> -			radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
> -			/* Clear the bit after the TLB flush */
> -			cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
> -		}
> +		kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
>  	}
>  
>  	/*
> @@ -3868,11 +3908,10 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
>  			  struct kvm_vcpu *vcpu, u64 time_limit,
>  			  unsigned long lpcr)
>  {
> -	int trap, r, pcpu, pcpu0;
> +	int trap, r, pcpu;
>  	int srcu_idx;
>  	struct kvmppc_vcore *vc;
>  	struct kvm_nested_guest *nested = vcpu->arch.nested;
> -	unsigned long lpid;
>  
>  	trace_kvmppc_run_vcpu_enter(vcpu);
>  
> @@ -3945,22 +3984,8 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
>  	vc->vcore_state = VCORE_RUNNING;
>  	trace_kvmppc_run_core(vc, 0);
>  
> -	lpid = vc->kvm->arch.lpid;
> -	if (nested)
> -		lpid = nested->shadow_lpid;
> -	mtspr(SPRN_LPID, lpid);
> -	isync();
> -
> -	/* See comment above in kvmppc_run_core() about this */
> -	pcpu0 = pcpu;
> -	if (cpu_has_feature(CPU_FTR_ARCH_300))
> -		pcpu0 &= ~0x3UL;
> -
> -	if (cpumask_test_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush)) {
> -		radix__local_flush_tlb_lpid_guest(lpid);
> -		/* Clear the bit after the TLB flush */
> -		cpumask_clear_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush);
> -	}
> +	if (cpu_has_feature(CPU_FTR_HVMODE))
> +		kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, nested);
>  
>  	trace_hardirqs_on();
>  	guest_enter_irqoff();
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> index 35f8111..1a8c40d 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -167,6 +167,9 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
>  	if (err)
>  		return H_PARAMETER;
>  
> +	if (l2_hv.vcpu_token >= NR_CPUS)
> +		return H_PARAMETER;
> +
>  	/* translate lpid */
>  	l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
>  	if (!l2)
> @@ -411,6 +414,8 @@ struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
>  		goto out_free2;
>  	gp->shadow_lpid = shadow_lpid;
>  
> +	memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
> +
>  	return gp;
>  
>   out_free2:
Paul Mackerras Oct. 5, 2018, 4:23 a.m. UTC | #2
On Fri, Oct 05, 2018 at 02:09:08PM +1000, David Gibson wrote:
> On Thu, Oct 04, 2018 at 09:56:02PM +1000, Paul Mackerras wrote:
> > From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> > 
> > This is only done at level 0, since only level 0 knows which physical
> > CPU a vcpu is running on.  This does for nested guests what L0 already
> > did for its own guests, which is to flush the TLB on a pCPU when it
> > goes to run a vCPU there, and there is another vCPU in the same VM
> > which previously ran on this pCPU and has now started to run on another
> > pCPU.  This is to handle the situation where the other vCPU touched
> > a mapping, moved to another pCPU and did a tlbiel (local-only tlbie)
> > on that new pCPU and thus left behind a stale TLB entry on this pCPU.
> > 
> > This introduces a limit on the the vcpu_token values used in the
> > H_ENTER_NESTED hcall -- they must now be less than NR_CPUS.
> 
> This does make the vcpu tokens no longer entirely opaque to the L0.
> It works for now, because the only L1 is Linux and we know basically
> how it allocates those tokens.  Eventually we probably want some way
> to either remove this restriction or to advertise the limit to the L1.

Right, we could use something like a hash table and have it be
basically just as efficient as the array when the set of IDs is dense
while also handling arbitrary ID values.  (We'd have to make sure that
L1 couldn't trigger unbounded memory consumption in L0, though.)

Paul.
David Gibson Oct. 5, 2018, 4:54 a.m. UTC | #3
On Fri, Oct 05, 2018 at 02:23:50PM +1000, Paul Mackerras wrote:
> On Fri, Oct 05, 2018 at 02:09:08PM +1000, David Gibson wrote:
> > On Thu, Oct 04, 2018 at 09:56:02PM +1000, Paul Mackerras wrote:
> > > From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> > > 
> > > This is only done at level 0, since only level 0 knows which physical
> > > CPU a vcpu is running on.  This does for nested guests what L0 already
> > > did for its own guests, which is to flush the TLB on a pCPU when it
> > > goes to run a vCPU there, and there is another vCPU in the same VM
> > > which previously ran on this pCPU and has now started to run on another
> > > pCPU.  This is to handle the situation where the other vCPU touched
> > > a mapping, moved to another pCPU and did a tlbiel (local-only tlbie)
> > > on that new pCPU and thus left behind a stale TLB entry on this pCPU.
> > > 
> > > This introduces a limit on the the vcpu_token values used in the
> > > H_ENTER_NESTED hcall -- they must now be less than NR_CPUS.
> > 
> > This does make the vcpu tokens no longer entirely opaque to the L0.
> > It works for now, because the only L1 is Linux and we know basically
> > how it allocates those tokens.  Eventually we probably want some way
> > to either remove this restriction or to advertise the limit to the L1.
> 
> Right, we could use something like a hash table and have it be
> basically just as efficient as the array when the set of IDs is dense
> while also handling arbitrary ID values.  (We'd have to make sure that
> L1 couldn't trigger unbounded memory consumption in L0, though.)

Another approach would be to sacifice some performance for L0
simplicity:  when an L1 vCPU changes pCPU, flush all the nested LPIDs
associated with that L1.  When an L2 vCPU changes L1 vCPU (and
therefore, indirectly pCPU), the L1 would be responsible for flushing
it.
Paul Mackerras Oct. 5, 2018, 5:32 a.m. UTC | #4
On Fri, Oct 05, 2018 at 02:54:28PM +1000, David Gibson wrote:
> On Fri, Oct 05, 2018 at 02:23:50PM +1000, Paul Mackerras wrote:
> > On Fri, Oct 05, 2018 at 02:09:08PM +1000, David Gibson wrote:
> > > On Thu, Oct 04, 2018 at 09:56:02PM +1000, Paul Mackerras wrote:
> > > > From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> > > > 
> > > > This is only done at level 0, since only level 0 knows which physical
> > > > CPU a vcpu is running on.  This does for nested guests what L0 already
> > > > did for its own guests, which is to flush the TLB on a pCPU when it
> > > > goes to run a vCPU there, and there is another vCPU in the same VM
> > > > which previously ran on this pCPU and has now started to run on another
> > > > pCPU.  This is to handle the situation where the other vCPU touched
> > > > a mapping, moved to another pCPU and did a tlbiel (local-only tlbie)
> > > > on that new pCPU and thus left behind a stale TLB entry on this pCPU.
> > > > 
> > > > This introduces a limit on the the vcpu_token values used in the
> > > > H_ENTER_NESTED hcall -- they must now be less than NR_CPUS.
> > > 
> > > This does make the vcpu tokens no longer entirely opaque to the L0.
> > > It works for now, because the only L1 is Linux and we know basically
> > > how it allocates those tokens.  Eventually we probably want some way
> > > to either remove this restriction or to advertise the limit to the L1.
> > 
> > Right, we could use something like a hash table and have it be
> > basically just as efficient as the array when the set of IDs is dense
> > while also handling arbitrary ID values.  (We'd have to make sure that
> > L1 couldn't trigger unbounded memory consumption in L0, though.)
> 
> Another approach would be to sacifice some performance for L0
> simplicity:  when an L1 vCPU changes pCPU, flush all the nested LPIDs
> associated with that L1.  When an L2 vCPU changes L1 vCPU (and
> therefore, indirectly pCPU), the L1 would be responsible for flushing
> it.

That was one of the approaches I considered initially, but it has
complexities that aren't apparent, and it could be quite inefficient
for a guest with a lot of nested guests.  For a start you have to
provide a way for L1 to flush the TLB for another LPID, which guests
can't do themselves (it's a hypervisor privileged operation).  Then
there's the fact that it's not the pCPU where the moving vCPU has
moved to that needs the flush, it's the pCPU that it moved from (where
presumably something else is now running).  All in all, the simplest
solution was to have L0 do it, because L0 knows unambiguously the real
physical CPU where any given vCPU last ran.

Paul.
David Gibson Oct. 8, 2018, 2:02 a.m. UTC | #5
On Fri, Oct 05, 2018 at 03:32:26PM +1000, Paul Mackerras wrote:
> On Fri, Oct 05, 2018 at 02:54:28PM +1000, David Gibson wrote:
> > On Fri, Oct 05, 2018 at 02:23:50PM +1000, Paul Mackerras wrote:
> > > On Fri, Oct 05, 2018 at 02:09:08PM +1000, David Gibson wrote:
> > > > On Thu, Oct 04, 2018 at 09:56:02PM +1000, Paul Mackerras wrote:
> > > > > From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> > > > > 
> > > > > This is only done at level 0, since only level 0 knows which physical
> > > > > CPU a vcpu is running on.  This does for nested guests what L0 already
> > > > > did for its own guests, which is to flush the TLB on a pCPU when it
> > > > > goes to run a vCPU there, and there is another vCPU in the same VM
> > > > > which previously ran on this pCPU and has now started to run on another
> > > > > pCPU.  This is to handle the situation where the other vCPU touched
> > > > > a mapping, moved to another pCPU and did a tlbiel (local-only tlbie)
> > > > > on that new pCPU and thus left behind a stale TLB entry on this pCPU.
> > > > > 
> > > > > This introduces a limit on the the vcpu_token values used in the
> > > > > H_ENTER_NESTED hcall -- they must now be less than NR_CPUS.
> > > > 
> > > > This does make the vcpu tokens no longer entirely opaque to the L0.
> > > > It works for now, because the only L1 is Linux and we know basically
> > > > how it allocates those tokens.  Eventually we probably want some way
> > > > to either remove this restriction or to advertise the limit to the L1.
> > > 
> > > Right, we could use something like a hash table and have it be
> > > basically just as efficient as the array when the set of IDs is dense
> > > while also handling arbitrary ID values.  (We'd have to make sure that
> > > L1 couldn't trigger unbounded memory consumption in L0, though.)
> > 
> > Another approach would be to sacifice some performance for L0
> > simplicity:  when an L1 vCPU changes pCPU, flush all the nested LPIDs
> > associated with that L1.  When an L2 vCPU changes L1 vCPU (and
> > therefore, indirectly pCPU), the L1 would be responsible for flushing
> > it.
> 
> That was one of the approaches I considered initially, but it has
> complexities that aren't apparent, and it could be quite inefficient
> for a guest with a lot of nested guests.  For a start you have to
> provide a way for L1 to flush the TLB for another LPID, which guests
> can't do themselves (it's a hypervisor privileged operation).  Then
> there's the fact that it's not the pCPU where the moving vCPU has
> moved to that needs the flush, it's the pCPU that it moved from (where
> presumably something else is now running).  All in all, the simplest
> solution was to have L0 do it, because L0 knows unambiguously the real
> physical CPU where any given vCPU last ran.

Ah, I see.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index aa5bf85..1e96027 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -52,6 +52,9 @@  struct kvm_nested_guest {
 	long refcnt;			/* number of pointers to this struct */
 	struct mutex tlb_lock;		/* serialize page faults and tlbies */
 	struct kvm_nested_guest *next;
+	cpumask_t need_tlb_flush;
+	cpumask_t cpu_in_guest;
+	unsigned short prev_cpu[NR_CPUS];
 };
 
 /*
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ba58883..53a967ea 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2397,10 +2397,18 @@  static void kvmppc_release_hwthread(int cpu)
 
 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
 {
+	struct kvm_nested_guest *nested = vcpu->arch.nested;
+	cpumask_t *cpu_in_guest;
 	int i;
 
 	cpu = cpu_first_thread_sibling(cpu);
-	cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+	if (nested) {
+		cpumask_set_cpu(cpu, &nested->need_tlb_flush);
+		cpu_in_guest = &nested->cpu_in_guest;
+	} else {
+		cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+		cpu_in_guest = &kvm->arch.cpu_in_guest;
+	}
 	/*
 	 * Make sure setting of bit in need_tlb_flush precedes
 	 * testing of cpu_in_guest bits.  The matching barrier on
@@ -2408,13 +2416,23 @@  static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
 	 */
 	smp_mb();
 	for (i = 0; i < threads_per_core; ++i)
-		if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
+		if (cpumask_test_cpu(cpu + i, cpu_in_guest))
 			smp_call_function_single(cpu + i, do_nothing, NULL, 1);
 }
 
 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
 {
+	struct kvm_nested_guest *nested = vcpu->arch.nested;
 	struct kvm *kvm = vcpu->kvm;
+	int prev_cpu;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
+		return;
+
+	if (nested)
+		prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
+	else
+		prev_cpu = vcpu->arch.prev_cpu;
 
 	/*
 	 * With radix, the guest can do TLB invalidations itself,
@@ -2428,12 +2446,46 @@  static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
 	 * ran to flush the TLB.  The TLB is shared between threads,
 	 * so we use a single bit in .need_tlb_flush for all 4 threads.
 	 */
-	if (vcpu->arch.prev_cpu != pcpu) {
-		if (vcpu->arch.prev_cpu >= 0 &&
-		    cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+	if (prev_cpu != pcpu) {
+		if (prev_cpu >= 0 &&
+		    cpu_first_thread_sibling(prev_cpu) !=
 		    cpu_first_thread_sibling(pcpu))
-			radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
-		vcpu->arch.prev_cpu = pcpu;
+			radix_flush_cpu(kvm, prev_cpu, vcpu);
+		if (nested)
+			nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
+		else
+			vcpu->arch.prev_cpu = pcpu;
+	}
+}
+
+static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
+					      struct kvm_nested_guest *nested)
+{
+	cpumask_t *need_tlb_flush;
+	int lpid;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
+		return;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		pcpu &= ~0x3UL;
+
+	if (nested) {
+		lpid = nested->shadow_lpid;
+		need_tlb_flush = &nested->need_tlb_flush;
+	} else {
+		lpid = kvm->arch.lpid;
+		need_tlb_flush = &kvm->arch.need_tlb_flush;
+	}
+
+	mtspr(SPRN_LPID, lpid);
+	isync();
+	smp_mb();
+
+	if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
+		radix__local_flush_tlb_lpid_guest(lpid);
+		/* Clear the bit after the TLB flush */
+		cpumask_clear_cpu(pcpu, need_tlb_flush);
 	}
 }
 
@@ -3127,8 +3179,6 @@  static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		spin_unlock(&core_info.vc[sub]->lock);
 
 	if (kvm_is_radix(vc->kvm)) {
-		int tmp = pcpu;
-
 		/*
 		 * Do we need to flush the process scoped TLB for the LPAR?
 		 *
@@ -3139,17 +3189,7 @@  static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		 *
 		 * Hash must be flushed in realmode in order to use tlbiel.
 		 */
-		mtspr(SPRN_LPID, vc->kvm->arch.lpid);
-		isync();
-
-		if (cpu_has_feature(CPU_FTR_ARCH_300))
-			tmp &= ~0x3UL;
-
-		if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
-			radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
-			/* Clear the bit after the TLB flush */
-			cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
-		}
+		kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
 	}
 
 	/*
@@ -3868,11 +3908,10 @@  int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
 			  struct kvm_vcpu *vcpu, u64 time_limit,
 			  unsigned long lpcr)
 {
-	int trap, r, pcpu, pcpu0;
+	int trap, r, pcpu;
 	int srcu_idx;
 	struct kvmppc_vcore *vc;
 	struct kvm_nested_guest *nested = vcpu->arch.nested;
-	unsigned long lpid;
 
 	trace_kvmppc_run_vcpu_enter(vcpu);
 
@@ -3945,22 +3984,8 @@  int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
 	vc->vcore_state = VCORE_RUNNING;
 	trace_kvmppc_run_core(vc, 0);
 
-	lpid = vc->kvm->arch.lpid;
-	if (nested)
-		lpid = nested->shadow_lpid;
-	mtspr(SPRN_LPID, lpid);
-	isync();
-
-	/* See comment above in kvmppc_run_core() about this */
-	pcpu0 = pcpu;
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
-		pcpu0 &= ~0x3UL;
-
-	if (cpumask_test_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush)) {
-		radix__local_flush_tlb_lpid_guest(lpid);
-		/* Clear the bit after the TLB flush */
-		cpumask_clear_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush);
-	}
+	if (cpu_has_feature(CPU_FTR_HVMODE))
+		kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, nested);
 
 	trace_hardirqs_on();
 	guest_enter_irqoff();
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 35f8111..1a8c40d 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -167,6 +167,9 @@  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
 	if (err)
 		return H_PARAMETER;
 
+	if (l2_hv.vcpu_token >= NR_CPUS)
+		return H_PARAMETER;
+
 	/* translate lpid */
 	l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
 	if (!l2)
@@ -411,6 +414,8 @@  struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
 		goto out_free2;
 	gp->shadow_lpid = shadow_lpid;
 
+	memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
+
 	return gp;
 
  out_free2: