diff mbox series

[RFC,25/32] KVM: PPC: Book3S HV: Introduce rmap to track nested guest mappings

Message ID 1537524123-9578-26-git-send-email-paulus@ozlabs.org (mailing list archive)
State New, archived
Headers show
Series KVM: PPC: Book3S HV: Nested HV virtualization | expand

Commit Message

Paul Mackerras Sept. 21, 2018, 10:01 a.m. UTC
From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>

When a host (L0) page which is mapped into a (L1) guest is in turn
mapped through to a nested (L2) guest we keep a reverse mapping (rmap)
so that these mappings can be retrieved later.

Whenever we create an entry in a shadow_pgtable for a nested guest we
create a corresponding rmap entry and add it to the list for the
L1 guest memslot at the index of the L1 guest page it maps. This means
at the L1 guest memslot we end up with lists of rmaps as follows;

  memslot (L1)
-----------------
| gfn = 0	| -> rmap -> rmap -> rmap -> NULL
-----------------
| gfn = 1	|
-----------------
| gfn = 2	| -> rmap -> NULL
-----------------
| gfn = 3	| -> rmap -> rmap -> NULL
-----------------
| gfn = 4	|
-----------------
| gfn = n	|
-----------------

When we are notified of a host page being invalidated which has been
mapped through to a (L1) guest, we can then walk the rmap list for that
guest page, and find and invalidate all of the corresponding
shadow_pgtable entries.

Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s.h    |  3 +
 arch/powerpc/include/asm/kvm_book3s_64.h | 17 +++++-
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 45 ++++++++++-----
 arch/powerpc/kvm/book3s_hv.c             |  1 +
 arch/powerpc/kvm/book3s_hv_nested.c      | 98 +++++++++++++++++++++++++++++++-
 5 files changed, 147 insertions(+), 17 deletions(-)

Comments

David Gibson Sept. 27, 2018, 4:07 a.m. UTC | #1
On Fri, Sep 21, 2018 at 08:01:56PM +1000, Paul Mackerras wrote:
> From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> 
> When a host (L0) page which is mapped into a (L1) guest is in turn
> mapped through to a nested (L2) guest we keep a reverse mapping (rmap)
> so that these mappings can be retrieved later.
> 
> Whenever we create an entry in a shadow_pgtable for a nested guest we
> create a corresponding rmap entry and add it to the list for the
> L1 guest memslot at the index of the L1 guest page it maps. This means
> at the L1 guest memslot we end up with lists of rmaps as follows;
> 
>   memslot (L1)
> -----------------
> | gfn = 0	| -> rmap -> rmap -> rmap -> NULL
> -----------------
> | gfn = 1	|
> -----------------
> | gfn = 2	| -> rmap -> NULL
> -----------------
> | gfn = 3	| -> rmap -> rmap -> NULL
> -----------------
> | gfn = 4	|
> -----------------
> | gfn = n	|
> -----------------
> 
> When we are notified of a host page being invalidated which has been
> mapped through to a (L1) guest, we can then walk the rmap list for that
> guest page, and find and invalidate all of the corresponding
> shadow_pgtable entries.
> 
> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

Am I right in thinking that the reason we need this in addition to the
exiting rmap for the L1 guest is that we need to know the LPID(s) to
invalidate in the case of a host invalidation?

> ---
>  arch/powerpc/include/asm/kvm_book3s.h    |  3 +
>  arch/powerpc/include/asm/kvm_book3s_64.h | 17 +++++-
>  arch/powerpc/kvm/book3s_64_mmu_radix.c   | 45 ++++++++++-----
>  arch/powerpc/kvm/book3s_hv.c             |  1 +
>  arch/powerpc/kvm/book3s_hv_nested.c      | 98 +++++++++++++++++++++++++++++++-
>  5 files changed, 147 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index 664e1fb..d849518c 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -196,6 +196,9 @@ extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
>  			int table_index, u64 *pte_ret_p);
>  extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
>  			struct kvmppc_pte *gpte, bool data, bool iswrite);
> +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
> +			unsigned int shift, struct kvm_memory_slot *memslot,
> +			unsigned int lpid);
>  extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
>  				    bool writing, unsigned long gpa,
>  				    unsigned int lpid);
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 16c3a97..dadd40e 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -37,6 +37,13 @@ struct kvm_nested_guest {
>  	struct kvm_nested_guest *next;
>  };
>  
> +/* Structure for a nested guest rmap entry */
> +struct rmap_nested {
> +	struct llist_node list;
> +	unsigned int l1_lpid;		/* L1's lpid of this nested guest */
> +	unsigned long n_gpa;		/* nested guest physical address */
> +};
> +
>  struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int lpid,
>  					  bool create);
>  void kvmhv_put_nested(struct kvm_nested_guest *gp);
> @@ -535,7 +542,15 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
>  
>  extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			     unsigned long gpa, unsigned int level,
> -			     unsigned long mmu_seq, unsigned int lpid);
> +			     unsigned long mmu_seq, unsigned int lpid,
> +			     unsigned long *rmap, struct rmap_nested *n_rmap);
> +extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmap,
> +				   struct rmap_nested *n_rmap);
> +extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
> +				struct kvm_memory_slot *memslot,
> +				unsigned long gpa, unsigned long hpa,
> +				unsigned long nbytes);
> +extern void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
>  
>  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
>  
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 778ae87..52b2a57 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -256,28 +256,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
>  	kmem_cache_free(kvm_pmd_cache, pmdp);
>  }
>  
> -void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
> -		      unsigned long gpa, unsigned int shift,
> -		      struct kvm_memory_slot *memslot,
> +/* Called with kvm->mmu_lock held */
> +void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
> +		      unsigned int shift, struct kvm_memory_slot *memslot,
>  		      unsigned int lpid)
>  
>  {
>  	unsigned long old;
> +	unsigned long gfn = gpa >> PAGE_SHIFT;
> +	unsigned long page_size = PAGE_SIZE;
> +	unsigned long hpa;
>  
>  	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
>  	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
> -	if ((old & _PAGE_DIRTY) && (lpid == kvm->arch.lpid)) {
> -		unsigned long gfn = gpa >> PAGE_SHIFT;
> -		unsigned long page_size = PAGE_SIZE;
>  
> -		if (shift)
> -			page_size = 1ul << shift;
> +	/* The following only applies to L1 entries */
> +	if (lpid != kvm->arch.lpid)
> +		return;
> +
> +	if (!memslot) {
> +		memslot = gfn_to_memslot(kvm, gfn);
>  		if (!memslot)
> -			memslot = gfn_to_memslot(kvm, gfn);
> -		if (memslot && memslot->dirty_bitmap) {
> -			kvmppc_update_dirty_map(memslot, gfn, page_size);
> -		}
> +			return;
>  	}
> +	if (shift)
> +		page_size = 1ul << shift;
> +
> +	gpa &= ~(page_size - 1);
> +	hpa = old & PTE_RPN_MASK;
> +	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
> +
> +	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
> +		kvmppc_update_dirty_map(memslot, gfn, page_size);
>  }
>  
>  /*
> @@ -433,7 +443,8 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
>  
>  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  		      unsigned long gpa, unsigned int level,
> -		      unsigned long mmu_seq, unsigned int lpid)
> +		      unsigned long mmu_seq, unsigned int lpid,
> +		      unsigned long *rmap, struct rmap_nested *n_rmap)
>  {
>  	pgd_t *pgd;
>  	pud_t *pud, *new_pud = NULL;
> @@ -512,6 +523,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
>  		}
>  		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
> +		if (rmap && n_rmap)
> +			kvmhv_insert_nest_rmap(kvm, rmap, n_rmap);
>  		ret = 0;
>  		goto out_unlock;
>  	}
> @@ -562,6 +575,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
>  		}
>  		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
> +		if (rmap && n_rmap)
> +			kvmhv_insert_nest_rmap(kvm, rmap, n_rmap);
>  		ret = 0;
>  		goto out_unlock;
>  	}
> @@ -586,6 +601,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  		goto out_unlock;
>  	}
>  	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
> +	if (rmap && n_rmap)
> +		kvmhv_insert_nest_rmap(kvm, rmap, n_rmap);
>  	ret = 0;
>  
>   out_unlock:
> @@ -713,7 +730,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
>  
>  	/* Allocate space in the tree and write the PTE */
>  	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
> -				mmu_seq, kvm->arch.lpid);
> +				mmu_seq, kvm->arch.lpid, NULL, NULL);
>  	if (inserted_pte)
>  		*inserted_pte = pte;
>  	if (levelp)
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 3b78d97..80da231 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4277,6 +4277,7 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
>  					struct kvm_memory_slot *dont)
>  {
>  	if (!dont || free->arch.rmap != dont->arch.rmap) {
> +		kvmhv_free_memslot_nest_rmap(free);
>  		vfree(free->arch.rmap);
>  		free->arch.rmap = NULL;
>  	}
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> index af8066b..9a50feb 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -10,6 +10,7 @@
>  
>  #include <linux/kernel.h>
>  #include <linux/kvm_host.h>
> +#include <linux/llist.h>
>  
>  #include <asm/kvm_ppc.h>
>  #include <asm/kvm_book3s.h>
> @@ -523,13 +524,96 @@ void kvmhv_put_nested(struct kvm_nested_guest *gp)
>  		kvmhv_release_nested(gp);
>  }
>  
> -struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
> +static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
>  {
>  	if (lpid > kvm->arch.max_nested_lpid)
>  		return NULL;
>  	return kvm->arch.nested_guests[lpid];
>  }
>  
> +void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmap,
> +			   struct rmap_nested *n_rmap)
> +{
> +	llist_add(&n_rmap->list, (struct llist_head *) rmap);
> +}
> +
> +static void kvmhv_remove_nest_rmap(struct kvm *kvm, struct rmap_nested *n_rmap,
> +				   unsigned long hpa, unsigned long mask)
> +{
> +	struct kvm_nested_guest *gp;
> +	unsigned int shift;
> +	pte_t *ptep;
> +
> +	gp = kvmhv_find_nested(kvm, n_rmap->l1_lpid);
> +	if (!gp)
> +		return;
> +
> +	/* Find and invalidate the pte */
> +	ptep = __find_linux_pte(gp->shadow_pgtable, n_rmap->n_gpa, NULL,
> +				&shift);
> +	/* Don't spuriously invalidate ptes if the pfn has changed */
> +	if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
> +		kvmppc_unmap_pte(kvm, ptep, n_rmap->n_gpa, shift, NULL,
> +				 gp->shadow_lpid);
> +}
> +
> +static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmap,
> +					unsigned long hpa, unsigned long mask)
> +{
> +	struct rmap_nested *n_rmap, *next;
> +	struct llist_node *entry = llist_del_all((struct llist_head *) rmap);
> +
> +	if (!entry) /* List was empty */
> +		return;
> +
> +	llist_for_each_entry_safe(n_rmap, next, entry, list) {
> +		kvmhv_remove_nest_rmap(kvm, n_rmap, hpa, mask);
> +		kfree(n_rmap);
> +	}
> +}
> +
> +/* called with kvm->mmu_lock held */
> +void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
> +				  struct kvm_memory_slot *memslot,
> +				  unsigned long gpa, unsigned long hpa,
> +				  unsigned long nbytes)
> +{
> +	unsigned long gfn, end_gfn;
> +	unsigned long addr_mask;
> +
> +	if (!memslot)
> +		return;
> +	gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
> +	end_gfn = gfn + (nbytes >> PAGE_SHIFT);
> +
> +	addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
> +	hpa &= addr_mask;
> +
> +	for (; gfn < end_gfn; gfn++) {
> +		unsigned long *rmap = &memslot->arch.rmap[gfn];
> +		kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
> +	}
> +}
> +
> +void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
> +{
> +	struct rmap_nested *n_rmap, *next;
> +	struct llist_node *entry;
> +	unsigned long *rmap;
> +	unsigned long page;
> +
> +	for (page = 0; page < free->npages; page++) {
> +		rmap = &free->arch.rmap[page];
> +		entry = llist_del_all((struct llist_head *) rmap);
> +
> +		if (!entry)
> +			continue;
> +
> +		llist_for_each_entry_safe(n_rmap, next, entry, list)
> +			kfree(n_rmap);
> +	}
> +}
> +
>  static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
>  					struct kvm_nested_guest *gp,
>  					long gpa, int *shift_ret)
> @@ -736,11 +820,13 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
>  {
>  	struct kvm *kvm = vcpu->kvm;
>  	struct kvm_memory_slot *memslot;
> +	struct rmap_nested *n_rmap;
>  	struct kvmppc_pte gpte;
>  	pte_t pte, *pte_p;
>  	unsigned long mmu_seq;
>  	unsigned long dsisr = vcpu->arch.fault_dsisr;
>  	unsigned long ea = vcpu->arch.fault_dar;
> +	unsigned long *rmap;
>  	unsigned long n_gpa, gpa, gfn, perm = 0UL;
>  	unsigned int shift, l1_shift, level;
>  	bool writing = !!(dsisr & DSISR_ISSTORE);
> @@ -867,8 +953,16 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
>  
>  	/* 4. Insert the pte into our shadow_pgtable */
>  
> +	n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
> +	if (!n_rmap)
> +		return RESUME_GUEST; /* Let the guest try again */
> +	n_rmap->l1_lpid = gp->l1_lpid;
> +	n_rmap->n_gpa = n_gpa;
> +	rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
>  	ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
> -				mmu_seq, gp->shadow_lpid);
> +				mmu_seq, gp->shadow_lpid, rmap, n_rmap);
> +	if (ret)
> +		kfree(n_rmap);
>  	if (ret == -EAGAIN)
>  		ret = RESUME_GUEST;	/* Let the guest try again */
>
Paul Mackerras Sept. 27, 2018, 5:54 a.m. UTC | #2
On Thu, Sep 27, 2018 at 02:07:58PM +1000, David Gibson wrote:
> On Fri, Sep 21, 2018 at 08:01:56PM +1000, Paul Mackerras wrote:
> > From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> > 
> > When a host (L0) page which is mapped into a (L1) guest is in turn
> > mapped through to a nested (L2) guest we keep a reverse mapping (rmap)
> > so that these mappings can be retrieved later.
> > 
> > Whenever we create an entry in a shadow_pgtable for a nested guest we
> > create a corresponding rmap entry and add it to the list for the
> > L1 guest memslot at the index of the L1 guest page it maps. This means
> > at the L1 guest memslot we end up with lists of rmaps as follows;
> > 
> >   memslot (L1)
> > -----------------
> > | gfn = 0	| -> rmap -> rmap -> rmap -> NULL
> > -----------------
> > | gfn = 1	|
> > -----------------
> > | gfn = 2	| -> rmap -> NULL
> > -----------------
> > | gfn = 3	| -> rmap -> rmap -> NULL
> > -----------------
> > | gfn = 4	|
> > -----------------
> > | gfn = n	|
> > -----------------
> > 
> > When we are notified of a host page being invalidated which has been
> > mapped through to a (L1) guest, we can then walk the rmap list for that
> > guest page, and find and invalidate all of the corresponding
> > shadow_pgtable entries.
> > 
> > Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> > Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> 
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> 
> Am I right in thinking that the reason we need this in addition to the
> exiting rmap for the L1 guest is that we need to know the LPID(s) to
> invalidate in the case of a host invalidation?

The existing rmap is not really used for radix guests because we get a
guest real address and we can just index into the partition-scoped
radix tree and find the one and only PTE that maps that guest real
address.

With a nested guest, a given L1 guest real page could be mapped in the
shadow partition-scoped trees for multiple nested guests at arbitrary
L2 guest real addresses.  The rmap that we implement here gives us a
list of (l1 lpid, l2 guest real address) combinations that (may) map
the target L1 guest real page.

Paul.
David Gibson Sept. 28, 2018, 1:04 a.m. UTC | #3
On Thu, Sep 27, 2018 at 03:54:40PM +1000, Paul Mackerras wrote:
> On Thu, Sep 27, 2018 at 02:07:58PM +1000, David Gibson wrote:
> > On Fri, Sep 21, 2018 at 08:01:56PM +1000, Paul Mackerras wrote:
> > > From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> > > 
> > > When a host (L0) page which is mapped into a (L1) guest is in turn
> > > mapped through to a nested (L2) guest we keep a reverse mapping (rmap)
> > > so that these mappings can be retrieved later.
> > > 
> > > Whenever we create an entry in a shadow_pgtable for a nested guest we
> > > create a corresponding rmap entry and add it to the list for the
> > > L1 guest memslot at the index of the L1 guest page it maps. This means
> > > at the L1 guest memslot we end up with lists of rmaps as follows;
> > > 
> > >   memslot (L1)
> > > -----------------
> > > | gfn = 0	| -> rmap -> rmap -> rmap -> NULL
> > > -----------------
> > > | gfn = 1	|
> > > -----------------
> > > | gfn = 2	| -> rmap -> NULL
> > > -----------------
> > > | gfn = 3	| -> rmap -> rmap -> NULL
> > > -----------------
> > > | gfn = 4	|
> > > -----------------
> > > | gfn = n	|
> > > -----------------
> > > 
> > > When we are notified of a host page being invalidated which has been
> > > mapped through to a (L1) guest, we can then walk the rmap list for that
> > > guest page, and find and invalidate all of the corresponding
> > > shadow_pgtable entries.
> > > 
> > > Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> > > Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> > 
> > Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> > 
> > Am I right in thinking that the reason we need this in addition to the
> > exiting rmap for the L1 guest is that we need to know the LPID(s) to
> > invalidate in the case of a host invalidation?
> 
> The existing rmap is not really used for radix guests because we get a
> guest real address and we can just index into the partition-scoped
> radix tree and find the one and only PTE that maps that guest real
> address.
> 
> With a nested guest, a given L1 guest real page could be mapped in the
> shadow partition-scoped trees for multiple nested guests at arbitrary
> L2 guest real addresses.  The rmap that we implement here gives us a
> list of (l1 lpid, l2 guest real address) combinations that (may) map
> the target L1 guest real page.

Ah, right.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 664e1fb..d849518c 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -196,6 +196,9 @@  extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
 			int table_index, u64 *pte_ret_p);
 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 			struct kvmppc_pte *gpte, bool data, bool iswrite);
+extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
+			unsigned int shift, struct kvm_memory_slot *memslot,
+			unsigned int lpid);
 extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
 				    bool writing, unsigned long gpa,
 				    unsigned int lpid);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 16c3a97..dadd40e 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -37,6 +37,13 @@  struct kvm_nested_guest {
 	struct kvm_nested_guest *next;
 };
 
+/* Structure for a nested guest rmap entry */
+struct rmap_nested {
+	struct llist_node list;
+	unsigned int l1_lpid;		/* L1's lpid of this nested guest */
+	unsigned long n_gpa;		/* nested guest physical address */
+};
+
 struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int lpid,
 					  bool create);
 void kvmhv_put_nested(struct kvm_nested_guest *gp);
@@ -535,7 +542,15 @@  static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
 
 extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 			     unsigned long gpa, unsigned int level,
-			     unsigned long mmu_seq, unsigned int lpid);
+			     unsigned long mmu_seq, unsigned int lpid,
+			     unsigned long *rmap, struct rmap_nested *n_rmap);
+extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmap,
+				   struct rmap_nested *n_rmap);
+extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
+				struct kvm_memory_slot *memslot,
+				unsigned long gpa, unsigned long hpa,
+				unsigned long nbytes);
+extern void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
 
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 778ae87..52b2a57 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -256,28 +256,38 @@  static void kvmppc_pmd_free(pmd_t *pmdp)
 	kmem_cache_free(kvm_pmd_cache, pmdp);
 }
 
-void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
-		      unsigned long gpa, unsigned int shift,
-		      struct kvm_memory_slot *memslot,
+/* Called with kvm->mmu_lock held */
+void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
+		      unsigned int shift, struct kvm_memory_slot *memslot,
 		      unsigned int lpid)
 
 {
 	unsigned long old;
+	unsigned long gfn = gpa >> PAGE_SHIFT;
+	unsigned long page_size = PAGE_SIZE;
+	unsigned long hpa;
 
 	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
 	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
-	if ((old & _PAGE_DIRTY) && (lpid == kvm->arch.lpid)) {
-		unsigned long gfn = gpa >> PAGE_SHIFT;
-		unsigned long page_size = PAGE_SIZE;
 
-		if (shift)
-			page_size = 1ul << shift;
+	/* The following only applies to L1 entries */
+	if (lpid != kvm->arch.lpid)
+		return;
+
+	if (!memslot) {
+		memslot = gfn_to_memslot(kvm, gfn);
 		if (!memslot)
-			memslot = gfn_to_memslot(kvm, gfn);
-		if (memslot && memslot->dirty_bitmap) {
-			kvmppc_update_dirty_map(memslot, gfn, page_size);
-		}
+			return;
 	}
+	if (shift)
+		page_size = 1ul << shift;
+
+	gpa &= ~(page_size - 1);
+	hpa = old & PTE_RPN_MASK;
+	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
+
+	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
+		kvmppc_update_dirty_map(memslot, gfn, page_size);
 }
 
 /*
@@ -433,7 +443,8 @@  static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 
 int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 		      unsigned long gpa, unsigned int level,
-		      unsigned long mmu_seq, unsigned int lpid)
+		      unsigned long mmu_seq, unsigned int lpid,
+		      unsigned long *rmap, struct rmap_nested *n_rmap)
 {
 	pgd_t *pgd;
 	pud_t *pud, *new_pud = NULL;
@@ -512,6 +523,8 @@  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
 		}
 		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
+		if (rmap && n_rmap)
+			kvmhv_insert_nest_rmap(kvm, rmap, n_rmap);
 		ret = 0;
 		goto out_unlock;
 	}
@@ -562,6 +575,8 @@  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
 		}
 		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+		if (rmap && n_rmap)
+			kvmhv_insert_nest_rmap(kvm, rmap, n_rmap);
 		ret = 0;
 		goto out_unlock;
 	}
@@ -586,6 +601,8 @@  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 		goto out_unlock;
 	}
 	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+	if (rmap && n_rmap)
+		kvmhv_insert_nest_rmap(kvm, rmap, n_rmap);
 	ret = 0;
 
  out_unlock:
@@ -713,7 +730,7 @@  int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 
 	/* Allocate space in the tree and write the PTE */
 	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
-				mmu_seq, kvm->arch.lpid);
+				mmu_seq, kvm->arch.lpid, NULL, NULL);
 	if (inserted_pte)
 		*inserted_pte = pte;
 	if (levelp)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3b78d97..80da231 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4277,6 +4277,7 @@  static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
 					struct kvm_memory_slot *dont)
 {
 	if (!dont || free->arch.rmap != dont->arch.rmap) {
+		kvmhv_free_memslot_nest_rmap(free);
 		vfree(free->arch.rmap);
 		free->arch.rmap = NULL;
 	}
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index af8066b..9a50feb 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -10,6 +10,7 @@ 
 
 #include <linux/kernel.h>
 #include <linux/kvm_host.h>
+#include <linux/llist.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -523,13 +524,96 @@  void kvmhv_put_nested(struct kvm_nested_guest *gp)
 		kvmhv_release_nested(gp);
 }
 
-struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
+static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
 {
 	if (lpid > kvm->arch.max_nested_lpid)
 		return NULL;
 	return kvm->arch.nested_guests[lpid];
 }
 
+void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmap,
+			   struct rmap_nested *n_rmap)
+{
+	llist_add(&n_rmap->list, (struct llist_head *) rmap);
+}
+
+static void kvmhv_remove_nest_rmap(struct kvm *kvm, struct rmap_nested *n_rmap,
+				   unsigned long hpa, unsigned long mask)
+{
+	struct kvm_nested_guest *gp;
+	unsigned int shift;
+	pte_t *ptep;
+
+	gp = kvmhv_find_nested(kvm, n_rmap->l1_lpid);
+	if (!gp)
+		return;
+
+	/* Find and invalidate the pte */
+	ptep = __find_linux_pte(gp->shadow_pgtable, n_rmap->n_gpa, NULL,
+				&shift);
+	/* Don't spuriously invalidate ptes if the pfn has changed */
+	if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
+		kvmppc_unmap_pte(kvm, ptep, n_rmap->n_gpa, shift, NULL,
+				 gp->shadow_lpid);
+}
+
+static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmap,
+					unsigned long hpa, unsigned long mask)
+{
+	struct rmap_nested *n_rmap, *next;
+	struct llist_node *entry = llist_del_all((struct llist_head *) rmap);
+
+	if (!entry) /* List was empty */
+		return;
+
+	llist_for_each_entry_safe(n_rmap, next, entry, list) {
+		kvmhv_remove_nest_rmap(kvm, n_rmap, hpa, mask);
+		kfree(n_rmap);
+	}
+}
+
+/* called with kvm->mmu_lock held */
+void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
+				  struct kvm_memory_slot *memslot,
+				  unsigned long gpa, unsigned long hpa,
+				  unsigned long nbytes)
+{
+	unsigned long gfn, end_gfn;
+	unsigned long addr_mask;
+
+	if (!memslot)
+		return;
+	gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
+	end_gfn = gfn + (nbytes >> PAGE_SHIFT);
+
+	addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
+	hpa &= addr_mask;
+
+	for (; gfn < end_gfn; gfn++) {
+		unsigned long *rmap = &memslot->arch.rmap[gfn];
+		kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
+	}
+}
+
+void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
+{
+	struct rmap_nested *n_rmap, *next;
+	struct llist_node *entry;
+	unsigned long *rmap;
+	unsigned long page;
+
+	for (page = 0; page < free->npages; page++) {
+		rmap = &free->arch.rmap[page];
+		entry = llist_del_all((struct llist_head *) rmap);
+
+		if (!entry)
+			continue;
+
+		llist_for_each_entry_safe(n_rmap, next, entry, list)
+			kfree(n_rmap);
+	}
+}
+
 static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
 					struct kvm_nested_guest *gp,
 					long gpa, int *shift_ret)
@@ -736,11 +820,13 @@  static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_memory_slot *memslot;
+	struct rmap_nested *n_rmap;
 	struct kvmppc_pte gpte;
 	pte_t pte, *pte_p;
 	unsigned long mmu_seq;
 	unsigned long dsisr = vcpu->arch.fault_dsisr;
 	unsigned long ea = vcpu->arch.fault_dar;
+	unsigned long *rmap;
 	unsigned long n_gpa, gpa, gfn, perm = 0UL;
 	unsigned int shift, l1_shift, level;
 	bool writing = !!(dsisr & DSISR_ISSTORE);
@@ -867,8 +953,16 @@  static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
 
 	/* 4. Insert the pte into our shadow_pgtable */
 
+	n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
+	if (!n_rmap)
+		return RESUME_GUEST; /* Let the guest try again */
+	n_rmap->l1_lpid = gp->l1_lpid;
+	n_rmap->n_gpa = n_gpa;
+	rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
 	ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
-				mmu_seq, gp->shadow_lpid);
+				mmu_seq, gp->shadow_lpid, rmap, n_rmap);
+	if (ret)
+		kfree(n_rmap);
 	if (ret == -EAGAIN)
 		ret = RESUME_GUEST;	/* Let the guest try again */