diff mbox series

[v5,05/14] KVM: s390: pv: leak the topmost page table when destroy fails

Message ID 20210920132502.36111-6-imbrenda@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series KVM: s390: pv: implement lazy destroy for reboot | expand

Commit Message

Claudio Imbrenda Sept. 20, 2021, 1:24 p.m. UTC
Each secure guest must have a unique address space control element and
we must avoid that new guests use the same ASCE, to avoid errors.
Since the ASCE mostly consists of the topmost page table address (and
flags), we must not return that memory to the pool unless the ASCE is
no longer in use.

Only a successful Destroy Secure Configuration UVC will make the ASCE
reusable again. If the Destroy Configuration UVC fails, the ASCE
cannot be reused for a secure guest (either for the ASCE or for other
memory areas). To avoid a collision, it must not be used again.

This is a permanent error and the page becomes in practice unusable, so
we set it aside and leak it. On failure we already leak other memory
that belongs to the ultravisor (i.e. the variable and base storage for
a guest) and not leaking the topmost page table was an oversight.

This error should not happen unless the hardware is broken or KVM has
some unknown serious bug.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 arch/s390/include/asm/gmap.h |  2 ++
 arch/s390/kvm/pv.c           |  4 ++-
 arch/s390/mm/gmap.c          | 55 ++++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+), 1 deletion(-)

Comments

Janosch Frank Oct. 12, 2021, 7:58 a.m. UTC | #1
On 9/20/21 15:24, Claudio Imbrenda wrote:
> Each secure guest must have a unique address space control element and
> we must avoid that new guests use the same ASCE, to avoid errors.
> Since the ASCE mostly consists of the topmost page table address (and
> flags), we must not return that memory to the pool unless the ASCE is
> no longer in use.
> 
> Only a successful Destroy Secure Configuration UVC will make the ASCE
> reusable again. If the Destroy Configuration UVC fails, the ASCE
> cannot be reused for a secure guest (either for the ASCE or for other
> memory areas). To avoid a collision, it must not be used again.
> 
> This is a permanent error and the page becomes in practice unusable, so
> we set it aside and leak it. On failure we already leak other memory
> that belongs to the ultravisor (i.e. the variable and base storage for
> a guest) and not leaking the topmost page table was an oversight.
> 
> This error should not happen unless the hardware is broken or KVM has
> some unknown serious bug.
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>

Fixes tag?

> ---
>   arch/s390/include/asm/gmap.h |  2 ++
>   arch/s390/kvm/pv.c           |  4 ++-
>   arch/s390/mm/gmap.c          | 55 ++++++++++++++++++++++++++++++++++++
>   3 files changed, 60 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
> index 40264f60b0da..746e18bf8984 100644
> --- a/arch/s390/include/asm/gmap.h
> +++ b/arch/s390/include/asm/gmap.h
> @@ -148,4 +148,6 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
>   			     unsigned long gaddr, unsigned long vmaddr);
>   int gmap_mark_unmergeable(void);
>   void s390_reset_acc(struct mm_struct *mm);
> +void s390_remove_old_asce(struct gmap *gmap);
> +int s390_replace_asce(struct gmap *gmap);
>   #endif /* _ASM_S390_GMAP_H */
> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> index 00d272d134c2..76b0d64ce8fa 100644
> --- a/arch/s390/kvm/pv.c
> +++ b/arch/s390/kvm/pv.c
> @@ -168,9 +168,11 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>   	atomic_set(&kvm->mm->context.is_protected, 0);
>   	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
>   	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
> -	/* Inteded memory leak on "impossible" error */
> +	/* Intended memory leak on "impossible" error */

Rather unrelated

>   	if (!cc)
>   		kvm_s390_pv_dealloc_vm(kvm);
> +	else
> +		s390_replace_asce(kvm->arch.gmap);
>   	return cc ? -EIO : 0;

Might make more sense now to do an early return so we don't have the 
ternary if here.

>   }
>   
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 9bb2c7512cd5..5a138f6220c4 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -2706,3 +2706,58 @@ void s390_reset_acc(struct mm_struct *mm)
>   	mmput(mm);
>   }
>   EXPORT_SYMBOL_GPL(s390_reset_acc);
> +
> +/*
> + * Remove the topmost level of page tables from the list of page tables of
> + * the gmap.
> + * This means that it will not be freed when the VM is torn down, and needs
> + * to be handled separately by the caller, unless an intentional leak is
> + * intended.
> + */
> +void s390_remove_old_asce(struct gmap *gmap)
> +{
> +	struct page *old;
> +
> +	old = virt_to_page(gmap->table);
> +	spin_lock(&gmap->guest_table_lock);
> +	list_del(&old->lru);
> +	spin_unlock(&gmap->guest_table_lock);
> +	/* in case the ASCE needs to be "removed" multiple times */
> +	INIT_LIST_HEAD(&old->lru);
> +}
> +EXPORT_SYMBOL_GPL(s390_remove_old_asce);

Is this used anywhere else than below?
This can be static, no?

> +
> +/*
> + * Try to replace the current ASCE with another equivalent one.
> + * If the allocation of the new top level page table fails, the ASCE is not
> + * replaced.
> + * In any case, the old ASCE is removed from the list, therefore the caller
> + * has to make sure to save a pointer to it beforehands, unless an
> + * intentional leak is intended.
> + */
> +int s390_replace_asce(struct gmap *gmap)
> +{
> +	unsigned long asce;
> +	struct page *page;
> +	void *table;
> +
> +	s390_remove_old_asce(gmap);
> +
> +	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
> +	if (!page)
> +		return -ENOMEM;
> +	table = page_to_virt(page);
> +	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
> +
> +	spin_lock(&gmap->guest_table_lock);
> +	list_add(&page->lru, &gmap->crst_list);
> +	spin_unlock(&gmap->guest_table_lock);
> +
> +	asce = (gmap->asce & ~PAGE_MASK) | __pa(table);
> +	WRITE_ONCE(gmap->asce, asce);

Are you sure we don't need the mm in write lock?

> +	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);

This is usually changed with the context lock held.

> +	WRITE_ONCE(gmap->table, table);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(s390_replace_asce);
>
Claudio Imbrenda Oct. 12, 2021, 8:33 a.m. UTC | #2
On Tue, 12 Oct 2021 09:58:19 +0200
Janosch Frank <frankja@linux.ibm.com> wrote:

> On 9/20/21 15:24, Claudio Imbrenda wrote:
> > Each secure guest must have a unique address space control element and
> > we must avoid that new guests use the same ASCE, to avoid errors.
> > Since the ASCE mostly consists of the topmost page table address (and
> > flags), we must not return that memory to the pool unless the ASCE is
> > no longer in use.
> > 
> > Only a successful Destroy Secure Configuration UVC will make the ASCE
> > reusable again. If the Destroy Configuration UVC fails, the ASCE
> > cannot be reused for a secure guest (either for the ASCE or for other
> > memory areas). To avoid a collision, it must not be used again.
> > 
> > This is a permanent error and the page becomes in practice unusable, so
> > we set it aside and leak it. On failure we already leak other memory
> > that belongs to the ultravisor (i.e. the variable and base storage for
> > a guest) and not leaking the topmost page table was an oversight.
> > 
> > This error should not happen unless the hardware is broken or KVM has
> > some unknown serious bug.
> > 
> > Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>  
> 
> Fixes tag?

will add

> 
> > ---
> >   arch/s390/include/asm/gmap.h |  2 ++
> >   arch/s390/kvm/pv.c           |  4 ++-
> >   arch/s390/mm/gmap.c          | 55 ++++++++++++++++++++++++++++++++++++
> >   3 files changed, 60 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
> > index 40264f60b0da..746e18bf8984 100644
> > --- a/arch/s390/include/asm/gmap.h
> > +++ b/arch/s390/include/asm/gmap.h
> > @@ -148,4 +148,6 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
> >   			     unsigned long gaddr, unsigned long vmaddr);
> >   int gmap_mark_unmergeable(void);
> >   void s390_reset_acc(struct mm_struct *mm);
> > +void s390_remove_old_asce(struct gmap *gmap);
> > +int s390_replace_asce(struct gmap *gmap);
> >   #endif /* _ASM_S390_GMAP_H */
> > diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> > index 00d272d134c2..76b0d64ce8fa 100644
> > --- a/arch/s390/kvm/pv.c
> > +++ b/arch/s390/kvm/pv.c
> > @@ -168,9 +168,11 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> >   	atomic_set(&kvm->mm->context.is_protected, 0);
> >   	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
> >   	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
> > -	/* Inteded memory leak on "impossible" error */
> > +	/* Intended memory leak on "impossible" error */  
> 
> Rather unrelated

it's a typo, might as well fix it here, since I'm touching this function

> >   	if (!cc)
> >   		kvm_s390_pv_dealloc_vm(kvm);
> > +	else
> > +		s390_replace_asce(kvm->arch.gmap);
> >   	return cc ? -EIO : 0;  
> 
> Might make more sense now to do an early return so we don't have the 
> ternary if here.

will do

> >   }
> >   
> > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> > index 9bb2c7512cd5..5a138f6220c4 100644
> > --- a/arch/s390/mm/gmap.c
> > +++ b/arch/s390/mm/gmap.c
> > @@ -2706,3 +2706,58 @@ void s390_reset_acc(struct mm_struct *mm)
> >   	mmput(mm);
> >   }
> >   EXPORT_SYMBOL_GPL(s390_reset_acc);
> > +
> > +/*
> > + * Remove the topmost level of page tables from the list of page tables of
> > + * the gmap.
> > + * This means that it will not be freed when the VM is torn down, and needs
> > + * to be handled separately by the caller, unless an intentional leak is
> > + * intended.
> > + */
> > +void s390_remove_old_asce(struct gmap *gmap)
> > +{
> > +	struct page *old;
> > +
> > +	old = virt_to_page(gmap->table);
> > +	spin_lock(&gmap->guest_table_lock);
> > +	list_del(&old->lru);
> > +	spin_unlock(&gmap->guest_table_lock);
> > +	/* in case the ASCE needs to be "removed" multiple times */
> > +	INIT_LIST_HEAD(&old->lru);
> > +}
> > +EXPORT_SYMBOL_GPL(s390_remove_old_asce);  
> 
> Is this used anywhere else than below?
> This can be static, no?

it's used in KVM in the subsequent patches of this series.

should I add the export when KVM needs the funcion, instead?

> > +
> > +/*
> > + * Try to replace the current ASCE with another equivalent one.
> > + * If the allocation of the new top level page table fails, the ASCE is not
> > + * replaced.
> > + * In any case, the old ASCE is removed from the list, therefore the caller
> > + * has to make sure to save a pointer to it beforehands, unless an
> > + * intentional leak is intended.
> > + */
> > +int s390_replace_asce(struct gmap *gmap)
> > +{
> > +	unsigned long asce;
> > +	struct page *page;
> > +	void *table;
> > +
> > +	s390_remove_old_asce(gmap);
> > +
> > +	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
> > +	if (!page)
> > +		return -ENOMEM;
> > +	table = page_to_virt(page);
> > +	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
> > +
> > +	spin_lock(&gmap->guest_table_lock);
> > +	list_add(&page->lru, &gmap->crst_list);
> > +	spin_unlock(&gmap->guest_table_lock);
> > +
> > +	asce = (gmap->asce & ~PAGE_MASK) | __pa(table);
> > +	WRITE_ONCE(gmap->asce, asce);  
> 
> Are you sure we don't need the mm in write lock?
> 
> > +	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);  
> 
> This is usually changed with the context lock held.

I had thought about it, and I realized that probably we would not need
it. The guest is not running at this point, and we are replacing an
ASCE with a different one pointing to the same page tables.

I can of course add a lock if you think it looks nicer, we are not in a
fast path after all

> > +	WRITE_ONCE(gmap->table, table);
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(s390_replace_asce);
> >   
>
diff mbox series

Patch

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 40264f60b0da..746e18bf8984 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -148,4 +148,6 @@  void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
 			     unsigned long gaddr, unsigned long vmaddr);
 int gmap_mark_unmergeable(void);
 void s390_reset_acc(struct mm_struct *mm);
+void s390_remove_old_asce(struct gmap *gmap);
+int s390_replace_asce(struct gmap *gmap);
 #endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 00d272d134c2..76b0d64ce8fa 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -168,9 +168,11 @@  int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 	atomic_set(&kvm->mm->context.is_protected, 0);
 	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
 	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
-	/* Inteded memory leak on "impossible" error */
+	/* Intended memory leak on "impossible" error */
 	if (!cc)
 		kvm_s390_pv_dealloc_vm(kvm);
+	else
+		s390_replace_asce(kvm->arch.gmap);
 	return cc ? -EIO : 0;
 }
 
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 9bb2c7512cd5..5a138f6220c4 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2706,3 +2706,58 @@  void s390_reset_acc(struct mm_struct *mm)
 	mmput(mm);
 }
 EXPORT_SYMBOL_GPL(s390_reset_acc);
+
+/*
+ * Remove the topmost level of page tables from the list of page tables of
+ * the gmap.
+ * This means that it will not be freed when the VM is torn down, and needs
+ * to be handled separately by the caller, unless an intentional leak is
+ * intended.
+ */
+void s390_remove_old_asce(struct gmap *gmap)
+{
+	struct page *old;
+
+	old = virt_to_page(gmap->table);
+	spin_lock(&gmap->guest_table_lock);
+	list_del(&old->lru);
+	spin_unlock(&gmap->guest_table_lock);
+	/* in case the ASCE needs to be "removed" multiple times */
+	INIT_LIST_HEAD(&old->lru);
+}
+EXPORT_SYMBOL_GPL(s390_remove_old_asce);
+
+/*
+ * Try to replace the current ASCE with another equivalent one.
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is removed from the list, therefore the caller
+ * has to make sure to save a pointer to it beforehands, unless an
+ * intentional leak is intended.
+ */
+int s390_replace_asce(struct gmap *gmap)
+{
+	unsigned long asce;
+	struct page *page;
+	void *table;
+
+	s390_remove_old_asce(gmap);
+
+	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	if (!page)
+		return -ENOMEM;
+	table = page_to_virt(page);
+	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
+
+	spin_lock(&gmap->guest_table_lock);
+	list_add(&page->lru, &gmap->crst_list);
+	spin_unlock(&gmap->guest_table_lock);
+
+	asce = (gmap->asce & ~PAGE_MASK) | __pa(table);
+	WRITE_ONCE(gmap->asce, asce);
+	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
+	WRITE_ONCE(gmap->table, table);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(s390_replace_asce);