Message ID | 20250213200755.196832-3-imbrenda@linux.ibm.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | KVM: s390: fix two newly introduced bugs | expand |
On 13.02.25 21:07, Claudio Imbrenda wrote: > Holding the pte lock for the page that is being converted to secure is > needed to avoid races. A previous commit removed the locking, which > caused issues. Fix by locking the pte again. > > Fixes: 5cbe24350b7d ("KVM: s390: move pv gmap functions into kvm") If you found this because of my report about the changed locking, consider adding a Suggested-by / Reported-y. > Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com> > --- > arch/s390/include/asm/uv.h | 2 +- > arch/s390/kernel/uv.c | 19 +++++++++++++++++-- > arch/s390/kvm/gmap.c | 12 ++++++++---- > 3 files changed, 26 insertions(+), 7 deletions(-) > > diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h > index b11f5b6d0bd1..46fb0ef6f984 100644 > --- a/arch/s390/include/asm/uv.h > +++ b/arch/s390/include/asm/uv.h > @@ -631,7 +631,7 @@ int uv_pin_shared(unsigned long paddr); > int uv_destroy_folio(struct folio *folio); > int uv_destroy_pte(pte_t pte); > int uv_convert_from_secure_pte(pte_t pte); > -int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); > +int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb); > int uv_convert_from_secure(unsigned long paddr); > int uv_convert_from_secure_folio(struct folio *folio); > > diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c > index 9f05df2da2f7..de3c092da7b9 100644 > --- a/arch/s390/kernel/uv.c > +++ b/arch/s390/kernel/uv.c > @@ -245,7 +245,7 @@ static int expected_folio_refs(struct folio *folio) > * Context: The caller must hold exactly one extra reference on the folio > * (it's the same logic as split_folio()) > */ > -int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) > +static int __make_folio_secure(struct folio *folio, unsigned long hva, struct uv_cb_header *uvcb) > { > int expected, cc = 0; > > @@ -277,7 +277,22 @@ int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) > return -EAGAIN; > return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; > } > -EXPORT_SYMBOL_GPL(make_folio_secure); > + > +int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) > +{ > + spinlock_t *ptelock; > + pte_t *ptep; > + int rc; > + > + ptep = get_locked_pte(mm, hva, &ptelock); > + if (!ptep) > + return -ENXIO; > + rc = __make_folio_secure(page_folio(pte_page(*ptep)), hva, uvcb); > + pte_unmap_unlock(ptep, ptelock); > + > + return rc; > +} > +EXPORT_SYMBOL_GPL(make_hva_secure); > > /* > * To be called with the folio locked or with an extra reference! This will > diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c > index fc4d490d25a2..e56c0ab5fec7 100644 > --- a/arch/s390/kvm/gmap.c > +++ b/arch/s390/kvm/gmap.c > @@ -55,7 +55,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str > return atomic_read(&mm->context.protected_count) > 1; > } > > -static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) > +static int __gmap_make_secure(struct gmap *gmap, struct page *page, unsigned long hva, void *uvcb) > { > struct folio *folio = page_folio(page); > int rc; > @@ -83,7 +83,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) > return -EAGAIN; > if (should_export_before_import(uvcb, gmap->mm)) > uv_convert_from_secure(folio_to_phys(folio)); > - rc = make_folio_secure(folio, uvcb); > + rc = make_hva_secure(gmap->mm, hva, uvcb); > folio_unlock(folio); > > /* > @@ -120,6 +120,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) > int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) > { > struct kvm *kvm = gmap->private; > + unsigned long vmaddr; > struct page *page; > int rc = 0; > > @@ -127,8 +128,11 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) > > page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); > mmap_read_lock(gmap->mm); > - if (page) > - rc = __gmap_make_secure(gmap, page, uvcb); > + vmaddr = gfn_to_hva(gmap->private, gpa_to_gfn(gaddr)); > + if (kvm_is_error_hva(vmaddr)) > + rc = -ENXIO; > + if (!rc && page) > + rc = __gmap_make_secure(gmap, page, vmaddr, uvcb); > kvm_release_page_clean(page); > mmap_read_unlock(gmap->mm); > You effectively make the code more complicated and inefficient than before. Now you effectively walk the page table twice in the common small-folio case ... Can we just go back to the old handling that we had before here?
On 13.02.25 21:16, David Hildenbrand wrote: > On 13.02.25 21:07, Claudio Imbrenda wrote: >> Holding the pte lock for the page that is being converted to secure is >> needed to avoid races. A previous commit removed the locking, which >> caused issues. Fix by locking the pte again. >> >> Fixes: 5cbe24350b7d ("KVM: s390: move pv gmap functions into kvm") > > If you found this because of my report about the changed locking, > consider adding a Suggested-by / Reported-y. > >> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com> >> --- >> arch/s390/include/asm/uv.h | 2 +- >> arch/s390/kernel/uv.c | 19 +++++++++++++++++-- >> arch/s390/kvm/gmap.c | 12 ++++++++---- >> 3 files changed, 26 insertions(+), 7 deletions(-) >> >> diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h >> index b11f5b6d0bd1..46fb0ef6f984 100644 >> --- a/arch/s390/include/asm/uv.h >> +++ b/arch/s390/include/asm/uv.h >> @@ -631,7 +631,7 @@ int uv_pin_shared(unsigned long paddr); >> int uv_destroy_folio(struct folio *folio); >> int uv_destroy_pte(pte_t pte); >> int uv_convert_from_secure_pte(pte_t pte); >> -int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); >> +int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb); >> int uv_convert_from_secure(unsigned long paddr); >> int uv_convert_from_secure_folio(struct folio *folio); >> >> diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c >> index 9f05df2da2f7..de3c092da7b9 100644 >> --- a/arch/s390/kernel/uv.c >> +++ b/arch/s390/kernel/uv.c >> @@ -245,7 +245,7 @@ static int expected_folio_refs(struct folio *folio) >> * Context: The caller must hold exactly one extra reference on the folio >> * (it's the same logic as split_folio()) >> */ >> -int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) >> +static int __make_folio_secure(struct folio *folio, unsigned long hva, struct uv_cb_header *uvcb) >> { >> int expected, cc = 0; >> >> @@ -277,7 +277,22 @@ int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) >> return -EAGAIN; >> return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; >> } >> -EXPORT_SYMBOL_GPL(make_folio_secure); >> + >> +int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) >> +{ >> + spinlock_t *ptelock; >> + pte_t *ptep; >> + int rc; >> + >> + ptep = get_locked_pte(mm, hva, &ptelock); >> + if (!ptep) >> + return -ENXIO; >> + rc = __make_folio_secure(page_folio(pte_page(*ptep)), hva, uvcb); >> + pte_unmap_unlock(ptep, ptelock); >> + >> + return rc; >> +} >> +EXPORT_SYMBOL_GPL(make_hva_secure); >> >> /* >> * To be called with the folio locked or with an extra reference! This will >> diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c >> index fc4d490d25a2..e56c0ab5fec7 100644 >> --- a/arch/s390/kvm/gmap.c >> +++ b/arch/s390/kvm/gmap.c >> @@ -55,7 +55,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str >> return atomic_read(&mm->context.protected_count) > 1; >> } >> >> -static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) >> +static int __gmap_make_secure(struct gmap *gmap, struct page *page, unsigned long hva, void *uvcb) >> { >> struct folio *folio = page_folio(page); >> int rc; >> @@ -83,7 +83,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) >> return -EAGAIN; >> if (should_export_before_import(uvcb, gmap->mm)) >> uv_convert_from_secure(folio_to_phys(folio)); >> - rc = make_folio_secure(folio, uvcb); >> + rc = make_hva_secure(gmap->mm, hva, uvcb); >> folio_unlock(folio); >> >> /* >> @@ -120,6 +120,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) >> int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) >> { >> struct kvm *kvm = gmap->private; >> + unsigned long vmaddr; >> struct page *page; >> int rc = 0; >> >> @@ -127,8 +128,11 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) >> >> page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); >> mmap_read_lock(gmap->mm); >> - if (page) >> - rc = __gmap_make_secure(gmap, page, uvcb); >> + vmaddr = gfn_to_hva(gmap->private, gpa_to_gfn(gaddr)); >> + if (kvm_is_error_hva(vmaddr)) >> + rc = -ENXIO; >> + if (!rc && page) >> + rc = __gmap_make_secure(gmap, page, vmaddr, uvcb); >> kvm_release_page_clean(page); >> mmap_read_unlock(gmap->mm); >> > > You effectively make the code more complicated and inefficient than > before. Now you effectively walk the page table twice in the common > small-folio case ... > > Can we just go back to the old handling that we had before here? I'll note that there is still the possibility for a different race: nothing guarantees that the page you looked up using gfn_to_hva() will still be mapped when you perform the get_locked_pte(). Not sure what would happen if we would have a different page mapped. You could re-verify it is still there, but then, doing two page table walks is still more than required in the common case where we can just perform the conversion.
On Thu, 13 Feb 2025 21:16:03 +0100 David Hildenbrand <david@redhat.com> wrote: > On 13.02.25 21:07, Claudio Imbrenda wrote: > > Holding the pte lock for the page that is being converted to secure is > > needed to avoid races. A previous commit removed the locking, which > > caused issues. Fix by locking the pte again. > > > > Fixes: 5cbe24350b7d ("KVM: s390: move pv gmap functions into kvm") > > If you found this because of my report about the changed locking, > consider adding a Suggested-by / Reported-y. yes, sorry; I sent the patch in haste and forgot. Which one would you prefer (or both?) [...] > > @@ -127,8 +128,11 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) > > > > page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); > > mmap_read_lock(gmap->mm); > > - if (page) > > - rc = __gmap_make_secure(gmap, page, uvcb); > > + vmaddr = gfn_to_hva(gmap->private, gpa_to_gfn(gaddr)); > > + if (kvm_is_error_hva(vmaddr)) > > + rc = -ENXIO; > > + if (!rc && page) > > + rc = __gmap_make_secure(gmap, page, vmaddr, uvcb); > > kvm_release_page_clean(page); > > mmap_read_unlock(gmap->mm); > > > > You effectively make the code more complicated and inefficient than > before. Now you effectively walk the page table twice in the common > small-folio case ... I think in every case, but see below > > Can we just go back to the old handling that we had before here? > I'd rather not, this is needed to prepare for the next series (for 6.15) in a couple of weeks, where gmap gets completely removed from s390/mm, and gmap dat tables will not share ptes with userspace anymore (i.e. we will use mmu_notifiers, like all other archs) I will remove the double walk, though, since there is no reason to keep it in there
On 14.02.25 11:17, Claudio Imbrenda wrote: > On Thu, 13 Feb 2025 21:16:03 +0100 > David Hildenbrand <david@redhat.com> wrote: > >> On 13.02.25 21:07, Claudio Imbrenda wrote: >>> Holding the pte lock for the page that is being converted to secure is >>> needed to avoid races. A previous commit removed the locking, which >>> caused issues. Fix by locking the pte again. >>> >>> Fixes: 5cbe24350b7d ("KVM: s390: move pv gmap functions into kvm") >> >> If you found this because of my report about the changed locking, >> consider adding a Suggested-by / Reported-y. > > yes, sorry; I sent the patch in haste and forgot. Which one would you > prefer (or both?) > Maybe Reported-by. > [...] > >>> @@ -127,8 +128,11 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) >>> >>> page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); >>> mmap_read_lock(gmap->mm); >>> - if (page) >>> - rc = __gmap_make_secure(gmap, page, uvcb); >>> + vmaddr = gfn_to_hva(gmap->private, gpa_to_gfn(gaddr)); >>> + if (kvm_is_error_hva(vmaddr)) >>> + rc = -ENXIO; >>> + if (!rc && page) >>> + rc = __gmap_make_secure(gmap, page, vmaddr, uvcb); >>> kvm_release_page_clean(page); >>> mmap_read_unlock(gmap->mm); >>> >> >> You effectively make the code more complicated and inefficient than >> before. Now you effectively walk the page table twice in the common >> small-folio case ... > > I think in every case, but see below > >> >> Can we just go back to the old handling that we had before here? >> > > I'd rather not, this is needed to prepare for the next series (for > 6.15) in a couple of weeks, where gmap gets completely removed from > s390/mm, and gmap dat tables will not share ptes with userspace anymore > (i.e. we will use mmu_notifiers, like all other archs) I think for the conversion we would still: GFN -> HVA Walk to the folio mapped at HVA, lock the PTE and perform the conversion. So even with memory notifiers, that should be fine, no? So not necessarily "the old handling that we had before" but rather "the old way of looking up what's mapped and performing the conversion under the PTL". For me to fix the refcount freezing properly on top of your work, we'll need the PTL (esp. to exclude concurrent GUP-slow) etc.
On Fri, 14 Feb 2025 11:27:15 +0100 David Hildenbrand <david@redhat.com> wrote: > On 14.02.25 11:17, Claudio Imbrenda wrote: > > On Thu, 13 Feb 2025 21:16:03 +0100 > > David Hildenbrand <david@redhat.com> wrote: > > > >> On 13.02.25 21:07, Claudio Imbrenda wrote: > >>> Holding the pte lock for the page that is being converted to secure is > >>> needed to avoid races. A previous commit removed the locking, which > >>> caused issues. Fix by locking the pte again. > >>> > >>> Fixes: 5cbe24350b7d ("KVM: s390: move pv gmap functions into kvm") > >> > >> If you found this because of my report about the changed locking, > >> consider adding a Suggested-by / Reported-y. > > > > yes, sorry; I sent the patch in haste and forgot. Which one would you > > prefer (or both?) > > > > Maybe Reported-by. > > > [...] > > > >>> @@ -127,8 +128,11 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) > >>> > >>> page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); > >>> mmap_read_lock(gmap->mm); > >>> - if (page) > >>> - rc = __gmap_make_secure(gmap, page, uvcb); > >>> + vmaddr = gfn_to_hva(gmap->private, gpa_to_gfn(gaddr)); > >>> + if (kvm_is_error_hva(vmaddr)) > >>> + rc = -ENXIO; > >>> + if (!rc && page) > >>> + rc = __gmap_make_secure(gmap, page, vmaddr, uvcb); > >>> kvm_release_page_clean(page); > >>> mmap_read_unlock(gmap->mm); > >>> > >> > >> You effectively make the code more complicated and inefficient than > >> before. Now you effectively walk the page table twice in the common > >> small-folio case ... > > > > I think in every case, but see below > > > >> > >> Can we just go back to the old handling that we had before here? > >> > > > > I'd rather not, this is needed to prepare for the next series (for > > 6.15) in a couple of weeks, where gmap gets completely removed from > > s390/mm, and gmap dat tables will not share ptes with userspace anymore > > (i.e. we will use mmu_notifiers, like all other archs) > > I think for the conversion we would still: > > GFN -> HVA > > Walk to the folio mapped at HVA, lock the PTE and perform the conversion. > > So even with memory notifiers, that should be fine, no? yes > > So not necessarily "the old handling that we had before" but rather "the > old way of looking up what's mapped and performing the conversion under > the PTL". ahhh, yes > > For me to fix the refcount freezing properly on top of your work, we'll > need the PTL (esp. to exclude concurrent GUP-slow) etc. let's discuss this off-list
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index b11f5b6d0bd1..46fb0ef6f984 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -631,7 +631,7 @@ int uv_pin_shared(unsigned long paddr); int uv_destroy_folio(struct folio *folio); int uv_destroy_pte(pte_t pte); int uv_convert_from_secure_pte(pte_t pte); -int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); +int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb); int uv_convert_from_secure(unsigned long paddr); int uv_convert_from_secure_folio(struct folio *folio); diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 9f05df2da2f7..de3c092da7b9 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -245,7 +245,7 @@ static int expected_folio_refs(struct folio *folio) * Context: The caller must hold exactly one extra reference on the folio * (it's the same logic as split_folio()) */ -int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) +static int __make_folio_secure(struct folio *folio, unsigned long hva, struct uv_cb_header *uvcb) { int expected, cc = 0; @@ -277,7 +277,22 @@ int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) return -EAGAIN; return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } -EXPORT_SYMBOL_GPL(make_folio_secure); + +int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) +{ + spinlock_t *ptelock; + pte_t *ptep; + int rc; + + ptep = get_locked_pte(mm, hva, &ptelock); + if (!ptep) + return -ENXIO; + rc = __make_folio_secure(page_folio(pte_page(*ptep)), hva, uvcb); + pte_unmap_unlock(ptep, ptelock); + + return rc; +} +EXPORT_SYMBOL_GPL(make_hva_secure); /* * To be called with the folio locked or with an extra reference! This will diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index fc4d490d25a2..e56c0ab5fec7 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -55,7 +55,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str return atomic_read(&mm->context.protected_count) > 1; } -static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) +static int __gmap_make_secure(struct gmap *gmap, struct page *page, unsigned long hva, void *uvcb) { struct folio *folio = page_folio(page); int rc; @@ -83,7 +83,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) return -EAGAIN; if (should_export_before_import(uvcb, gmap->mm)) uv_convert_from_secure(folio_to_phys(folio)); - rc = make_folio_secure(folio, uvcb); + rc = make_hva_secure(gmap->mm, hva, uvcb); folio_unlock(folio); /* @@ -120,6 +120,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) { struct kvm *kvm = gmap->private; + unsigned long vmaddr; struct page *page; int rc = 0; @@ -127,8 +128,11 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); mmap_read_lock(gmap->mm); - if (page) - rc = __gmap_make_secure(gmap, page, uvcb); + vmaddr = gfn_to_hva(gmap->private, gpa_to_gfn(gaddr)); + if (kvm_is_error_hva(vmaddr)) + rc = -ENXIO; + if (!rc && page) + rc = __gmap_make_secure(gmap, page, vmaddr, uvcb); kvm_release_page_clean(page); mmap_read_unlock(gmap->mm);
Holding the pte lock for the page that is being converted to secure is needed to avoid races. A previous commit removed the locking, which caused issues. Fix by locking the pte again. Fixes: 5cbe24350b7d ("KVM: s390: move pv gmap functions into kvm") Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com> --- arch/s390/include/asm/uv.h | 2 +- arch/s390/kernel/uv.c | 19 +++++++++++++++++-- arch/s390/kvm/gmap.c | 12 ++++++++---- 3 files changed, 26 insertions(+), 7 deletions(-)