diff mbox

[RFC] KVM: PPC: Book3S HV: add support for page faults in VM_IO|VM_PFNMAP vmas

Message ID 20180208160303.19016-1-clg@kaod.org (mailing list archive)
State New, archived
Headers show

Commit Message

Cédric Le Goater Feb. 8, 2018, 4:03 p.m. UTC
On the POWER9 processor, the XIVE interrupt controller can control
interrupt sources using MMIO to trigger events, to EOI or to turn off
the sources. Priority management and interrupt acknowledgment is also
controlled by MMIO in the presenter subengine.

These MMIO regions are exposed to guests in QEMU with a set of memory
mappings, similarly to VFIO, and it would be good to populate
dynamically the VMAs with the appropriate pages using a fault handler.

Largy inspired by Paulo's commit add6a0cd1c5b ("KVM: MMU: try to fix
up page faults before giving up"), this adds support for page faults
under KVM/PPC for memory mappings of host MMIO regions exposed in
guests.

If this is the right approach, can we externalize the
hva_to_pfn_remapped() routine to use it under kvm/ppc in the Radix
tree and HPT MMU modes ?

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 59 ++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

Comments

Benjamin Herrenschmidt Feb. 8, 2018, 10:19 p.m. UTC | #1
On Thu, 2018-02-08 at 17:03 +0100, Cédric Le Goater wrote:
>  
> +/*
> + * Stolen from virt/kvm/kvm_main.c
> + */

Just export it. It's annoying that we can't just ues hva_to_pfn() ...

> +static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> +                              unsigned long addr, bool write_fault,
> +                              unsigned long *p_pfn)
> +{
> +       unsigned long pfn;
> +       int r;
> +
> +       r = follow_pfn(vma, addr, &pfn);
> +       if (r) {
> +               /*
> +                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
> +                * not call the fault handler, so do it here.
> +                */
> +               bool unlocked = false;
> +
> +               r = fixup_user_fault(current, current->mm, addr,
> +                                    (write_fault ? FAULT_FLAG_WRITE : 0),
> +                                    &unlocked);
> +               if (unlocked)
> +                       return -EAGAIN;
> +               if (r)
> +                       return r;
> +
> +               r = follow_pfn(vma, addr, &pfn);
> +               if (r)
> +                       return r;
> +       }
> +
> +       /*
> +        * Get a reference here because callers of *hva_to_pfn* and
> +        * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
> +        * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
> +        * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
> +        * simply do nothing for reserved pfns.
> +        *
> +        * Whoever called remap_pfn_range is also going to call e.g.
> +        * unmap_mapping_range before the underlying pages are freed,
> +        * causing a call to our MMU notifier.
> +        */
> +       kvm_get_pfn(pfn);
> +
> +       *p_pfn = pfn;
> +       return 0;
> +}
> +
>  int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>                                    unsigned long ea, unsigned long dsisr)
>  {
> @@ -402,8 +450,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>                 vma = find_vma(current->mm, hva);
>                 if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
>                     (vma->vm_flags & VM_PFNMAP)) {
> -                       pfn = vma->vm_pgoff +
> -                               ((hva - vma->vm_start) >> PAGE_SHIFT);
> +                       if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> +                               ret = hva_to_pfn_remapped(vma, hva, writing,
> +                                                         &pfn);
> +                               if (ret == -EAGAIN)
> +                                       return RESUME_GUEST;
> +                       } else {
> +                               pfn = vma->vm_pgoff +
> +                                       ((hva - vma->vm_start) >> PAGE_SHIFT);
> +                       }

I don't think the else case is useful. If fact you are checking
VM_PFNMAP twice...

>                         pgflags = pgprot_val(vma->vm_page_prot);
>                 }
>                 up_read(&current->mm->mmap_sem);
Cédric Le Goater Feb. 9, 2018, 7:14 a.m. UTC | #2
On 02/08/2018 11:19 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2018-02-08 at 17:03 +0100, Cédric Le Goater wrote:
>>  
>> +/*
>> + * Stolen from virt/kvm/kvm_main.c
>> + */
> 
> Just export it. It's annoying that we can't just ues hva_to_pfn() ...

yes. I might add a kvm_ prefix. I will check to see what is the
best pratice.

[ ... ]

>> @@ -402,8 +450,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>>                 vma = find_vma(current->mm, hva);
>>                 if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
>>                     (vma->vm_flags & VM_PFNMAP)) {
>> -                       pfn = vma->vm_pgoff +
>> -                               ((hva - vma->vm_start) >> PAGE_SHIFT);
>> +                       if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
>> +                               ret = hva_to_pfn_remapped(vma, hva, writing,
>> +                                                         &pfn);
>> +                               if (ret == -EAGAIN)
>> +                                       return RESUME_GUEST;
>> +                       } else {
>> +                               pfn = vma->vm_pgoff +
>> +                                       ((hva - vma->vm_start) >> PAGE_SHIFT);
>> +                       }
> 
> I don't think the else case is useful. If fact you are checking
> VM_PFNMAP twice...

yes. I am not sure what the VM_PFNMAP case does. The code comes 
from kvmppc_book3s_hv_page_fault() I suppose.

Thanks,

C.
Paolo Bonzini Feb. 9, 2018, 3:01 p.m. UTC | #3
On 09/02/2018 08:14, Cédric Le Goater wrote:
>>> @@ -402,8 +450,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>>>                 vma = find_vma(current->mm, hva);
>>>                 if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
>>>                     (vma->vm_flags & VM_PFNMAP)) {
>>> -                       pfn = vma->vm_pgoff +
>>> -                               ((hva - vma->vm_start) >> PAGE_SHIFT);
>>> +                       if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
>>> +                               ret = hva_to_pfn_remapped(vma, hva, writing,
>>> +                                                         &pfn);
>>> +                               if (ret == -EAGAIN)
>>> +                                       return RESUME_GUEST;
>>> +                       } else {
>>> +                               pfn = vma->vm_pgoff +
>>> +                                       ((hva - vma->vm_start) >> PAGE_SHIFT);
>>> +                       }
>> I don't think the else case is useful. If fact you are checking
>> VM_PFNMAP twice...
> yes. I am not sure what the VM_PFNMAP case does. The code comes 
> from kvmppc_book3s_hv_page_fault() I suppose.

Maybe the outer check should be for VM_IO|VM_PFNMAP?

(Also, why is this code reinventing most of hva_to_pfn?  Not asking you
to fix that of course, but perhaps there's something to improve in the
generic code too).

Thanks,

Paolo
Benjamin Herrenschmidt Feb. 9, 2018, 9:44 p.m. UTC | #4
On Fri, 2018-02-09 at 16:01 +0100, Paolo Bonzini wrote:
> > > I don't think the else case is useful. If fact you are checking
> > > VM_PFNMAP twice...
> > 
> > yes. I am not sure what the VM_PFNMAP case does. The code comes 
> > from kvmppc_book3s_hv_page_fault() I suppose.
> 
> Maybe the outer check should be for VM_IO|VM_PFNMAP?
> 
> (Also, why is this code reinventing most of hva_to_pfn?  Not asking you
> to fix that of course, but perhaps there's something to improve in the
> generic code too).

I've been wondering the same thing, which led me to try to understand
hva_to_pfn, which resulted in skull marks in the wall ;-)

So hva_to_pfn does a complicated dance around 3 or 4 different variants
of gup and it's really not obvious why, the "intent" doesn't appear to
be documented clearly. I think I somewhat figued out it relates to the
async page fault stuff but even then I don't actually know what those
are and what is their purpose :-)

But yeah, Paul's code more/less does its own thing with a simple
gup_fast, and is missing the hva_to_pfn_remapped case for VM_IO |
VM_PFNMAP. The short fix is to make hva_to_pfn_remapped exported and
use it via a fixed variant of Cedric patch.

The longer term fix might be to understand if/how we can use hva_to_pfn

Cheers,
Ben.
Benjamin Herrenschmidt Feb. 9, 2018, 9:54 p.m. UTC | #5
On Fri, 2018-02-09 at 08:14 +0100, Cédric Le Goater wrote:
> > > @@ -402,8 +450,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
> > >                  vma = find_vma(current->mm, hva);
> > >                  if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
> > >                      (vma->vm_flags & VM_PFNMAP)) {
> > > -                       pfn = vma->vm_pgoff +
> > > -                               ((hva - vma->vm_start) >> PAGE_SHIFT);
> > > +                       if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> > > +                               ret = hva_to_pfn_remapped(vma, hva, writing,
> > > +                                                         &pfn);
> > > +                               if (ret == -EAGAIN)
> > > +                                       return RESUME_GUEST;
> > > +                       } else {
> > > +                               pfn = vma->vm_pgoff +
> > > +                                       ((hva - vma->vm_start) >> PAGE_SHIFT);
> > > +                       }
> > 
> > I don't think the else case is useful. If fact you are checking
> > VM_PFNMAP twice...
> 
> yes. I am not sure what the VM_PFNMAP case does. The code comes 
> from kvmppc_book3s_hv_page_fault() I suppose.

You just need to reproduce the bottom of hva_to_pfn. I don't think
that PFNMAP case in Paul's code is quite right anyway, the generic
code will just go read the PTE which is the right thing to do.

Cheers,
Ben.
diff mbox

Patch

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 58618f644c56..74e889575bf0 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -291,6 +291,54 @@  static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 	return ret;
 }
 
+/*
+ * Stolen from virt/kvm/kvm_main.c
+ */
+static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+			       unsigned long addr, bool write_fault,
+			       unsigned long *p_pfn)
+{
+	unsigned long pfn;
+	int r;
+
+	r = follow_pfn(vma, addr, &pfn);
+	if (r) {
+		/*
+		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+		 * not call the fault handler, so do it here.
+		 */
+		bool unlocked = false;
+
+		r = fixup_user_fault(current, current->mm, addr,
+				     (write_fault ? FAULT_FLAG_WRITE : 0),
+				     &unlocked);
+		if (unlocked)
+			return -EAGAIN;
+		if (r)
+			return r;
+
+		r = follow_pfn(vma, addr, &pfn);
+		if (r)
+			return r;
+	}
+
+	/*
+	 * Get a reference here because callers of *hva_to_pfn* and
+	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+	 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+	 * simply do nothing for reserved pfns.
+	 *
+	 * Whoever called remap_pfn_range is also going to call e.g.
+	 * unmap_mapping_range before the underlying pages are freed,
+	 * causing a call to our MMU notifier.
+	 */
+	kvm_get_pfn(pfn);
+
+	*p_pfn = pfn;
+	return 0;
+}
+
 int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				   unsigned long ea, unsigned long dsisr)
 {
@@ -402,8 +450,15 @@  int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		vma = find_vma(current->mm, hva);
 		if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
 		    (vma->vm_flags & VM_PFNMAP)) {
-			pfn = vma->vm_pgoff +
-				((hva - vma->vm_start) >> PAGE_SHIFT);
+			if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+				ret = hva_to_pfn_remapped(vma, hva, writing,
+							  &pfn);
+				if (ret == -EAGAIN)
+					return RESUME_GUEST;
+			} else {
+				pfn = vma->vm_pgoff +
+					((hva - vma->vm_start) >> PAGE_SHIFT);
+			}
 			pgflags = pgprot_val(vma->vm_page_prot);
 		}
 		up_read(&current->mm->mmap_sem);