diff mbox series

[2/3] KVM: arm/arm64: Re-check VMA on detecting a poisoned page

Message ID 20191211165651.7889-3-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm/arm64: user_mem_abort() assorted fixes | expand

Commit Message

Marc Zyngier Dec. 11, 2019, 4:56 p.m. UTC
When we check for a poisoned page, we use the VMA to tell userspace
about the looming disaster. But we pass a pointer to this VMA
after having released the mmap_sem, which isn't a good idea.

Instead, re-check that we have still have a VMA, and that this
VMA still points to a poisoned page. If the VMA isn't there,
userspace is playing with our nerves, so lety's give it a -EFAULT
(it deserves it). If the PFN isn't poisoned anymore, let's restart
from the top and handle the fault again.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 virt/kvm/arm/mmu.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

Comments

Marc Zyngier Dec. 12, 2019, 11:33 a.m. UTC | #1
On 2019-12-11 16:56, Marc Zyngier wrote:
> When we check for a poisoned page, we use the VMA to tell userspace
> about the looming disaster. But we pass a pointer to this VMA
> after having released the mmap_sem, which isn't a good idea.
>
> Instead, re-check that we have still have a VMA, and that this
> VMA still points to a poisoned page. If the VMA isn't there,
> userspace is playing with our nerves, so lety's give it a -EFAULT
> (it deserves it). If the PFN isn't poisoned anymore, let's restart
> from the top and handle the fault again.
>
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  virt/kvm/arm/mmu.c | 25 +++++++++++++++++++++++--
>  1 file changed, 23 insertions(+), 2 deletions(-)
>
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 0b32a904a1bb..f73393f5ddb7 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1741,9 +1741,30 @@ static int user_mem_abort(struct kvm_vcpu
> *vcpu, phys_addr_t fault_ipa,
>
>  	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
>  	if (pfn == KVM_PFN_ERR_HWPOISON) {
> -		kvm_send_hwpoison_signal(hva, vma);
> -		return 0;
> +		/*
> +		 * Search for the VMA again, as it may have been
> +		 * removed in the interval...
> +		 */
> +		down_read(&current->mm->mmap_sem);
> +		vma = find_vma_intersection(current->mm, hva, hva + 1);
> +		if (vma) {
> +			/*
> +			 * Recheck for a poisoned page. If something changed
> +			 * behind our back, don't do a thing and take the
> +			 * fault again.
> +			 */
> +			pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
> +			if (pfn == KVM_PFN_ERR_HWPOISON)
> +				kvm_send_hwpoison_signal(hva, vma);
> +
> +			ret = 0;
> +		} else {
> +			ret = -EFAULT;
> +		}
> +		up_read(&current->mm->mmap_sem);
> +		return ret;
>  	}
> +
>  	if (is_error_noslot_pfn(pfn))
>  		return -EFAULT;

Revisiting this, I wonder if we're not better off just holding the 
mmap_sem
for a bit longer. Something like:

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 0b32a904a1bb..87d416d000c6 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1719,13 +1719,13 @@ static int user_mem_abort(struct kvm_vcpu 
*vcpu, phys_addr_t fault_ipa,
  	if (vma_pagesize == PMD_SIZE ||
  	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
  		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
-	up_read(&current->mm->mmap_sem);
-
  	/* We need minimum second+third level pages */
  	ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
  				     KVM_NR_MEM_OBJS);
-	if (ret)
+	if (ret) {
+		up_read(&current->mm->mmap_sem);
  		return ret;
+	}

  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
  	/*
@@ -1742,8 +1742,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
  	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
  	if (pfn == KVM_PFN_ERR_HWPOISON) {
  		kvm_send_hwpoison_signal(hva, vma);
+		up_read(&current->mm->mmap_sem);
  		return 0;
  	}
+
+	up_read(&current->mm->mmap_sem);
+
  	if (is_error_noslot_pfn(pfn))
  		return -EFAULT;


James, what do you think?

         M.
James Morse Dec. 12, 2019, 3:34 p.m. UTC | #2
Hi Marc,

On 12/12/2019 11:33, Marc Zyngier wrote:
> On 2019-12-11 16:56, Marc Zyngier wrote:
>> When we check for a poisoned page, we use the VMA to tell userspace
>> about the looming disaster. But we pass a pointer to this VMA
>> after having released the mmap_sem, which isn't a good idea.

Sounds like a bug! The vma-size might not match the poisoned pfn.


>> Instead, re-check that we have still have a VMA, and that this
>> VMA still points to a poisoned page. If the VMA isn't there,
>> userspace is playing with our nerves, so lety's give it a -EFAULT
>> (it deserves it). If the PFN isn't poisoned anymore, let's restart
>> from the top and handle the fault again.


>>  virt/kvm/arm/mmu.c | 25 +++++++++++++++++++++++--
>>  1 file changed, 23 insertions(+), 2 deletions(-)

... yeah ...

>> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
>> index 0b32a904a1bb..f73393f5ddb7 100644
>> --- a/virt/kvm/arm/mmu.c
>> +++ b/virt/kvm/arm/mmu.c
>> @@ -1741,9 +1741,30 @@ static int user_mem_abort(struct kvm_vcpu
>> *vcpu, phys_addr_t fault_ipa,
>>
>>      pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
>>      if (pfn == KVM_PFN_ERR_HWPOISON) {
>> -        kvm_send_hwpoison_signal(hva, vma);
>> -        return 0;
>> +        /*
>> +         * Search for the VMA again, as it may have been
>> +         * removed in the interval...
>> +         */
>> +        down_read(&current->mm->mmap_sem);
>> +        vma = find_vma_intersection(current->mm, hva, hva + 1);
>> +        if (vma) {
>> +            /*
>> +             * Recheck for a poisoned page. If something changed
>> +             * behind our back, don't do a thing and take the
>> +             * fault again.
>> +             */
>> +            pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
>> +            if (pfn == KVM_PFN_ERR_HWPOISON)
>> +                kvm_send_hwpoison_signal(hva, vma);
>> +
>> +            ret = 0;
>> +        } else {
>> +            ret = -EFAULT;
>> +        }
>> +        up_read(&current->mm->mmap_sem);
>> +        return ret;
>>      }
>> +
>>      if (is_error_noslot_pfn(pfn))
>>          return -EFAULT;


> Revisiting this, I wonder if we're not better off just holding the mmap_sem
> for a bit longer. Something like:
> 
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 0b32a904a1bb..87d416d000c6 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1719,13 +1719,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t
> fault_ipa,
>      if (vma_pagesize == PMD_SIZE ||
>          (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
>          gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
> -    up_read(&current->mm->mmap_sem);
> -
>      /* We need minimum second+third level pages */
>      ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
>                       KVM_NR_MEM_OBJS);
> -    if (ret)
> +    if (ret) {
> +        up_read(&current->mm->mmap_sem);
>          return ret;
> +    }
> 
>      mmu_seq = vcpu->kvm->mmu_notifier_seq;
>      /*
> @@ -1742,8 +1742,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t
> fault_ipa,
>      pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
>      if (pfn == KVM_PFN_ERR_HWPOISON) {
>          kvm_send_hwpoison_signal(hva, vma);
> +        up_read(&current->mm->mmap_sem);
>          return 0;
>      }
> +
> +    up_read(&current->mm->mmap_sem);
> +
>      if (is_error_noslot_pfn(pfn))
>          return -EFAULT;
> 
> 
> James, what do you think?

(allocating from a kmemcache while holding current's mmap_sem. I don't want to think about
it!)

Can we be lazier? We want the VMA to get the size of the poisoned mapping correct in the
signal. The bug is that this could change when we drop the lock, before queuing the
signal, so we report hwpoison on old-vmas:pfn with new-vmas:size.

Can't it equally change when we drop the lock after queuing the signal? Any time before
the thread returns to user-space to take the signal gives us a stale value.

I think all that matters is the size goes with the pfn that was poisoned. If we look the
vma up by hva again, we have to check if the pfn has changed too... (which you are doing)

Can we stash the size in the existing mmap_sem region, and use that in
kvm_send_hwpoison_signal()? We know it matches the pfn we saw as poisoned.

The vma could be changed before/after we send the signal, but user-space can't know which.
This is user-spaces' problem for messing with the memslots while a vpcu is running.


How about (untested):
-------------------------%<-------------------------
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 38b4c910b6c3..80212d4935bd 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1591,16 +1591,8 @@ static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned
long size)
        __invalidate_icache_guest_page(pfn, size);
 }

-static void kvm_send_hwpoison_signal(unsigned long address,
-                                    struct vm_area_struct *vma)
+static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
 {
-       short lsb;
-
-       if (is_vm_hugetlb_page(vma))
-               lsb = huge_page_shift(hstate_vma(vma));
-       else
-               lsb = PAGE_SHIFT;
-
        send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
 }

@@ -1673,6 +1665,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
        struct vm_area_struct *vma;
+       short stage1_vma_size;
        kvm_pfn_t pfn;
        pgprot_t mem_type = PAGE_S2;
        bool logging_active = memslot_is_logging(memslot);

@@ -1703,6 +1696,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                vma_pagesize = PAGE_SIZE;
        }

+       /* For signals due to hwpoison, we need to use the stage1 size */
+       if (is_vm_hugetlb_page(vma))
+               stage1_vma_size = huge_page_shift(hstate_vma(vma));
+       else
+               stage1_vma_size = PAGE_SHIFT;
+
        /*
         * The stage2 has a minimum of 2 level table (For arm64 see
         * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
@@ -1735,7 +1734,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,

        pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
        if (pfn == KVM_PFN_ERR_HWPOISON) {
-               kvm_send_hwpoison_signal(hva, vma);
+               kvm_send_hwpoison_signal(hva, stage1_vma_size);
                return 0;
        }
        if (is_error_noslot_pfn(pfn))
-------------------------%<-------------------------

Its possible this could even be the original output of vma_kernel_pagesize()... (Punit
supplied the original huge_page_shift(hstate_vma()) runes...)



Thanks,

James
Marc Zyngier Dec. 12, 2019, 3:40 p.m. UTC | #3
Hi James,

On 2019-12-12 15:34, James Morse wrote:
> Hi Marc,
>
> On 12/12/2019 11:33, Marc Zyngier wrote:
>> On 2019-12-11 16:56, Marc Zyngier wrote:
>>> When we check for a poisoned page, we use the VMA to tell userspace
>>> about the looming disaster. But we pass a pointer to this VMA
>>> after having released the mmap_sem, which isn't a good idea.
>
> Sounds like a bug! The vma-size might not match the poisoned pfn.
>
>
>>> Instead, re-check that we have still have a VMA, and that this
>>> VMA still points to a poisoned page. If the VMA isn't there,
>>> userspace is playing with our nerves, so lety's give it a -EFAULT
>>> (it deserves it). If the PFN isn't poisoned anymore, let's restart
>>> from the top and handle the fault again.
>
>
>>>  virt/kvm/arm/mmu.c | 25 +++++++++++++++++++++++--
>>>  1 file changed, 23 insertions(+), 2 deletions(-)
>
> ... yeah ...
>

[...]

> How about (untested):
> -------------------------%<-------------------------
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 38b4c910b6c3..80212d4935bd 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1591,16 +1591,8 @@ static void
> invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned
> long size)
>         __invalidate_icache_guest_page(pfn, size);
>  }
>
> -static void kvm_send_hwpoison_signal(unsigned long address,
> -                                    struct vm_area_struct *vma)
> +static void kvm_send_hwpoison_signal(unsigned long address, short 
> lsb)
>  {
> -       short lsb;
> -
> -       if (is_vm_hugetlb_page(vma))
> -               lsb = huge_page_shift(hstate_vma(vma));
> -       else
> -               lsb = PAGE_SHIFT;
> -
>         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, 
> current);
>  }
>
> @@ -1673,6 +1665,7 @@ static int user_mem_abort(struct kvm_vcpu
> *vcpu, phys_addr_t fault_ipa,
>         struct kvm *kvm = vcpu->kvm;
>         struct kvm_mmu_memory_cache *memcache = 
> &vcpu->arch.mmu_page_cache;
>         struct vm_area_struct *vma;
> +       short stage1_vma_size;
>         kvm_pfn_t pfn;
>         pgprot_t mem_type = PAGE_S2;
>         bool logging_active = memslot_is_logging(memslot);
>
> @@ -1703,6 +1696,12 @@ static int user_mem_abort(struct kvm_vcpu
> *vcpu, phys_addr_t fault_ipa,
>                 vma_pagesize = PAGE_SIZE;
>         }
>
> +       /* For signals due to hwpoison, we need to use the stage1 
> size */
> +       if (is_vm_hugetlb_page(vma))
> +               stage1_vma_size = huge_page_shift(hstate_vma(vma));
> +       else
> +               stage1_vma_size = PAGE_SHIFT;
> +
>         /*
>          * The stage2 has a minimum of 2 level table (For arm64 see
>          * kvm_arm_setup_stage2()). Hence, we are guaranteed that we 
> can
> @@ -1735,7 +1734,7 @@ static int user_mem_abort(struct kvm_vcpu
> *vcpu, phys_addr_t fault_ipa,
>
>         pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
>         if (pfn == KVM_PFN_ERR_HWPOISON) {
> -               kvm_send_hwpoison_signal(hva, vma);
> +               kvm_send_hwpoison_signal(hva, stage1_vma_size);
>                 return 0;
>         }
>         if (is_error_noslot_pfn(pfn))
> -------------------------%<-------------------------
>
> Its possible this could even be the original output of
> vma_kernel_pagesize()... (Punit supplied the original
> huge_page_shift(hstate_vma()) runes...)

I'd be happy with something along these lines. Any chance you could
a proper patch?

Thanks,

         M.
Christoffer Dall Dec. 13, 2019, 9:22 a.m. UTC | #4
On Wed, Dec 11, 2019 at 04:56:49PM +0000, Marc Zyngier wrote:
> When we check for a poisoned page, we use the VMA to tell userspace
> about the looming disaster. But we pass a pointer to this VMA
> after having released the mmap_sem, which isn't a good idea.
> 
> Instead, re-check that we have still have a VMA, and that this
> VMA still points to a poisoned page. If the VMA isn't there,
> userspace is playing with our nerves, so lety's give it a -EFAULT
> (it deserves it). If the PFN isn't poisoned anymore, let's restart
> from the top and handle the fault again.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  virt/kvm/arm/mmu.c | 25 +++++++++++++++++++++++--
>  1 file changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 0b32a904a1bb..f73393f5ddb7 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1741,9 +1741,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  
>  	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
>  	if (pfn == KVM_PFN_ERR_HWPOISON) {
> -		kvm_send_hwpoison_signal(hva, vma);
> -		return 0;
> +		/*
> +		 * Search for the VMA again, as it may have been
> +		 * removed in the interval...
> +		 */
> +		down_read(&current->mm->mmap_sem);
> +		vma = find_vma_intersection(current->mm, hva, hva + 1);
> +		if (vma) {
> +			/*
> +			 * Recheck for a poisoned page. If something changed
> +			 * behind our back, don't do a thing and take the
> +			 * fault again.
> +			 */
> +			pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
> +			if (pfn == KVM_PFN_ERR_HWPOISON)
> +				kvm_send_hwpoison_signal(hva, vma);
> +
> +			ret = 0;
> +		} else {
> +			ret = -EFAULT;
> +		}
> +		up_read(&current->mm->mmap_sem);
> +		return ret;
>  	}
> +
>  	if (is_error_noslot_pfn(pfn))
>  		return -EFAULT;
>  
> -- 
> 2.20.1
> 

If I read this code correctly, then all we use the VMA for is to find
the page size used by the MMU to back the VMA, which we've already
established in the vma_pagesize (and possibly adjusted to something more
accurate based on our constraints in stage 2 which generated the error),
so all we need is the size and a way to convert that into a shift.

Not being 100% confident about the semantics of the lsb bit we pass to
user space (is it indicating the size of the mapping which caused the
error or the size of the mapping where user space could potentially
trigger an error?), or wheter we care enough at that level, could we
consider something like the following instead?

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 38b4c910b6c3..2509d9dec42d 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1592,15 +1592,9 @@ static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address,
-				     struct vm_area_struct *vma)
+				     unsigned long vma_pagesize)
 {
-	short lsb;
-
-	if (is_vm_hugetlb_page(vma))
-		lsb = huge_page_shift(hstate_vma(vma));
-	else
-		lsb = PAGE_SHIFT;
-
+	short lsb = __ffs(vma_pagesize);
 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
 }
 
@@ -1735,7 +1729,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
-		kvm_send_hwpoison_signal(hva, vma);
+		kvm_send_hwpoison_signal(hva, vma_pagesize);
 		return 0;
 	}
 	if (is_error_noslot_pfn(pfn))


Thanks,

    Christoffer
Christoffer Dall Dec. 13, 2019, 9:25 a.m. UTC | #5
Hi James,

On Thu, Dec 12, 2019 at 03:34:31PM +0000, James Morse wrote:
> Hi Marc,
> 
> On 12/12/2019 11:33, Marc Zyngier wrote:
> > On 2019-12-11 16:56, Marc Zyngier wrote:

[...]

> 
> (allocating from a kmemcache while holding current's mmap_sem. I don't want to think about
> it!)
> 
> Can we be lazier? We want the VMA to get the size of the poisoned mapping correct in the
> signal. The bug is that this could change when we drop the lock, before queuing the
> signal, so we report hwpoison on old-vmas:pfn with new-vmas:size.
> 
> Can't it equally change when we drop the lock after queuing the signal? Any time before
> the thread returns to user-space to take the signal gives us a stale value.
> 
> I think all that matters is the size goes with the pfn that was poisoned. If we look the
> vma up by hva again, we have to check if the pfn has changed too... (which you are doing)
> 
> Can we stash the size in the existing mmap_sem region, and use that in
> kvm_send_hwpoison_signal()? We know it matches the pfn we saw as poisoned.
> 
> The vma could be changed before/after we send the signal, but user-space can't know which.
> This is user-spaces' problem for messing with the memslots while a vpcu is running.
> 

(I should clearly have expanded this thread before I replied to the
original patch...)

> 
> How about (untested):
> -------------------------%<-------------------------
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 38b4c910b6c3..80212d4935bd 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1591,16 +1591,8 @@ static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned
> long size)
>         __invalidate_icache_guest_page(pfn, size);
>  }
> 
> -static void kvm_send_hwpoison_signal(unsigned long address,
> -                                    struct vm_area_struct *vma)
> +static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
>  {
> -       short lsb;
> -
> -       if (is_vm_hugetlb_page(vma))
> -               lsb = huge_page_shift(hstate_vma(vma));
> -       else
> -               lsb = PAGE_SHIFT;
> -
>         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
>  }
> 
> @@ -1673,6 +1665,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         struct kvm *kvm = vcpu->kvm;
>         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>         struct vm_area_struct *vma;
> +       short stage1_vma_size;
>         kvm_pfn_t pfn;
>         pgprot_t mem_type = PAGE_S2;
>         bool logging_active = memslot_is_logging(memslot);
> 
> @@ -1703,6 +1696,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                 vma_pagesize = PAGE_SIZE;
>         }
> 
> +       /* For signals due to hwpoison, we need to use the stage1 size */
> +       if (is_vm_hugetlb_page(vma))
> +               stage1_vma_size = huge_page_shift(hstate_vma(vma));
> +       else
> +               stage1_vma_size = PAGE_SHIFT;
> +

But (see my patch) as far as I can tell, this is already what we have in
vma_pagesize, and do we really have to provide the stage 1 size to user
space if the fault happened within a smaller boundary?  Isn't that just
providing more precise information to the user?


Thanks,

    Christoffer
James Morse Dec. 16, 2019, 6:29 p.m. UTC | #6
Hi Christoffer,

On 13/12/2019 09:22, Christoffer Dall wrote:
> On Wed, Dec 11, 2019 at 04:56:49PM +0000, Marc Zyngier wrote:
>> When we check for a poisoned page, we use the VMA to tell userspace
>> about the looming disaster. But we pass a pointer to this VMA
>> after having released the mmap_sem, which isn't a good idea.
>>
>> Instead, re-check that we have still have a VMA, and that this
>> VMA still points to a poisoned page. If the VMA isn't there,
>> userspace is playing with our nerves, so lety's give it a -EFAULT
>> (it deserves it). If the PFN isn't poisoned anymore, let's restart
>> from the top and handle the fault again.

> If I read this code correctly, then all we use the VMA for is to find
> the page size used by the MMU to back the VMA, which we've already
> established in the vma_pagesize (and possibly adjusted to something more
> accurate based on our constraints in stage 2 which generated the error),
> so all we need is the size and a way to convert that into a shift.
> 
> Not being 100% confident about the semantics of the lsb bit we pass to
> user space (is it indicating the size of the mapping which caused the
> error or the size of the mapping where user space could potentially

Its the size of the hole that has opened up in the address map. The error was very likely
to be much smaller, but all we can do is unmap pages.
Transparent huge-pages are split up to minimise the impact. This code is for hugetlbfs
(?), whose pages are dedicated for that use, so don't get split.

arch/arm64/mm/fault.c::do_page_fault() has:
|	lsb = PAGE_SHIFT;
|	if (fault & VM_FAULT_HWPOISON_LARGE)
|		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
|
|	arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb,

(I assume its a shift because bytes in the signal union are precious)


> trigger an error?), or wheter we care enough at that level, could we
> consider something like the following instead?

> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 38b4c910b6c3..2509d9dec42d 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1592,15 +1592,9 @@ static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
>  }
>  
>  static void kvm_send_hwpoison_signal(unsigned long address,
> -				     struct vm_area_struct *vma)
> +				     unsigned long vma_pagesize)
>  {
> -	short lsb;
> -
> -	if (is_vm_hugetlb_page(vma))
> -		lsb = huge_page_shift(hstate_vma(vma));
> -	else
> -		lsb = PAGE_SHIFT;
> -
> +	short lsb = __ffs(vma_pagesize);
>  	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
>  }
>  
> @@ -1735,7 +1729,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  
>  	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
>  	if (pfn == KVM_PFN_ERR_HWPOISON) {
> -		kvm_send_hwpoison_signal(hva, vma);
> +		kvm_send_hwpoison_signal(hva, vma_pagesize);
>  		return 0;
>  	}
>  	if (is_error_noslot_pfn(pfn))

This changes the meaning, vma_pagesize is a value like 4K, not a shift like 12.

But yes, caching the shift value under the mmap_sem and passing it in is the
right-thing-to-do(tm). I have a patch which I'll post, once I remember how to test this thing!



Thanks,

James
diff mbox series

Patch

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 0b32a904a1bb..f73393f5ddb7 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1741,9 +1741,30 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
-		kvm_send_hwpoison_signal(hva, vma);
-		return 0;
+		/*
+		 * Search for the VMA again, as it may have been
+		 * removed in the interval...
+		 */
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma_intersection(current->mm, hva, hva + 1);
+		if (vma) {
+			/*
+			 * Recheck for a poisoned page. If something changed
+			 * behind our back, don't do a thing and take the
+			 * fault again.
+			 */
+			pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+			if (pfn == KVM_PFN_ERR_HWPOISON)
+				kvm_send_hwpoison_signal(hva, vma);
+
+			ret = 0;
+		} else {
+			ret = -EFAULT;
+		}
+		up_read(&current->mm->mmap_sem);
+		return ret;
 	}
+
 	if (is_error_noslot_pfn(pfn))
 		return -EFAULT;