diff mbox series

[RFC] KVM: x86: Fix APIC page invalidation race

Message ID 20200606042627.61070-1-eiichi.tsukata@nutanix.com (mailing list archive)
State New, archived
Headers show
Series [RFC] KVM: x86: Fix APIC page invalidation race | expand

Commit Message

Eiichi Tsukata June 6, 2020, 4:26 a.m. UTC
Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to
fix inappropriate APIC page invalidation by re-introducing arch specific
kvm_arch_mmu_notifier_invalidate_range() and calling it from
kvm_mmu_notifier_invalidate_range_start. But threre could be the
following race because VMCS APIC address cache can be updated
*before* it is unmapped.

Race:
  (Invalidator) kvm_mmu_notifier_invalidate_range_start()
  (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
  (KVM VCPU) vcpu_enter_guest()
  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
  (Invalidator) actually unmap page

Symptom:
  The above race can make Guest OS see already freed page and Guest OS
will see broken APIC register values. Especially, Windows OS checks
LAPIC modification so it can cause BSOD crash with BugCheck
CRITICAL_STRUCTURE_CORRUPTION (109). These symptoms are the same as we
previously saw in https://bugzilla.kernel.org/show_bug.cgi?id=197951 and
we are currently seeing in
https://bugzilla.redhat.com/show_bug.cgi?id=1751017.

To prevent Guest OS from accessing already freed page, this patch calls
kvm_arch_mmu_notifier_invalidate_range() from
kvm_mmu_notifier_invalidate_range() instead of ..._range_start().

Fixes: b1394e745b94 ("KVM: x86: fix APIC page invalidation")
Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
---
 arch/x86/kvm/x86.c       |  7 ++-----
 include/linux/kvm_host.h |  4 ++--
 virt/kvm/kvm_main.c      | 26 ++++++++++++++++----------
 3 files changed, 20 insertions(+), 17 deletions(-)

Comments

Eiichi Tsukata June 6, 2020, 5 a.m. UTC | #1
Hello

The race window I mentioned in the commit message is pretty small. So it’s difficult to reproduce it.
But with the following ‘delay’ patch, it can be very easy to reproduce.

```
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c17e6eb9ad43..b6728bf80a7d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -55,6 +55,7 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/isolation.h>
 #include <linux/mem_encrypt.h>
+#include <linux/delay.h>
 
 #include <trace/events/kvm.h>
 
@@ -8161,8 +8162,10 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
         * Update it when it becomes invalid.
         */
        apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-       if (start <= apic_address && apic_address < end)
+       if (start <= apic_address && apic_address < end) {
                kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+               mdelay(1000);
+       }
 
        return 0;
 }
```

Steps to Reproduce:
- start Windows VM(ex: Windows Server 2016) and watch YouTube video to stimulate VM_ENTER/EXIT
- ’stress —vm X —vm-bytes Y’ to make the APIC page swapped out
- Windows OS will crash with BugCheck 0x109

Thanks,

Eiichi

> On Jun 6, 2020, at 13:26, Eiichi Tsukata <eiichi.tsukata@nutanix.com> wrote:
> 
> Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to
> fix inappropriate APIC page invalidation by re-introducing arch specific
> kvm_arch_mmu_notifier_invalidate_range() and calling it from
> kvm_mmu_notifier_invalidate_range_start. But threre could be the
> following race because VMCS APIC address cache can be updated
> *before* it is unmapped.
> 
> Race:
>  (Invalidator) kvm_mmu_notifier_invalidate_range_start()
>  (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
>  (KVM VCPU) vcpu_enter_guest()
>  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>  (Invalidator) actually unmap page
> 
> Symptom:
>  The above race can make Guest OS see already freed page and Guest OS
> will see broken APIC register values. Especially, Windows OS checks
> LAPIC modification so it can cause BSOD crash with BugCheck
> CRITICAL_STRUCTURE_CORRUPTION (109). These symptoms are the same as we
> previously saw in https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.kernel.org_show-5Fbug.cgi-3Fid-3D197951&d=DwIDAg&c=s883GpUCOChKOHiocYtGcg&r=dy01Dr4Ly8mhvnUdx1pZhhT1bkq4h9z5aVWu3paoZtk&m=0Tyk-14RQ4E7qUHEz3qfkUGJEUisqm5fr6wFgen6m9o&s=uTkyasbUNMoptgfsLkg3D5IDb_xxOSjklf2IfLLUzgI&e=  and
> we are currently seeing in
> https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.redhat.com_show-5Fbug.cgi-3Fid-3D1751017&d=DwIDAg&c=s883GpUCOChKOHiocYtGcg&r=dy01Dr4Ly8mhvnUdx1pZhhT1bkq4h9z5aVWu3paoZtk&m=0Tyk-14RQ4E7qUHEz3qfkUGJEUisqm5fr6wFgen6m9o&s=pyRkFbs1A9a9AXxWMqiDEOoGJGBbmF8uJdLu8vKSPCs&e= .
> 
> To prevent Guest OS from accessing already freed page, this patch calls
> kvm_arch_mmu_notifier_invalidate_range() from
> kvm_mmu_notifier_invalidate_range() instead of ..._range_start().
> 
> Fixes: b1394e745b94 ("KVM: x86: fix APIC page invalidation")
> Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
> ---
> arch/x86/kvm/x86.c       |  7 ++-----
> include/linux/kvm_host.h |  4 ++--
> virt/kvm/kvm_main.c      | 26 ++++++++++++++++----------
> 3 files changed, 20 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c17e6eb9ad43..1700aade39d1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -8150,9 +8150,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
> 	kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
> }
> 
> -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> -		unsigned long start, unsigned long end,
> -		bool blockable)
> +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> +					    unsigned long start, unsigned long end)
> {
> 	unsigned long apic_address;
> 
> @@ -8163,8 +8162,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> 	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
> 	if (start <= apic_address && apic_address < end)
> 		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
> -
> -	return 0;
> }
> 
> void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 131cc1527d68..92efa39ea3d7 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1406,8 +1406,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
> }
> #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
> 
> -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> -		unsigned long start, unsigned long end, bool blockable);
> +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> +					    unsigned long start, unsigned long end);
> 
> #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
> int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 731c1e517716..77aa91fb08d2 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -155,10 +155,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
> static unsigned long long kvm_createvm_count;
> static unsigned long long kvm_active_vms;
> 
> -__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> -		unsigned long start, unsigned long end, bool blockable)
> +__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> +						   unsigned long start, unsigned long end)
> {
> -	return 0;
> }
> 
> bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
> @@ -384,6 +383,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> 	return container_of(mn, struct kvm, mmu_notifier);
> }
> 
> +static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
> +					      struct mm_struct *mm,
> +					      unsigned long start, unsigned long end)
> +{
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	int idx;
> +
> +	idx = srcu_read_lock(&kvm->srcu);
> +	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
> +	srcu_read_unlock(&kvm->srcu, idx);
> +}
> +
> static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
> 					struct mm_struct *mm,
> 					unsigned long address,
> @@ -408,7 +419,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> {
> 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> 	int need_tlb_flush = 0, idx;
> -	int ret;
> 
> 	idx = srcu_read_lock(&kvm->srcu);
> 	spin_lock(&kvm->mmu_lock);
> @@ -425,14 +435,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> 		kvm_flush_remote_tlbs(kvm);
> 
> 	spin_unlock(&kvm->mmu_lock);
> -
> -	ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
> -					range->end,
> -					mmu_notifier_range_blockable(range));
> -
> 	srcu_read_unlock(&kvm->srcu, idx);
> 
> -	return ret;
> +	return 0;
> }
> 
> static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> @@ -538,6 +543,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
> 
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
> 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
> 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
> 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
> -- 
> 2.21.3
>
Paolo Bonzini June 8, 2020, 1:13 p.m. UTC | #2
On 06/06/20 06:26, Eiichi Tsukata wrote:
> Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to
> fix inappropriate APIC page invalidation by re-introducing arch specific
> kvm_arch_mmu_notifier_invalidate_range() and calling it from
> kvm_mmu_notifier_invalidate_range_start. But threre could be the
> following race because VMCS APIC address cache can be updated
> *before* it is unmapped.
> 
> Race:
>   (Invalidator) kvm_mmu_notifier_invalidate_range_start()
>   (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
>   (KVM VCPU) vcpu_enter_guest()
>   (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>   (Invalidator) actually unmap page
> 
> Symptom:
>   The above race can make Guest OS see already freed page and Guest OS
> will see broken APIC register values.

This is not exactly the issue.  The values in the APIC-access page do
not really matter, the problem is that the host physical address values
won't match between the page tables and the APIC-access page address.
Then the processor will not trap APIC accesses, and will instead show
the raw contents of the APIC-access page (zeroes), and cause the crash
as you mention below.

Still, the race explains the symptoms and the patch matches this text in
include/linux/mmu_notifier.h:

	 * If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().

where the "additional reference" is in the VMCS: because we have to
account for kvm_vcpu_reload_apic_access_page running between
invalidate_range_start() and invalidate_range_end(), we need to
implement invalidate_range().

The patch seems good, but I'd like Andrea Arcangeli to take a look as
well so I've CCed him.

Thank you very much!

Paolo

> Especially, Windows OS checks
> LAPIC modification so it can cause BSOD crash with BugCheck
> CRITICAL_STRUCTURE_CORRUPTION (109). These symptoms are the same as we
> previously saw in https://bugzilla.kernel.org/show_bug.cgi?id=197951 and
> we are currently seeing in
> https://bugzilla.redhat.com/show_bug.cgi?id=1751017.
> 
> To prevent Guest OS from accessing already freed page, this patch calls
> kvm_arch_mmu_notifier_invalidate_range() from
> kvm_mmu_notifier_invalidate_range() instead of ..._range_start().
Eiichi Tsukata June 9, 2020, 1:04 a.m. UTC | #3
> On Jun 8, 2020, at 22:13, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 06/06/20 06:26, Eiichi Tsukata wrote:
>> Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to
>> fix inappropriate APIC page invalidation by re-introducing arch specific
>> kvm_arch_mmu_notifier_invalidate_range() and calling it from
>> kvm_mmu_notifier_invalidate_range_start. But threre could be the
>> following race because VMCS APIC address cache can be updated
>> *before* it is unmapped.
>> 
>> Race:
>>  (Invalidator) kvm_mmu_notifier_invalidate_range_start()
>>  (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
>>  (KVM VCPU) vcpu_enter_guest()
>>  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>>  (Invalidator) actually unmap page
>> 
>> Symptom:
>>  The above race can make Guest OS see already freed page and Guest OS
>> will see broken APIC register values.
> 
> This is not exactly the issue.  The values in the APIC-access page do
> not really matter, the problem is that the host physical address values
> won't match between the page tables and the APIC-access page address.
> Then the processor will not trap APIC accesses, and will instead show
> the raw contents of the APIC-access page (zeroes), and cause the crash
> as you mention below.
> 
> Still, the race explains the symptoms and the patch matches this text in
> include/linux/mmu_notifier.h:
> 
> 	 * If the subsystem
>         * can't guarantee that no additional references are taken to
>         * the pages in the range, it has to implement the
>         * invalidate_range() notifier to remove any references taken
>         * after invalidate_range_start().
> 
> where the "additional reference" is in the VMCS: because we have to
> account for kvm_vcpu_reload_apic_access_page running between
> invalidate_range_start() and invalidate_range_end(), we need to
> implement invalidate_range().
> 
> The patch seems good, but I'd like Andrea Arcangeli to take a look as
> well so I've CCed him.
> 
> Thank you very much!
> 
> Paolo
> 

Hello Paolo

Thanks for detailed explanation!
I’ll fix the commit message like this:

```
Symptom:
  The above race can cause mismatch between the page tables and the
APIC-access page address in VMCS.Then the processor will not trap APIC
accesses, and will instead show the raw contents of the APIC-access page
(zeroes). Especially, Windows OS checks LAPIC modification so it can cause
BSOD crash with BugCheck CRITICAL_STRUCTURE_CORRUPTION (109). These symptoms
are the same as we previously saw in
https://bugzilla.kernel.org/show_bug.cgi?id=197951
and we are currently seeing in
https://bugzilla.redhat.com/show_bug.cgi?id=1751017.

To prevent mismatch between page tables and APIC-access page address,
this patch calls kvm_arch_mmu_notifier_invalidate_range() from
kvm_mmu_notifier_invalidate_range() instead of ..._range_start().
We need to implement invalidate_range() because we have to
account for kvm_vcpu_reload_apic_access_page() running between
invalidate_range_start() and invalidate_range_end().
```


Best

Eiichi
Paolo Bonzini June 9, 2020, 9:54 a.m. UTC | #4
On 09/06/20 03:04, Eiichi Tsukata wrote:
> 
> 
>> On Jun 8, 2020, at 22:13, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>> On 06/06/20 06:26, Eiichi Tsukata wrote:
>>> Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to
>>> fix inappropriate APIC page invalidation by re-introducing arch specific
>>> kvm_arch_mmu_notifier_invalidate_range() and calling it from
>>> kvm_mmu_notifier_invalidate_range_start. But threre could be the
>>> following race because VMCS APIC address cache can be updated
>>> *before* it is unmapped.
>>>
>>> Race:
>>>  (Invalidator) kvm_mmu_notifier_invalidate_range_start()
>>>  (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
>>>  (KVM VCPU) vcpu_enter_guest()
>>>  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>>>  (Invalidator) actually unmap page
>>>
>>> Symptom:
>>>  The above race can make Guest OS see already freed page and Guest OS
>>> will see broken APIC register values.
>>
>> This is not exactly the issue.  The values in the APIC-access page do
>> not really matter, the problem is that the host physical address values
>> won't match between the page tables and the APIC-access page address.
>> Then the processor will not trap APIC accesses, and will instead show
>> the raw contents of the APIC-access page (zeroes), and cause the crash
>> as you mention below.
>>
>> Still, the race explains the symptoms and the patch matches this text in
>> include/linux/mmu_notifier.h:
>>
>> 	 * If the subsystem
>>         * can't guarantee that no additional references are taken to
>>         * the pages in the range, it has to implement the
>>         * invalidate_range() notifier to remove any references taken
>>         * after invalidate_range_start().
>>
>> where the "additional reference" is in the VMCS: because we have to
>> account for kvm_vcpu_reload_apic_access_page running between
>> invalidate_range_start() and invalidate_range_end(), we need to
>> implement invalidate_range().
>>
>> The patch seems good, but I'd like Andrea Arcangeli to take a look as
>> well so I've CCed him.
>>
>> Thank you very much!
>>
>> Paolo
>>
> 
> Hello Paolo
> 
> Thanks for detailed explanation!
> I’ll fix the commit message like this:

No need to resend, the patch is good.  Here is my take on the commit message:

    Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried
    to fix inappropriate APIC page invalidation by re-introducing arch
    specific kvm_arch_mmu_notifier_invalidate_range() and calling it from
    kvm_mmu_notifier_invalidate_range_start. However, the patch left a
    possible race where the VMCS APIC address cache is updated *before*
    it is unmapped:
    
      (Invalidator) kvm_mmu_notifier_invalidate_range_start()
      (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
      (KVM VCPU) vcpu_enter_guest()
      (KVM VCPU) kvm_vcpu_reload_apic_access_page()
      (Invalidator) actually unmap page
    
    Because of the above race, there can be a mismatch between the
    host physical address stored in the APIC_ACCESS_PAGE VMCS field and
    the host physical address stored in the EPT entry for the APIC GPA
    (0xfee0000).  When this happens, the processor will not trap APIC
    accesses, and will instead show the raw contents of the APIC-access page.
    Because Windows OS periodically checks for unexpected modifications to
    the LAPIC register, this will show up as a BSOD crash with BugCheck
    CRITICAL_STRUCTURE_CORRUPTION (109) we are currently seeing in
    https://bugzilla.redhat.com/show_bug.cgi?id=1751017.
    
    The root cause of the issue is that kvm_arch_mmu_notifier_invalidate_range()
    cannot guarantee that no additional references are taken to the pages in
    the range before kvm_mmu_notifier_invalidate_range_end().  Fortunately,
    this case is supported by the MMU notifier API, as documented in
    include/linux/mmu_notifier.h:
    
             * If the subsystem
             * can't guarantee that no additional references are taken to
             * the pages in the range, it has to implement the
             * invalidate_range() notifier to remove any references taken
             * after invalidate_range_start().
    
    The fix therefore is to reload the APIC-access page field in the VMCS
    from kvm_mmu_notifier_invalidate_range() instead of ..._range_start().

Thanks,

Paolo
Eiichi Tsukata June 9, 2020, 1:36 p.m. UTC | #5
> On Jun 9, 2020, at 18:54, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> 
> No need to resend, the patch is good.  Here is my take on the commit message:

Thank you Paolo! Your commit message is much clearer.
I really appreciate your great job.

Best

Eiichi

> 
>    Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried
>    to fix inappropriate APIC page invalidation by re-introducing arch
>    specific kvm_arch_mmu_notifier_invalidate_range() and calling it from
>    kvm_mmu_notifier_invalidate_range_start. However, the patch left a
>    possible race where the VMCS APIC address cache is updated *before*
>    it is unmapped:
> 
>      (Invalidator) kvm_mmu_notifier_invalidate_range_start()
>      (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
>      (KVM VCPU) vcpu_enter_guest()
>      (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>      (Invalidator) actually unmap page
> 
>    Because of the above race, there can be a mismatch between the
>    host physical address stored in the APIC_ACCESS_PAGE VMCS field and
>    the host physical address stored in the EPT entry for the APIC GPA
>    (0xfee0000).  When this happens, the processor will not trap APIC
>    accesses, and will instead show the raw contents of the APIC-access page.
>    Because Windows OS periodically checks for unexpected modifications to
>    the LAPIC register, this will show up as a BSOD crash with BugCheck
>    CRITICAL_STRUCTURE_CORRUPTION (109) we are currently seeing in
>    https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.redhat.com_show-5Fbug.cgi-3Fid-3D1751017&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=dy01Dr4Ly8mhvnUdx1pZhhT1bkq4h9z5aVWu3paoZtk&m=yo6pYK4bnrvDLz_RwGyvwtJEY6oWkg-9XGlbPBq1B2g&s=7tKkV92x4mCNLQ7miJIanMVqokYNJdjrcK4Rqqb_h7s&e= .
> 
>    The root cause of the issue is that kvm_arch_mmu_notifier_invalidate_range()
>    cannot guarantee that no additional references are taken to the pages in
>    the range before kvm_mmu_notifier_invalidate_range_end().  Fortunately,
>    this case is supported by the MMU notifier API, as documented in
>    include/linux/mmu_notifier.h:
> 
>             * If the subsystem
>             * can't guarantee that no additional references are taken to
>             * the pages in the range, it has to implement the
>             * invalidate_range() notifier to remove any references taken
>             * after invalidate_range_start().
> 
>    The fix therefore is to reload the APIC-access page field in the VMCS
>    from kvm_mmu_notifier_invalidate_range() instead of ..._range_start().
> 
> Thanks,
> 
> Paolo
Paolo Bonzini June 19, 2020, 12:12 p.m. UTC | #6
On 19/06/20 07:17, Xinlong Lin wrote:
>>     Because of the above race, there can be a mismatch between the
>>     host physical address stored in the APIC_ACCESS_PAGE VMCS field and
>>     the host physical address stored in the EPT entry for the APIC GPA
>>     (0xfee0000).  When this happens, the processor will not trap APIC 
> 
> Is the APIC GPA shoud be 0xfee00000?

Yes, there's a missing zero.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c17e6eb9ad43..1700aade39d1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8150,9 +8150,8 @@  static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 	kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 }
 
-int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-		unsigned long start, unsigned long end,
-		bool blockable)
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+					    unsigned long start, unsigned long end)
 {
 	unsigned long apic_address;
 
@@ -8163,8 +8162,6 @@  int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
 	if (start <= apic_address && apic_address < end)
 		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-
-	return 0;
 }
 
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 131cc1527d68..92efa39ea3d7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1406,8 +1406,8 @@  static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
 }
 #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
 
-int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-		unsigned long start, unsigned long end, bool blockable);
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+					    unsigned long start, unsigned long end);
 
 #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
 int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 731c1e517716..77aa91fb08d2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -155,10 +155,9 @@  static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 static unsigned long long kvm_createvm_count;
 static unsigned long long kvm_active_vms;
 
-__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-		unsigned long start, unsigned long end, bool blockable)
+__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+						   unsigned long start, unsigned long end)
 {
-	return 0;
 }
 
 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
@@ -384,6 +383,18 @@  static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 	return container_of(mn, struct kvm, mmu_notifier);
 }
 
+static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
+					      struct mm_struct *mm,
+					      unsigned long start, unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
+	srcu_read_unlock(&kvm->srcu, idx);
+}
+
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 					struct mm_struct *mm,
 					unsigned long address,
@@ -408,7 +419,6 @@  static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int need_tlb_flush = 0, idx;
-	int ret;
 
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
@@ -425,14 +435,9 @@  static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		kvm_flush_remote_tlbs(kvm);
 
 	spin_unlock(&kvm->mmu_lock);
-
-	ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
-					range->end,
-					mmu_notifier_range_blockable(range));
-
 	srcu_read_unlock(&kvm->srcu, idx);
 
-	return ret;
+	return 0;
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -538,6 +543,7 @@  static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 }
 
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,