diff mbox series

[RESEND] KVM: x86/mmu: make apf token non-zero to fix bug

Message ID 20220222031239.1076682-1-zhangliang5@huawei.com (mailing list archive)
State New, archived
Headers show
Series [RESEND] KVM: x86/mmu: make apf token non-zero to fix bug | expand

Commit Message

zhangliang (AG) Feb. 22, 2022, 3:12 a.m. UTC
In current async pagefault logic, when a page is ready, KVM relies on
kvm_arch_can_dequeue_async_page_present() to determine whether to deliver
a READY event to the Guest. This function test token value of struct
kvm_vcpu_pv_apf_data, which must be reset to zero by Guest kernel when a
READY event is finished by Guest. If value is zero meaning that a READY
event is done, so the KVM can deliver another.
But the kvm_arch_setup_async_pf() may produce a valid token with zero
value, which is confused with previous mention and may lead the loss of
this READY event.

This bug may cause task blocked forever in Guest:
 INFO: task stress:7532 blocked for more than 1254 seconds.
       Not tainted 5.10.0 #16
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 task:stress          state:D stack:    0 pid: 7532 ppid:  1409
 flags:0x00000080
 Call Trace:
  __schedule+0x1e7/0x650
  schedule+0x46/0xb0
  kvm_async_pf_task_wait_schedule+0xad/0xe0
  ? exit_to_user_mode_prepare+0x60/0x70
  __kvm_handle_async_pf+0x4f/0xb0
  ? asm_exc_page_fault+0x8/0x30
  exc_page_fault+0x6f/0x110
  ? asm_exc_page_fault+0x8/0x30
  asm_exc_page_fault+0x1e/0x30
 RIP: 0033:0x402d00
 RSP: 002b:00007ffd31912500 EFLAGS: 00010206
 RAX: 0000000000071000 RBX: ffffffffffffffff RCX: 00000000021a32b0
 RDX: 000000000007d011 RSI: 000000000007d000 RDI: 00000000021262b0
 RBP: 00000000021262b0 R08: 0000000000000003 R09: 0000000000000086
 R10: 00000000000000eb R11: 00007fefbdf2baa0 R12: 0000000000000000
 R13: 0000000000000002 R14: 000000000007d000 R15: 0000000000001000

Signed-off-by: Liang Zhang <zhangliang5@huawei.com>
---
 arch/x86/kvm/mmu/mmu.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

Comments

Paolo Bonzini Feb. 22, 2022, 8:38 a.m. UTC | #1
On 2/22/22 04:12, Liang Zhang wrote:
> In current async pagefault logic, when a page is ready, KVM relies on
> kvm_arch_can_dequeue_async_page_present() to determine whether to deliver
> a READY event to the Guest. This function test token value of struct
> kvm_vcpu_pv_apf_data, which must be reset to zero by Guest kernel when a
> READY event is finished by Guest. If value is zero meaning that a READY
> event is done, so the KVM can deliver another.
> But the kvm_arch_setup_async_pf() may produce a valid token with zero
> value, which is confused with previous mention and may lead the loss of
> this READY event.
> 
> This bug may cause task blocked forever in Guest:
>   INFO: task stress:7532 blocked for more than 1254 seconds.
>         Not tainted 5.10.0 #16
>   "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>   task:stress          state:D stack:    0 pid: 7532 ppid:  1409
>   flags:0x00000080
>   Call Trace:
>    __schedule+0x1e7/0x650
>    schedule+0x46/0xb0
>    kvm_async_pf_task_wait_schedule+0xad/0xe0
>    ? exit_to_user_mode_prepare+0x60/0x70
>    __kvm_handle_async_pf+0x4f/0xb0
>    ? asm_exc_page_fault+0x8/0x30
>    exc_page_fault+0x6f/0x110
>    ? asm_exc_page_fault+0x8/0x30
>    asm_exc_page_fault+0x1e/0x30
>   RIP: 0033:0x402d00
>   RSP: 002b:00007ffd31912500 EFLAGS: 00010206
>   RAX: 0000000000071000 RBX: ffffffffffffffff RCX: 00000000021a32b0
>   RDX: 000000000007d011 RSI: 000000000007d000 RDI: 00000000021262b0
>   RBP: 00000000021262b0 R08: 0000000000000003 R09: 0000000000000086
>   R10: 00000000000000eb R11: 00007fefbdf2baa0 R12: 0000000000000000
>   R13: 0000000000000002 R14: 000000000007d000 R15: 0000000000001000
> 
> Signed-off-by: Liang Zhang <zhangliang5@huawei.com>
> ---
>   arch/x86/kvm/mmu/mmu.c | 13 ++++++++++++-
>   1 file changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 593093b52395..8e24f73bf60b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3889,12 +3889,23 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
>   	walk_shadow_page_lockless_end(vcpu);
>   }
>   
> +static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
> +{
> +	/* make sure the token value is not 0 */
> +	u32 id = vcpu->arch.apf.id;
> +
> +	if (id << 12 == 0)
> +		vcpu->arch.apf.id = 1;
> +
> +	return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
> +}
> +
>   static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>   				    gfn_t gfn)
>   {
>   	struct kvm_arch_async_pf arch;
>   
> -	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
> +	arch.token = alloc_apf_token(vcpu);
>   	arch.gfn = gfn;
>   	arch.direct_map = vcpu->arch.mmu->direct_map;
>   	arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);

Queued, thanks.

Paolo
Xinlong Lin March 10, 2022, 3:34 a.m. UTC | #2
On 2022/2/22 11:12, Liang Zhang wrote:
> In current async pagefault logic, when a page is ready, KVM relies on
> kvm_arch_can_dequeue_async_page_present() to determine whether to deliver
> a READY event to the Guest. This function test token value of struct
> kvm_vcpu_pv_apf_data, which must be reset to zero by Guest kernel when a
> READY event is finished by Guest. If value is zero meaning that a READY
> event is done, so the KVM can deliver another.
> But the kvm_arch_setup_async_pf() may produce a valid token with zero
> value, which is confused with previous mention and may lead the loss of
> this READY event.
> 
> This bug may cause task blocked forever in Guest:
>   INFO: task stress:7532 blocked for more than 1254 seconds.
>         Not tainted 5.10.0 #16
>   "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>   task:stress          state:D stack:    0 pid: 7532 ppid:  1409
>   flags:0x00000080
>   Call Trace:
>    __schedule+0x1e7/0x650
>    schedule+0x46/0xb0
>    kvm_async_pf_task_wait_schedule+0xad/0xe0
>    ? exit_to_user_mode_prepare+0x60/0x70
>    __kvm_handle_async_pf+0x4f/0xb0
>    ? asm_exc_page_fault+0x8/0x30
>    exc_page_fault+0x6f/0x110
>    ? asm_exc_page_fault+0x8/0x30
>    asm_exc_page_fault+0x1e/0x30
>   RIP: 0033:0x402d00
>   RSP: 002b:00007ffd31912500 EFLAGS: 00010206
>   RAX: 0000000000071000 RBX: ffffffffffffffff RCX: 00000000021a32b0
>   RDX: 000000000007d011 RSI: 000000000007d000 RDI: 00000000021262b0
>   RBP: 00000000021262b0 R08: 0000000000000003 R09: 0000000000000086
>   R10: 00000000000000eb R11: 00007fefbdf2baa0 R12: 0000000000000000
>   R13: 0000000000000002 R14: 000000000007d000 R15: 0000000000001000
> 
> Signed-off-by: Liang Zhang <zhangliang5@huawei.com>
> ---
>   arch/x86/kvm/mmu/mmu.c | 13 ++++++++++++-
>   1 file changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 593093b52395..8e24f73bf60b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3889,12 +3889,23 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
>   	walk_shadow_page_lockless_end(vcpu);
>   }
>   
> +static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
> +{
> +	/* make sure the token value is not 0 */
> +	u32 id = vcpu->arch.apf.id;
> +
> +	if (id << 12 == 0)
> +		vcpu->arch.apf.id = 1;
> +
> +	return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
> +}
> +
>   static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>   				    gfn_t gfn)
>   {
>   	struct kvm_arch_async_pf arch;
>   
> -	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
This patch is completely OK. But I have a question, can we simplify it to
arch.token = (++vcpu->arch.apf.id << 12) | vcpu->vcpu_id;
> +	arch.token = alloc_apf_token(vcpu);
>   	arch.gfn = gfn;
>   	arch.direct_map = vcpu->arch.mmu->direct_map;
>   	arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
zhangliang (AG) March 10, 2022, 3:42 a.m. UTC | #3
No. Because '(++vcpu->arch.apf.id << 12)' may also produce zero value.

On 2022/3/10 11:34, Xinlong Lin wrote:
> 
> 
> On 2022/2/22 11:12, Liang Zhang wrote:
>> In current async pagefault logic, when a page is ready, KVM relies on
>> kvm_arch_can_dequeue_async_page_present() to determine whether to deliver
>> a READY event to the Guest. This function test token value of struct
>> kvm_vcpu_pv_apf_data, which must be reset to zero by Guest kernel when a
>> READY event is finished by Guest. If value is zero meaning that a READY
>> event is done, so the KVM can deliver another.
>> But the kvm_arch_setup_async_pf() may produce a valid token with zero
>> value, which is confused with previous mention and may lead the loss of
>> this READY event.
>>
>> This bug may cause task blocked forever in Guest:
>>   INFO: task stress:7532 blocked for more than 1254 seconds.
>>         Not tainted 5.10.0 #16
>>   "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>   task:stress          state:D stack:    0 pid: 7532 ppid:  1409
>>   flags:0x00000080
>>   Call Trace:
>>    __schedule+0x1e7/0x650
>>    schedule+0x46/0xb0
>>    kvm_async_pf_task_wait_schedule+0xad/0xe0
>>    ? exit_to_user_mode_prepare+0x60/0x70
>>    __kvm_handle_async_pf+0x4f/0xb0
>>    ? asm_exc_page_fault+0x8/0x30
>>    exc_page_fault+0x6f/0x110
>>    ? asm_exc_page_fault+0x8/0x30
>>    asm_exc_page_fault+0x1e/0x30
>>   RIP: 0033:0x402d00
>>   RSP: 002b:00007ffd31912500 EFLAGS: 00010206
>>   RAX: 0000000000071000 RBX: ffffffffffffffff RCX: 00000000021a32b0
>>   RDX: 000000000007d011 RSI: 000000000007d000 RDI: 00000000021262b0
>>   RBP: 00000000021262b0 R08: 0000000000000003 R09: 0000000000000086
>>   R10: 00000000000000eb R11: 00007fefbdf2baa0 R12: 0000000000000000
>>   R13: 0000000000000002 R14: 000000000007d000 R15: 0000000000001000
>>
>> Signed-off-by: Liang Zhang <zhangliang5@huawei.com>
>> ---
>>   arch/x86/kvm/mmu/mmu.c | 13 ++++++++++++-
>>   1 file changed, 12 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 593093b52395..8e24f73bf60b 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -3889,12 +3889,23 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
>>       walk_shadow_page_lockless_end(vcpu);
>>   }
>>   +static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
>> +{
>> +    /* make sure the token value is not 0 */
>> +    u32 id = vcpu->arch.apf.id;
>> +
>> +    if (id << 12 == 0)
>> +        vcpu->arch.apf.id = 1;
>> +
>> +    return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
>> +}
>> +
>>   static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>>                       gfn_t gfn)
>>   {
>>       struct kvm_arch_async_pf arch;
>>   -    arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
> This patch is completely OK. But I have a question, can we simplify it to
> arch.token = (++vcpu->arch.apf.id << 12) | vcpu->vcpu_id;
>> +    arch.token = alloc_apf_token(vcpu);
>>       arch.gfn = gfn;
>>       arch.direct_map = vcpu->arch.mmu->direct_map;
>>       arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
> 
> .
Xinlong Lin March 10, 2022, 5:09 a.m. UTC | #4
Thank you and sorry for the noise.

On 2022/3/10 11:42, zhangliang (AG) wrote:
> No. Because '(++vcpu->arch.apf.id << 12)' may also produce zero value.
> 
> On 2022/3/10 11:34, Xinlong Lin wrote:
>>
>>
>> On 2022/2/22 11:12, Liang Zhang wrote:
>>> In current async pagefault logic, when a page is ready, KVM relies on
>>> kvm_arch_can_dequeue_async_page_present() to determine whether to deliver
>>> a READY event to the Guest. This function test token value of struct
>>> kvm_vcpu_pv_apf_data, which must be reset to zero by Guest kernel when a
>>> READY event is finished by Guest. If value is zero meaning that a READY
>>> event is done, so the KVM can deliver another.
>>> But the kvm_arch_setup_async_pf() may produce a valid token with zero
>>> value, which is confused with previous mention and may lead the loss of
>>> this READY event.
>>>
>>> This bug may cause task blocked forever in Guest:
>>>    INFO: task stress:7532 blocked for more than 1254 seconds.
>>>          Not tainted 5.10.0 #16
>>>    "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>    task:stress          state:D stack:    0 pid: 7532 ppid:  1409
>>>    flags:0x00000080
>>>    Call Trace:
>>>     __schedule+0x1e7/0x650
>>>     schedule+0x46/0xb0
>>>     kvm_async_pf_task_wait_schedule+0xad/0xe0
>>>     ? exit_to_user_mode_prepare+0x60/0x70
>>>     __kvm_handle_async_pf+0x4f/0xb0
>>>     ? asm_exc_page_fault+0x8/0x30
>>>     exc_page_fault+0x6f/0x110
>>>     ? asm_exc_page_fault+0x8/0x30
>>>     asm_exc_page_fault+0x1e/0x30
>>>    RIP: 0033:0x402d00
>>>    RSP: 002b:00007ffd31912500 EFLAGS: 00010206
>>>    RAX: 0000000000071000 RBX: ffffffffffffffff RCX: 00000000021a32b0
>>>    RDX: 000000000007d011 RSI: 000000000007d000 RDI: 00000000021262b0
>>>    RBP: 00000000021262b0 R08: 0000000000000003 R09: 0000000000000086
>>>    R10: 00000000000000eb R11: 00007fefbdf2baa0 R12: 0000000000000000
>>>    R13: 0000000000000002 R14: 000000000007d000 R15: 0000000000001000
>>>
>>> Signed-off-by: Liang Zhang <zhangliang5@huawei.com>
>>> ---
>>>    arch/x86/kvm/mmu/mmu.c | 13 ++++++++++++-
>>>    1 file changed, 12 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>>> index 593093b52395..8e24f73bf60b 100644
>>> --- a/arch/x86/kvm/mmu/mmu.c
>>> +++ b/arch/x86/kvm/mmu/mmu.c
>>> @@ -3889,12 +3889,23 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
>>>        walk_shadow_page_lockless_end(vcpu);
>>>    }
>>>    +static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
>>> +{
>>> +    /* make sure the token value is not 0 */
>>> +    u32 id = vcpu->arch.apf.id;
>>> +
>>> +    if (id << 12 == 0)
>>> +        vcpu->arch.apf.id = 1;
>>> +
>>> +    return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
>>> +}
>>> +
>>>    static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>>>                        gfn_t gfn)
>>>    {
>>>        struct kvm_arch_async_pf arch;
>>>    -    arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
>> This patch is completely OK. But I have a question, can we simplify it to
>> arch.token = (++vcpu->arch.apf.id << 12) | vcpu->vcpu_id;
>>> +    arch.token = alloc_apf_token(vcpu);
>>>        arch.gfn = gfn;
>>>        arch.direct_map = vcpu->arch.mmu->direct_map;
>>>        arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
>>
>> .
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 593093b52395..8e24f73bf60b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3889,12 +3889,23 @@  static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
 	walk_shadow_page_lockless_end(vcpu);
 }
 
+static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
+{
+	/* make sure the token value is not 0 */
+	u32 id = vcpu->arch.apf.id;
+
+	if (id << 12 == 0)
+		vcpu->arch.apf.id = 1;
+
+	return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+}
+
 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 				    gfn_t gfn)
 {
 	struct kvm_arch_async_pf arch;
 
-	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+	arch.token = alloc_apf_token(vcpu);
 	arch.gfn = gfn;
 	arch.direct_map = vcpu->arch.mmu->direct_map;
 	arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);