diff mbox

[2/7] KVM: MMU: document clear_spte_count

Message ID 1371632965-20077-3-git-send-email-xiaoguangrong@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong June 19, 2013, 9:09 a.m. UTC
Document it to Documentation/virtual/kvm/mmu.txt

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 Documentation/virtual/kvm/mmu.txt | 4 ++++
 arch/x86/include/asm/kvm_host.h   | 5 +++++
 arch/x86/kvm/mmu.c                | 7 ++++---
 3 files changed, 13 insertions(+), 3 deletions(-)

Comments

Paolo Bonzini June 19, 2013, 11:32 a.m. UTC | #1
Il 19/06/2013 11:09, Xiao Guangrong ha scritto:
> Document it to Documentation/virtual/kvm/mmu.txt

While reviewing the docs, I looked at the code.

Why can't this happen?

    CPU 1: __get_spte_lockless          CPU 2: __update_clear_spte_slow
------------------------------------------------------------------------------
                                        write low
    read count
    read low
    read high
                                        write high
    check low and count
                                        update count

The check passes, but CPU 1 read a "torn" SPTE.

It seems like this is the same reason why seqlocks do two version
updates, one before and one after, and make the reader check "version &
~1".  But maybe I'm wrong.

Paolo

> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  Documentation/virtual/kvm/mmu.txt | 4 ++++
>  arch/x86/include/asm/kvm_host.h   | 5 +++++
>  arch/x86/kvm/mmu.c                | 7 ++++---
>  3 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
> index 869abcc..ce6df51 100644
> --- a/Documentation/virtual/kvm/mmu.txt
> +++ b/Documentation/virtual/kvm/mmu.txt
> @@ -210,6 +210,10 @@ Shadow pages contain the following information:
>      A bitmap indicating which sptes in spt point (directly or indirectly) at
>      pages that may be unsynchronized.  Used to quickly locate all unsychronized
>      pages reachable from a given page.
> +  clear_spte_count:
> +    It is only used on 32bit host which helps us to detect whether updating the
> +    64bit spte is complete so that we can avoid reading the truncated value out
> +    of mmu-lock.
>  
>  Reverse map
>  ===========
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 966f265..1dac2c1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -226,6 +226,11 @@ struct kvm_mmu_page {
>  	DECLARE_BITMAP(unsync_child_bitmap, 512);
>  
>  #ifdef CONFIG_X86_32
> +	/*
> +	 * Count after the page's spte has been cleared to avoid
> +	 * the truncated value is read out of mmu-lock.
> +	 * please see the comments in __get_spte_lockless().
> +	 */
>  	int clear_spte_count;
>  #endif
>  
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index c87b19d..77d516c 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -464,9 +464,10 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
>  /*
>   * The idea using the light way get the spte on x86_32 guest is from
>   * gup_get_pte(arch/x86/mm/gup.c).
> - * The difference is we can not catch the spte tlb flush if we leave
> - * guest mode, so we emulate it by increase clear_spte_count when spte
> - * is cleared.
> + * The difference is we can not immediately catch the spte tlb since
> + * kvm may collapse tlb flush some times. Please see kvm_set_pte_rmapp.
> + *
> + * We emulate it by increase clear_spte_count when spte is cleared.
>   */
>  static u64 __get_spte_lockless(u64 *sptep)
>  {
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini June 19, 2013, 11:40 a.m. UTC | #2
Il 19/06/2013 11:09, Xiao Guangrong ha scritto:
> Document it to Documentation/virtual/kvm/mmu.txt

Edits inline, please ack.

> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  Documentation/virtual/kvm/mmu.txt | 4 ++++
>  arch/x86/include/asm/kvm_host.h   | 5 +++++
>  arch/x86/kvm/mmu.c                | 7 ++++---
>  3 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
> index 869abcc..ce6df51 100644
> --- a/Documentation/virtual/kvm/mmu.txt
> +++ b/Documentation/virtual/kvm/mmu.txt
> @@ -210,6 +210,10 @@ Shadow pages contain the following information:
>      A bitmap indicating which sptes in spt point (directly or indirectly) at
>      pages that may be unsynchronized.  Used to quickly locate all unsychronized
>      pages reachable from a given page.
> +  clear_spte_count:
> +    It is only used on 32bit host which helps us to detect whether updating the
> +    64bit spte is complete so that we can avoid reading the truncated value out
> +    of mmu-lock.

+    Only present on 32-bit hosts, where a 64-bit spte cannot be written
+    atomically.  The reader uses this while running out of the MMU lock
+    to detect in-progress updates and retry them until the writer has
+    finished the write.

>  Reverse map
>  ===========
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 966f265..1dac2c1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -226,6 +226,11 @@ struct kvm_mmu_page {
>  	DECLARE_BITMAP(unsync_child_bitmap, 512);
>  
>  #ifdef CONFIG_X86_32
> +	/*
> +	 * Count after the page's spte has been cleared to avoid
> +	 * the truncated value is read out of mmu-lock.
> +	 * please see the comments in __get_spte_lockless().

+	 * Used out of the mmu-lock to avoid reading spte values while an
+	 * update is in progress; see the comments in __get_spte_lockless().

> +	 */
>  	int clear_spte_count;
>  #endif
>  
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index c87b19d..77d516c 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -464,9 +464,10 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
>  /*
>   * The idea using the light way get the spte on x86_32 guest is from
>   * gup_get_pte(arch/x86/mm/gup.c).
> - * The difference is we can not catch the spte tlb flush if we leave
> - * guest mode, so we emulate it by increase clear_spte_count when spte
> - * is cleared.
> + * The difference is we can not immediately catch the spte tlb since
> + * kvm may collapse tlb flush some times. Please see kvm_set_pte_rmapp.
> + *
> + * We emulate it by increase clear_spte_count when spte is cleared.

+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * coalesces them and we are running out of the MMU lock.  Therefore
+ * we need to protect against in-progress updates of the spte, which
+ * is done using clear_spte_count.

>   */
>  static u64 __get_spte_lockless(u64 *sptep)
>  {
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong June 19, 2013, 11:53 a.m. UTC | #3
On 06/19/2013 07:32 PM, Paolo Bonzini wrote:
> Il 19/06/2013 11:09, Xiao Guangrong ha scritto:
>> Document it to Documentation/virtual/kvm/mmu.txt
> 
> While reviewing the docs, I looked at the code.
> 
> Why can't this happen?
> 
>     CPU 1: __get_spte_lockless          CPU 2: __update_clear_spte_slow
> ------------------------------------------------------------------------------
>                                         write low
>     read count
>     read low
>     read high
>                                         write high
>     check low and count
>                                         update count
> 
> The check passes, but CPU 1 read a "torn" SPTE.

In this case, CPU 1 will read the "new low bits" and the "old high bits", right?
the P bit in the low bits is cleared when do __update_clear_spte_slow, i.e, it is
not present, so the whole value is ignored.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini June 19, 2013, 11:55 a.m. UTC | #4
Il 19/06/2013 13:53, Xiao Guangrong ha scritto:
> On 06/19/2013 07:32 PM, Paolo Bonzini wrote:
>> Il 19/06/2013 11:09, Xiao Guangrong ha scritto:
>>> Document it to Documentation/virtual/kvm/mmu.txt
>>
>> While reviewing the docs, I looked at the code.
>>
>> Why can't this happen?
>>
>>     CPU 1: __get_spte_lockless          CPU 2: __update_clear_spte_slow
>> ------------------------------------------------------------------------------
>>                                         write low
>>     read count
>>     read low
>>     read high
>>                                         write high
>>     check low and count
>>                                         update count
>>
>> The check passes, but CPU 1 read a "torn" SPTE.
> 
> In this case, CPU 1 will read the "new low bits" and the "old high bits", right?
> the P bit in the low bits is cleared when do __update_clear_spte_slow, i.e, it is
> not present, so the whole value is ignored.

Indeed that's what the comment says, too.  But then why do we need the
count at all?  The spte that is read is exactly the same before and
after the count is updated.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong June 19, 2013, 12:25 p.m. UTC | #5
On 06/19/2013 07:55 PM, Paolo Bonzini wrote:
> Il 19/06/2013 13:53, Xiao Guangrong ha scritto:
>> On 06/19/2013 07:32 PM, Paolo Bonzini wrote:
>>> Il 19/06/2013 11:09, Xiao Guangrong ha scritto:
>>>> Document it to Documentation/virtual/kvm/mmu.txt
>>>
>>> While reviewing the docs, I looked at the code.
>>>
>>> Why can't this happen?
>>>
>>>     CPU 1: __get_spte_lockless          CPU 2: __update_clear_spte_slow
>>> ------------------------------------------------------------------------------
>>>                                         write low
>>>     read count
>>>     read low
>>>     read high
>>>                                         write high
>>>     check low and count
>>>                                         update count
>>>
>>> The check passes, but CPU 1 read a "torn" SPTE.
>>
>> In this case, CPU 1 will read the "new low bits" and the "old high bits", right?
>> the P bit in the low bits is cleared when do __update_clear_spte_slow, i.e, it is
>> not present, so the whole value is ignored.
> 
> Indeed that's what the comment says, too.  But then why do we need the
> count at all?  The spte that is read is exactly the same before and
> after the count is updated.

In order to detect repeatedly marking spte present to stop the lockless side
to see present to present change, otherwise, we can get this:

Say spte = 0xa11110001 (high 32bits = 0xa, low 32bit = 0x11110001)

CPU 1: __get_spte_lockless          CPU 2: __update_clear_spte_slow
----------------------------------------------------------------------
read low: low= 0x11110001
                                    clear the spte, then spte = 0x0ull
read high: high = 0x0
                                    set spte to 0xb11110001 (high 32bits = 0xb,
                                    low 32bit = 0x11110001)

read low: 0x11110001 and see
it is not changed.

In this case, CPU 1 see the low bits are not changed, then it tries to access the memory at:
0x11110000.

BTW, we are using tlb to protect lockless walking, the count can be drop after
improving kvm_set_pte_rmapp where is the only place change spte from present to present
without TLB flush.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong June 19, 2013, 12:39 p.m. UTC | #6
On 06/19/2013 07:40 PM, Paolo Bonzini wrote:
> Il 19/06/2013 11:09, Xiao Guangrong ha scritto:
>> Document it to Documentation/virtual/kvm/mmu.txt
> 
> Edits inline, please ack.

Good to me.

Thank you very much for bearing my poor English.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini June 19, 2013, 12:41 p.m. UTC | #7
Il 19/06/2013 14:25, Xiao Guangrong ha scritto:
> On 06/19/2013 07:55 PM, Paolo Bonzini wrote:
>> Il 19/06/2013 13:53, Xiao Guangrong ha scritto:
>>> On 06/19/2013 07:32 PM, Paolo Bonzini wrote:
>>>> Il 19/06/2013 11:09, Xiao Guangrong ha scritto:
>>>>> Document it to Documentation/virtual/kvm/mmu.txt
>>>>
>>>> While reviewing the docs, I looked at the code.
>>>>
>>>> Why can't this happen?
>>>>
>>>>     CPU 1: __get_spte_lockless          CPU 2: __update_clear_spte_slow
>>>> ------------------------------------------------------------------------------
>>>>                                         write low
>>>>     read count
>>>>     read low
>>>>     read high
>>>>                                         write high
>>>>     check low and count
>>>>                                         update count
>>>>
>>>> The check passes, but CPU 1 read a "torn" SPTE.
>>>
>>> In this case, CPU 1 will read the "new low bits" and the "old high bits", right?
>>> the P bit in the low bits is cleared when do __update_clear_spte_slow, i.e, it is
>>> not present, so the whole value is ignored.
>>
>> Indeed that's what the comment says, too.  But then why do we need the
>> count at all?  The spte that is read is exactly the same before and
>> after the count is updated.
> 
> In order to detect repeatedly marking spte present to stop the lockless side
> to see present to present change, otherwise, we can get this:
> 
> Say spte = 0xa11110001 (high 32bits = 0xa, low 32bit = 0x11110001)
> 
> CPU 1: __get_spte_lockless          CPU 2: __update_clear_spte_slow
> ----------------------------------------------------------------------
> read low: low= 0x11110001
>                                     clear the spte, then spte = 0x0ull
> read high: high = 0x0
>                                     set spte to 0xb11110001 (high 32bits = 0xb,
>                                     low 32bit = 0x11110001)
> 
> read low: 0x11110001 and see
> it is not changed.
> 
> In this case, CPU 1 see the low bits are not changed, then it tries to access the memory at:
> 0x11110000.

Got it.  What about this in the comment to __get_spte_lockless:

 * The idea using the light way get the spte on x86_32 guest is from
 * gup_get_pte(arch/x86/mm/gup.c).
 *
 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
 * coalesces them and we are running out of the MMU lock.  Therefore
 * we need to protect against in-progress updates of the spte.
 *
 * A race on changing present->non-present may get the old value for
 * the high part of the spte.  This is okay because the high part of
 * the spte is ignored for non-present spte.
 *
 * However, we must detect a present->present change and reread the
 * spte in case the change is in progress.  Because all such changes
 * are done in two steps (present->non-present and non-present->present),
 * it is enough to count the number of present->non-present updates,
 * which is done using clear_spte_count.

Paolo

> BTW, we are using tlb to protect lockless walking, the count can be drop after
> improving kvm_set_pte_rmapp where is the only place change spte from present to present
> without TLB flush.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong June 19, 2013, 1:29 p.m. UTC | #8
On 06/19/2013 08:41 PM, Paolo Bonzini wrote:
> Il 19/06/2013 14:25, Xiao Guangrong ha scritto:
>> On 06/19/2013 07:55 PM, Paolo Bonzini wrote:
>>> Il 19/06/2013 13:53, Xiao Guangrong ha scritto:
>>>> On 06/19/2013 07:32 PM, Paolo Bonzini wrote:
>>>>> Il 19/06/2013 11:09, Xiao Guangrong ha scritto:
>>>>>> Document it to Documentation/virtual/kvm/mmu.txt
>>>>>
>>>>> While reviewing the docs, I looked at the code.
>>>>>
>>>>> Why can't this happen?
>>>>>
>>>>>     CPU 1: __get_spte_lockless          CPU 2: __update_clear_spte_slow
>>>>> ------------------------------------------------------------------------------
>>>>>                                         write low
>>>>>     read count
>>>>>     read low
>>>>>     read high
>>>>>                                         write high
>>>>>     check low and count
>>>>>                                         update count
>>>>>
>>>>> The check passes, but CPU 1 read a "torn" SPTE.
>>>>
>>>> In this case, CPU 1 will read the "new low bits" and the "old high bits", right?
>>>> the P bit in the low bits is cleared when do __update_clear_spte_slow, i.e, it is
>>>> not present, so the whole value is ignored.
>>>
>>> Indeed that's what the comment says, too.  But then why do we need the
>>> count at all?  The spte that is read is exactly the same before and
>>> after the count is updated.
>>
>> In order to detect repeatedly marking spte present to stop the lockless side
>> to see present to present change, otherwise, we can get this:
>>
>> Say spte = 0xa11110001 (high 32bits = 0xa, low 32bit = 0x11110001)
>>
>> CPU 1: __get_spte_lockless          CPU 2: __update_clear_spte_slow
>> ----------------------------------------------------------------------
>> read low: low= 0x11110001
>>                                     clear the spte, then spte = 0x0ull
>> read high: high = 0x0
>>                                     set spte to 0xb11110001 (high 32bits = 0xb,
>>                                     low 32bit = 0x11110001)
>>
>> read low: 0x11110001 and see
>> it is not changed.
>>
>> In this case, CPU 1 see the low bits are not changed, then it tries to access the memory at:
>> 0x11110000.
> 
> Got it.  What about this in the comment to __get_spte_lockless:
> 
>  * The idea using the light way get the spte on x86_32 guest is from
>  * gup_get_pte(arch/x86/mm/gup.c).
>  *
>  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
>  * coalesces them and we are running out of the MMU lock.  Therefore
>  * we need to protect against in-progress updates of the spte.
>  *
>  * A race on changing present->non-present may get the old value for
>  * the high part of the spte.  This is okay because the high part of
>  * the spte is ignored for non-present spte.
>  *
>  * However, we must detect a present->present change and reread the
>  * spte in case the change is in progress.  Because all such changes
>  * are done in two steps (present->non-present and non-present->present),
>  * it is enough to count the number of present->non-present updates,
>  * which is done using clear_spte_count.

It is fantastic :)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index 869abcc..ce6df51 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -210,6 +210,10 @@  Shadow pages contain the following information:
     A bitmap indicating which sptes in spt point (directly or indirectly) at
     pages that may be unsynchronized.  Used to quickly locate all unsychronized
     pages reachable from a given page.
+  clear_spte_count:
+    It is only used on 32bit host which helps us to detect whether updating the
+    64bit spte is complete so that we can avoid reading the truncated value out
+    of mmu-lock.
 
 Reverse map
 ===========
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 966f265..1dac2c1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -226,6 +226,11 @@  struct kvm_mmu_page {
 	DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
+	/*
+	 * Count after the page's spte has been cleared to avoid
+	 * the truncated value is read out of mmu-lock.
+	 * please see the comments in __get_spte_lockless().
+	 */
 	int clear_spte_count;
 #endif
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c87b19d..77d516c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -464,9 +464,10 @@  static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 /*
  * The idea using the light way get the spte on x86_32 guest is from
  * gup_get_pte(arch/x86/mm/gup.c).
- * The difference is we can not catch the spte tlb flush if we leave
- * guest mode, so we emulate it by increase clear_spte_count when spte
- * is cleared.
+ * The difference is we can not immediately catch the spte tlb since
+ * kvm may collapse tlb flush some times. Please see kvm_set_pte_rmapp.
+ *
+ * We emulate it by increase clear_spte_count when spte is cleared.
  */
 static u64 __get_spte_lockless(u64 *sptep)
 {