diff mbox

[RFC,v1,08/12] cputlb: introduce tlb_flush_* async work.

Message ID 1460730231-1184-10-git-send-email-alex.bennee@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Bennée April 15, 2016, 2:23 p.m. UTC
From: KONRAD Frederic <fred.konrad@greensocs.com>

Some architectures allow to flush the tlb of other VCPUs. This is not a problem
when we have only one thread for all VCPUs but it definitely needs to be an
asynchronous work when we are in true multithreaded work.

This patch doesn't do anything to protect other cputlb function being
called in MTTCG mode making cross vCPU changes.

Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
[AJB: remove need for g_malloc on defer, make check fixes]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---
v1
  - Remove tlb_flush_all just do the check in tlb_flush.
  - remove the need to g_malloc
  - tlb_flush calls direct if !cpu->created
---
 cputlb.c                | 61 ++++++++++++++++++++++++++++++++++++++-----------
 include/exec/exec-all.h |  1 +
 2 files changed, 49 insertions(+), 13 deletions(-)

Comments

Sergey Fedorov June 5, 2016, 4:39 p.m. UTC | #1
On 15/04/16 17:23, Alex Bennée wrote:
> diff --git a/cputlb.c b/cputlb.c
> index 1412049..42a3b07 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -56,22 +56,14 @@
>      } \
>  } while (0)
>  
> +/* We need a solution for stuffing 64 bit pointers in 32 bit ones if
> + * we care about this combination */
> +QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(void *));
> +
>  /* statistics */
>  int tlb_flush_count;
>  
> -/* NOTE:
> - * If flush_global is true (the usual case), flush all tlb entries.
> - * If flush_global is false, flush (at least) all tlb entries not
> - * marked global.
> - *
> - * Since QEMU doesn't currently implement a global/not-global flag
> - * for tlb entries, at the moment tlb_flush() will also flush all
> - * tlb entries in the flush_global == false case. This is OK because
> - * CPU architectures generally permit an implementation to drop
> - * entries from the TLB at any time, so flushing more entries than
> - * required is only an efficiency issue, not a correctness issue.
> - */
> -void tlb_flush(CPUState *cpu, int flush_global)
> +static void tlb_flush_nocheck(CPUState *cpu, int flush_global)
>  {
>      CPUArchState *env = cpu->env_ptr;
>  
> @@ -89,6 +81,34 @@ void tlb_flush(CPUState *cpu, int flush_global)
>      env->tlb_flush_addr = -1;
>      env->tlb_flush_mask = 0;
>      tlb_flush_count++;
> +    /* atomic_mb_set(&cpu->pending_tlb_flush, 0); */
> +}
> +
> +static void tlb_flush_global_async_work(CPUState *cpu, void *opaque)
> +{
> +    tlb_flush_nocheck(cpu, GPOINTER_TO_INT(opaque));
> +}
> +
> +/* NOTE:
> + * If flush_global is true (the usual case), flush all tlb entries.
> + * If flush_global is false, flush (at least) all tlb entries not
> + * marked global.
> + *
> + * Since QEMU doesn't currently implement a global/not-global flag
> + * for tlb entries, at the moment tlb_flush() will also flush all
> + * tlb entries in the flush_global == false case. This is OK because
> + * CPU architectures generally permit an implementation to drop
> + * entries from the TLB at any time, so flushing more entries than
> + * required is only an efficiency issue, not a correctness issue.
> + */
> +void tlb_flush(CPUState *cpu, int flush_global)
> +{
> +    if (cpu->created) {

Why do we check for 'cpu->created' here? Any why don't do that in
tlb_flush_page_all()?

> +        async_run_on_cpu(cpu, tlb_flush_global_async_work,
> +                         GINT_TO_POINTER(flush_global));
> +    } else {
> +        tlb_flush_nocheck(cpu, flush_global);
> +    }
>  }
>  
>  static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
> @@ -222,6 +242,21 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
>      tb_flush_jmp_cache(cpu, addr);
>  }
>  
> +static void tlb_flush_page_async_work(CPUState *cpu, void *opaque)
> +{
> +    tlb_flush_page(cpu, GPOINTER_TO_UINT(opaque));
> +}
> +
> +void tlb_flush_page_all(target_ulong addr)
> +{
> +    CPUState *cpu;
> +
> +    CPU_FOREACH(cpu) {
> +        async_run_on_cpu(cpu, tlb_flush_page_async_work,
> +                         GUINT_TO_POINTER(addr));
> +    }
> +}
> +
>  /* update the TLBs so that writes to code in the virtual page 'addr'
>     can be detected */
>  void tlb_protect_code(ram_addr_t ram_addr)
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 9144ee0..f695577 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -190,6 +190,7 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
>  void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr);
>  void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
>                   uintptr_t retaddr);
> +void tlb_flush_page_all(target_ulong addr);
>  #else
>  static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
>  {

tlb_flush_by_mmuidx() and tlb_flush_page_by_mmuidx() want to be safe as
well.

Kind regards,
Sergey
Alex Bennée June 6, 2016, 8:54 a.m. UTC | #2
Sergey Fedorov <serge.fdrv@gmail.com> writes:

> On 15/04/16 17:23, Alex Bennée wrote:
>> diff --git a/cputlb.c b/cputlb.c
>> index 1412049..42a3b07 100644
>> --- a/cputlb.c
>> +++ b/cputlb.c
>> @@ -56,22 +56,14 @@
>>      } \
>>  } while (0)
>>
>> +/* We need a solution for stuffing 64 bit pointers in 32 bit ones if
>> + * we care about this combination */
>> +QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(void *));
>> +
>>  /* statistics */
>>  int tlb_flush_count;
>>
>> -/* NOTE:
>> - * If flush_global is true (the usual case), flush all tlb entries.
>> - * If flush_global is false, flush (at least) all tlb entries not
>> - * marked global.
>> - *
>> - * Since QEMU doesn't currently implement a global/not-global flag
>> - * for tlb entries, at the moment tlb_flush() will also flush all
>> - * tlb entries in the flush_global == false case. This is OK because
>> - * CPU architectures generally permit an implementation to drop
>> - * entries from the TLB at any time, so flushing more entries than
>> - * required is only an efficiency issue, not a correctness issue.
>> - */
>> -void tlb_flush(CPUState *cpu, int flush_global)
>> +static void tlb_flush_nocheck(CPUState *cpu, int flush_global)
>>  {
>>      CPUArchState *env = cpu->env_ptr;
>>
>> @@ -89,6 +81,34 @@ void tlb_flush(CPUState *cpu, int flush_global)
>>      env->tlb_flush_addr = -1;
>>      env->tlb_flush_mask = 0;
>>      tlb_flush_count++;
>> +    /* atomic_mb_set(&cpu->pending_tlb_flush, 0); */
>> +}
>> +
>> +static void tlb_flush_global_async_work(CPUState *cpu, void *opaque)
>> +{
>> +    tlb_flush_nocheck(cpu, GPOINTER_TO_INT(opaque));
>> +}
>> +
>> +/* NOTE:
>> + * If flush_global is true (the usual case), flush all tlb entries.
>> + * If flush_global is false, flush (at least) all tlb entries not
>> + * marked global.
>> + *
>> + * Since QEMU doesn't currently implement a global/not-global flag
>> + * for tlb entries, at the moment tlb_flush() will also flush all
>> + * tlb entries in the flush_global == false case. This is OK because
>> + * CPU architectures generally permit an implementation to drop
>> + * entries from the TLB at any time, so flushing more entries than
>> + * required is only an efficiency issue, not a correctness issue.
>> + */
>> +void tlb_flush(CPUState *cpu, int flush_global)
>> +{
>> +    if (cpu->created) {
>
> Why do we check for 'cpu->created' here? Any why don't do that in
> tlb_flush_page_all()?

A bunch of random stuff gets kicked off at start-up which was getting in
the way (c.f. arm_cpu_reset and watch/breakpoints). tlb_flush() is
rather liberally sprinkled around the init code of various CPUs.

>
>> +        async_run_on_cpu(cpu, tlb_flush_global_async_work,
>> +                         GINT_TO_POINTER(flush_global));
>> +    } else {
>> +        tlb_flush_nocheck(cpu, flush_global);
>> +    }
>>  }
>>
>>  static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
>> @@ -222,6 +242,21 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
>>      tb_flush_jmp_cache(cpu, addr);
>>  }
>>
>> +static void tlb_flush_page_async_work(CPUState *cpu, void *opaque)
>> +{
>> +    tlb_flush_page(cpu, GPOINTER_TO_UINT(opaque));
>> +}
>> +
>> +void tlb_flush_page_all(target_ulong addr)
>> +{
>> +    CPUState *cpu;
>> +
>> +    CPU_FOREACH(cpu) {
>> +        async_run_on_cpu(cpu, tlb_flush_page_async_work,
>> +                         GUINT_TO_POINTER(addr));
>> +    }
>> +}
>> +
>>  /* update the TLBs so that writes to code in the virtual page 'addr'
>>     can be detected */
>>  void tlb_protect_code(ram_addr_t ram_addr)
>> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
>> index 9144ee0..f695577 100644
>> --- a/include/exec/exec-all.h
>> +++ b/include/exec/exec-all.h
>> @@ -190,6 +190,7 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
>>  void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr);
>>  void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
>>                   uintptr_t retaddr);
>> +void tlb_flush_page_all(target_ulong addr);
>>  #else
>>  static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
>>  {
>
> tlb_flush_by_mmuidx() and tlb_flush_page_by_mmuidx() want to be safe as
> well.
>
> Kind regards,
> Sergey


--
Alex Bennée
Sergey Fedorov June 6, 2016, 10:04 a.m. UTC | #3
On 06/06/16 11:54, Alex Bennée wrote:
> Sergey Fedorov <serge.fdrv@gmail.com> writes:
>
>> On 15/04/16 17:23, Alex Bennée wrote:
>>> diff --git a/cputlb.c b/cputlb.c
>>> index 1412049..42a3b07 100644
>>> --- a/cputlb.c
>>> +++ b/cputlb.c
(snip)
>>> @@ -89,6 +81,34 @@ void tlb_flush(CPUState *cpu, int flush_global)
>>>      env->tlb_flush_addr = -1;
>>>      env->tlb_flush_mask = 0;
>>>      tlb_flush_count++;
>>> +    /* atomic_mb_set(&cpu->pending_tlb_flush, 0); */
>>> +}
>>> +
>>> +static void tlb_flush_global_async_work(CPUState *cpu, void *opaque)
>>> +{
>>> +    tlb_flush_nocheck(cpu, GPOINTER_TO_INT(opaque));
>>> +}
>>> +
>>> +/* NOTE:
>>> + * If flush_global is true (the usual case), flush all tlb entries.
>>> + * If flush_global is false, flush (at least) all tlb entries not
>>> + * marked global.
>>> + *
>>> + * Since QEMU doesn't currently implement a global/not-global flag
>>> + * for tlb entries, at the moment tlb_flush() will also flush all
>>> + * tlb entries in the flush_global == false case. This is OK because
>>> + * CPU architectures generally permit an implementation to drop
>>> + * entries from the TLB at any time, so flushing more entries than
>>> + * required is only an efficiency issue, not a correctness issue.
>>> + */
>>> +void tlb_flush(CPUState *cpu, int flush_global)
>>> +{
>>> +    if (cpu->created) {
>> Why do we check for 'cpu->created' here? Any why don't do that in
>> tlb_flush_page_all()?
> A bunch of random stuff gets kicked off at start-up which was getting in
> the way (c.f. arm_cpu_reset and watch/breakpoints). tlb_flush() is
> rather liberally sprinkled around the init code of various CPUs.

Wouldn't we race if we call tlb_flush() with no BQL held? We could just
queue an async job since we force each CPU thread to flush its work
queue before starting execution anyway.

Kind regards,
Sergey

>
>>> +        async_run_on_cpu(cpu, tlb_flush_global_async_work,
>>> +                         GINT_TO_POINTER(flush_global));
>>> +    } else {
>>> +        tlb_flush_nocheck(cpu, flush_global);
>>> +    }
>>>  }
>>>
>>>  static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
>>>
diff mbox

Patch

diff --git a/cputlb.c b/cputlb.c
index 1412049..42a3b07 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -56,22 +56,14 @@ 
     } \
 } while (0)
 
+/* We need a solution for stuffing 64 bit pointers in 32 bit ones if
+ * we care about this combination */
+QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(void *));
+
 /* statistics */
 int tlb_flush_count;
 
-/* NOTE:
- * If flush_global is true (the usual case), flush all tlb entries.
- * If flush_global is false, flush (at least) all tlb entries not
- * marked global.
- *
- * Since QEMU doesn't currently implement a global/not-global flag
- * for tlb entries, at the moment tlb_flush() will also flush all
- * tlb entries in the flush_global == false case. This is OK because
- * CPU architectures generally permit an implementation to drop
- * entries from the TLB at any time, so flushing more entries than
- * required is only an efficiency issue, not a correctness issue.
- */
-void tlb_flush(CPUState *cpu, int flush_global)
+static void tlb_flush_nocheck(CPUState *cpu, int flush_global)
 {
     CPUArchState *env = cpu->env_ptr;
 
@@ -89,6 +81,34 @@  void tlb_flush(CPUState *cpu, int flush_global)
     env->tlb_flush_addr = -1;
     env->tlb_flush_mask = 0;
     tlb_flush_count++;
+    /* atomic_mb_set(&cpu->pending_tlb_flush, 0); */
+}
+
+static void tlb_flush_global_async_work(CPUState *cpu, void *opaque)
+{
+    tlb_flush_nocheck(cpu, GPOINTER_TO_INT(opaque));
+}
+
+/* NOTE:
+ * If flush_global is true (the usual case), flush all tlb entries.
+ * If flush_global is false, flush (at least) all tlb entries not
+ * marked global.
+ *
+ * Since QEMU doesn't currently implement a global/not-global flag
+ * for tlb entries, at the moment tlb_flush() will also flush all
+ * tlb entries in the flush_global == false case. This is OK because
+ * CPU architectures generally permit an implementation to drop
+ * entries from the TLB at any time, so flushing more entries than
+ * required is only an efficiency issue, not a correctness issue.
+ */
+void tlb_flush(CPUState *cpu, int flush_global)
+{
+    if (cpu->created) {
+        async_run_on_cpu(cpu, tlb_flush_global_async_work,
+                         GINT_TO_POINTER(flush_global));
+    } else {
+        tlb_flush_nocheck(cpu, flush_global);
+    }
 }
 
 static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
@@ -222,6 +242,21 @@  void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
     tb_flush_jmp_cache(cpu, addr);
 }
 
+static void tlb_flush_page_async_work(CPUState *cpu, void *opaque)
+{
+    tlb_flush_page(cpu, GPOINTER_TO_UINT(opaque));
+}
+
+void tlb_flush_page_all(target_ulong addr)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        async_run_on_cpu(cpu, tlb_flush_page_async_work,
+                         GUINT_TO_POINTER(addr));
+    }
+}
+
 /* update the TLBs so that writes to code in the virtual page 'addr'
    can be detected */
 void tlb_protect_code(ram_addr_t ram_addr)
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 9144ee0..f695577 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -190,6 +190,7 @@  void tlb_set_page(CPUState *cpu, target_ulong vaddr,
 void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr);
 void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
                  uintptr_t retaddr);
+void tlb_flush_page_all(target_ulong addr);
 #else
 static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
 {