diff mbox series

[8/8] riscv: ignore the SYS_RISCV_FLUSH_ICACHE_LOCAL flag

Message ID 20190822065612.28634-9-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [1/8] riscv: fix the flags argument type for riscv_riscv_flush_icache | expand

Commit Message

Christoph Hellwig Aug. 22, 2019, 6:56 a.m. UTC
The SYS_RISCV_FLUSH_ICACHE_LOCAL is built on the flawed assumption that
there is such a thing as a local cpu outside of in-kernel preemption
disabled sections.  Just ignore the flag as it can't be used in a safe
way.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/riscv/include/asm/cacheflush.h |  2 +-
 arch/riscv/mm/cacheflush.c          | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

Comments

Palmer Dabbelt Aug. 28, 2019, 1:10 a.m. UTC | #1
On Wed, 21 Aug 2019 23:56:12 PDT (-0700), Christoph Hellwig wrote:
> The SYS_RISCV_FLUSH_ICACHE_LOCAL is built on the flawed assumption that
> there is such a thing as a local cpu outside of in-kernel preemption
> disabled sections.  Just ignore the flag as it can't be used in a safe
> way.

This is meant to perform a context-local flush, not a cpu-local flush.  The 
whole point here is that userspace doesn't know anything about CPUs, just 
contexts -- that's why we have this deferred flush mechanism.  I think the 
logic is complicated but sound, and removing this will almost certainly lead to 
huge performance degradation.

Maybe I'm missing something, what is the specific issue?

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  arch/riscv/include/asm/cacheflush.h |  2 +-
>  arch/riscv/mm/cacheflush.c          | 13 ++++++++-----
>  2 files changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
> index b86ac3a4653a..3c18d956c970 100644
> --- a/arch/riscv/include/asm/cacheflush.h
> +++ b/arch/riscv/include/asm/cacheflush.h
> @@ -100,6 +100,6 @@ void flush_icache_all(void);
>  /*
>   * Bits in sys_riscv_flush_icache()'s flags argument.
>   */
> -#define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL
> +#define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL /* ignored */
>
>  #endif /* _ASM_RISCV_CACHEFLUSH_H */
> diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
> index 8f1134715fec..4b6ecc3431e2 100644
> --- a/arch/riscv/mm/cacheflush.c
> +++ b/arch/riscv/mm/cacheflush.c
> @@ -17,7 +17,7 @@ void flush_icache_all(void)
>  	sbi_remote_fence_i(NULL);
>  }
>
> -static void flush_icache_mm(bool local)
> +static void flush_icache_mm(void)
>  {
>  	unsigned int cpu = get_cpu();
>
> @@ -36,8 +36,7 @@ static void flush_icache_mm(bool local)
>  	 * still need to order this hart's writes with flush_icache_deferred().
>  	 */
>  	cpu = get_cpu();
> -	if (local ||
> -	    cpumask_any_but(mm_cpumask(current->mm), cpu) >= nr_cpu_ids) {
> +	if (cpumask_any_but(mm_cpumask(current->mm), cpu) >= nr_cpu_ids) {
>  		local_flush_icache_all();
>  		smp_mb();
>  	} else {
> @@ -50,7 +49,7 @@ static void flush_icache_mm(bool local)
>  	put_cpu();
>  }
>  #else
> -#define flush_icache_mm(local)	flush_icache_all()
> +#define flush_icache_mm()	flush_icache_all()
>  #endif /* CONFIG_SMP */
>
>  /*
> @@ -72,6 +71,10 @@ static void flush_icache_mm(bool local)
>   * remove flush for harts that are not currently executing a MM context and
>   * instead schedule a deferred local instruction cache flush to be performed
>   * before execution resumes on each hart.
> + *
> + * Note that we ignore the SYS_RISCV_FLUSH_ICACHE_LOCAL flag, as there is
> + * absolutely not way to ensure the local CPU is still the same by the time we
> + * return from the syscall.
>   */
>  SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end,
>  		unsigned long, flags)
> @@ -79,7 +82,7 @@ SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end,
>  	/* Check the reserved flags. */
>  	if (unlikely(flags & ~SYS_RISCV_FLUSH_ICACHE_LOCAL))
>  		return -EINVAL;
> -	flush_icache_mm(flags & SYS_RISCV_FLUSH_ICACHE_LOCAL);
> +	flush_icache_mm();
>  	return 0;
>  }
Christoph Hellwig Aug. 28, 2019, 6:09 a.m. UTC | #2
On Tue, Aug 27, 2019 at 06:10:33PM -0700, Palmer Dabbelt wrote:
> This is meant to perform a context-local flush, not a cpu-local flush.  The 
> whole point here is that userspace doesn't know anything about CPUs, just 
> contexts -- that's why we have this deferred flush mechanism.  I think the 
> logic is complicated but sound, and removing this will almost certainly 
> lead to huge performance degradation.

All calls to flush_icache_mm are local to the context.  Take a look at
what the current code does:

 - set all bits in context.icache_stale_mask
 - clear the current cpu from context.icache_stale_mask
 - flush the cpu local icache
 - create a local others mask containing every cpu running the context
   except for the current one
 - now if others is empty OR the local flag is set don't do anything
   but a memory barrier, else flush the other cpus

>
> Maybe I'm missing something, what is the specific issue?

The issue is that the current implementation of
SYS_RISCV_FLUSH_ICACHE_LOCAL only flushes the icache of the currently
running core, which is an interface that can't be used correctly.

riscv_flush_icache without that flag on the other handle already just
flushes the caches for the cpus that run the current context, and then
causes a deferred flush if the context gets run on another cpu
eventually.
Palmer Dabbelt Sept. 3, 2019, 6:46 p.m. UTC | #3
On Tue, 27 Aug 2019 23:09:42 PDT (-0700), Christoph Hellwig wrote:
> On Tue, Aug 27, 2019 at 06:10:33PM -0700, Palmer Dabbelt wrote:
>> This is meant to perform a context-local flush, not a cpu-local flush.  The
>> whole point here is that userspace doesn't know anything about CPUs, just
>> contexts -- that's why we have this deferred flush mechanism.  I think the
>> logic is complicated but sound, and removing this will almost certainly
>> lead to huge performance degradation.
>
> All calls to flush_icache_mm are local to the context.  Take a look at
> what the current code does:
>
>  - set all bits in context.icache_stale_mask
>  - clear the current cpu from context.icache_stale_mask
>  - flush the cpu local icache
>  - create a local others mask containing every cpu running the context
>    except for the current one
>  - now if others is empty OR the local flag is set don't do anything
>    but a memory barrier, else flush the other cpus
>
>>
>> Maybe I'm missing something, what is the specific issue?
>
> The issue is that the current implementation of
> SYS_RISCV_FLUSH_ICACHE_LOCAL only flushes the icache of the currently
> running core, which is an interface that can't be used correctly.

This is used by userspace as a thread-local icache barrier: there's an 
immediate fence on the current hart, and one will be executed before that 
thread makes it to userspace on another hart.  As far as I can tell this is 
implemented correctly but not optimally: there's always a fence, but we emit an 
unnecessary fence when a different thread in the same context is scheduled on a 
different hart.

I suppose maybe we should attach the local fence mask to a task_struct instead of an 
mm_struct, which would trade off an extra load in the scheduler (to check both 
places) or more fences in the global case (on every thread getting scheduled) 
for fewer fences in the local case.  I feel like it's not really worth worrying 
about this for now.

The construct seems reasonable to me.

> riscv_flush_icache without that flag on the other handle already just
> flushes the caches for the cpus that run the current context, and then
> causes a deferred flush if the context gets run on another cpu
> eventually.
Christoph Hellwig Sept. 6, 2019, 5:07 p.m. UTC | #4
On Tue, Sep 03, 2019 at 11:46:33AM -0700, Palmer Dabbelt wrote:
> This is used by userspace as a thread-local icache barrier: there's an 
> immediate fence on the current hart, and one will be executed before that 
> thread makes it to userspace on another hart.  As far as I can tell this is 
> implemented correctly but not optimally: there's always a fence, but we 
> emit an unnecessary fence when a different thread in the same context is 
> scheduled on a different hart.
>
> I suppose maybe we should attach the local fence mask to a task_struct 
> instead of an mm_struct, which would trade off an extra load in the 
> scheduler (to check both places) or more fences in the global case (on 
> every thread getting scheduled) for fewer fences in the local case.  I feel 
> like it's not really worth worrying about this for now.
>
> The construct seems reasonable to me.

I haven't been able to poke holes into that idea yet, but I'll try
a bit more once I find a little time.
Palmer Dabbelt Sept. 13, 2019, 7:44 p.m. UTC | #5
On Fri, 06 Sep 2019 10:07:25 PDT (-0700), Christoph Hellwig wrote:
> On Tue, Sep 03, 2019 at 11:46:33AM -0700, Palmer Dabbelt wrote:
>> This is used by userspace as a thread-local icache barrier: there's an
>> immediate fence on the current hart, and one will be executed before that
>> thread makes it to userspace on another hart.  As far as I can tell this is
>> implemented correctly but not optimally: there's always a fence, but we
>> emit an unnecessary fence when a different thread in the same context is
>> scheduled on a different hart.
>>
>> I suppose maybe we should attach the local fence mask to a task_struct
>> instead of an mm_struct, which would trade off an extra load in the
>> scheduler (to check both places) or more fences in the global case (on
>> every thread getting scheduled) for fewer fences in the local case.  I feel
>> like it's not really worth worrying about this for now.
>>
>> The construct seems reasonable to me.
>
> I haven't been able to poke holes into that idea yet, but I'll try
> a bit more once I find a little time.

OK, well, LMK if you find anything -- it's in the user ABI, which is a bad 
place to have something fundamentally broken :)
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
index b86ac3a4653a..3c18d956c970 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -100,6 +100,6 @@  void flush_icache_all(void);
 /*
  * Bits in sys_riscv_flush_icache()'s flags argument.
  */
-#define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL
+#define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL /* ignored */
 
 #endif /* _ASM_RISCV_CACHEFLUSH_H */
diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
index 8f1134715fec..4b6ecc3431e2 100644
--- a/arch/riscv/mm/cacheflush.c
+++ b/arch/riscv/mm/cacheflush.c
@@ -17,7 +17,7 @@  void flush_icache_all(void)
 	sbi_remote_fence_i(NULL);
 }
 
-static void flush_icache_mm(bool local)
+static void flush_icache_mm(void)
 {
 	unsigned int cpu = get_cpu();
 
@@ -36,8 +36,7 @@  static void flush_icache_mm(bool local)
 	 * still need to order this hart's writes with flush_icache_deferred().
 	 */
 	cpu = get_cpu();
-	if (local ||
-	    cpumask_any_but(mm_cpumask(current->mm), cpu) >= nr_cpu_ids) {
+	if (cpumask_any_but(mm_cpumask(current->mm), cpu) >= nr_cpu_ids) {
 		local_flush_icache_all();
 		smp_mb();
 	} else {
@@ -50,7 +49,7 @@  static void flush_icache_mm(bool local)
 	put_cpu();
 }
 #else
-#define flush_icache_mm(local)	flush_icache_all()
+#define flush_icache_mm()	flush_icache_all()
 #endif /* CONFIG_SMP */
 
 /*
@@ -72,6 +71,10 @@  static void flush_icache_mm(bool local)
  * remove flush for harts that are not currently executing a MM context and
  * instead schedule a deferred local instruction cache flush to be performed
  * before execution resumes on each hart.
+ *
+ * Note that we ignore the SYS_RISCV_FLUSH_ICACHE_LOCAL flag, as there is
+ * absolutely not way to ensure the local CPU is still the same by the time we
+ * return from the syscall.
  */
 SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end,
 		unsigned long, flags)
@@ -79,7 +82,7 @@  SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end,
 	/* Check the reserved flags. */
 	if (unlikely(flags & ~SYS_RISCV_FLUSH_ICACHE_LOCAL))
 		return -EINVAL;
-	flush_icache_mm(flags & SYS_RISCV_FLUSH_ICACHE_LOCAL);
+	flush_icache_mm();
 	return 0;
 }