diff mbox series

RISC-V: Issue a local tlb flush if possible.

Message ID 20190810014309.20838-1-atish.patra@wdc.com (mailing list archive)
State New, archived
Headers show
Series RISC-V: Issue a local tlb flush if possible. | expand

Commit Message

Atish Patra Aug. 10, 2019, 1:43 a.m. UTC
In RISC-V, tlb flush happens via SBI which is expensive.
If the target cpumask contains a local hartid, some cost
can be saved by issuing a local tlb flush as we do that
in OpenSBI anyways.

Signed-off-by: Atish Patra <atish.patra@wdc.com>
---
 arch/riscv/include/asm/tlbflush.h | 33 +++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

Comments

Anup Patel Aug. 10, 2019, 3:30 a.m. UTC | #1
On Sat, Aug 10, 2019 at 7:13 AM Atish Patra <atish.patra@wdc.com> wrote:
>
> In RISC-V, tlb flush happens via SBI which is expensive.
> If the target cpumask contains a local hartid, some cost
> can be saved by issuing a local tlb flush as we do that
> in OpenSBI anyways.
>
> Signed-off-by: Atish Patra <atish.patra@wdc.com>
> ---
>  arch/riscv/include/asm/tlbflush.h | 33 +++++++++++++++++++++++++++----
>  1 file changed, 29 insertions(+), 4 deletions(-)
>
> diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
> index 687dd19735a7..b32ba4fa5888 100644
> --- a/arch/riscv/include/asm/tlbflush.h
> +++ b/arch/riscv/include/asm/tlbflush.h
> @@ -8,6 +8,7 @@
>  #define _ASM_RISCV_TLBFLUSH_H
>
>  #include <linux/mm_types.h>
> +#include <linux/sched.h>
>  #include <asm/smp.h>
>
>  /*
> @@ -46,14 +47,38 @@ static inline void remote_sfence_vma(struct cpumask *cmask, unsigned long start,
>                                      unsigned long size)
>  {
>         struct cpumask hmask;
> +       struct cpumask tmask;
> +       int cpuid = smp_processor_id();
>
>         cpumask_clear(&hmask);
> -       riscv_cpuid_to_hartid_mask(cmask, &hmask);
> -       sbi_remote_sfence_vma(hmask.bits, start, size);
> +       cpumask_clear(&tmask);
> +
> +       if (cmask)
> +               cpumask_copy(&tmask, cmask);
> +       else
> +               cpumask_copy(&tmask, cpu_online_mask);

This can be further simplified.

We can totally avoid tmask, cpumask_copy(), and cpumask_clear()
by directly updating hmask.

In addition to this patch, we should also handle the case of
empty hart mask in OpenSBI.

> +
> +       if (cpumask_test_cpu(cpuid, &tmask)) {
> +               /* Save trap cost by issuing a local tlb flush here */
> +               if ((start == 0 && size == -1) || (size > PAGE_SIZE))
> +                       local_flush_tlb_all();
> +               else if (size == PAGE_SIZE)
> +                       local_flush_tlb_page(start);
> +               cpumask_clear_cpu(cpuid, &tmask);
> +       } else if (cpumask_empty(&tmask)) {
> +               /* cpumask is empty. So just do a local flush */
> +               local_flush_tlb_all();
> +               return;
> +       }
> +
> +       if (!cpumask_empty(&tmask)) {
> +               riscv_cpuid_to_hartid_mask(&tmask, &hmask);
> +               sbi_remote_sfence_vma(hmask.bits, start, size);
> +       }
>  }
>
> -#define flush_tlb_all() sbi_remote_sfence_vma(NULL, 0, -1)
> -#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, 0)
> +#define flush_tlb_all() remote_sfence_vma(NULL, 0, -1)
> +#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, (addr) + PAGE_SIZE)
>  #define flush_tlb_range(vma, start, end) \
>         remote_sfence_vma(mm_cpumask((vma)->vm_mm), start, (end) - (start))
>  #define flush_tlb_mm(mm) \
> --
> 2.21.0
>
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

Regards,
Anup
Atish Patra Aug. 10, 2019, 5:28 a.m. UTC | #2
On 8/9/19, 8:30 PM, "Anup Patel" <anup@brainfault.org> wrote:

    On Sat, Aug 10, 2019 at 7:13 AM Atish Patra <atish.patra@wdc.com> wrote:
    >
    > In RISC-V, tlb flush happens via SBI which is expensive.
    > If the target cpumask contains a local hartid, some cost
    > can be saved by issuing a local tlb flush as we do that
    > in OpenSBI anyways.
    >
    > Signed-off-by: Atish Patra <atish.patra@wdc.com>
    > ---
    >  arch/riscv/include/asm/tlbflush.h | 33 +++++++++++++++++++++++++++----
    >  1 file changed, 29 insertions(+), 4 deletions(-)
    >
    > diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
    > index 687dd19735a7..b32ba4fa5888 100644
    > --- a/arch/riscv/include/asm/tlbflush.h
    > +++ b/arch/riscv/include/asm/tlbflush.h
    > @@ -8,6 +8,7 @@
    >  #define _ASM_RISCV_TLBFLUSH_H
    >
    >  #include <linux/mm_types.h>
    > +#include <linux/sched.h>
    >  #include <asm/smp.h>
    >
    >  /*
    > @@ -46,14 +47,38 @@ static inline void remote_sfence_vma(struct cpumask *cmask, unsigned long start,
    >                                      unsigned long size)
    >  {
    >         struct cpumask hmask;
    > +       struct cpumask tmask;
    > +       int cpuid = smp_processor_id();
    >
    >         cpumask_clear(&hmask);
    > -       riscv_cpuid_to_hartid_mask(cmask, &hmask);
    > -       sbi_remote_sfence_vma(hmask.bits, start, size);
    > +       cpumask_clear(&tmask);
    > +
    > +       if (cmask)
    > +               cpumask_copy(&tmask, cmask);
    > +       else
    > +               cpumask_copy(&tmask, cpu_online_mask);
    
    This can be further simplified.
    
    We can totally avoid tmask, cpumask_copy(), and cpumask_clear()
    by directly updating hmask.
    
Ahh yes. Thanks for pointing out.

    In addition to this patch, we should also handle the case of
    empty hart mask in OpenSBI.
    
Yes. I have few other fixes as well (around fifo race condition and local flushing in OpenSBI).
I will send them soon.

Regards,
Atish
    > +
    > +       if (cpumask_test_cpu(cpuid, &tmask)) {
    > +               /* Save trap cost by issuing a local tlb flush here */
    > +               if ((start == 0 && size == -1) || (size > PAGE_SIZE))
    > +                       local_flush_tlb_all();
    > +               else if (size == PAGE_SIZE)
    > +                       local_flush_tlb_page(start);
    > +               cpumask_clear_cpu(cpuid, &tmask);
    > +       } else if (cpumask_empty(&tmask)) {
    > +               /* cpumask is empty. So just do a local flush */
    > +               local_flush_tlb_all();
    > +               return;
    > +       }
    > +
    > +       if (!cpumask_empty(&tmask)) {
    > +               riscv_cpuid_to_hartid_mask(&tmask, &hmask);
    > +               sbi_remote_sfence_vma(hmask.bits, start, size);
    > +       }
    >  }
    >
    > -#define flush_tlb_all() sbi_remote_sfence_vma(NULL, 0, -1)
    > -#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, 0)
    > +#define flush_tlb_all() remote_sfence_vma(NULL, 0, -1)
    > +#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, (addr) + PAGE_SIZE)
    >  #define flush_tlb_range(vma, start, end) \
    >         remote_sfence_vma(mm_cpumask((vma)->vm_mm), start, (end) - (start))
    >  #define flush_tlb_mm(mm) \
    > --
    > 2.21.0
    >
    >
    > _______________________________________________
    > linux-riscv mailing list
    > linux-riscv@lists.infradead.org
    > http://lists.infradead.org/mailman/listinfo/linux-riscv
    
    Regards,
    Anup
Andreas Schwab Aug. 10, 2019, 6:37 a.m. UTC | #3
On Aug 09 2019, Atish Patra <atish.patra@wdc.com> wrote:

> @@ -46,14 +47,38 @@ static inline void remote_sfence_vma(struct cpumask *cmask, unsigned long start,
>  				     unsigned long size)
>  {
>  	struct cpumask hmask;
> +	struct cpumask tmask;
> +	int cpuid = smp_processor_id();
>  
>  	cpumask_clear(&hmask);
> -	riscv_cpuid_to_hartid_mask(cmask, &hmask);
> -	sbi_remote_sfence_vma(hmask.bits, start, size);
> +	cpumask_clear(&tmask);
> +
> +	if (cmask)
> +		cpumask_copy(&tmask, cmask);
> +	else
> +		cpumask_copy(&tmask, cpu_online_mask);
> +
> +	if (cpumask_test_cpu(cpuid, &tmask)) {
> +		/* Save trap cost by issuing a local tlb flush here */
> +		if ((start == 0 && size == -1) || (size > PAGE_SIZE))
> +			local_flush_tlb_all();
> +		else if (size == PAGE_SIZE)
> +			local_flush_tlb_page(start);
> +		cpumask_clear_cpu(cpuid, &tmask);
> +	} else if (cpumask_empty(&tmask)) {
> +		/* cpumask is empty. So just do a local flush */
> +		local_flush_tlb_all();
> +		return;
> +	}
> +
> +	if (!cpumask_empty(&tmask)) {
> +		riscv_cpuid_to_hartid_mask(&tmask, &hmask);
> +		sbi_remote_sfence_vma(hmask.bits, start, size);
> +	}
>  }

I think it's becoming too big to be inline.

Andreas.
Atish Patra Aug. 10, 2019, 9:21 a.m. UTC | #4
On 8/9/19, 6:43 PM, "Atish Patra" <Atish.Patra@wdc.com> wrote:

    In RISC-V, tlb flush happens via SBI which is expensive.
    If the target cpumask contains a local hartid, some cost
    can be saved by issuing a local tlb flush as we do that
    in OpenSBI anyways.
    
    Signed-off-by: Atish Patra <atish.patra@wdc.com>
    ---
     arch/riscv/include/asm/tlbflush.h | 33 +++++++++++++++++++++++++++----
     1 file changed, 29 insertions(+), 4 deletions(-)
    
    diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
    index 687dd19735a7..b32ba4fa5888 100644
    --- a/arch/riscv/include/asm/tlbflush.h
    +++ b/arch/riscv/include/asm/tlbflush.h
    @@ -8,6 +8,7 @@
     #define _ASM_RISCV_TLBFLUSH_H
     
     #include <linux/mm_types.h>
    +#include <linux/sched.h>
     #include <asm/smp.h>
     
     /*
    @@ -46,14 +47,38 @@ static inline void remote_sfence_vma(struct cpumask *cmask, unsigned long start,
     				     unsigned long size)
     {
     	struct cpumask hmask;
    +	struct cpumask tmask;
    +	int cpuid = smp_processor_id();
     
     	cpumask_clear(&hmask);
    -	riscv_cpuid_to_hartid_mask(cmask, &hmask);
    -	sbi_remote_sfence_vma(hmask.bits, start, size);
    +	cpumask_clear(&tmask);
    +
    +	if (cmask)
    +		cpumask_copy(&tmask, cmask);
    +	else
    +		cpumask_copy(&tmask, cpu_online_mask);
    +
    +	if (cpumask_test_cpu(cpuid, &tmask)) {
    +		/* Save trap cost by issuing a local tlb flush here */
    +		if ((start == 0 && size == -1) || (size > PAGE_SIZE))
    +			local_flush_tlb_all();
    +		else if (size == PAGE_SIZE)
    +			local_flush_tlb_page(start);
    +		cpumask_clear_cpu(cpuid, &tmask);
    +	} else if (cpumask_empty(&tmask)) {
    +		/* cpumask is empty. So just do a local flush */
    +		local_flush_tlb_all();
    +		return;
    +	}
    +
    +	if (!cpumask_empty(&tmask)) {
    +		riscv_cpuid_to_hartid_mask(&tmask, &hmask);
    +		sbi_remote_sfence_vma(hmask.bits, start, size);
    +	}
     }
     
    -#define flush_tlb_all() sbi_remote_sfence_vma(NULL, 0, -1)
    -#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, 0)
    +#define flush_tlb_all() remote_sfence_vma(NULL, 0, -1)
    +#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, (addr) + PAGE_SIZE)

I did not see Paul's patch to fix this. I will remove this in v2 and rebase on top of Paul's patch.

Regards,
Atish
     #define flush_tlb_range(vma, start, end) \
     	remote_sfence_vma(mm_cpumask((vma)->vm_mm), start, (end) - (start))
     #define flush_tlb_mm(mm) \
    -- 
    2.21.0
Christoph Hellwig Aug. 12, 2019, 2:56 p.m. UTC | #5
I agree with the comment that we really should move this out of line
now, and also that we can simplify it further, which also includes
not bothering with the SBI call if we were the only online CPU.
I also thing we need to use get_cpu/put_cpu to be preemption safe.

Also why would we need to do a local flush if we have a mask that
doesn't include the local CPU?

How about something like:

void __riscv_flush_tlb(struct cpumask *cpumask, unsigned long start,
		unsigned long size)
{
	unsigned int cpu;

	if (!cpumask)
		cpumask = cpu_online_mask;

	cpu = get_cpu();
	if (!cpumask || cpumask_test_cpu(cpu, cpumask) {
		if ((start == 0 && size == -1) || size > PAGE_SIZE)
			local_flush_tlb_all();
		else if (size == PAGE_SIZE)
			local_flush_tlb_page(start);
		cpumask_clear_cpu(cpuid, cpumask);
	}

	if (!cpumask_empty(cpumask)) {
	  	struct cpumask hmask;

		riscv_cpuid_to_hartid_mask(cpumask, &hmask);
		sbi_remote_sfence_vma(hmask.bits, start, size);
	}
	put_cpu();
}
Troy Benjegerdes Aug. 12, 2019, 3:36 p.m. UTC | #6
> On Aug 9, 2019, at 8:43 PM, Atish Patra <atish.patra@wdc.com> wrote:
> 
> In RISC-V, tlb flush happens via SBI which is expensive.
> If the target cpumask contains a local hartid, some cost
> can be saved by issuing a local tlb flush as we do that
> in OpenSBI anyways.

Is there anything other than convention and current usage that prevents
the kernel from natively handling TLB flushes without ever making the SBI
call?

Someone is eventually going to want to run the linux kernel in machine mode,
likely for performance and/or security reasons, and this will require flushing TLBs
natively anyway.


> 
> Signed-off-by: Atish Patra <atish.patra@wdc.com>
> ---
> arch/riscv/include/asm/tlbflush.h | 33 +++++++++++++++++++++++++++----
> 1 file changed, 29 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
> index 687dd19735a7..b32ba4fa5888 100644
> --- a/arch/riscv/include/asm/tlbflush.h
> +++ b/arch/riscv/include/asm/tlbflush.h
> @@ -8,6 +8,7 @@
> #define _ASM_RISCV_TLBFLUSH_H
> 
> #include <linux/mm_types.h>
> +#include <linux/sched.h>
> #include <asm/smp.h>
> 
> /*
> @@ -46,14 +47,38 @@ static inline void remote_sfence_vma(struct cpumask *cmask, unsigned long start,
> 				     unsigned long size)
> {
> 	struct cpumask hmask;
> +	struct cpumask tmask;
> +	int cpuid = smp_processor_id();
> 
> 	cpumask_clear(&hmask);
> -	riscv_cpuid_to_hartid_mask(cmask, &hmask);
> -	sbi_remote_sfence_vma(hmask.bits, start, size);
> +	cpumask_clear(&tmask);
> +
> +	if (cmask)
> +		cpumask_copy(&tmask, cmask);
> +	else
> +		cpumask_copy(&tmask, cpu_online_mask);
> +
> +	if (cpumask_test_cpu(cpuid, &tmask)) {
> +		/* Save trap cost by issuing a local tlb flush here */
> +		if ((start == 0 && size == -1) || (size > PAGE_SIZE))
> +			local_flush_tlb_all();
> +		else if (size == PAGE_SIZE)
> +			local_flush_tlb_page(start);
> +		cpumask_clear_cpu(cpuid, &tmask);
> +	} else if (cpumask_empty(&tmask)) {
> +		/* cpumask is empty. So just do a local flush */
> +		local_flush_tlb_all();
> +		return;
> +	}
> +
> +	if (!cpumask_empty(&tmask)) {
> +		riscv_cpuid_to_hartid_mask(&tmask, &hmask);
> +		sbi_remote_sfence_vma(hmask.bits, start, size);
> +	}
> }
> 
> -#define flush_tlb_all() sbi_remote_sfence_vma(NULL, 0, -1)
> -#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, 0)
> +#define flush_tlb_all() remote_sfence_vma(NULL, 0, -1)
> +#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, (addr) + PAGE_SIZE)
> #define flush_tlb_range(vma, start, end) \
> 	remote_sfence_vma(mm_cpumask((vma)->vm_mm), start, (end) - (start))
> #define flush_tlb_mm(mm) \
> -- 
> 2.21.0
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Atish Patra Aug. 12, 2019, 5:13 p.m. UTC | #7
On Mon, 2019-08-12 at 10:36 -0500, Troy Benjegerdes wrote:
> > On Aug 9, 2019, at 8:43 PM, Atish Patra <atish.patra@wdc.com>
> > wrote:
> > 
> > In RISC-V, tlb flush happens via SBI which is expensive.
> > If the target cpumask contains a local hartid, some cost
> > can be saved by issuing a local tlb flush as we do that
> > in OpenSBI anyways.
> 
> Is there anything other than convention and current usage that
> prevents
> the kernel from natively handling TLB flushes without ever making the
> SBI
> call?
> 
> Someone is eventually going to want to run the linux kernel in
> machine mode,
> likely for performance and/or security reasons, and this will require
> flushing TLBs
> natively anyway.
> 

The support is already added by Christoph in nommu series.

https://lkml.org/lkml/2019/6/10/935

The idea is to just send IPIs directly in Linux. The same approach is
not good in Supervisor mode until we can get rid of IPIs via SBI all
together. Otherwise, every tlbflush will be even more expensive as it
has to comeback to S mode and then execute sfence.vma.

> 
> > Signed-off-by: Atish Patra <atish.patra@wdc.com>
> > ---
> > arch/riscv/include/asm/tlbflush.h | 33 +++++++++++++++++++++++++++-
> > ---
> > 1 file changed, 29 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/riscv/include/asm/tlbflush.h
> > b/arch/riscv/include/asm/tlbflush.h
> > index 687dd19735a7..b32ba4fa5888 100644
> > --- a/arch/riscv/include/asm/tlbflush.h
> > +++ b/arch/riscv/include/asm/tlbflush.h
> > @@ -8,6 +8,7 @@
> > #define _ASM_RISCV_TLBFLUSH_H
> > 
> > #include <linux/mm_types.h>
> > +#include <linux/sched.h>
> > #include <asm/smp.h>
> > 
> > /*
> > @@ -46,14 +47,38 @@ static inline void remote_sfence_vma(struct
> > cpumask *cmask, unsigned long start,
> > 				     unsigned long size)
> > {
> > 	struct cpumask hmask;
> > +	struct cpumask tmask;
> > +	int cpuid = smp_processor_id();
> > 
> > 	cpumask_clear(&hmask);
> > -	riscv_cpuid_to_hartid_mask(cmask, &hmask);
> > -	sbi_remote_sfence_vma(hmask.bits, start, size);
> > +	cpumask_clear(&tmask);
> > +
> > +	if (cmask)
> > +		cpumask_copy(&tmask, cmask);
> > +	else
> > +		cpumask_copy(&tmask, cpu_online_mask);
> > +
> > +	if (cpumask_test_cpu(cpuid, &tmask)) {
> > +		/* Save trap cost by issuing a local tlb flush here */
> > +		if ((start == 0 && size == -1) || (size > PAGE_SIZE))
> > +			local_flush_tlb_all();
> > +		else if (size == PAGE_SIZE)
> > +			local_flush_tlb_page(start);
> > +		cpumask_clear_cpu(cpuid, &tmask);
> > +	} else if (cpumask_empty(&tmask)) {
> > +		/* cpumask is empty. So just do a local flush */
> > +		local_flush_tlb_all();
> > +		return;
> > +	}
> > +
> > +	if (!cpumask_empty(&tmask)) {
> > +		riscv_cpuid_to_hartid_mask(&tmask, &hmask);
> > +		sbi_remote_sfence_vma(hmask.bits, start, size);
> > +	}
> > }
> > 
> > -#define flush_tlb_all() sbi_remote_sfence_vma(NULL, 0, -1)
> > -#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, 0)
> > +#define flush_tlb_all() remote_sfence_vma(NULL, 0, -1)
> > +#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr,
> > (addr) + PAGE_SIZE)
> > #define flush_tlb_range(vma, start, end) \
> > 	remote_sfence_vma(mm_cpumask((vma)->vm_mm), start, (end) -
> > (start))
> > #define flush_tlb_mm(mm) \
> > -- 
> > 2.21.0
> > 
> > 
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
Christoph Hellwig Aug. 12, 2019, 5:55 p.m. UTC | #8
On Mon, Aug 12, 2019 at 10:36:25AM -0500, Troy Benjegerdes wrote:
> Is there anything other than convention and current usage that prevents
> the kernel from natively handling TLB flushes without ever making the SBI
> call?

Yes and no.

In all existing RISC-V implementation remote TLB flushes are simply
implementing using IPIs.  So you could trivially implement remote TLB
flush using IPIs, and in fact Gary Guo posted a series to do that a
while ago.

But: the RISC privileged spec requires that IPIs are only issued from
M-mode and only delivered to M-mode.  So what would be a trivial MMIO
write plus interupt to wakeup the remote hart actually turns into
a dance requiring multiple context switches between privile levels,
and without additional optimizations that will be even slower than the
current SBI based implementation.

I've started a prototype implementation and spec edits to relax this
and allow direct IPIs from S-mode to S-mode, which will speed up IPIs
by about an order of magnitude, and I hope this will be how future
RISC-V implementations work.

> Someone is eventually going to want to run the linux kernel in machine mode,
> likely for performance and/or security reasons, and this will require flushing TLBs
> natively anyway.

The nommu ports run in M-mode.  But running a MMU-enabled port in M-mode
is rather painful if not impossible (trust me, I've tried) due to how
the privileged spec says that M-mode generally runs without address
translation.  There is a workaround using the MPRV bit in mstatus, but
even that just uses the address translation for loads and stores, and
not for the program text.
Atish Patra Aug. 13, 2019, 12:15 a.m. UTC | #9
On Mon, 2019-08-12 at 07:56 -0700, Christoph Hellwig wrote:
> I agree with the comment that we really should move this out of line
> now, and 

sure.

> also that we can simplify it further, which also includes
> not bothering with the SBI call if we were the only online CPU.

I already had that optimization in my patch.

> I also thing we need to use get_cpu/put_cpu to be preemption safe.
> 
ok. I will update that.

> Also why would we need to do a local flush if we have a mask that
> doesn't include the local CPU?
> 

I thought if it recieves an empty cpumask, then it should at least do a
local flush is the expected behavior. Are you saying that we should
just skip all and return ? 


> How about something like:
> 
> void __riscv_flush_tlb(struct cpumask *cpumask, unsigned long start,
> 		unsigned long size)
> {
> 	unsigned int cpu;
> 
> 	if (!cpumask)
> 		cpumask = cpu_online_mask;
> 
> 	cpu = get_cpu();
> 	if (!cpumask || cpumask_test_cpu(cpu, cpumask) {
> 		if ((start == 0 && size == -1) || size > PAGE_SIZE)
> 			local_flush_tlb_all();
> 		else if (size == PAGE_SIZE)
> 			local_flush_tlb_page(start);
> 		cpumask_clear_cpu(cpuid, cpumask);

This will modify the original cpumask which is not correct. clear part
has to be done on hmask to avoid a copy.

Regards,
Atish
> 	}
> 
> 	if (!cpumask_empty(cpumask)) {
> 	  	struct cpumask hmask;
> 
> 		riscv_cpuid_to_hartid_mask(cpumask, &hmask);
> 		sbi_remote_sfence_vma(hmask.bits, start, size);
> 	}
> 	put_cpu();
> }
Christoph Hellwig Aug. 13, 2019, 2:30 p.m. UTC | #10
On Tue, Aug 13, 2019 at 12:15:15AM +0000, Atish Patra wrote:
> I thought if it recieves an empty cpumask, then it should at least do a
> local flush is the expected behavior. Are you saying that we should
> just skip all and return ? 

How could we ever receive an empty cpu mask?  I think it could only
be empty without the current cpu.  At least that is my reading of
the callers and a few other implementations.

> > 	if (!cpumask || cpumask_test_cpu(cpu, cpumask) {
> > 		if ((start == 0 && size == -1) || size > PAGE_SIZE)
> > 			local_flush_tlb_all();
> > 		else if (size == PAGE_SIZE)
> > 			local_flush_tlb_page(start);
> > 		cpumask_clear_cpu(cpuid, cpumask);
> 
> This will modify the original cpumask which is not correct. clear part
> has to be done on hmask to avoid a copy.

Indeed.  But looking at the x86 tlbflush implementation it seems like we
could use cpumask_any_but() to avoid having to modify the passed in
cpumask.
Paul Walmsley Aug. 13, 2019, 6:25 p.m. UTC | #11
Hi Atish,

On Fri, 9 Aug 2019, Atish Patra wrote:

> In RISC-V, tlb flush happens via SBI which is expensive.
> If the target cpumask contains a local hartid, some cost
> can be saved by issuing a local tlb flush as we do that
> in OpenSBI anyways.
> 
> Signed-off-by: Atish Patra <atish.patra@wdc.com>

A few brief comments/questions beyond the ones that others have mentioned:

1. At some point, some RISC-V systems may implement this SBI call in 
hardware, rather than in software.  Then this might wind up becoming a 
de-optimization.  I don't think there's anything to do about that in code 
right now, but it might be worth adding a comment, and thinking about how 
that case might be handled in the platform specification group.

2. If this patch masks or reduces the likelihood of hitting the 
TLB-related crashes that we're seeing, we probably will want to hold off 
on merging this one until we're relatively certain that those other 
problems have been fixed. 



> ---
>  arch/riscv/include/asm/tlbflush.h | 33 +++++++++++++++++++++++++++----
>  1 file changed, 29 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
> index 687dd19735a7..b32ba4fa5888 100644
> --- a/arch/riscv/include/asm/tlbflush.h
> +++ b/arch/riscv/include/asm/tlbflush.h
> @@ -8,6 +8,7 @@
>  #define _ASM_RISCV_TLBFLUSH_H
>  
>  #include <linux/mm_types.h>
> +#include <linux/sched.h>
>  #include <asm/smp.h>
>  
>  /*
> @@ -46,14 +47,38 @@ static inline void remote_sfence_vma(struct cpumask *cmask, unsigned long start,
>  				     unsigned long size)
>  {
>  	struct cpumask hmask;
> +	struct cpumask tmask;
> +	int cpuid = smp_processor_id();
>  
>  	cpumask_clear(&hmask);
> -	riscv_cpuid_to_hartid_mask(cmask, &hmask);
> -	sbi_remote_sfence_vma(hmask.bits, start, size);
> +	cpumask_clear(&tmask);
> +
> +	if (cmask)
> +		cpumask_copy(&tmask, cmask);
> +	else
> +		cpumask_copy(&tmask, cpu_online_mask);
> +
> +	if (cpumask_test_cpu(cpuid, &tmask)) {
> +		/* Save trap cost by issuing a local tlb flush here */
> +		if ((start == 0 && size == -1) || (size > PAGE_SIZE))
> +			local_flush_tlb_all();
> +		else if (size == PAGE_SIZE)
> +			local_flush_tlb_page(start);
> +		cpumask_clear_cpu(cpuid, &tmask);
> +	} else if (cpumask_empty(&tmask)) {
> +		/* cpumask is empty. So just do a local flush */
> +		local_flush_tlb_all();
> +		return;
> +	}
> +
> +	if (!cpumask_empty(&tmask)) {
> +		riscv_cpuid_to_hartid_mask(&tmask, &hmask);
> +		sbi_remote_sfence_vma(hmask.bits, start, size);
> +	}
>  }
>  
> -#define flush_tlb_all() sbi_remote_sfence_vma(NULL, 0, -1)
> -#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, 0)
> +#define flush_tlb_all() remote_sfence_vma(NULL, 0, -1)
> +#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, (addr) + PAGE_SIZE)
>  #define flush_tlb_range(vma, start, end) \
>  	remote_sfence_vma(mm_cpumask((vma)->vm_mm), start, (end) - (start))
>  #define flush_tlb_mm(mm) \
> -- 
> 2.21.0
> 
> 


- Paul
Atish Patra Aug. 14, 2019, 1:49 a.m. UTC | #12
On Tue, 2019-08-13 at 11:25 -0700, Paul Walmsley wrote:
> Hi Atish,
> 
> On Fri, 9 Aug 2019, Atish Patra wrote:
> 
> > In RISC-V, tlb flush happens via SBI which is expensive.
> > If the target cpumask contains a local hartid, some cost
> > can be saved by issuing a local tlb flush as we do that
> > in OpenSBI anyways.
> > 
> > Signed-off-by: Atish Patra <atish.patra@wdc.com>
> 
> A few brief comments/questions beyond the ones that others have
> mentioned:
> 
> 1. At some point, some RISC-V systems may implement this SBI call in 
> hardware, rather than in software.  Then this might wind up becoming
> a 
> de-optimization.  I don't think there's anything to do about that in
> code 
> right now, but it might be worth adding a comment, and thinking about
> how 
> that case might be handled in the platform specification group.

I am fine with adding a comment. But I did not understand why this
would be deoptimization. I not exactly sure about the syntax of future
TLB flush instructions. But if we have a hardware instruction for
remote tlb flushing, it should be executed only for the harts other
than current hart. Right ? 

> 2. If this patch masks or reduces the likelihood of hitting the 
> TLB-related crashes that we're seeing, we probably will want to hold
> off 
> on merging this one until we're relatively certain that those other 
> problems have been fixed. 
> 

Yeah sure. I don't see any stress-ng error but still chasing down the
glibc error.

Regards,
Atish
> 
> 
> > ---
> >  arch/riscv/include/asm/tlbflush.h | 33
> > +++++++++++++++++++++++++++----
> >  1 file changed, 29 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/riscv/include/asm/tlbflush.h
> > b/arch/riscv/include/asm/tlbflush.h
> > index 687dd19735a7..b32ba4fa5888 100644
> > --- a/arch/riscv/include/asm/tlbflush.h
> > +++ b/arch/riscv/include/asm/tlbflush.h
> > @@ -8,6 +8,7 @@
> >  #define _ASM_RISCV_TLBFLUSH_H
> >  
> >  #include <linux/mm_types.h>
> > +#include <linux/sched.h>
> >  #include <asm/smp.h>
> >  
> >  /*
> > @@ -46,14 +47,38 @@ static inline void remote_sfence_vma(struct
> > cpumask *cmask, unsigned long start,
> >  				     unsigned long size)
> >  {
> >  	struct cpumask hmask;
> > +	struct cpumask tmask;
> > +	int cpuid = smp_processor_id();
> >  
> >  	cpumask_clear(&hmask);
> > -	riscv_cpuid_to_hartid_mask(cmask, &hmask);
> > -	sbi_remote_sfence_vma(hmask.bits, start, size);
> > +	cpumask_clear(&tmask);
> > +
> > +	if (cmask)
> > +		cpumask_copy(&tmask, cmask);
> > +	else
> > +		cpumask_copy(&tmask, cpu_online_mask);
> > +
> > +	if (cpumask_test_cpu(cpuid, &tmask)) {
> > +		/* Save trap cost by issuing a local tlb flush here */
> > +		if ((start == 0 && size == -1) || (size > PAGE_SIZE))
> > +			local_flush_tlb_all();
> > +		else if (size == PAGE_SIZE)
> > +			local_flush_tlb_page(start);
> > +		cpumask_clear_cpu(cpuid, &tmask);
> > +	} else if (cpumask_empty(&tmask)) {
> > +		/* cpumask is empty. So just do a local flush */
> > +		local_flush_tlb_all();
> > +		return;
> > +	}
> > +
> > +	if (!cpumask_empty(&tmask)) {
> > +		riscv_cpuid_to_hartid_mask(&tmask, &hmask);
> > +		sbi_remote_sfence_vma(hmask.bits, start, size);
> > +	}
> >  }
> >  
> > -#define flush_tlb_all() sbi_remote_sfence_vma(NULL, 0, -1)
> > -#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, 0)
> > +#define flush_tlb_all() remote_sfence_vma(NULL, 0, -1)
> > +#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr,
> > (addr) + PAGE_SIZE)
> >  #define flush_tlb_range(vma, start, end) \
> >  	remote_sfence_vma(mm_cpumask((vma)->vm_mm), start, (end) -
> > (start))
> >  #define flush_tlb_mm(mm) \
> > -- 
> > 2.21.0
> > 
> > 
> 
> - Paul
Atish Patra Aug. 15, 2019, 8:37 p.m. UTC | #13
On Tue, 2019-08-13 at 07:30 -0700, hch@infradead.org wrote:
> On Tue, Aug 13, 2019 at 12:15:15AM +0000, Atish Patra wrote:
> > I thought if it recieves an empty cpumask, then it should at least
> > do a
> > local flush is the expected behavior. Are you saying that we should
> > just skip all and return ? 
> 
> How could we ever receive an empty cpu mask?  I think it could only
> be empty without the current cpu.  At least that is my reading of
> the callers and a few other implementations.
> 

We get ton of them. Here is the stack dump.

[   16.735814] [<ffffffe000035498>] walk_stackframe+0x0/0xa0^M
298436 [   16.819037] [<ffffffe0000355f8>] show_stack+0x2a/0x34^M
298437 [   16.899648] [<ffffffe00067b54c>] dump_stack+0x62/0x7c^M
298438 [   16.977402] [<ffffffe0000ef6f6>] tlb_flush_mmu+0x14a/0x150^M
298439 [   17.054197] [<ffffffe0000ef7a4>] tlb_finish_mmu+0x42/0x72^M
298440 [   17.129986] [<ffffffe0000ede7c>] exit_mmap+0x8e/0xfa^M
298441 [   17.203669] [<ffffffe000037d54>] mmput.part.3+0x1a/0xc4^M
298442 [   17.275985] [<ffffffe000037e1e>] mmput+0x20/0x28^M
298443 [   17.345248] [<ffffffe0001143c2>] flush_old_exec+0x418/0x5f8^M
298444 [   17.415370] [<ffffffe000158408>]
load_elf_binary+0x262/0xc84^M
298445 [   17.483641] [<ffffffe000114614>]
search_binary_handler.part.7+0x72/0x172^M
298446 [   17.552078] [<ffffffe000114bb2>]
__do_execve_file+0x40c/0x56a^M
298447 [   17.617932] [<ffffffe00011503e>] sys_execve+0x26/0x32^M
298448 [   17.682164] [<ffffffe00003437e>] ret_from_syscall+0x0/0xe^M

It looks like it is in the path of clearing the old traces of already
run script or program.  I am not sure if the cpumask supposed to be
empty in this path.

Probably we should just issue tlb flush all for all CPUs instead of
just the local CPU.

> > > 	if (!cpumask || cpumask_test_cpu(cpu, cpumask) {
> > > 		if ((start == 0 && size == -1) || size > PAGE_SIZE)
> > > 			local_flush_tlb_all();
> > > 		else if (size == PAGE_SIZE)
> > > 			local_flush_tlb_page(start);
> > > 		cpumask_clear_cpu(cpuid, cpumask);
> > 
> > This will modify the original cpumask which is not correct. clear
> > part
> > has to be done on hmask to avoid a copy.
> 
> Indeed.  But looking at the x86 tlbflush implementation it seems like
> we
> could use cpumask_any_but() to avoid having to modify the passed in
> cpumask.

Looking at the x86 code, it uses cpumask_any_but to just test if there
any other cpu present apart from the current one.

If yes, it calls smp_call_function_many which ignores the current cpu
and execute tlb flush code on all other cpus.

For RISC-V, it has to still send the cpumask containing local cpu and
M-mode software may do a local tlb flush the tlbs again for no reason.


Regards,
Atish
Christoph Hellwig Aug. 19, 2019, 2:46 p.m. UTC | #14
On Thu, Aug 15, 2019 at 08:37:04PM +0000, Atish Patra wrote:
> We get ton of them. Here is the stack dump.

Looks like we might not need to flush anything at all here as the
mm_struct was never scheduled to run on any cpu?
Anup Patel Aug. 19, 2019, 3:09 p.m. UTC | #15
On Mon, Aug 19, 2019 at 8:16 PM hch@infradead.org <hch@infradead.org> wrote:
>
> On Thu, Aug 15, 2019 at 08:37:04PM +0000, Atish Patra wrote:
> > We get ton of them. Here is the stack dump.
>
> Looks like we might not need to flush anything at all here as the
> mm_struct was never scheduled to run on any cpu?

If we were using ASID then yes we don't need to flush anything
but currently we don't use ASID due to lack of HW support and
HW can certainly do speculatively page table walks so flushing
local TLB when MM mask is empty might help.

This just my theory and we need to stress test more.

Regards,
Anup
Christoph Hellwig Aug. 19, 2019, 3:10 p.m. UTC | #16
On Mon, Aug 19, 2019 at 08:39:02PM +0530, Anup Patel wrote:
> If we were using ASID then yes we don't need to flush anything
> but currently we don't use ASID due to lack of HW support and
> HW can certainly do speculatively page table walks so flushing
> local TLB when MM mask is empty might help.
> 
> This just my theory and we need to stress test more.

Well, when we context switch away from a mm we always flush the
local tlb.  So either the mm_struct has never been scheduled in,
or we alrady did a local_tlb_flush and we context switched it up.
Atish Patra Aug. 20, 2019, 12:02 a.m. UTC | #17
On Mon, 2019-08-19 at 08:10 -0700, hch@infradead.org wrote:
> On Mon, Aug 19, 2019 at 08:39:02PM +0530, Anup Patel wrote:
> > If we were using ASID then yes we don't need to flush anything
> > but currently we don't use ASID due to lack of HW support and
> > HW can certainly do speculatively page table walks so flushing
> > local TLB when MM mask is empty might help.
> > 
> > This just my theory and we need to stress test more.
> 
> Well, when we context switch away from a mm we always flush the
> local tlb.  So either the mm_struct has never been scheduled in,

Looking at the stack dump, it looks like this is the case. cpumask is
empty possibly after a fork/exec situation where forked child is being
replaced with actual program that is about to run.

I also looked at x86 & powerpc implementation which doesn't seem to do
anything special if cpumask is empty.

I will send a v2 with no tlb flushing if cpumask is empty.

> or we alrady did a local_tlb_flush and we context switched it up.


Regards,
Atish
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 687dd19735a7..b32ba4fa5888 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -8,6 +8,7 @@ 
 #define _ASM_RISCV_TLBFLUSH_H
 
 #include <linux/mm_types.h>
+#include <linux/sched.h>
 #include <asm/smp.h>
 
 /*
@@ -46,14 +47,38 @@  static inline void remote_sfence_vma(struct cpumask *cmask, unsigned long start,
 				     unsigned long size)
 {
 	struct cpumask hmask;
+	struct cpumask tmask;
+	int cpuid = smp_processor_id();
 
 	cpumask_clear(&hmask);
-	riscv_cpuid_to_hartid_mask(cmask, &hmask);
-	sbi_remote_sfence_vma(hmask.bits, start, size);
+	cpumask_clear(&tmask);
+
+	if (cmask)
+		cpumask_copy(&tmask, cmask);
+	else
+		cpumask_copy(&tmask, cpu_online_mask);
+
+	if (cpumask_test_cpu(cpuid, &tmask)) {
+		/* Save trap cost by issuing a local tlb flush here */
+		if ((start == 0 && size == -1) || (size > PAGE_SIZE))
+			local_flush_tlb_all();
+		else if (size == PAGE_SIZE)
+			local_flush_tlb_page(start);
+		cpumask_clear_cpu(cpuid, &tmask);
+	} else if (cpumask_empty(&tmask)) {
+		/* cpumask is empty. So just do a local flush */
+		local_flush_tlb_all();
+		return;
+	}
+
+	if (!cpumask_empty(&tmask)) {
+		riscv_cpuid_to_hartid_mask(&tmask, &hmask);
+		sbi_remote_sfence_vma(hmask.bits, start, size);
+	}
 }
 
-#define flush_tlb_all() sbi_remote_sfence_vma(NULL, 0, -1)
-#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, 0)
+#define flush_tlb_all() remote_sfence_vma(NULL, 0, -1)
+#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, (addr) + PAGE_SIZE)
 #define flush_tlb_range(vma, start, end) \
 	remote_sfence_vma(mm_cpumask((vma)->vm_mm), start, (end) - (start))
 #define flush_tlb_mm(mm) \