Message ID | 20240714-loongson64-cevt-r4k-v1-1-98afed7260aa@flygoat.com (mailing list archive) |
---|---|
State | Accepted |
Commit | fa165f919016829e542e37782a3452512dffa5ea |
Headers | show |
Series | MIPS: Loongson64: Switch to SYNC_R4K | expand |
On Sun, Jul 14, 2024 at 10:41 AM Jiaxun Yang <jiaxun.yang@flygoat.com> wrote: > > Nowadays SYNC_R4K is performing better than Loongson64's > custom sync mechanism. Loongson64's preciseness is significantly better than SYNC_R4K. Huacai > > Switch to SYNC_R4K to improve performance and reduce code > duplication. > > Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com> > --- > Last minute for 6.11 :-) > --- > arch/mips/Kconfig | 1 + > arch/mips/include/asm/smp.h | 1 - > arch/mips/loongson64/smp.c | 35 ++--------------------------------- > 3 files changed, 3 insertions(+), 34 deletions(-) > > diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig > index 1236ea122061..e163059dd4d3 100644 > --- a/arch/mips/Kconfig > +++ b/arch/mips/Kconfig > @@ -478,6 +478,7 @@ config MACH_LOONGSON64 > select BOARD_SCACHE > select CSRC_R4K > select CEVT_R4K > + select SYNC_R4K > select FORCE_PCI > select ISA > select I8259 > diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h > index bc2c240f414b..2427d76f953f 100644 > --- a/arch/mips/include/asm/smp.h > +++ b/arch/mips/include/asm/smp.h > @@ -50,7 +50,6 @@ extern int __cpu_logical_map[NR_CPUS]; > #define SMP_CALL_FUNCTION 0x2 > /* Octeon - Tell another core to flush its icache */ > #define SMP_ICACHE_FLUSH 0x4 > -#define SMP_ASK_C0COUNT 0x8 > > /* Mask of CPUs which are currently definitely operating coherently */ > extern cpumask_t cpu_coherent_mask; > diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c > index 66d049cdcf14..147acd972a07 100644 > --- a/arch/mips/loongson64/smp.c > +++ b/arch/mips/loongson64/smp.c > @@ -33,7 +33,6 @@ static void __iomem *ipi_clear0_regs[16]; > static void __iomem *ipi_status0_regs[16]; > static void __iomem *ipi_en0_regs[16]; > static void __iomem *ipi_mailbox_buf[16]; > -static uint32_t core0_c0count[NR_CPUS]; > > static u32 (*ipi_read_clear)(int cpu); > static void (*ipi_write_action)(int cpu, u32 action); > @@ -382,11 +381,10 @@ loongson3_send_ipi_mask(const struct cpumask *mask, unsigned int action) > ipi_write_action(cpu_logical_map(i), (u32)action); > } > > - > static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id) > { > - int i, cpu = smp_processor_id(); > - unsigned int action, c0count; > + int cpu = smp_processor_id(); > + unsigned int action; > > action = ipi_read_clear(cpu); > > @@ -399,26 +397,14 @@ static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id) > irq_exit(); > } > > - if (action & SMP_ASK_C0COUNT) { > - BUG_ON(cpu != 0); > - c0count = read_c0_count(); > - c0count = c0count ? c0count : 1; > - for (i = 1; i < nr_cpu_ids; i++) > - core0_c0count[i] = c0count; > - nudge_writes(); /* Let others see the result ASAP */ > - } > - > return IRQ_HANDLED; > } > > -#define MAX_LOOPS 800 > /* > * SMP init and finish on secondary CPUs > */ > static void loongson3_init_secondary(void) > { > - int i; > - uint32_t initcount; > unsigned int cpu = smp_processor_id(); > unsigned int imask = STATUSF_IP7 | STATUSF_IP6 | > STATUSF_IP3 | STATUSF_IP2; > @@ -432,23 +418,6 @@ static void loongson3_init_secondary(void) > cpu_logical_map(cpu) % loongson_sysconf.cores_per_package); > cpu_data[cpu].package = > cpu_logical_map(cpu) / loongson_sysconf.cores_per_package; > - > - i = 0; > - core0_c0count[cpu] = 0; > - loongson3_send_ipi_single(0, SMP_ASK_C0COUNT); > - while (!core0_c0count[cpu]) { > - i++; > - cpu_relax(); > - } > - > - if (i > MAX_LOOPS) > - i = MAX_LOOPS; > - if (cpu_data[cpu].package) > - initcount = core0_c0count[cpu] + i; > - else /* Local access is faster for loops */ > - initcount = core0_c0count[cpu] + i/2; > - > - write_c0_count(initcount); > } > > static void loongson3_smp_finish(void) > > --- > base-commit: 0b58e108042b0ed28a71cd7edf5175999955b233 > change-id: 20240714-loongson64-cevt-r4k-eb74d4ad984c > > Best regards, > -- > Jiaxun Yang <jiaxun.yang@flygoat.com> >
在2024年7月14日七月 上午10:54,Huacai Chen写道: > On Sun, Jul 14, 2024 at 10:41 AM Jiaxun Yang <jiaxun.yang@flygoat.com> wrote: >> >> Nowadays SYNC_R4K is performing better than Loongson64's >> custom sync mechanism. > Loongson64's preciseness is significantly better than SYNC_R4K. My updated implementation[1] uses a multi-pass regression methodology which can compensite communication overhead. I had measured delta with EJTAG DINT on Loongson-2K and it can be reduced to as low as 4 cycles while ASK_C0_COUNT has ~50 cycles delta. [1]: https://lore.kernel.org/linux-mips/20240511-mips-clks-v1-0-ddb4a10ee9f9@flygoat.com/ Thanks - Jiaxun > > Huacai
在2024年7月14日七月 上午11:00,Jiaxun Yang写道: > 在2024年7月14日七月 上午10:54,Huacai Chen写道: >> On Sun, Jul 14, 2024 at 10:41 AM Jiaxun Yang <jiaxun.yang@flygoat.com> wrote: >>> >>> Nowadays SYNC_R4K is performing better than Loongson64's >>> custom sync mechanism. >> Loongson64's preciseness is significantly better than SYNC_R4K. > > My updated implementation[1] uses a multi-pass regression methodology > which can compensite communication overhead. I had measured delta > with EJTAG DINT on Loongson-2K and it can be reduced to as low as 4 > cycles while ASK_C0_COUNT has ~50 cycles delta. > > [1]: > https://lore.kernel.org/linux-mips/20240511-mips-clks-v1-0-ddb4a10ee9f9@flygoat.com/ This is currently in mips-next. > > Thanks > - Jiaxun > >> >> Huacai > > > -- > - Jiaxun
在2024年7月14日七月 上午10:41,Jiaxun Yang写道: > Nowadays SYNC_R4K is performing better than Loongson64's > custom sync mechanism. > > Switch to SYNC_R4K to improve performance and reduce code > duplication. > > Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com> > --- > Last minute for 6.11 :-) Hi Thomas, Could you please apply this to 6.11 PR, or 6.11 fixes? This is technically a left over of previous clock source series, and it does help on preventing random RCU stall for multi-node Loongson-3 systems. Thanks - Jiaxun > --- > arch/mips/Kconfig | 1 + > arch/mips/include/asm/smp.h | 1 - > arch/mips/loongson64/smp.c | 35 ++--------------------------------- > 3 files changed, 3 insertions(+), 34 deletions(-) > > diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig > index 1236ea122061..e163059dd4d3 100644 > --- a/arch/mips/Kconfig > +++ b/arch/mips/Kconfig > @@ -478,6 +478,7 @@ config MACH_LOONGSON64 > select BOARD_SCACHE > select CSRC_R4K > select CEVT_R4K > + select SYNC_R4K > select FORCE_PCI > select ISA > select I8259 > diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h > index bc2c240f414b..2427d76f953f 100644 > --- a/arch/mips/include/asm/smp.h > +++ b/arch/mips/include/asm/smp.h > @@ -50,7 +50,6 @@ extern int __cpu_logical_map[NR_CPUS]; > #define SMP_CALL_FUNCTION 0x2 > /* Octeon - Tell another core to flush its icache */ > #define SMP_ICACHE_FLUSH 0x4 > -#define SMP_ASK_C0COUNT 0x8 > > /* Mask of CPUs which are currently definitely operating coherently */ > extern cpumask_t cpu_coherent_mask; > diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c > index 66d049cdcf14..147acd972a07 100644 > --- a/arch/mips/loongson64/smp.c > +++ b/arch/mips/loongson64/smp.c > @@ -33,7 +33,6 @@ static void __iomem *ipi_clear0_regs[16]; > static void __iomem *ipi_status0_regs[16]; > static void __iomem *ipi_en0_regs[16]; > static void __iomem *ipi_mailbox_buf[16]; > -static uint32_t core0_c0count[NR_CPUS]; > > static u32 (*ipi_read_clear)(int cpu); > static void (*ipi_write_action)(int cpu, u32 action); > @@ -382,11 +381,10 @@ loongson3_send_ipi_mask(const struct cpumask > *mask, unsigned int action) > ipi_write_action(cpu_logical_map(i), (u32)action); > } > > - > static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id) > { > - int i, cpu = smp_processor_id(); > - unsigned int action, c0count; > + int cpu = smp_processor_id(); > + unsigned int action; > > action = ipi_read_clear(cpu); > > @@ -399,26 +397,14 @@ static irqreturn_t loongson3_ipi_interrupt(int > irq, void *dev_id) > irq_exit(); > } > > - if (action & SMP_ASK_C0COUNT) { > - BUG_ON(cpu != 0); > - c0count = read_c0_count(); > - c0count = c0count ? c0count : 1; > - for (i = 1; i < nr_cpu_ids; i++) > - core0_c0count[i] = c0count; > - nudge_writes(); /* Let others see the result ASAP */ > - } > - > return IRQ_HANDLED; > } > > -#define MAX_LOOPS 800 > /* > * SMP init and finish on secondary CPUs > */ > static void loongson3_init_secondary(void) > { > - int i; > - uint32_t initcount; > unsigned int cpu = smp_processor_id(); > unsigned int imask = STATUSF_IP7 | STATUSF_IP6 | > STATUSF_IP3 | STATUSF_IP2; > @@ -432,23 +418,6 @@ static void loongson3_init_secondary(void) > cpu_logical_map(cpu) % loongson_sysconf.cores_per_package); > cpu_data[cpu].package = > cpu_logical_map(cpu) / loongson_sysconf.cores_per_package; > - > - i = 0; > - core0_c0count[cpu] = 0; > - loongson3_send_ipi_single(0, SMP_ASK_C0COUNT); > - while (!core0_c0count[cpu]) { > - i++; > - cpu_relax(); > - } > - > - if (i > MAX_LOOPS) > - i = MAX_LOOPS; > - if (cpu_data[cpu].package) > - initcount = core0_c0count[cpu] + i; > - else /* Local access is faster for loops */ > - initcount = core0_c0count[cpu] + i/2; > - > - write_c0_count(initcount); > } > > static void loongson3_smp_finish(void) > > --- > base-commit: 0b58e108042b0ed28a71cd7edf5175999955b233 > change-id: 20240714-loongson64-cevt-r4k-eb74d4ad984c > > Best regards, > -- > Jiaxun Yang <jiaxun.yang@flygoat.com>
On Thu, Jul 18, 2024 at 03:34:30PM +0800, Jiaxun Yang wrote: > > > 在2024年7月14日七月 上午10:41,Jiaxun Yang写道: > > Nowadays SYNC_R4K is performing better than Loongson64's > > custom sync mechanism. > > > > Switch to SYNC_R4K to improve performance and reduce code > > duplication. > > > > Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com> > > --- > > Last minute for 6.11 :-) > > Hi Thomas, > > Could you please apply this to 6.11 PR, or 6.11 fixes? > > This is technically a left over of previous clock source series, and it does help > on preventing random RCU stall for multi-node Loongson-3 systems. if Huacai is ok with it, I'll add it to a second PR for 6.11. Thomas.
Reviewed-by: Huacai Chen <chenhuacai@loongson.cn> On Fri, Jul 19, 2024 at 1:29 AM Thomas Bogendoerfer <tsbogend@alpha.franken.de> wrote: > > On Thu, Jul 18, 2024 at 03:34:30PM +0800, Jiaxun Yang wrote: > > > > > > 在2024年7月14日七月 上午10:41,Jiaxun Yang写道: > > > Nowadays SYNC_R4K is performing better than Loongson64's > > > custom sync mechanism. > > > > > > Switch to SYNC_R4K to improve performance and reduce code > > > duplication. > > > > > > Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com> > > > --- > > > Last minute for 6.11 :-) > > > > Hi Thomas, > > > > Could you please apply this to 6.11 PR, or 6.11 fixes? > > > > This is technically a left over of previous clock source series, and it does help > > on preventing random RCU stall for multi-node Loongson-3 systems. > > if Huacai is ok with it, I'll add it to a second PR for 6.11. > > Thomas. > > -- > Crap can work. Given enough thrust pigs will fly, but it's not necessarily a > good idea. [ RFC1925, 2.3 ]
On Sun, Jul 14, 2024 at 10:41:05AM +0800, Jiaxun Yang wrote: > Nowadays SYNC_R4K is performing better than Loongson64's > custom sync mechanism. > > Switch to SYNC_R4K to improve performance and reduce code > duplication. > > Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com> > --- > Last minute for 6.11 :-) > --- > arch/mips/Kconfig | 1 + > arch/mips/include/asm/smp.h | 1 - > arch/mips/loongson64/smp.c | 35 ++--------------------------------- > 3 files changed, 3 insertions(+), 34 deletions(-) applied to mips-next. Thomas.
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 1236ea122061..e163059dd4d3 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -478,6 +478,7 @@ config MACH_LOONGSON64 select BOARD_SCACHE select CSRC_R4K select CEVT_R4K + select SYNC_R4K select FORCE_PCI select ISA select I8259 diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h index bc2c240f414b..2427d76f953f 100644 --- a/arch/mips/include/asm/smp.h +++ b/arch/mips/include/asm/smp.h @@ -50,7 +50,6 @@ extern int __cpu_logical_map[NR_CPUS]; #define SMP_CALL_FUNCTION 0x2 /* Octeon - Tell another core to flush its icache */ #define SMP_ICACHE_FLUSH 0x4 -#define SMP_ASK_C0COUNT 0x8 /* Mask of CPUs which are currently definitely operating coherently */ extern cpumask_t cpu_coherent_mask; diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c index 66d049cdcf14..147acd972a07 100644 --- a/arch/mips/loongson64/smp.c +++ b/arch/mips/loongson64/smp.c @@ -33,7 +33,6 @@ static void __iomem *ipi_clear0_regs[16]; static void __iomem *ipi_status0_regs[16]; static void __iomem *ipi_en0_regs[16]; static void __iomem *ipi_mailbox_buf[16]; -static uint32_t core0_c0count[NR_CPUS]; static u32 (*ipi_read_clear)(int cpu); static void (*ipi_write_action)(int cpu, u32 action); @@ -382,11 +381,10 @@ loongson3_send_ipi_mask(const struct cpumask *mask, unsigned int action) ipi_write_action(cpu_logical_map(i), (u32)action); } - static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id) { - int i, cpu = smp_processor_id(); - unsigned int action, c0count; + int cpu = smp_processor_id(); + unsigned int action; action = ipi_read_clear(cpu); @@ -399,26 +397,14 @@ static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id) irq_exit(); } - if (action & SMP_ASK_C0COUNT) { - BUG_ON(cpu != 0); - c0count = read_c0_count(); - c0count = c0count ? c0count : 1; - for (i = 1; i < nr_cpu_ids; i++) - core0_c0count[i] = c0count; - nudge_writes(); /* Let others see the result ASAP */ - } - return IRQ_HANDLED; } -#define MAX_LOOPS 800 /* * SMP init and finish on secondary CPUs */ static void loongson3_init_secondary(void) { - int i; - uint32_t initcount; unsigned int cpu = smp_processor_id(); unsigned int imask = STATUSF_IP7 | STATUSF_IP6 | STATUSF_IP3 | STATUSF_IP2; @@ -432,23 +418,6 @@ static void loongson3_init_secondary(void) cpu_logical_map(cpu) % loongson_sysconf.cores_per_package); cpu_data[cpu].package = cpu_logical_map(cpu) / loongson_sysconf.cores_per_package; - - i = 0; - core0_c0count[cpu] = 0; - loongson3_send_ipi_single(0, SMP_ASK_C0COUNT); - while (!core0_c0count[cpu]) { - i++; - cpu_relax(); - } - - if (i > MAX_LOOPS) - i = MAX_LOOPS; - if (cpu_data[cpu].package) - initcount = core0_c0count[cpu] + i; - else /* Local access is faster for loops */ - initcount = core0_c0count[cpu] + i/2; - - write_c0_count(initcount); } static void loongson3_smp_finish(void)
Nowadays SYNC_R4K is performing better than Loongson64's custom sync mechanism. Switch to SYNC_R4K to improve performance and reduce code duplication. Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com> --- Last minute for 6.11 :-) --- arch/mips/Kconfig | 1 + arch/mips/include/asm/smp.h | 1 - arch/mips/loongson64/smp.c | 35 ++--------------------------------- 3 files changed, 3 insertions(+), 34 deletions(-) --- base-commit: 0b58e108042b0ed28a71cd7edf5175999955b233 change-id: 20240714-loongson64-cevt-r4k-eb74d4ad984c Best regards,