diff mbox series

MIPS: Loongson64: Switch to SYNC_R4K

Message ID 20240714-loongson64-cevt-r4k-v1-1-98afed7260aa@flygoat.com (mailing list archive)
State Accepted
Commit fa165f919016829e542e37782a3452512dffa5ea
Headers show
Series MIPS: Loongson64: Switch to SYNC_R4K | expand

Commit Message

Jiaxun Yang July 14, 2024, 2:41 a.m. UTC
Nowadays SYNC_R4K is performing better than Loongson64's
custom sync mechanism.

Switch to SYNC_R4K to improve performance and reduce code
duplication.

Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
---
Last minute for 6.11 :-)
---
 arch/mips/Kconfig           |  1 +
 arch/mips/include/asm/smp.h |  1 -
 arch/mips/loongson64/smp.c  | 35 ++---------------------------------
 3 files changed, 3 insertions(+), 34 deletions(-)


---
base-commit: 0b58e108042b0ed28a71cd7edf5175999955b233
change-id: 20240714-loongson64-cevt-r4k-eb74d4ad984c

Best regards,

Comments

Huacai Chen July 14, 2024, 2:54 a.m. UTC | #1
On Sun, Jul 14, 2024 at 10:41 AM Jiaxun Yang <jiaxun.yang@flygoat.com> wrote:
>
> Nowadays SYNC_R4K is performing better than Loongson64's
> custom sync mechanism.
Loongson64's preciseness is significantly better than SYNC_R4K.

Huacai

>
> Switch to SYNC_R4K to improve performance and reduce code
> duplication.
>
> Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
> ---
> Last minute for 6.11 :-)
> ---
>  arch/mips/Kconfig           |  1 +
>  arch/mips/include/asm/smp.h |  1 -
>  arch/mips/loongson64/smp.c  | 35 ++---------------------------------
>  3 files changed, 3 insertions(+), 34 deletions(-)
>
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index 1236ea122061..e163059dd4d3 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -478,6 +478,7 @@ config MACH_LOONGSON64
>         select BOARD_SCACHE
>         select CSRC_R4K
>         select CEVT_R4K
> +       select SYNC_R4K
>         select FORCE_PCI
>         select ISA
>         select I8259
> diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h
> index bc2c240f414b..2427d76f953f 100644
> --- a/arch/mips/include/asm/smp.h
> +++ b/arch/mips/include/asm/smp.h
> @@ -50,7 +50,6 @@ extern int __cpu_logical_map[NR_CPUS];
>  #define SMP_CALL_FUNCTION      0x2
>  /* Octeon - Tell another core to flush its icache */
>  #define SMP_ICACHE_FLUSH       0x4
> -#define SMP_ASK_C0COUNT                0x8
>
>  /* Mask of CPUs which are currently definitely operating coherently */
>  extern cpumask_t cpu_coherent_mask;
> diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c
> index 66d049cdcf14..147acd972a07 100644
> --- a/arch/mips/loongson64/smp.c
> +++ b/arch/mips/loongson64/smp.c
> @@ -33,7 +33,6 @@ static void __iomem *ipi_clear0_regs[16];
>  static void __iomem *ipi_status0_regs[16];
>  static void __iomem *ipi_en0_regs[16];
>  static void __iomem *ipi_mailbox_buf[16];
> -static uint32_t core0_c0count[NR_CPUS];
>
>  static u32 (*ipi_read_clear)(int cpu);
>  static void (*ipi_write_action)(int cpu, u32 action);
> @@ -382,11 +381,10 @@ loongson3_send_ipi_mask(const struct cpumask *mask, unsigned int action)
>                 ipi_write_action(cpu_logical_map(i), (u32)action);
>  }
>
> -
>  static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id)
>  {
> -       int i, cpu = smp_processor_id();
> -       unsigned int action, c0count;
> +       int cpu = smp_processor_id();
> +       unsigned int action;
>
>         action = ipi_read_clear(cpu);
>
> @@ -399,26 +397,14 @@ static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id)
>                 irq_exit();
>         }
>
> -       if (action & SMP_ASK_C0COUNT) {
> -               BUG_ON(cpu != 0);
> -               c0count = read_c0_count();
> -               c0count = c0count ? c0count : 1;
> -               for (i = 1; i < nr_cpu_ids; i++)
> -                       core0_c0count[i] = c0count;
> -               nudge_writes(); /* Let others see the result ASAP */
> -       }
> -
>         return IRQ_HANDLED;
>  }
>
> -#define MAX_LOOPS 800
>  /*
>   * SMP init and finish on secondary CPUs
>   */
>  static void loongson3_init_secondary(void)
>  {
> -       int i;
> -       uint32_t initcount;
>         unsigned int cpu = smp_processor_id();
>         unsigned int imask = STATUSF_IP7 | STATUSF_IP6 |
>                              STATUSF_IP3 | STATUSF_IP2;
> @@ -432,23 +418,6 @@ static void loongson3_init_secondary(void)
>                      cpu_logical_map(cpu) % loongson_sysconf.cores_per_package);
>         cpu_data[cpu].package =
>                 cpu_logical_map(cpu) / loongson_sysconf.cores_per_package;
> -
> -       i = 0;
> -       core0_c0count[cpu] = 0;
> -       loongson3_send_ipi_single(0, SMP_ASK_C0COUNT);
> -       while (!core0_c0count[cpu]) {
> -               i++;
> -               cpu_relax();
> -       }
> -
> -       if (i > MAX_LOOPS)
> -               i = MAX_LOOPS;
> -       if (cpu_data[cpu].package)
> -               initcount = core0_c0count[cpu] + i;
> -       else /* Local access is faster for loops */
> -               initcount = core0_c0count[cpu] + i/2;
> -
> -       write_c0_count(initcount);
>  }
>
>  static void loongson3_smp_finish(void)
>
> ---
> base-commit: 0b58e108042b0ed28a71cd7edf5175999955b233
> change-id: 20240714-loongson64-cevt-r4k-eb74d4ad984c
>
> Best regards,
> --
> Jiaxun Yang <jiaxun.yang@flygoat.com>
>
Jiaxun Yang July 14, 2024, 3 a.m. UTC | #2
在2024年7月14日七月 上午10:54,Huacai Chen写道:
> On Sun, Jul 14, 2024 at 10:41 AM Jiaxun Yang <jiaxun.yang@flygoat.com> wrote:
>>
>> Nowadays SYNC_R4K is performing better than Loongson64's
>> custom sync mechanism.
> Loongson64's preciseness is significantly better than SYNC_R4K.

My updated implementation[1] uses a multi-pass regression methodology
which can compensite communication overhead. I had measured delta
with EJTAG DINT on Loongson-2K and it can be reduced to as low as 4
cycles while ASK_C0_COUNT has ~50 cycles delta.

[1]: https://lore.kernel.org/linux-mips/20240511-mips-clks-v1-0-ddb4a10ee9f9@flygoat.com/

Thanks
- Jiaxun

>
> Huacai
Jiaxun Yang July 14, 2024, 3:02 a.m. UTC | #3
在2024年7月14日七月 上午11:00,Jiaxun Yang写道:
> 在2024年7月14日七月 上午10:54,Huacai Chen写道:
>> On Sun, Jul 14, 2024 at 10:41 AM Jiaxun Yang <jiaxun.yang@flygoat.com> wrote:
>>>
>>> Nowadays SYNC_R4K is performing better than Loongson64's
>>> custom sync mechanism.
>> Loongson64's preciseness is significantly better than SYNC_R4K.
>
> My updated implementation[1] uses a multi-pass regression methodology
> which can compensite communication overhead. I had measured delta
> with EJTAG DINT on Loongson-2K and it can be reduced to as low as 4
> cycles while ASK_C0_COUNT has ~50 cycles delta.
>
> [1]: 
> https://lore.kernel.org/linux-mips/20240511-mips-clks-v1-0-ddb4a10ee9f9@flygoat.com/

This is currently in mips-next.

>
> Thanks
> - Jiaxun
>
>>
>> Huacai
>
>
> -- 
> - Jiaxun
Jiaxun Yang July 18, 2024, 7:34 a.m. UTC | #4
在2024年7月14日七月 上午10:41,Jiaxun Yang写道:
> Nowadays SYNC_R4K is performing better than Loongson64's
> custom sync mechanism.
>
> Switch to SYNC_R4K to improve performance and reduce code
> duplication.
>
> Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
> ---
> Last minute for 6.11 :-)

Hi Thomas,

Could you please apply this to 6.11 PR, or 6.11 fixes?

This is technically a left over of previous clock source series, and it does help
on preventing random RCU stall for multi-node Loongson-3 systems.

Thanks
- Jiaxun

> ---
>  arch/mips/Kconfig           |  1 +
>  arch/mips/include/asm/smp.h |  1 -
>  arch/mips/loongson64/smp.c  | 35 ++---------------------------------
>  3 files changed, 3 insertions(+), 34 deletions(-)
>
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index 1236ea122061..e163059dd4d3 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -478,6 +478,7 @@ config MACH_LOONGSON64
>  	select BOARD_SCACHE
>  	select CSRC_R4K
>  	select CEVT_R4K
> +	select SYNC_R4K
>  	select FORCE_PCI
>  	select ISA
>  	select I8259
> diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h
> index bc2c240f414b..2427d76f953f 100644
> --- a/arch/mips/include/asm/smp.h
> +++ b/arch/mips/include/asm/smp.h
> @@ -50,7 +50,6 @@ extern int __cpu_logical_map[NR_CPUS];
>  #define SMP_CALL_FUNCTION	0x2
>  /* Octeon - Tell another core to flush its icache */
>  #define SMP_ICACHE_FLUSH	0x4
> -#define SMP_ASK_C0COUNT		0x8
> 
>  /* Mask of CPUs which are currently definitely operating coherently */
>  extern cpumask_t cpu_coherent_mask;
> diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c
> index 66d049cdcf14..147acd972a07 100644
> --- a/arch/mips/loongson64/smp.c
> +++ b/arch/mips/loongson64/smp.c
> @@ -33,7 +33,6 @@ static void __iomem *ipi_clear0_regs[16];
>  static void __iomem *ipi_status0_regs[16];
>  static void __iomem *ipi_en0_regs[16];
>  static void __iomem *ipi_mailbox_buf[16];
> -static uint32_t core0_c0count[NR_CPUS];
> 
>  static u32 (*ipi_read_clear)(int cpu);
>  static void (*ipi_write_action)(int cpu, u32 action);
> @@ -382,11 +381,10 @@ loongson3_send_ipi_mask(const struct cpumask 
> *mask, unsigned int action)
>  		ipi_write_action(cpu_logical_map(i), (u32)action);
>  }
> 
> -
>  static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id)
>  {
> -	int i, cpu = smp_processor_id();
> -	unsigned int action, c0count;
> +	int cpu = smp_processor_id();
> +	unsigned int action;
> 
>  	action = ipi_read_clear(cpu);
> 
> @@ -399,26 +397,14 @@ static irqreturn_t loongson3_ipi_interrupt(int 
> irq, void *dev_id)
>  		irq_exit();
>  	}
> 
> -	if (action & SMP_ASK_C0COUNT) {
> -		BUG_ON(cpu != 0);
> -		c0count = read_c0_count();
> -		c0count = c0count ? c0count : 1;
> -		for (i = 1; i < nr_cpu_ids; i++)
> -			core0_c0count[i] = c0count;
> -		nudge_writes(); /* Let others see the result ASAP */
> -	}
> -
>  	return IRQ_HANDLED;
>  }
> 
> -#define MAX_LOOPS 800
>  /*
>   * SMP init and finish on secondary CPUs
>   */
>  static void loongson3_init_secondary(void)
>  {
> -	int i;
> -	uint32_t initcount;
>  	unsigned int cpu = smp_processor_id();
>  	unsigned int imask = STATUSF_IP7 | STATUSF_IP6 |
>  			     STATUSF_IP3 | STATUSF_IP2;
> @@ -432,23 +418,6 @@ static void loongson3_init_secondary(void)
>  		     cpu_logical_map(cpu) % loongson_sysconf.cores_per_package);
>  	cpu_data[cpu].package =
>  		cpu_logical_map(cpu) / loongson_sysconf.cores_per_package;
> -
> -	i = 0;
> -	core0_c0count[cpu] = 0;
> -	loongson3_send_ipi_single(0, SMP_ASK_C0COUNT);
> -	while (!core0_c0count[cpu]) {
> -		i++;
> -		cpu_relax();
> -	}
> -
> -	if (i > MAX_LOOPS)
> -		i = MAX_LOOPS;
> -	if (cpu_data[cpu].package)
> -		initcount = core0_c0count[cpu] + i;
> -	else /* Local access is faster for loops */
> -		initcount = core0_c0count[cpu] + i/2;
> -
> -	write_c0_count(initcount);
>  }
> 
>  static void loongson3_smp_finish(void)
>
> ---
> base-commit: 0b58e108042b0ed28a71cd7edf5175999955b233
> change-id: 20240714-loongson64-cevt-r4k-eb74d4ad984c
>
> Best regards,
> -- 
> Jiaxun Yang <jiaxun.yang@flygoat.com>
Thomas Bogendoerfer July 18, 2024, 5:28 p.m. UTC | #5
On Thu, Jul 18, 2024 at 03:34:30PM +0800, Jiaxun Yang wrote:
> 
> 
> 在2024年7月14日七月 上午10:41,Jiaxun Yang写道:
> > Nowadays SYNC_R4K is performing better than Loongson64's
> > custom sync mechanism.
> >
> > Switch to SYNC_R4K to improve performance and reduce code
> > duplication.
> >
> > Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
> > ---
> > Last minute for 6.11 :-)
> 
> Hi Thomas,
> 
> Could you please apply this to 6.11 PR, or 6.11 fixes?
> 
> This is technically a left over of previous clock source series, and it does help
> on preventing random RCU stall for multi-node Loongson-3 systems.

if Huacai is ok with it, I'll add it to a second PR for 6.11.

Thomas.
Huacai Chen July 23, 2024, 2:53 a.m. UTC | #6
Reviewed-by: Huacai Chen <chenhuacai@loongson.cn>

On Fri, Jul 19, 2024 at 1:29 AM Thomas Bogendoerfer
<tsbogend@alpha.franken.de> wrote:
>
> On Thu, Jul 18, 2024 at 03:34:30PM +0800, Jiaxun Yang wrote:
> >
> >
> > 在2024年7月14日七月 上午10:41,Jiaxun Yang写道:
> > > Nowadays SYNC_R4K is performing better than Loongson64's
> > > custom sync mechanism.
> > >
> > > Switch to SYNC_R4K to improve performance and reduce code
> > > duplication.
> > >
> > > Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
> > > ---
> > > Last minute for 6.11 :-)
> >
> > Hi Thomas,
> >
> > Could you please apply this to 6.11 PR, or 6.11 fixes?
> >
> > This is technically a left over of previous clock source series, and it does help
> > on preventing random RCU stall for multi-node Loongson-3 systems.
>
> if Huacai is ok with it, I'll add it to a second PR for 6.11.
>
> Thomas.
>
> --
> Crap can work. Given enough thrust pigs will fly, but it's not necessarily a
> good idea.                                                [ RFC1925, 2.3 ]
Thomas Bogendoerfer July 23, 2024, 7:32 a.m. UTC | #7
On Sun, Jul 14, 2024 at 10:41:05AM +0800, Jiaxun Yang wrote:
> Nowadays SYNC_R4K is performing better than Loongson64's
> custom sync mechanism.
> 
> Switch to SYNC_R4K to improve performance and reduce code
> duplication.
> 
> Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
> ---
> Last minute for 6.11 :-)
> ---
>  arch/mips/Kconfig           |  1 +
>  arch/mips/include/asm/smp.h |  1 -
>  arch/mips/loongson64/smp.c  | 35 ++---------------------------------
>  3 files changed, 3 insertions(+), 34 deletions(-)

applied to mips-next.

Thomas.
diff mbox series

Patch

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 1236ea122061..e163059dd4d3 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -478,6 +478,7 @@  config MACH_LOONGSON64
 	select BOARD_SCACHE
 	select CSRC_R4K
 	select CEVT_R4K
+	select SYNC_R4K
 	select FORCE_PCI
 	select ISA
 	select I8259
diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h
index bc2c240f414b..2427d76f953f 100644
--- a/arch/mips/include/asm/smp.h
+++ b/arch/mips/include/asm/smp.h
@@ -50,7 +50,6 @@  extern int __cpu_logical_map[NR_CPUS];
 #define SMP_CALL_FUNCTION	0x2
 /* Octeon - Tell another core to flush its icache */
 #define SMP_ICACHE_FLUSH	0x4
-#define SMP_ASK_C0COUNT		0x8
 
 /* Mask of CPUs which are currently definitely operating coherently */
 extern cpumask_t cpu_coherent_mask;
diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c
index 66d049cdcf14..147acd972a07 100644
--- a/arch/mips/loongson64/smp.c
+++ b/arch/mips/loongson64/smp.c
@@ -33,7 +33,6 @@  static void __iomem *ipi_clear0_regs[16];
 static void __iomem *ipi_status0_regs[16];
 static void __iomem *ipi_en0_regs[16];
 static void __iomem *ipi_mailbox_buf[16];
-static uint32_t core0_c0count[NR_CPUS];
 
 static u32 (*ipi_read_clear)(int cpu);
 static void (*ipi_write_action)(int cpu, u32 action);
@@ -382,11 +381,10 @@  loongson3_send_ipi_mask(const struct cpumask *mask, unsigned int action)
 		ipi_write_action(cpu_logical_map(i), (u32)action);
 }
 
-
 static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id)
 {
-	int i, cpu = smp_processor_id();
-	unsigned int action, c0count;
+	int cpu = smp_processor_id();
+	unsigned int action;
 
 	action = ipi_read_clear(cpu);
 
@@ -399,26 +397,14 @@  static irqreturn_t loongson3_ipi_interrupt(int irq, void *dev_id)
 		irq_exit();
 	}
 
-	if (action & SMP_ASK_C0COUNT) {
-		BUG_ON(cpu != 0);
-		c0count = read_c0_count();
-		c0count = c0count ? c0count : 1;
-		for (i = 1; i < nr_cpu_ids; i++)
-			core0_c0count[i] = c0count;
-		nudge_writes(); /* Let others see the result ASAP */
-	}
-
 	return IRQ_HANDLED;
 }
 
-#define MAX_LOOPS 800
 /*
  * SMP init and finish on secondary CPUs
  */
 static void loongson3_init_secondary(void)
 {
-	int i;
-	uint32_t initcount;
 	unsigned int cpu = smp_processor_id();
 	unsigned int imask = STATUSF_IP7 | STATUSF_IP6 |
 			     STATUSF_IP3 | STATUSF_IP2;
@@ -432,23 +418,6 @@  static void loongson3_init_secondary(void)
 		     cpu_logical_map(cpu) % loongson_sysconf.cores_per_package);
 	cpu_data[cpu].package =
 		cpu_logical_map(cpu) / loongson_sysconf.cores_per_package;
-
-	i = 0;
-	core0_c0count[cpu] = 0;
-	loongson3_send_ipi_single(0, SMP_ASK_C0COUNT);
-	while (!core0_c0count[cpu]) {
-		i++;
-		cpu_relax();
-	}
-
-	if (i > MAX_LOOPS)
-		i = MAX_LOOPS;
-	if (cpu_data[cpu].package)
-		initcount = core0_c0count[cpu] + i;
-	else /* Local access is faster for loops */
-		initcount = core0_c0count[cpu] + i/2;
-
-	write_c0_count(initcount);
 }
 
 static void loongson3_smp_finish(void)