diff mbox series

[1/7] MIPS: sync-r4k: Rework to be many cores firendly

Message ID 20200817034701.3515721-2-jiaxun.yang@flygoat.com (mailing list archive)
State Deferred
Headers show
Series R4000 clock enhancements for Loongson | expand

Commit Message

Jiaxun Yang Aug. 17, 2020, 3:46 a.m. UTC
The original sync-r4k did a good job on reducing jitter by determine
the "next time value", but it has a limitation that when we have lots
of cores, the timewrap on CPU0 will become unaccpetable. That will also
happen when CPU Hotplug is enabled and the counter needs to be synchronised
at every plug event.

Here we reworked the whole procdure. Now the synchronise event on CPU0
is triggered by smp call function, and we won't touch the count on CPU0
at all.

Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
---
 arch/mips/include/asm/r4k-timer.h |   5 --
 arch/mips/kernel/smp.c            |   2 -
 arch/mips/kernel/sync-r4k.c       | 143 +++++++++++++-----------------
 3 files changed, 61 insertions(+), 89 deletions(-)

Comments

kernel test robot Aug. 17, 2020, 6:04 a.m. UTC | #1
Hi Jiaxun,

I love your patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.9-rc1 next-20200817]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Jiaxun-Yang/R4000-clock-enhancements-for-Loongson/20200817-115351
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9123e3a74ec7b934a4a099e98af6a61c2f80bbf5
config: mips-allyesconfig (attached as .config)
compiler: mips-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=mips 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> arch/mips/kernel/sync-r4k.c:28:6: warning: no previous prototype for 'synchronise_count_master' [-Wmissing-prototypes]
      28 | void synchronise_count_master(void *unused)
         |      ^~~~~~~~~~~~~~~~~~~~~~~~

# https://github.com/0day-ci/linux/commit/6c760a310d6f07efede56e732f3de2e2a944e361
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Jiaxun-Yang/R4000-clock-enhancements-for-Loongson/20200817-115351
git checkout 6c760a310d6f07efede56e732f3de2e2a944e361
vim +/synchronise_count_master +28 arch/mips/kernel/sync-r4k.c

    27	
  > 28	void synchronise_count_master(void *unused)
    29	{
    30		unsigned long flags;
    31		long delta;
    32		int i;
    33	
    34		if (atomic_read(&sync_stage) != STAGE_START)
    35			BUG();
    36	
    37		local_irq_save(flags);
    38	
    39		cur_count = read_c0_count();
    40		smp_wmb();
    41		atomic_inc(&sync_stage); /* inc to STAGE_MASTER_READY */
    42	
    43		for (i = 0; i < MAX_LOOPS; i++) {
    44			cur_count = read_c0_count();
    45			smp_wmb();
    46			if (atomic_read(&sync_stage) == STAGE_SLAVE_SYNCED)
    47				break;
    48		}
    49	
    50		delta = read_c0_count() - fini_count;
    51	
    52		local_irq_restore(flags);
    53	
    54		if (i == MAX_LOOPS)
    55			pr_err("sync-r4k: Master: synchronise timeout\n");
    56		else
    57			pr_info("sync-r4k: Master: synchronise succeed, maximum delta: %ld\n", delta);
    58	
    59		return;
    60	}
    61	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Peter Zijlstra Aug. 17, 2020, 7:55 a.m. UTC | #2
On Mon, Aug 17, 2020 at 11:46:40AM +0800, Jiaxun Yang wrote:
> Here we reworked the whole procdure. Now the synchronise event on CPU0
> is triggered by smp call function, and we won't touch the count on CPU0
> at all.

Are you telling me, that in 2020 you're building chips that need
horrible crap like this ?!?

> +#define MAX_LOOPS	1000
> +
> +void synchronise_count_master(void *unused)
>  {
>  	unsigned long flags;
> +	long delta;
> +	int i;
>  
> +	if (atomic_read(&sync_stage) != STAGE_START)
> +		BUG();

	BUG_ON(atomic_read(&sync_state) != STAGE_START);

>  
>  	local_irq_save(flags);

That's silly, replace with: lockdep_assert_hardirqs_disabled().

>  
> +	cur_count = read_c0_count();
> +	smp_wmb();
> +	atomic_inc(&sync_stage); /* inc to STAGE_MASTER_READY */

memory barriers require a comment that describes the ordering. This
includes at least 2 variables and at least 2 code paths (*) -- afaict
your code does NOT have a matching barrier, see below.

>  
> +	for (i = 0; i < MAX_LOOPS; i++) {
> +		cur_count = read_c0_count();
>  		smp_wmb();
> -		atomic_inc(&count_count_stop);
> +		if (atomic_read(&sync_stage) == STAGE_SLAVE_SYNCED)
> +			break;
>  	}
> +
> +	delta = read_c0_count() - fini_count;
>  
>  	local_irq_restore(flags);
>  
> +	if (i == MAX_LOOPS)
> +		pr_err("sync-r4k: Master: synchronise timeout\n");
> +	else
> +		pr_info("sync-r4k: Master: synchronise succeed, maximum delta: %ld\n", delta);
> +
> +	return;
>  }
>  
>  void synchronise_count_slave(int cpu)
>  {
>  	int i;
>  	unsigned long flags;
> +	call_single_data_t csd;
>  
> +	raw_spin_lock(&sync_r4k_lock);

Why should this be a raw_spnilock_t ?

>  
> +	/* Let variables get attention from cache */
> +	for (i = 0; i < MAX_LOOPS; i++) {
> +		cur_count++;
> +		fini_count += cur_count;
> +		cur_count += fini_count;
>  	}

What does this actually do? You're going to bounce those variables
between this CPU and CPU-0.

> +
> +	atomic_set(&sync_stage, STAGE_START);
> +	csd.func = synchronise_count_master;
> +
> +	/* Master count is always CPU0 */
> +	if (smp_call_function_single_async(0, &csd)) {

This is diguisting.

It also requires a comment on how the on-stack csd is correct (it is,
but it really needs a comment).

> +		pr_err("sync-r4k: Salve: Failed to call master\n");
> +		raw_spin_unlock(&sync_r4k_lock);
> +		return;
> +	}
> +
> +	local_irq_save(flags);
> +
> +	/* Wait until master ready */
> +	while (atomic_read(&sync_stage) != STAGE_MASTER_READY)
> +		cpu_relax();

This really wants to be:

	atomic_cond_read_acquire(&&sync_stage, VAL == STAGE_MASTER_READY);

Because, afaict the smp_wmb() (*) in synchronize_count_master() order
against this here and we need to guarantee we read @sync_stage _before_
@cur_count.

> +
> +	write_c0_count(cur_count);
> +	fini_count = read_c0_count();
> +	smp_wmb();
> +	atomic_inc(&sync_stage); /* inc to STAGE_SLAVE_SYNCED */
>  
>  	local_irq_restore(flags);
> +
> +	raw_spin_unlock(&sync_r4k_lock);
>  }


Furthermore, afaict there isn't actually any concurrency on @sync_stage,
so atomic_t isn't required, Using smp_store_release() to change state
might be far more natural.
kernel test robot Aug. 21, 2020, 3:32 p.m. UTC | #3
Hi Jiaxun,

I love your patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.9-rc1 next-20200821]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Jiaxun-Yang/R4000-clock-enhancements-for-Loongson/20200817-115351
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9123e3a74ec7b934a4a099e98af6a61c2f80bbf5
config: mips-randconfig-r014-20200820 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project b587ca93be114d07ec3bf654add97d7872325281)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install mips cross compiling tool for clang build
        # apt-get install binutils-mips-linux-gnu
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=mips 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> arch/mips/kernel/sync-r4k.c:28:6: warning: no previous prototype for function 'synchronise_count_master' [-Wmissing-prototypes]
   void synchronise_count_master(void *unused)
        ^
   arch/mips/kernel/sync-r4k.c:28:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void synchronise_count_master(void *unused)
   ^
   static 
   1 warning generated.

# https://github.com/0day-ci/linux/commit/6c760a310d6f07efede56e732f3de2e2a944e361
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Jiaxun-Yang/R4000-clock-enhancements-for-Loongson/20200817-115351
git checkout 6c760a310d6f07efede56e732f3de2e2a944e361
vim +/synchronise_count_master +28 arch/mips/kernel/sync-r4k.c

    27	
  > 28	void synchronise_count_master(void *unused)
    29	{
    30		unsigned long flags;
    31		long delta;
    32		int i;
    33	
    34		if (atomic_read(&sync_stage) != STAGE_START)
    35			BUG();
    36	
    37		local_irq_save(flags);
    38	
    39		cur_count = read_c0_count();
    40		smp_wmb();
    41		atomic_inc(&sync_stage); /* inc to STAGE_MASTER_READY */
    42	
    43		for (i = 0; i < MAX_LOOPS; i++) {
    44			cur_count = read_c0_count();
    45			smp_wmb();
    46			if (atomic_read(&sync_stage) == STAGE_SLAVE_SYNCED)
    47				break;
    48		}
    49	
    50		delta = read_c0_count() - fini_count;
    51	
    52		local_irq_restore(flags);
    53	
    54		if (i == MAX_LOOPS)
    55			pr_err("sync-r4k: Master: synchronise timeout\n");
    56		else
    57			pr_info("sync-r4k: Master: synchronise succeed, maximum delta: %ld\n", delta);
    58	
    59		return;
    60	}
    61	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff mbox series

Patch

diff --git a/arch/mips/include/asm/r4k-timer.h b/arch/mips/include/asm/r4k-timer.h
index afe9e0e03fe9..2789d2fc5e23 100644
--- a/arch/mips/include/asm/r4k-timer.h
+++ b/arch/mips/include/asm/r4k-timer.h
@@ -12,15 +12,10 @@ 
 
 #ifdef CONFIG_SYNC_R4K
 
-extern void synchronise_count_master(int cpu);
 extern void synchronise_count_slave(int cpu);
 
 #else
 
-static inline void synchronise_count_master(int cpu)
-{
-}
-
 static inline void synchronise_count_slave(int cpu)
 {
 }
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 48d84d5fcc36..ead9ac883241 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -444,8 +444,6 @@  int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 		return -EIO;
 	}
 
-	synchronise_count_master(cpu);
-
 	/* Wait for CPU to finish startup & mark itself online before return */
 	wait_for_completion(&cpu_running);
 	return 0;
diff --git a/arch/mips/kernel/sync-r4k.c b/arch/mips/kernel/sync-r4k.c
index abdd7aaa3311..c3e36d6d57fa 100644
--- a/arch/mips/kernel/sync-r4k.c
+++ b/arch/mips/kernel/sync-r4k.c
@@ -1,122 +1,101 @@ 
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Count register synchronisation.
- *
- * All CPUs will have their count registers synchronised to the CPU0 next time
- * value. This can cause a small timewarp for CPU0. All other CPU's should
- * not have done anything significant (but they may have had interrupts
- * enabled briefly - prom_smp_finish() should not be responsible for enabling
- * interrupts...)
  */
 
 #include <linux/kernel.h>
 #include <linux/irqflags.h>
 #include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
 
 #include <asm/r4k-timer.h>
 #include <linux/atomic.h>
 #include <asm/barrier.h>
 #include <asm/mipsregs.h>
 
-static unsigned int initcount = 0;
-static atomic_t count_count_start = ATOMIC_INIT(0);
-static atomic_t count_count_stop = ATOMIC_INIT(0);
+#define STAGE_START		0
+#define STAGE_MASTER_READY	1
+#define STAGE_SLAVE_SYNCED	2
 
-#define COUNTON 100
-#define NR_LOOPS 3
+static unsigned int cur_count;
+static unsigned int fini_count;
+static atomic_t sync_stage = ATOMIC_INIT(0);
+static DEFINE_RAW_SPINLOCK(sync_r4k_lock);
 
-void synchronise_count_master(int cpu)
+#define MAX_LOOPS	1000
+
+void synchronise_count_master(void *unused)
 {
-	int i;
 	unsigned long flags;
+	long delta;
+	int i;
 
-	pr_info("Synchronize counters for CPU %u: ", cpu);
+	if (atomic_read(&sync_stage) != STAGE_START)
+		BUG();
 
 	local_irq_save(flags);
 
-	/*
-	 * We loop a few times to get a primed instruction cache,
-	 * then the last pass is more or less synchronised and
-	 * the master and slaves each set their cycle counters to a known
-	 * value all at once. This reduces the chance of having random offsets
-	 * between the processors, and guarantees that the maximum
-	 * delay between the cycle counters is never bigger than
-	 * the latency of information-passing (cachelines) between
-	 * two CPUs.
-	 */
-
-	for (i = 0; i < NR_LOOPS; i++) {
-		/* slaves loop on '!= 2' */
-		while (atomic_read(&count_count_start) != 1)
-			mb();
-		atomic_set(&count_count_stop, 0);
-		smp_wmb();
+	cur_count = read_c0_count();
+	smp_wmb();
+	atomic_inc(&sync_stage); /* inc to STAGE_MASTER_READY */
 
-		/* Let the slave writes its count register */
-		atomic_inc(&count_count_start);
-
-		/* Count will be initialised to current timer */
-		if (i == 1)
-			initcount = read_c0_count();
-
-		/*
-		 * Everyone initialises count in the last loop:
-		 */
-		if (i == NR_LOOPS-1)
-			write_c0_count(initcount);
-
-		/*
-		 * Wait for slave to leave the synchronization point:
-		 */
-		while (atomic_read(&count_count_stop) != 1)
-			mb();
-		atomic_set(&count_count_start, 0);
+	for (i = 0; i < MAX_LOOPS; i++) {
+		cur_count = read_c0_count();
 		smp_wmb();
-		atomic_inc(&count_count_stop);
+		if (atomic_read(&sync_stage) == STAGE_SLAVE_SYNCED)
+			break;
 	}
-	/* Arrange for an interrupt in a short while */
-	write_c0_compare(read_c0_count() + COUNTON);
+
+	delta = read_c0_count() - fini_count;
 
 	local_irq_restore(flags);
 
-	/*
-	 * i386 code reported the skew here, but the
-	 * count registers were almost certainly out of sync
-	 * so no point in alarming people
-	 */
-	pr_cont("done.\n");
+	if (i == MAX_LOOPS)
+		pr_err("sync-r4k: Master: synchronise timeout\n");
+	else
+		pr_info("sync-r4k: Master: synchronise succeed, maximum delta: %ld\n", delta);
+
+	return;
 }
 
 void synchronise_count_slave(int cpu)
 {
 	int i;
 	unsigned long flags;
+	call_single_data_t csd;
 
-	local_irq_save(flags);
+	raw_spin_lock(&sync_r4k_lock);
 
-	/*
-	 * Not every cpu is online at the time this gets called,
-	 * so we first wait for the master to say everyone is ready
-	 */
-
-	for (i = 0; i < NR_LOOPS; i++) {
-		atomic_inc(&count_count_start);
-		while (atomic_read(&count_count_start) != 2)
-			mb();
-
-		/*
-		 * Everyone initialises count in the last loop:
-		 */
-		if (i == NR_LOOPS-1)
-			write_c0_count(initcount);
-
-		atomic_inc(&count_count_stop);
-		while (atomic_read(&count_count_stop) != 2)
-			mb();
+	/* Let variables get attention from cache */
+	for (i = 0; i < MAX_LOOPS; i++) {
+		cur_count++;
+		fini_count += cur_count;
+		cur_count += fini_count;
 	}
-	/* Arrange for an interrupt in a short while */
-	write_c0_compare(read_c0_count() + COUNTON);
+
+	atomic_set(&sync_stage, STAGE_START);
+	csd.func = synchronise_count_master;
+
+	/* Master count is always CPU0 */
+	if (smp_call_function_single_async(0, &csd)) {
+		pr_err("sync-r4k: Salve: Failed to call master\n");
+		raw_spin_unlock(&sync_r4k_lock);
+		return;
+	}
+
+	local_irq_save(flags);
+
+	/* Wait until master ready */
+	while (atomic_read(&sync_stage) != STAGE_MASTER_READY)
+		cpu_relax();
+
+	write_c0_count(cur_count);
+	fini_count = read_c0_count();
+	smp_wmb();
+	atomic_inc(&sync_stage); /* inc to STAGE_SLAVE_SYNCED */
 
 	local_irq_restore(flags);
+
+	raw_spin_unlock(&sync_r4k_lock);
 }
-#undef NR_LOOPS