diff mbox series

[RFC,v3,4/4] x86/time: re-arrange struct calibration_rendezvous

Message ID 56d70757-a887-6824-18f4-93b1f244e44b@suse.com (mailing list archive)
State Superseded
Headers show
Series x86/time: calibration rendezvous adjustments | expand

Commit Message

Jan Beulich Feb. 9, 2021, 12:57 p.m. UTC
To reduce latency on time_calibration_tsc_rendezvous()'s last loop
iteration, separate fields written on the last iteration enough from the
crucial field read by all CPUs on the last iteration such that they end
up in distinct cache lines. Prefetch this field on an earlier iteration.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.
---
While readability would likely suffer, we may want to consider avoiding
the prefetch on at least the first two iterations (where the field still
gets / may get written to by CPU0). Could e.g. be

    switch ( i )
    {
    case 0:
        write_tsc(r->master_tsc_stamp);
        break;
    case 1:
        prefetch(&r->master_tsc_stamp);
        break;
    }

Of course it would also be nice to avoid the pretty likely branch
misprediction on the last iteration. But with the static prediction
hints having been rather short-lived in the architecture, I don't see
any good means to do so.

Comments

Jan Beulich Feb. 24, 2021, 9:29 a.m. UTC | #1
On 09.02.2021 13:57, Jan Beulich wrote:
> To reduce latency on time_calibration_tsc_rendezvous()'s last loop
> iteration, separate fields written on the last iteration enough from the
> crucial field read by all CPUs on the last iteration such that they end
> up in distinct cache lines. Prefetch this field on an earlier iteration.

I've now measured the effects of this, at least to some degree. On my
single-socket 18-core Skylake system this reduces the average loop
exit time on CPU0 (from the TSC write on the last iteration to until
after the main loop) from around 32k cycles to around 28k (albeit the
values measured on separate runs vary quite significantly).

About the same effect (maybe a little less, but within error bounds)
can be had without any re-arrangement of the struct's layout, by
simply reading r->master_tsc_stamp into a local variable at the end
of each loop iteration. I'll make v4 use this less convoluted model
instead.

Jan

> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -1655,10 +1655,20 @@ static void tsc_check_reliability(void)
>   * All CPUS snapshot their local TSC and extrapolation of system time.
>   */
>  struct calibration_rendezvous {
> -    cpumask_t cpu_calibration_map;
>      atomic_t semaphore;
>      s_time_t master_stime;
> -    uint64_t master_tsc_stamp, max_tsc_stamp;
> +    cpumask_t cpu_calibration_map;
> +    /*
> +     * All CPUs want to read master_tsc_stamp on the last iteration.  If
> +     * cpu_calibration_map isn't large enough to push the field into a cache
> +     * line different from the one used by semaphore (written by all CPUs on
> +     * every iteration) and master_stime (written by CPU0 on the last
> +     * iteration), align the field suitably.
> +     */
> +    uint64_t __aligned(BITS_TO_LONGS(NR_CPUS) * sizeof(long) +
> +                       sizeof(atomic_t) + sizeof(s_time_t) < SMP_CACHE_BYTES
> +                       ? SMP_CACHE_BYTES : 1) master_tsc_stamp;
> +    uint64_t max_tsc_stamp;
>  };
>  
>  static void
> @@ -1709,6 +1719,8 @@ static void time_calibration_tsc_rendezv
>  
>              if ( i == 0 )
>                  write_tsc(r->master_tsc_stamp);
> +            else
> +                prefetch(&r->master_tsc_stamp);
>  
>              while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) )
>                  cpu_relax();
> @@ -1731,6 +1743,8 @@ static void time_calibration_tsc_rendezv
>  
>              if ( i == 0 )
>                  write_tsc(r->master_tsc_stamp);
> +            else
> +                prefetch(&r->master_tsc_stamp);
>  
>              atomic_inc(&r->semaphore);
>              while ( atomic_read(&r->semaphore) > total_cpus )
> 
>
diff mbox series

Patch

--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -1655,10 +1655,20 @@  static void tsc_check_reliability(void)
  * All CPUS snapshot their local TSC and extrapolation of system time.
  */
 struct calibration_rendezvous {
-    cpumask_t cpu_calibration_map;
     atomic_t semaphore;
     s_time_t master_stime;
-    uint64_t master_tsc_stamp, max_tsc_stamp;
+    cpumask_t cpu_calibration_map;
+    /*
+     * All CPUs want to read master_tsc_stamp on the last iteration.  If
+     * cpu_calibration_map isn't large enough to push the field into a cache
+     * line different from the one used by semaphore (written by all CPUs on
+     * every iteration) and master_stime (written by CPU0 on the last
+     * iteration), align the field suitably.
+     */
+    uint64_t __aligned(BITS_TO_LONGS(NR_CPUS) * sizeof(long) +
+                       sizeof(atomic_t) + sizeof(s_time_t) < SMP_CACHE_BYTES
+                       ? SMP_CACHE_BYTES : 1) master_tsc_stamp;
+    uint64_t max_tsc_stamp;
 };
 
 static void
@@ -1709,6 +1719,8 @@  static void time_calibration_tsc_rendezv
 
             if ( i == 0 )
                 write_tsc(r->master_tsc_stamp);
+            else
+                prefetch(&r->master_tsc_stamp);
 
             while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) )
                 cpu_relax();
@@ -1731,6 +1743,8 @@  static void time_calibration_tsc_rendezv
 
             if ( i == 0 )
                 write_tsc(r->master_tsc_stamp);
+            else
+                prefetch(&r->master_tsc_stamp);
 
             atomic_inc(&r->semaphore);
             while ( atomic_read(&r->semaphore) > total_cpus )