diff mbox series

[v3,3/6] util/bufferiszero: remove AVX512 variant

Message ID 20240206204809.9859-4-amonakov@ispras.ru (mailing list archive)
State New, archived
Headers show
Series Optimize buffer_is_zero | expand

Commit Message

Alexander Monakov Feb. 6, 2024, 8:48 p.m. UTC
Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD
routines are invoked much more rarely in normal use when most buffers
are non-zero. This makes use of AVX512 unprofitable, as it incurs extra
frequency and voltage transition periods during which the CPU operates
at reduced performance, as described in
https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html

Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru>
Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
---
 util/bufferiszero.c | 36 ++----------------------------------
 1 file changed, 2 insertions(+), 34 deletions(-)

Comments

Richard Henderson Feb. 6, 2024, 10:28 p.m. UTC | #1
On 2/7/24 06:48, Alexander Monakov wrote:
> Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD
> routines are invoked much more rarely in normal use when most buffers
> are non-zero. This makes use of AVX512 unprofitable, as it incurs extra
> frequency and voltage transition periods during which the CPU operates
> at reduced performance, as described in
> https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html
> 
> Signed-off-by: Mikhail Romanov<mmromanov@ispras.ru>
> Signed-off-by: Alexander Monakov<amonakov@ispras.ru>
> ---
>   util/bufferiszero.c | 36 ++----------------------------------
>   1 file changed, 2 insertions(+), 34 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

Although I think this patch should be ordered second.


r~
Elena Ufimtseva Feb. 6, 2024, 11:56 p.m. UTC | #2
Hello Alexander

On Tue, Feb 6, 2024 at 12:50 PM Alexander Monakov <amonakov@ispras.ru>
wrote:

> Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD
> routines are invoked much more rarely in normal use when most buffers
> are non-zero. This makes use of AVX512 unprofitable, as it incurs extra
> frequency and voltage transition periods during which the CPU operates
> at reduced performance, as described in
> https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html


I would like to point out that the frequency scaling is not currently an
issue on AMD Zen4 Genoa CPUs, for example.
And microcode architecture description here:
https://www.amd.com/system/files/documents/4th-gen-epyc-processor-architecture-white-paper.pdf
Although, the cpu frequency downscaling mentioned in the above document is
only in relation to floating point operations.
But from other online discussions I gather that the data path for the
integer registers in Zen4 is also 256 bits and it allows to avoid
frequency downscaling for FP and heavy instructions.
And looking at the optimizations for AVX2 in your other patch, would
unrolling the loop for AVX512 ops benefit from the speedup taken that the
data path has the same width?
If the frequency downscaling is not observed on some of the CPUs, can
AVX512 be maintained and used selectively for some
of the CPUs?

Thank you!


>
>
> Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru>
> Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>  util/bufferiszero.c | 36 ++----------------------------------
>  1 file changed, 2 insertions(+), 34 deletions(-)
>
> diff --git a/util/bufferiszero.c b/util/bufferiszero.c
> index 01050694a6..c037d11d04 100644
> --- a/util/bufferiszero.c
> +++ b/util/bufferiszero.c
> @@ -64,7 +64,7 @@ buffer_is_zero_len_4_plus(const void *buf, size_t len)
>      }
>  }
>
> -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) ||
> defined(__SSE2__)
> +#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
>  #include <immintrin.h>
>
>  /* Note that each of these vectorized functions require len >= 64.  */
> @@ -128,35 +128,6 @@ buffer_zero_avx2(const void *buf, size_t len)
>  }
>  #endif /* CONFIG_AVX2_OPT */
>
> -#ifdef CONFIG_AVX512F_OPT
> -static bool __attribute__((target("avx512f")))
> -buffer_zero_avx512(const void *buf, size_t len)
> -{
> -    /* Begin with an unaligned head of 64 bytes.  */
> -    __m512i t = _mm512_loadu_si512(buf);
> -    __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
> -    __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64);
> -
> -    /* Loop over 64-byte aligned blocks of 256.  */
> -    while (p <= e) {
> -        __builtin_prefetch(p);
> -        if (unlikely(_mm512_test_epi64_mask(t, t))) {
> -            return false;
> -        }
> -        t = p[-4] | p[-3] | p[-2] | p[-1];
> -        p += 4;
> -    }
> -
> -    t |= _mm512_loadu_si512(buf + len - 4 * 64);
> -    t |= _mm512_loadu_si512(buf + len - 3 * 64);
> -    t |= _mm512_loadu_si512(buf + len - 2 * 64);
> -    t |= _mm512_loadu_si512(buf + len - 1 * 64);
> -
> -    return !_mm512_test_epi64_mask(t, t);
> -
> -}
> -#endif /* CONFIG_AVX512F_OPT */
> -
>  static unsigned __attribute__((noinline))
>  select_accel_cpuinfo(unsigned info)
>  {
> @@ -165,9 +136,6 @@ select_accel_cpuinfo(unsigned info)
>          unsigned bit;
>          bool (*fn)(const void *, size_t);
>      } all[] = {
> -#ifdef CONFIG_AVX512F_OPT
> -        { CPUINFO_AVX512F, buffer_zero_avx512 },
> -#endif
>  #ifdef CONFIG_AVX2_OPT
>          { CPUINFO_AVX2,    buffer_zero_avx2 },
>  #endif
> @@ -191,7 +159,7 @@ static unsigned used_accel
>      = 0;
>  #endif
>
> -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
> +#if defined(CONFIG_AVX2_OPT)
>  static void __attribute__((constructor)) init_accel(void)
>  {
>      used_accel = select_accel_cpuinfo(cpuinfo_init());
> --
> 2.32.0
>
>
>
Alexander Monakov Feb. 7, 2024, 6:29 a.m. UTC | #3
On Tue, 6 Feb 2024, Elena Ufimtseva wrote:

> Hello Alexander
> 
> On Tue, Feb 6, 2024 at 12:50 PM Alexander Monakov <amonakov@ispras.ru>
> wrote:
> 
> > Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD
> > routines are invoked much more rarely in normal use when most buffers
> > are non-zero. This makes use of AVX512 unprofitable, as it incurs extra
> > frequency and voltage transition periods during which the CPU operates
> > at reduced performance, as described in
> > https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html
> 
> 
> I would like to point out that the frequency scaling is not currently an
> issue on AMD Zen4 Genoa CPUs, for example.
> And microcode architecture description here:
> https://www.amd.com/system/files/documents/4th-gen-epyc-processor-architecture-white-paper.pdf
> Although, the cpu frequency downscaling mentioned in the above document is
> only in relation to floating point operations.
> But from other online discussions I gather that the data path for the
> integer registers in Zen4 is also 256 bits and it allows to avoid
> frequency downscaling for FP and heavy instructions.

Yes, that's correct: in particular, on Zen 4 512-bit vector loads occupy load
ports for two consecutive cycles, so from load throughput perspective there's
no difference between 256-bit vectors and 512-bit vectors. Generally AVX-512
still has benefits on Zen 4 since it's a richer instruction set (it also reduces
pressure in the CPU front-end and is more power-efficient), but as the new AVX2
buffer_is_zero is saturating load ports I would expect that AVX512 can exceed
its performance only by a small margin if at all, not anywhere close to 2x.

> And looking at the optimizations for AVX2 in your other patch, would
> unrolling the loop for AVX512 ops benefit from the speedup taken that the
> data path has the same width?

No, 256-bit datapath on Zen 4 means that it's easier to saturate it with
512-bit loads than with 256-bit loads, so an AVX512 loop is roughly comparable
to a similar AVX-256 loop unrolled twice.

Aside: AVX512 variant needs a little more thought to use VPTERNLOG properly.

> If the frequency downscaling is not observed on some of the CPUs, can
> AVX512 be maintained and used selectively for some
> of the CPUs?

Please note that a properly optimized buffer_is_zero is limited by load
throughput, not ALUs. On Zen 4 AVX2 is sufficient to saturate L1 cache load
bandwidth in buffer_is_zero. For data outside of L1 cache, the benefits
of AVX-512 diminish more and more.

I don't have Zen 4 based machines at hand to see if AVX-512 is beneficial
there for buffer_is_zero for reasons like reaching higher turbo clocks or
higher memory parallelism.

Finally, let's consider a somewhat broader perspective. Let's suppose
buffer_is_zero takes 50% of overall application runtime, and 9 out of
10 buffers are found out to be non-zero in the inline wrapper that samples
three bytes. Then the vectorized routine takes about 5% of application
time, and speeding it up even by 20% only shaves off 1% from overall
execution time.

Alexander
Joao Martins Feb. 7, 2024, 10:38 a.m. UTC | #4
On 07/02/2024 06:29, Alexander Monakov wrote:
> On Tue, 6 Feb 2024, Elena Ufimtseva wrote:
>> Hello Alexander
>>
>> On Tue, Feb 6, 2024 at 12:50 PM Alexander Monakov <amonakov@ispras.ru>
>> wrote:
>>
>>> Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD
>>> routines are invoked much more rarely in normal use when most buffers
>>> are non-zero. This makes use of AVX512 unprofitable, as it incurs extra
>>> frequency and voltage transition periods during which the CPU operates
>>> at reduced performance, as described in
>>> https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html
>>
>> I would like to point out that the frequency scaling is not currently an
>> issue on AMD Zen4 Genoa CPUs, for example.
>> And microcode architecture description here:
>> https://www.amd.com/system/files/documents/4th-gen-epyc-processor-architecture-white-paper.pdf
>> Although, the cpu frequency downscaling mentioned in the above document is
>> only in relation to floating point operations.
>> But from other online discussions I gather that the data path for the
>> integer registers in Zen4 is also 256 bits and it allows to avoid
>> frequency downscaling for FP and heavy instructions.
> 
> Yes, that's correct: in particular, on Zen 4 512-bit vector loads occupy load
> ports for two consecutive cycles, so from load throughput perspective there's
> no difference between 256-bit vectors and 512-bit vectors. Generally AVX-512
> still has benefits on Zen 4 since it's a richer instruction set (it also reduces
> pressure in the CPU front-end and is more power-efficient), but as the new AVX2
> buffer_is_zero is saturating load ports I would expect that AVX512 can exceed
> its performance only by a small margin if at all, not anywhere close to 2x.
> 
>> And looking at the optimizations for AVX2 in your other patch, would
>> unrolling the loop for AVX512 ops benefit from the speedup taken that the
>> data path has the same width?
> 
> No, 256-bit datapath on Zen 4 means that it's easier to saturate it with
> 512-bit loads than with 256-bit loads, so an AVX512 loop is roughly comparable
> to a similar AVX-256 loop unrolled twice.
> 
> Aside: AVX512 variant needs a little more thought to use VPTERNLOG properly.
> 
>> If the frequency downscaling is not observed on some of the CPUs, can
>> AVX512 be maintained and used selectively for some
>> of the CPUs?
> 
> Please note that a properly optimized buffer_is_zero is limited by load
> throughput, not ALUs. On Zen 4 AVX2 is sufficient to saturate L1 cache load
> bandwidth in buffer_is_zero. For data outside of L1 cache, the benefits
> of AVX-512 diminish more and more.
> 
> I don't have Zen 4 based machines at hand to see if AVX-512 is beneficial
> there for buffer_is_zero for reasons like reaching higher turbo clocks or
> higher memory parallelism.
> 

FWIW, this frequency downscaling problem that was more prominent in Skylake is
/supposedly/ no longer observed in Intel Sapphire Rapids either:

https://www.phoronix.com/review/intel-sapphirerapids-avx512/8
diff mbox series

Patch

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 01050694a6..c037d11d04 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -64,7 +64,7 @@  buffer_is_zero_len_4_plus(const void *buf, size_t len)
     }
 }
 
-#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
+#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
 #include <immintrin.h>
 
 /* Note that each of these vectorized functions require len >= 64.  */
@@ -128,35 +128,6 @@  buffer_zero_avx2(const void *buf, size_t len)
 }
 #endif /* CONFIG_AVX2_OPT */
 
-#ifdef CONFIG_AVX512F_OPT
-static bool __attribute__((target("avx512f")))
-buffer_zero_avx512(const void *buf, size_t len)
-{
-    /* Begin with an unaligned head of 64 bytes.  */
-    __m512i t = _mm512_loadu_si512(buf);
-    __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
-    __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64);
-
-    /* Loop over 64-byte aligned blocks of 256.  */
-    while (p <= e) {
-        __builtin_prefetch(p);
-        if (unlikely(_mm512_test_epi64_mask(t, t))) {
-            return false;
-        }
-        t = p[-4] | p[-3] | p[-2] | p[-1];
-        p += 4;
-    }
-
-    t |= _mm512_loadu_si512(buf + len - 4 * 64);
-    t |= _mm512_loadu_si512(buf + len - 3 * 64);
-    t |= _mm512_loadu_si512(buf + len - 2 * 64);
-    t |= _mm512_loadu_si512(buf + len - 1 * 64);
-
-    return !_mm512_test_epi64_mask(t, t);
-
-}
-#endif /* CONFIG_AVX512F_OPT */
-
 static unsigned __attribute__((noinline))
 select_accel_cpuinfo(unsigned info)
 {
@@ -165,9 +136,6 @@  select_accel_cpuinfo(unsigned info)
         unsigned bit;
         bool (*fn)(const void *, size_t);
     } all[] = {
-#ifdef CONFIG_AVX512F_OPT
-        { CPUINFO_AVX512F, buffer_zero_avx512 },
-#endif
 #ifdef CONFIG_AVX2_OPT
         { CPUINFO_AVX2,    buffer_zero_avx2 },
 #endif
@@ -191,7 +159,7 @@  static unsigned used_accel
     = 0;
 #endif
 
-#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
+#if defined(CONFIG_AVX2_OPT)
 static void __attribute__((constructor)) init_accel(void)
 {
     used_accel = select_accel_cpuinfo(cpuinfo_init());