Message ID | 20240206204809.9859-4-amonakov@ispras.ru (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Optimize buffer_is_zero | expand |
On 2/7/24 06:48, Alexander Monakov wrote: > Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD > routines are invoked much more rarely in normal use when most buffers > are non-zero. This makes use of AVX512 unprofitable, as it incurs extra > frequency and voltage transition periods during which the CPU operates > at reduced performance, as described in > https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html > > Signed-off-by: Mikhail Romanov<mmromanov@ispras.ru> > Signed-off-by: Alexander Monakov<amonakov@ispras.ru> > --- > util/bufferiszero.c | 36 ++---------------------------------- > 1 file changed, 2 insertions(+), 34 deletions(-) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Although I think this patch should be ordered second. r~
Hello Alexander On Tue, Feb 6, 2024 at 12:50 PM Alexander Monakov <amonakov@ispras.ru> wrote: > Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD > routines are invoked much more rarely in normal use when most buffers > are non-zero. This makes use of AVX512 unprofitable, as it incurs extra > frequency and voltage transition periods during which the CPU operates > at reduced performance, as described in > https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html I would like to point out that the frequency scaling is not currently an issue on AMD Zen4 Genoa CPUs, for example. And microcode architecture description here: https://www.amd.com/system/files/documents/4th-gen-epyc-processor-architecture-white-paper.pdf Although, the cpu frequency downscaling mentioned in the above document is only in relation to floating point operations. But from other online discussions I gather that the data path for the integer registers in Zen4 is also 256 bits and it allows to avoid frequency downscaling for FP and heavy instructions. And looking at the optimizations for AVX2 in your other patch, would unrolling the loop for AVX512 ops benefit from the speedup taken that the data path has the same width? If the frequency downscaling is not observed on some of the CPUs, can AVX512 be maintained and used selectively for some of the CPUs? Thank you! > > > Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru> > Signed-off-by: Alexander Monakov <amonakov@ispras.ru> > --- > util/bufferiszero.c | 36 ++---------------------------------- > 1 file changed, 2 insertions(+), 34 deletions(-) > > diff --git a/util/bufferiszero.c b/util/bufferiszero.c > index 01050694a6..c037d11d04 100644 > --- a/util/bufferiszero.c > +++ b/util/bufferiszero.c > @@ -64,7 +64,7 @@ buffer_is_zero_len_4_plus(const void *buf, size_t len) > } > } > > -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || > defined(__SSE2__) > +#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) > #include <immintrin.h> > > /* Note that each of these vectorized functions require len >= 64. */ > @@ -128,35 +128,6 @@ buffer_zero_avx2(const void *buf, size_t len) > } > #endif /* CONFIG_AVX2_OPT */ > > -#ifdef CONFIG_AVX512F_OPT > -static bool __attribute__((target("avx512f"))) > -buffer_zero_avx512(const void *buf, size_t len) > -{ > - /* Begin with an unaligned head of 64 bytes. */ > - __m512i t = _mm512_loadu_si512(buf); > - __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64); > - __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64); > - > - /* Loop over 64-byte aligned blocks of 256. */ > - while (p <= e) { > - __builtin_prefetch(p); > - if (unlikely(_mm512_test_epi64_mask(t, t))) { > - return false; > - } > - t = p[-4] | p[-3] | p[-2] | p[-1]; > - p += 4; > - } > - > - t |= _mm512_loadu_si512(buf + len - 4 * 64); > - t |= _mm512_loadu_si512(buf + len - 3 * 64); > - t |= _mm512_loadu_si512(buf + len - 2 * 64); > - t |= _mm512_loadu_si512(buf + len - 1 * 64); > - > - return !_mm512_test_epi64_mask(t, t); > - > -} > -#endif /* CONFIG_AVX512F_OPT */ > - > static unsigned __attribute__((noinline)) > select_accel_cpuinfo(unsigned info) > { > @@ -165,9 +136,6 @@ select_accel_cpuinfo(unsigned info) > unsigned bit; > bool (*fn)(const void *, size_t); > } all[] = { > -#ifdef CONFIG_AVX512F_OPT > - { CPUINFO_AVX512F, buffer_zero_avx512 }, > -#endif > #ifdef CONFIG_AVX2_OPT > { CPUINFO_AVX2, buffer_zero_avx2 }, > #endif > @@ -191,7 +159,7 @@ static unsigned used_accel > = 0; > #endif > > -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) > +#if defined(CONFIG_AVX2_OPT) > static void __attribute__((constructor)) init_accel(void) > { > used_accel = select_accel_cpuinfo(cpuinfo_init()); > -- > 2.32.0 > > >
On Tue, 6 Feb 2024, Elena Ufimtseva wrote: > Hello Alexander > > On Tue, Feb 6, 2024 at 12:50 PM Alexander Monakov <amonakov@ispras.ru> > wrote: > > > Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD > > routines are invoked much more rarely in normal use when most buffers > > are non-zero. This makes use of AVX512 unprofitable, as it incurs extra > > frequency and voltage transition periods during which the CPU operates > > at reduced performance, as described in > > https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html > > > I would like to point out that the frequency scaling is not currently an > issue on AMD Zen4 Genoa CPUs, for example. > And microcode architecture description here: > https://www.amd.com/system/files/documents/4th-gen-epyc-processor-architecture-white-paper.pdf > Although, the cpu frequency downscaling mentioned in the above document is > only in relation to floating point operations. > But from other online discussions I gather that the data path for the > integer registers in Zen4 is also 256 bits and it allows to avoid > frequency downscaling for FP and heavy instructions. Yes, that's correct: in particular, on Zen 4 512-bit vector loads occupy load ports for two consecutive cycles, so from load throughput perspective there's no difference between 256-bit vectors and 512-bit vectors. Generally AVX-512 still has benefits on Zen 4 since it's a richer instruction set (it also reduces pressure in the CPU front-end and is more power-efficient), but as the new AVX2 buffer_is_zero is saturating load ports I would expect that AVX512 can exceed its performance only by a small margin if at all, not anywhere close to 2x. > And looking at the optimizations for AVX2 in your other patch, would > unrolling the loop for AVX512 ops benefit from the speedup taken that the > data path has the same width? No, 256-bit datapath on Zen 4 means that it's easier to saturate it with 512-bit loads than with 256-bit loads, so an AVX512 loop is roughly comparable to a similar AVX-256 loop unrolled twice. Aside: AVX512 variant needs a little more thought to use VPTERNLOG properly. > If the frequency downscaling is not observed on some of the CPUs, can > AVX512 be maintained and used selectively for some > of the CPUs? Please note that a properly optimized buffer_is_zero is limited by load throughput, not ALUs. On Zen 4 AVX2 is sufficient to saturate L1 cache load bandwidth in buffer_is_zero. For data outside of L1 cache, the benefits of AVX-512 diminish more and more. I don't have Zen 4 based machines at hand to see if AVX-512 is beneficial there for buffer_is_zero for reasons like reaching higher turbo clocks or higher memory parallelism. Finally, let's consider a somewhat broader perspective. Let's suppose buffer_is_zero takes 50% of overall application runtime, and 9 out of 10 buffers are found out to be non-zero in the inline wrapper that samples three bytes. Then the vectorized routine takes about 5% of application time, and speeding it up even by 20% only shaves off 1% from overall execution time. Alexander
On 07/02/2024 06:29, Alexander Monakov wrote: > On Tue, 6 Feb 2024, Elena Ufimtseva wrote: >> Hello Alexander >> >> On Tue, Feb 6, 2024 at 12:50 PM Alexander Monakov <amonakov@ispras.ru> >> wrote: >> >>> Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD >>> routines are invoked much more rarely in normal use when most buffers >>> are non-zero. This makes use of AVX512 unprofitable, as it incurs extra >>> frequency and voltage transition periods during which the CPU operates >>> at reduced performance, as described in >>> https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html >> >> I would like to point out that the frequency scaling is not currently an >> issue on AMD Zen4 Genoa CPUs, for example. >> And microcode architecture description here: >> https://www.amd.com/system/files/documents/4th-gen-epyc-processor-architecture-white-paper.pdf >> Although, the cpu frequency downscaling mentioned in the above document is >> only in relation to floating point operations. >> But from other online discussions I gather that the data path for the >> integer registers in Zen4 is also 256 bits and it allows to avoid >> frequency downscaling for FP and heavy instructions. > > Yes, that's correct: in particular, on Zen 4 512-bit vector loads occupy load > ports for two consecutive cycles, so from load throughput perspective there's > no difference between 256-bit vectors and 512-bit vectors. Generally AVX-512 > still has benefits on Zen 4 since it's a richer instruction set (it also reduces > pressure in the CPU front-end and is more power-efficient), but as the new AVX2 > buffer_is_zero is saturating load ports I would expect that AVX512 can exceed > its performance only by a small margin if at all, not anywhere close to 2x. > >> And looking at the optimizations for AVX2 in your other patch, would >> unrolling the loop for AVX512 ops benefit from the speedup taken that the >> data path has the same width? > > No, 256-bit datapath on Zen 4 means that it's easier to saturate it with > 512-bit loads than with 256-bit loads, so an AVX512 loop is roughly comparable > to a similar AVX-256 loop unrolled twice. > > Aside: AVX512 variant needs a little more thought to use VPTERNLOG properly. > >> If the frequency downscaling is not observed on some of the CPUs, can >> AVX512 be maintained and used selectively for some >> of the CPUs? > > Please note that a properly optimized buffer_is_zero is limited by load > throughput, not ALUs. On Zen 4 AVX2 is sufficient to saturate L1 cache load > bandwidth in buffer_is_zero. For data outside of L1 cache, the benefits > of AVX-512 diminish more and more. > > I don't have Zen 4 based machines at hand to see if AVX-512 is beneficial > there for buffer_is_zero for reasons like reaching higher turbo clocks or > higher memory parallelism. > FWIW, this frequency downscaling problem that was more prominent in Skylake is /supposedly/ no longer observed in Intel Sapphire Rapids either: https://www.phoronix.com/review/intel-sapphirerapids-avx512/8
diff --git a/util/bufferiszero.c b/util/bufferiszero.c index 01050694a6..c037d11d04 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -64,7 +64,7 @@ buffer_is_zero_len_4_plus(const void *buf, size_t len) } } -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__) +#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) #include <immintrin.h> /* Note that each of these vectorized functions require len >= 64. */ @@ -128,35 +128,6 @@ buffer_zero_avx2(const void *buf, size_t len) } #endif /* CONFIG_AVX2_OPT */ -#ifdef CONFIG_AVX512F_OPT -static bool __attribute__((target("avx512f"))) -buffer_zero_avx512(const void *buf, size_t len) -{ - /* Begin with an unaligned head of 64 bytes. */ - __m512i t = _mm512_loadu_si512(buf); - __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64); - __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64); - - /* Loop over 64-byte aligned blocks of 256. */ - while (p <= e) { - __builtin_prefetch(p); - if (unlikely(_mm512_test_epi64_mask(t, t))) { - return false; - } - t = p[-4] | p[-3] | p[-2] | p[-1]; - p += 4; - } - - t |= _mm512_loadu_si512(buf + len - 4 * 64); - t |= _mm512_loadu_si512(buf + len - 3 * 64); - t |= _mm512_loadu_si512(buf + len - 2 * 64); - t |= _mm512_loadu_si512(buf + len - 1 * 64); - - return !_mm512_test_epi64_mask(t, t); - -} -#endif /* CONFIG_AVX512F_OPT */ - static unsigned __attribute__((noinline)) select_accel_cpuinfo(unsigned info) { @@ -165,9 +136,6 @@ select_accel_cpuinfo(unsigned info) unsigned bit; bool (*fn)(const void *, size_t); } all[] = { -#ifdef CONFIG_AVX512F_OPT - { CPUINFO_AVX512F, buffer_zero_avx512 }, -#endif #ifdef CONFIG_AVX2_OPT { CPUINFO_AVX2, buffer_zero_avx2 }, #endif @@ -191,7 +159,7 @@ static unsigned used_accel = 0; #endif -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) +#if defined(CONFIG_AVX2_OPT) static void __attribute__((constructor)) init_accel(void) { used_accel = select_accel_cpuinfo(cpuinfo_init());