Message ID | 1477288523-10819-4-git-send-email-vijay.kilari@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 24/10/2016 07:55, vijay.kilari@gmail.com wrote: > From: Vijaya Kumar K <Vijaya.Kumar@cavium.com> > > Thunderx pass2 chip requires explicit prefetch > instruction to give prefetch hint. > > To speed up live migration on Thunderx platform, > prefetch instruction is added in zero buffer check > function.The below results show live migration time improvement > with prefetch instruction. VM with 4 VCPUs, 8GB RAM is migrated. > > Without prefetch total migration time is ~13 seconds > adding prefetch total migration time is 9.5 seconds > > Code for decoding cache size is taken from Richard's > patch > > Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com> > --- > util/bufferiszero.c | 37 ++++++++++++++++++++++++++++++++++++- > 1 file changed, 36 insertions(+), 1 deletion(-) > > diff --git a/util/bufferiszero.c b/util/bufferiszero.c > index 421d945..f50b8df 100644 > --- a/util/bufferiszero.c > +++ b/util/bufferiszero.c > @@ -25,6 +25,10 @@ > #include "qemu-common.h" > #include "qemu/cutils.h" > #include "qemu/bswap.h" > +#include <math.h> > + > +static uint32_t cache_line_factor = 1; Let's express this in bytes, with a default value of 64 (so rename cache_line_factor->cache_line_size). > +static uint32_t prefetch_line_dist = 1; > > static bool > buffer_zero_int(const void *buf, size_t len) > @@ -49,7 +53,8 @@ buffer_zero_int(const void *buf, size_t len) > const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); > > for (; p + 8 <= e; p += 8) { > - __builtin_prefetch(p + 8, 0, 0); > + __builtin_prefetch(p + > + (8 * cache_line_factor * prefetch_line_dist), 0, 0); You should precompute cache_line_bytes * prefetch_line_dist / sizeof(uint64_t) in a single variable, prefetch_distance. This saves the effort of loading global variables repeatedly. Then you can do __builtin_prefetch(p + prefetch_distance, 0, 0); > if (t) { > return false; > } > @@ -293,6 +298,30 @@ bool test_buffer_is_zero_next_accel(void) > } > #endif > > +#if defined(__aarch64__) > +#include "qemu/aarch64-cpuid.h" > + > +static void __attribute__((constructor)) aarch64_init_cache_size(void) > +{ > + uint64_t t; > + > + /* Use the DZP block size as a proxy for the cacheline size, > + since the later is not available to userspace. This seems > + to work in practice for existing implementations. */ > + asm("mrs %0, dczid_el0" : "=r"(t)); > + if (pow(2, (t & 0xf)) * 4 >= 128) { > + cache_line_factor = 2; > + } else { > + cache_line_factor = 1; > + } > + > + get_aarch64_cpu_id(); > + if (is_thunderx_pass2_cpu()) { > + prefetch_line_dist = 3; > + } > +} > +#endif > + > /* > * Checks if a buffer is all zeroes > */ > @@ -305,6 +334,12 @@ bool buffer_is_zero(const void *buf, size_t len) > /* Fetch the beginning of the buffer while we select the accelerator. */ > __builtin_prefetch(buf, 0, 0); > > +#if defined(__aarch64__) > + if (is_thunderx_pass2_cpu()) { > + __builtin_prefetch(buf + 16, 0, 0); > + __builtin_prefetch(buf + 32, 0, 0); This should not be ThunderX or aarch64 specific; it should be a loop like prefetch_distance_bytes = prefetch_line_dist * cache_line_size; for (i = 0; i < prefetch_distance_bytes; i += cache_line_size) __builtin_prefetch(buf + i, 0, 0); In the default case, cache_line_bytes == prefetch_distance_bytes (both are 64) and you will get the same behavior as the existing __builtin_prefetch(buf, 0, 0); Thanks, Paolo > + } > +#endif > /* Use an optimized zero check if possible. Note that this also > includes a check for an unrolled loop over 64-bit integers. */ > return select_accel_fn(buf, len); >
On 10/24/2016 04:25 AM, Paolo Bonzini wrote: >> > for (; p + 8 <= e; p += 8) { >> > - __builtin_prefetch(p + 8, 0, 0); >> > + __builtin_prefetch(p + >> > + (8 * cache_line_factor * prefetch_line_dist), 0, 0); > You should precompute cache_line_bytes * prefetch_line_dist / > sizeof(uint64_t) in a single variable, prefetch_distance. This saves > the effort of loading global variables repeatedly. Then you can do > > __builtin_prefetch(p + prefetch_distance, 0, 0); > Let's not complicate things by dividing by sizeof(uint64_t). It's less complicated to avoid both that and the implied multiply. __builtin_prefetch((char *)p + prefetch_distance, 0, 0) r~
diff --git a/util/bufferiszero.c b/util/bufferiszero.c index 421d945..f50b8df 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -25,6 +25,10 @@ #include "qemu-common.h" #include "qemu/cutils.h" #include "qemu/bswap.h" +#include <math.h> + +static uint32_t cache_line_factor = 1; +static uint32_t prefetch_line_dist = 1; static bool buffer_zero_int(const void *buf, size_t len) @@ -49,7 +53,8 @@ buffer_zero_int(const void *buf, size_t len) const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); for (; p + 8 <= e; p += 8) { - __builtin_prefetch(p + 8, 0, 0); + __builtin_prefetch(p + + (8 * cache_line_factor * prefetch_line_dist), 0, 0); if (t) { return false; } @@ -293,6 +298,30 @@ bool test_buffer_is_zero_next_accel(void) } #endif +#if defined(__aarch64__) +#include "qemu/aarch64-cpuid.h" + +static void __attribute__((constructor)) aarch64_init_cache_size(void) +{ + uint64_t t; + + /* Use the DZP block size as a proxy for the cacheline size, + since the later is not available to userspace. This seems + to work in practice for existing implementations. */ + asm("mrs %0, dczid_el0" : "=r"(t)); + if (pow(2, (t & 0xf)) * 4 >= 128) { + cache_line_factor = 2; + } else { + cache_line_factor = 1; + } + + get_aarch64_cpu_id(); + if (is_thunderx_pass2_cpu()) { + prefetch_line_dist = 3; + } +} +#endif + /* * Checks if a buffer is all zeroes */ @@ -305,6 +334,12 @@ bool buffer_is_zero(const void *buf, size_t len) /* Fetch the beginning of the buffer while we select the accelerator. */ __builtin_prefetch(buf, 0, 0); +#if defined(__aarch64__) + if (is_thunderx_pass2_cpu()) { + __builtin_prefetch(buf + 16, 0, 0); + __builtin_prefetch(buf + 32, 0, 0); + } +#endif /* Use an optimized zero check if possible. Note that this also includes a check for an unrolled loop over 64-bit integers. */ return select_accel_fn(buf, len);