diff mbox series

[v3,6/6] util/bufferiszero: improve scalar variant

Message ID 20240206204809.9859-7-amonakov@ispras.ru (mailing list archive)
State New, archived
Headers show
Series Optimize buffer_is_zero | expand

Commit Message

Alexander Monakov Feb. 6, 2024, 8:48 p.m. UTC
Take into account that the inline wrapper ensures len >= 4.

Use __attribute__((may_alias)) for accesses via non-char pointers.

Avoid using out-of-bounds pointers in loop boundary conditions by
reformulating the 'for' loop as 'if (...) do { ... } while (...)'.

Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru>
---
 util/bufferiszero.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

Comments

Richard Henderson Feb. 6, 2024, 10:34 p.m. UTC | #1
On 2/7/24 06:48, Alexander Monakov wrote:
> -        /* Otherwise, use the unaligned memory access functions to
> -           handle the beginning and end of the buffer, with a couple
> +        /* Use unaligned memory access functions to handle
> +           the beginning and end of the buffer, with a couple
>              of loops handling the middle aligned section.  */
> -        uint64_t t = ldq_he_p(buf);
> -        const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
> -        const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
> +        uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
> +        typedef uint64_t uint64_a __attribute__((may_alias));
> +        const uint64_a *p = (void *)(((uintptr_t)buf + 8) & -8);
> +        const uint64_a *e = (void *)(((uintptr_t)buf + len - 1) & -8);

You appear to be optimizing this routine for x86, which is not the primary consumer.

This is going to perform very poorly on hosts that do not support unaligned accesses (e.g. 
Sparc and some RISC-V).


r~
Richard Henderson Feb. 6, 2024, 10:46 p.m. UTC | #2
On 2/7/24 08:34, Richard Henderson wrote:
> On 2/7/24 06:48, Alexander Monakov wrote:
>> -        /* Otherwise, use the unaligned memory access functions to
>> -           handle the beginning and end of the buffer, with a couple
>> +        /* Use unaligned memory access functions to handle
>> +           the beginning and end of the buffer, with a couple
>>              of loops handling the middle aligned section.  */
>> -        uint64_t t = ldq_he_p(buf);
>> -        const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
>> -        const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
>> +        uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
>> +        typedef uint64_t uint64_a __attribute__((may_alias));
>> +        const uint64_a *p = (void *)(((uintptr_t)buf + 8) & -8);
>> +        const uint64_a *e = (void *)(((uintptr_t)buf + len - 1) & -8);
> 
> You appear to be optimizing this routine for x86, which is not the primary consumer.
> 
> This is going to perform very poorly on hosts that do not support unaligned accesses (e.g. 
> Sparc and some RISC-V).

I beg your pardon, I mis-read this.  You're only replacing the byte loops, which will be 
more-or-less identical, modulo unrolling, when unaligned access is not supported.  But 
will be much improved if some unaligned access support is available (e.g. MIPS LWL+LWR).

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
diff mbox series

Patch

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index d752edd8cc..1f4cbfaea4 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -29,35 +29,27 @@ 
 bool
 buffer_is_zero_len_4_plus(const void *buf, size_t len)
 {
-    if (unlikely(len < 8)) {
-        /* For a very small buffer, simply accumulate all the bytes.  */
-        const unsigned char *p = buf;
-        const unsigned char *e = buf + len;
-        unsigned char t = 0;
-
-        do {
-            t |= *p++;
-        } while (p < e);
-
-        return t == 0;
+    if (unlikely(len <= 8)) {
+        /* Our caller ensures len >= 4.  */
+        return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0;
     } else {
-        /* Otherwise, use the unaligned memory access functions to
-           handle the beginning and end of the buffer, with a couple
+        /* Use unaligned memory access functions to handle
+           the beginning and end of the buffer, with a couple
            of loops handling the middle aligned section.  */
-        uint64_t t = ldq_he_p(buf);
-        const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
-        const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
+        uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
+        typedef uint64_t uint64_a __attribute__((may_alias));
+        const uint64_a *p = (void *)(((uintptr_t)buf + 8) & -8);
+        const uint64_a *e = (void *)(((uintptr_t)buf + len - 1) & -8);
 
-        for (; p + 8 <= e; p += 8) {
+        if (e - p >= 8) do {
             if (t) {
                 return false;
             }
             t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
-        }
+        } while ((p += 8) <= e - 8);
         while (p < e) {
             t |= *p++;
         }
-        t |= ldq_he_p(buf + len - 8);
 
         return t == 0;
     }