diff mbox series

[bpf-next] bpf: optimize hashmap lookups when key_size is divisible by 4

Message ID 20230401101050.358342-1-aspsk@isovalent.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [bpf-next] bpf: optimize hashmap lookups when key_size is divisible by 4 | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 30 this patch: 30
netdev/cc_maintainers warning 6 maintainers not CCed: song@kernel.org sdf@google.com haoluo@google.com yhs@fb.com kpsingh@kernel.org jolsa@kernel.org
netdev/build_clang success Errors and warnings before: 18 this patch: 18
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 30 this patch: 30
netdev/checkpatch warning WARNING: line length of 90 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline warning Was 1 now: 1
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for set-matrix

Commit Message

Anton Protopopov April 1, 2023, 10:10 a.m. UTC
The BPF hashmap uses the jhash() hash function. There is an optimized version
of this hash function which may be used if hash size is a multiple of 4. Apply
this optimization to the hashmap in a similar way as it is done in the bloom
filter map.

On practice the optimization is only noticeable for smaller key sizes, which,
however, is sufficient for many applications. An example is listed in the
following table of measurements (a hashmap of 65536 elements was used):

    --------------------------------------------------------------------
    | key_size | fullness | lookups /sec | lookups (opt) /sec |   gain |
    --------------------------------------------------------------------
    |        4 |      25% |      42.990M |            46.000M |   7.0% |
    |        4 |      50% |      37.910M |            39.094M |   3.1% |
    |        4 |      75% |      34.486M |            36.124M |   4.7% |
    |        4 |     100% |      31.760M |            32.719M |   3.0% |
    --------------------------------------------------------------------
    |        8 |      25% |      43.855M |            49.626M |  13.2% |
    |        8 |      50% |      38.328M |            42.152M |  10.0% |
    |        8 |      75% |      34.483M |            38.088M |  10.5% |
    |        8 |     100% |      31.306M |            34.686M |  10.8% |
    --------------------------------------------------------------------
    |       12 |      25% |      38.398M |            43.770M |  14.0% |
    |       12 |      50% |      33.336M |            37.712M |  13.1% |
    |       12 |      75% |      29.917M |            34.440M |  15.1% |
    |       12 |     100% |      27.322M |            30.480M |  11.6% |
    --------------------------------------------------------------------
    |       16 |      25% |      41.491M |            41.921M |   1.0% |
    |       16 |      50% |      36.206M |            36.474M |   0.7% |
    |       16 |      75% |      32.529M |            33.027M |   1.5% |
    |       16 |     100% |      29.581M |            30.325M |   2.5% |
    --------------------------------------------------------------------
    |       20 |      25% |      34.240M |            36.787M |   7.4% |
    |       20 |      50% |      30.328M |            32.663M |   7.7% |
    |       20 |      75% |      27.536M |            29.354M |   6.6% |
    |       20 |     100% |      24.847M |            26.505M |   6.7% |
    --------------------------------------------------------------------
    |       24 |      25% |      36.329M |            40.608M |  11.8% |
    |       24 |      50% |      31.444M |            35.059M |  11.5% |
    |       24 |      75% |      28.426M |            31.452M |  10.6% |
    |       24 |     100% |      26.278M |            28.741M |   9.4% |
    --------------------------------------------------------------------
    |       28 |      25% |      31.540M |            31.944M |   1.3% |
    |       28 |      50% |      27.739M |            28.063M |   1.2% |
    |       28 |      75% |      24.993M |            25.814M |   3.3% |
    |       28 |     100% |      23.513M |            23.500M |  -0.1% |
    --------------------------------------------------------------------
    |       32 |      25% |      32.116M |            33.953M |   5.7% |
    |       32 |      50% |      28.879M |            29.859M |   3.4% |
    |       32 |      75% |      26.227M |            26.948M |   2.7% |
    |       32 |     100% |      23.829M |            24.613M |   3.3% |
    --------------------------------------------------------------------
    |       64 |      25% |      22.535M |            22.554M |   0.1% |
    |       64 |      50% |      20.471M |            20.675M |   1.0% |
    |       64 |      75% |      19.077M |            19.146M |   0.4% |
    |       64 |     100% |      17.710M |            18.131M |   2.4% |
    --------------------------------------------------------------------

The following script was used to gather the results (SMT & frequency off):

    cd tools/testing/selftests/bpf
    for key_size in 4 8 12 16 20 24 28 32 64; do
            for nr_entries in `seq 16384 16384 65536`; do
                    fullness=$(printf '%3s' $((nr_entries*100/65536)))
                    echo -n "key_size=$key_size: $fullness% full: "
                    sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=$key_size --nr_entries=$nr_entries --max_entries=65536 --nr_loops=2000000 --map_flags=0x40 | grep cpu
            done
            echo
    done

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
---
 kernel/bpf/hashtab.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

Comments

Alexei Starovoitov April 1, 2023, 4:20 p.m. UTC | #1
On Sat, Apr 01, 2023 at 10:10:50AM +0000, Anton Protopopov wrote:
> The BPF hashmap uses the jhash() hash function. There is an optimized version
> of this hash function which may be used if hash size is a multiple of 4. Apply
> this optimization to the hashmap in a similar way as it is done in the bloom
> filter map.
> 
> On practice the optimization is only noticeable for smaller key sizes, which,
> however, is sufficient for many applications. An example is listed in the
> following table of measurements (a hashmap of 65536 elements was used):
> 
>     --------------------------------------------------------------------
>     | key_size | fullness | lookups /sec | lookups (opt) /sec |   gain |
>     --------------------------------------------------------------------
>     |        4 |      25% |      42.990M |            46.000M |   7.0% |
>     |        4 |      50% |      37.910M |            39.094M |   3.1% |
>     |        4 |      75% |      34.486M |            36.124M |   4.7% |
>     |        4 |     100% |      31.760M |            32.719M |   3.0% |
>     --------------------------------------------------------------------
>     |        8 |      25% |      43.855M |            49.626M |  13.2% |
>     |        8 |      50% |      38.328M |            42.152M |  10.0% |
>     |        8 |      75% |      34.483M |            38.088M |  10.5% |
>     |        8 |     100% |      31.306M |            34.686M |  10.8% |
>     --------------------------------------------------------------------
>     |       12 |      25% |      38.398M |            43.770M |  14.0% |
>     |       12 |      50% |      33.336M |            37.712M |  13.1% |
>     |       12 |      75% |      29.917M |            34.440M |  15.1% |
>     |       12 |     100% |      27.322M |            30.480M |  11.6% |
>     --------------------------------------------------------------------
>     |       16 |      25% |      41.491M |            41.921M |   1.0% |
>     |       16 |      50% |      36.206M |            36.474M |   0.7% |
>     |       16 |      75% |      32.529M |            33.027M |   1.5% |
>     |       16 |     100% |      29.581M |            30.325M |   2.5% |
>     --------------------------------------------------------------------
>     |       20 |      25% |      34.240M |            36.787M |   7.4% |
>     |       20 |      50% |      30.328M |            32.663M |   7.7% |
>     |       20 |      75% |      27.536M |            29.354M |   6.6% |
>     |       20 |     100% |      24.847M |            26.505M |   6.7% |
>     --------------------------------------------------------------------
>     |       24 |      25% |      36.329M |            40.608M |  11.8% |
>     |       24 |      50% |      31.444M |            35.059M |  11.5% |
>     |       24 |      75% |      28.426M |            31.452M |  10.6% |
>     |       24 |     100% |      26.278M |            28.741M |   9.4% |
>     --------------------------------------------------------------------
>     |       28 |      25% |      31.540M |            31.944M |   1.3% |
>     |       28 |      50% |      27.739M |            28.063M |   1.2% |
>     |       28 |      75% |      24.993M |            25.814M |   3.3% |
>     |       28 |     100% |      23.513M |            23.500M |  -0.1% |
>     --------------------------------------------------------------------
>     |       32 |      25% |      32.116M |            33.953M |   5.7% |
>     |       32 |      50% |      28.879M |            29.859M |   3.4% |
>     |       32 |      75% |      26.227M |            26.948M |   2.7% |
>     |       32 |     100% |      23.829M |            24.613M |   3.3% |
>     --------------------------------------------------------------------
>     |       64 |      25% |      22.535M |            22.554M |   0.1% |
>     |       64 |      50% |      20.471M |            20.675M |   1.0% |
>     |       64 |      75% |      19.077M |            19.146M |   0.4% |
>     |       64 |     100% |      17.710M |            18.131M |   2.4% |
>     --------------------------------------------------------------------
> 
> The following script was used to gather the results (SMT & frequency off):
> 
>     cd tools/testing/selftests/bpf
>     for key_size in 4 8 12 16 20 24 28 32 64; do
>             for nr_entries in `seq 16384 16384 65536`; do
>                     fullness=$(printf '%3s' $((nr_entries*100/65536)))
>                     echo -n "key_size=$key_size: $fullness% full: "
>                     sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=$key_size --nr_entries=$nr_entries --max_entries=65536 --nr_loops=2000000 --map_flags=0x40 | grep cpu
>             done
>             echo
>     done
> 
> Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> ---
>  kernel/bpf/hashtab.c | 29 ++++++++++++++++++-----------
>  1 file changed, 18 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 96b645bba3a4..eb804815f7c3 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -103,6 +103,7 @@ struct bpf_htab {
>  	u32 n_buckets;	/* number of hash buckets */
>  	u32 elem_size;	/* size of each element in bytes */
>  	u32 hashrnd;
> +	u32 key_size_u32;
>  	struct lock_class_key lockdep_key;
>  	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
>  };
> @@ -510,6 +511,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  	else
>  		htab->elem_size += round_up(htab->map.value_size, 8);
>  
> +	/* optimize hash computations if key_size is divisible by 4 */
> +	if ((attr->key_size & (sizeof(u32) - 1)) == 0)
> +		htab->key_size_u32 = attr->key_size / sizeof(u32);

Please use & 3 and / 4.
sizeof(u32) is not going to change.

> +
>  	err = -E2BIG;
>  	/* prevent zero size kmalloc and check for u32 overflow */
>  	if (htab->n_buckets == 0 ||
> @@ -605,9 +610,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  	return ERR_PTR(err);
>  }
>  
> -static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
> +static inline u32 htab_map_hash(const struct bpf_htab *htab, const void *key, u32 key_len)
>  {
> -	return jhash(key, key_len, hashrnd);
> +	if (likely(htab->key_size_u32))
> +		return jhash2(key, htab->key_size_u32, htab->hashrnd);
> +	return jhash(key, key_len, htab->hashrnd);

Could you measure the speed when &3 and /4 is done in the hot path ?
I would expect the performance to be the same or faster,
since extra load is gone.
Anton Protopopov April 1, 2023, 7:25 p.m. UTC | #2
On Sat, Apr 01, 2023 at 09:20:03AM -0700, Alexei Starovoitov wrote:
> On Sat, Apr 01, 2023 at 10:10:50AM +0000, Anton Protopopov wrote:
> > The BPF hashmap uses the jhash() hash function. There is an optimized version
> > of this hash function which may be used if hash size is a multiple of 4. Apply
> > this optimization to the hashmap in a similar way as it is done in the bloom
> > filter map.
> > 
> > On practice the optimization is only noticeable for smaller key sizes, which,
> > however, is sufficient for many applications. An example is listed in the
> > following table of measurements (a hashmap of 65536 elements was used):
> > 
> >     --------------------------------------------------------------------
> >     | key_size | fullness | lookups /sec | lookups (opt) /sec |   gain |
> >     --------------------------------------------------------------------
> >     |        4 |      25% |      42.990M |            46.000M |   7.0% |
> >     |        4 |      50% |      37.910M |            39.094M |   3.1% |
> >     |        4 |      75% |      34.486M |            36.124M |   4.7% |
> >     |        4 |     100% |      31.760M |            32.719M |   3.0% |
> >     --------------------------------------------------------------------
> >     |        8 |      25% |      43.855M |            49.626M |  13.2% |
> >     |        8 |      50% |      38.328M |            42.152M |  10.0% |
> >     |        8 |      75% |      34.483M |            38.088M |  10.5% |
> >     |        8 |     100% |      31.306M |            34.686M |  10.8% |
> >     --------------------------------------------------------------------
> >     |       12 |      25% |      38.398M |            43.770M |  14.0% |
> >     |       12 |      50% |      33.336M |            37.712M |  13.1% |
> >     |       12 |      75% |      29.917M |            34.440M |  15.1% |
> >     |       12 |     100% |      27.322M |            30.480M |  11.6% |
> >     --------------------------------------------------------------------
> >     |       16 |      25% |      41.491M |            41.921M |   1.0% |
> >     |       16 |      50% |      36.206M |            36.474M |   0.7% |
> >     |       16 |      75% |      32.529M |            33.027M |   1.5% |
> >     |       16 |     100% |      29.581M |            30.325M |   2.5% |
> >     --------------------------------------------------------------------
> >     |       20 |      25% |      34.240M |            36.787M |   7.4% |
> >     |       20 |      50% |      30.328M |            32.663M |   7.7% |
> >     |       20 |      75% |      27.536M |            29.354M |   6.6% |
> >     |       20 |     100% |      24.847M |            26.505M |   6.7% |
> >     --------------------------------------------------------------------
> >     |       24 |      25% |      36.329M |            40.608M |  11.8% |
> >     |       24 |      50% |      31.444M |            35.059M |  11.5% |
> >     |       24 |      75% |      28.426M |            31.452M |  10.6% |
> >     |       24 |     100% |      26.278M |            28.741M |   9.4% |
> >     --------------------------------------------------------------------
> >     |       28 |      25% |      31.540M |            31.944M |   1.3% |
> >     |       28 |      50% |      27.739M |            28.063M |   1.2% |
> >     |       28 |      75% |      24.993M |            25.814M |   3.3% |
> >     |       28 |     100% |      23.513M |            23.500M |  -0.1% |
> >     --------------------------------------------------------------------
> >     |       32 |      25% |      32.116M |            33.953M |   5.7% |
> >     |       32 |      50% |      28.879M |            29.859M |   3.4% |
> >     |       32 |      75% |      26.227M |            26.948M |   2.7% |
> >     |       32 |     100% |      23.829M |            24.613M |   3.3% |
> >     --------------------------------------------------------------------
> >     |       64 |      25% |      22.535M |            22.554M |   0.1% |
> >     |       64 |      50% |      20.471M |            20.675M |   1.0% |
> >     |       64 |      75% |      19.077M |            19.146M |   0.4% |
> >     |       64 |     100% |      17.710M |            18.131M |   2.4% |
> >     --------------------------------------------------------------------
> > 
> > The following script was used to gather the results (SMT & frequency off):
> > 
> >     cd tools/testing/selftests/bpf
> >     for key_size in 4 8 12 16 20 24 28 32 64; do
> >             for nr_entries in `seq 16384 16384 65536`; do
> >                     fullness=$(printf '%3s' $((nr_entries*100/65536)))
> >                     echo -n "key_size=$key_size: $fullness% full: "
> >                     sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=$key_size --nr_entries=$nr_entries --max_entries=65536 --nr_loops=2000000 --map_flags=0x40 | grep cpu
> >             done
> >             echo
> >     done
> > 
> > Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> > ---
> >  kernel/bpf/hashtab.c | 29 ++++++++++++++++++-----------
> >  1 file changed, 18 insertions(+), 11 deletions(-)
> > 
> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > index 96b645bba3a4..eb804815f7c3 100644
> > --- a/kernel/bpf/hashtab.c
> > +++ b/kernel/bpf/hashtab.c
> > @@ -103,6 +103,7 @@ struct bpf_htab {
> >  	u32 n_buckets;	/* number of hash buckets */
> >  	u32 elem_size;	/* size of each element in bytes */
> >  	u32 hashrnd;
> > +	u32 key_size_u32;
> >  	struct lock_class_key lockdep_key;
> >  	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
> >  };
> > @@ -510,6 +511,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >  	else
> >  		htab->elem_size += round_up(htab->map.value_size, 8);
> >  
> > +	/* optimize hash computations if key_size is divisible by 4 */
> > +	if ((attr->key_size & (sizeof(u32) - 1)) == 0)
> > +		htab->key_size_u32 = attr->key_size / sizeof(u32);
> 
> Please use & 3 and / 4.
> sizeof(u32) is not going to change.

Yes, sure

> > +
> >  	err = -E2BIG;
> >  	/* prevent zero size kmalloc and check for u32 overflow */
> >  	if (htab->n_buckets == 0 ||
> > @@ -605,9 +610,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >  	return ERR_PTR(err);
> >  }
> >  
> > -static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
> > +static inline u32 htab_map_hash(const struct bpf_htab *htab, const void *key, u32 key_len)
> >  {
> > -	return jhash(key, key_len, hashrnd);
> > +	if (likely(htab->key_size_u32))
> > +		return jhash2(key, htab->key_size_u32, htab->hashrnd);
> > +	return jhash(key, key_len, htab->hashrnd);
> 
> Could you measure the speed when &3 and /4 is done in the hot path ?
> I would expect the performance to be the same or faster,
> since extra load is gone.

I don't see any visible difference (I've tested "&3 and /4" and "%4==0 and /4"
variants). Do you still prefer division in favor of using htab->key_size_u32?
Alexei Starovoitov April 1, 2023, 7:35 p.m. UTC | #3
On Sat, Apr 1, 2023 at 12:24 PM Anton Protopopov <aspsk@isovalent.com> wrote:
> > > -static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
> > > +static inline u32 htab_map_hash(const struct bpf_htab *htab, const void *key, u32 key_len)
> > >  {
> > > -   return jhash(key, key_len, hashrnd);
> > > +   if (likely(htab->key_size_u32))
> > > +           return jhash2(key, htab->key_size_u32, htab->hashrnd);
> > > +   return jhash(key, key_len, htab->hashrnd);
> >
> > Could you measure the speed when &3 and /4 is done in the hot path ?
> > I would expect the performance to be the same or faster,
> > since extra load is gone.
>
> I don't see any visible difference (I've tested "&3 and /4" and "%4==0 and /4"
> variants).

I bet compiler generates the same code for these two variants.
% is optimized into &.
/ is optimized into >>.

> Do you still prefer division in favor of using htab->key_size_u32?

yes, because it's a shift.
Anton Protopopov April 1, 2023, 7:53 p.m. UTC | #4
On Sat, Apr 01, 2023 at 12:35:55PM -0700, Alexei Starovoitov wrote:
> On Sat, Apr 1, 2023 at 12:24 PM Anton Protopopov <aspsk@isovalent.com> wrote:
> > > > -static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
> > > > +static inline u32 htab_map_hash(const struct bpf_htab *htab, const void *key, u32 key_len)
> > > >  {
> > > > -   return jhash(key, key_len, hashrnd);
> > > > +   if (likely(htab->key_size_u32))
> > > > +           return jhash2(key, htab->key_size_u32, htab->hashrnd);
> > > > +   return jhash(key, key_len, htab->hashrnd);
> > >
> > > Could you measure the speed when &3 and /4 is done in the hot path ?
> > > I would expect the performance to be the same or faster,
> > > since extra load is gone.
> >
> > I don't see any visible difference (I've tested "&3 and /4" and "%4==0 and /4"
> > variants).
> 
> I bet compiler generates the same code for these two variants.
> % is optimized into &.
> / is optimized into >>.

Yes, it does "test $0x3,%dl"

> > Do you still prefer division in favor of using htab->key_size_u32?
> 
> yes, because it's a shift.

ok, thanks, I will send v2 with division
diff mbox series

Patch

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 96b645bba3a4..eb804815f7c3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -103,6 +103,7 @@  struct bpf_htab {
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
 	u32 hashrnd;
+	u32 key_size_u32;
 	struct lock_class_key lockdep_key;
 	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
 };
@@ -510,6 +511,10 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	else
 		htab->elem_size += round_up(htab->map.value_size, 8);
 
+	/* optimize hash computations if key_size is divisible by 4 */
+	if ((attr->key_size & (sizeof(u32) - 1)) == 0)
+		htab->key_size_u32 = attr->key_size / sizeof(u32);
+
 	err = -E2BIG;
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
@@ -605,9 +610,11 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	return ERR_PTR(err);
 }
 
-static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
+static inline u32 htab_map_hash(const struct bpf_htab *htab, const void *key, u32 key_len)
 {
-	return jhash(key, key_len, hashrnd);
+	if (likely(htab->key_size_u32))
+		return jhash2(key, htab->key_size_u32, htab->hashrnd);
+	return jhash(key, key_len, htab->hashrnd);
 }
 
 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
@@ -673,7 +680,7 @@  static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 
 	key_size = map->key_size;
 
-	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	hash = htab_map_hash(htab, key, key_size);
 
 	head = select_bucket(htab, hash);
 
@@ -832,7 +839,7 @@  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	if (!key)
 		goto find_first_elem;
 
-	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	hash = htab_map_hash(htab, key, key_size);
 
 	head = select_bucket(htab, hash);
 
@@ -1093,7 +1100,7 @@  static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 
 	key_size = map->key_size;
 
-	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	hash = htab_map_hash(htab, key, key_size);
 
 	b = __select_bucket(htab, hash);
 	head = &b->head;
@@ -1195,7 +1202,7 @@  static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
 
 	key_size = map->key_size;
 
-	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	hash = htab_map_hash(htab, key, key_size);
 
 	b = __select_bucket(htab, hash);
 	head = &b->head;
@@ -1263,7 +1270,7 @@  static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 
 	key_size = map->key_size;
 
-	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	hash = htab_map_hash(htab, key, key_size);
 
 	b = __select_bucket(htab, hash);
 	head = &b->head;
@@ -1318,7 +1325,7 @@  static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 
 	key_size = map->key_size;
 
-	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	hash = htab_map_hash(htab, key, key_size);
 
 	b = __select_bucket(htab, hash);
 	head = &b->head;
@@ -1393,7 +1400,7 @@  static long htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	key_size = map->key_size;
 
-	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	hash = htab_map_hash(htab, key, key_size);
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
@@ -1429,7 +1436,7 @@  static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 
 	key_size = map->key_size;
 
-	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	hash = htab_map_hash(htab, key, key_size);
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
@@ -1572,7 +1579,7 @@  static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
 
 	key_size = map->key_size;
 
-	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	hash = htab_map_hash(htab, key, key_size);
 	b = __select_bucket(htab, hash);
 	head = &b->head;