diff mbox series

[v4,1/2] bpf: avoid grabbing spin_locks of all cpus when no free elems

Message ID 20220601084149.13097-2-zhoufeng.zf@bytedance.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Optimize performance of update hash-map when free is zero | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 13 this patch: 13
netdev/cc_maintainers success CCed 10 of 10 maintainers
netdev/build_clang success Errors and warnings before: 7 this patch: 7
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 13 this patch: 13
netdev/checkpatch warning WARNING: line length of 86 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Kernel LATEST on z15 with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-15
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Feng Zhou June 1, 2022, 8:41 a.m. UTC
From: Feng Zhou <zhoufeng.zf@bytedance.com>

This patch add is_empty in pcpu_freelist_head to check freelist
having free or not. If having, grab spin_lock, or check next cpu's
freelist.

Before patch: hash_map performance
./map_perf_test 1
0:hash_map_perf pre-alloc 975345 events per sec
4:hash_map_perf pre-alloc 855367 events per sec
12:hash_map_perf pre-alloc 860862 events per sec
8:hash_map_perf pre-alloc 849561 events per sec
3:hash_map_perf pre-alloc 849074 events per sec
6:hash_map_perf pre-alloc 847120 events per sec
10:hash_map_perf pre-alloc 845047 events per sec
5:hash_map_perf pre-alloc 841266 events per sec
14:hash_map_perf pre-alloc 849740 events per sec
2:hash_map_perf pre-alloc 839598 events per sec
9:hash_map_perf pre-alloc 838695 events per sec
11:hash_map_perf pre-alloc 845390 events per sec
7:hash_map_perf pre-alloc 834865 events per sec
13:hash_map_perf pre-alloc 842619 events per sec
1:hash_map_perf pre-alloc 804231 events per sec
15:hash_map_perf pre-alloc 795314 events per sec

hash_map the worst: no free
./map_perf_test 2048
6:worse hash_map_perf pre-alloc 28628 events per sec
5:worse hash_map_perf pre-alloc 28553 events per sec
11:worse hash_map_perf pre-alloc 28543 events per sec
3:worse hash_map_perf pre-alloc 28444 events per sec
1:worse hash_map_perf pre-alloc 28418 events per sec
7:worse hash_map_perf pre-alloc 28427 events per sec
13:worse hash_map_perf pre-alloc 28330 events per sec
14:worse hash_map_perf pre-alloc 28263 events per sec
9:worse hash_map_perf pre-alloc 28211 events per sec
15:worse hash_map_perf pre-alloc 28193 events per sec
12:worse hash_map_perf pre-alloc 28190 events per sec
10:worse hash_map_perf pre-alloc 28129 events per sec
8:worse hash_map_perf pre-alloc 28116 events per sec
4:worse hash_map_perf pre-alloc 27906 events per sec
2:worse hash_map_perf pre-alloc 27801 events per sec
0:worse hash_map_perf pre-alloc 27416 events per sec
3:worse hash_map_perf pre-alloc 28188 events per sec

ftrace trace

0)               |  htab_map_update_elem() {
0)   0.198 us    |    migrate_disable();
0)               |    _raw_spin_lock_irqsave() {
0)   0.157 us    |      preempt_count_add();
0)   0.538 us    |    }
0)   0.260 us    |    lookup_elem_raw();
0)               |    alloc_htab_elem() {
0)               |      __pcpu_freelist_pop() {
0)               |        _raw_spin_lock() {
0)   0.152 us    |          preempt_count_add();
0)   0.352 us    |          native_queued_spin_lock_slowpath();
0)   1.065 us    |        }
		 |	  ...
0)               |        _raw_spin_unlock() {
0)   0.254 us    |          preempt_count_sub();
0)   0.555 us    |        }
0) + 25.188 us   |      }
0) + 25.486 us   |    }
0)               |    _raw_spin_unlock_irqrestore() {
0)   0.155 us    |      preempt_count_sub();
0)   0.454 us    |    }
0)   0.148 us    |    migrate_enable();
0) + 28.439 us   |  }

The test machine is 16C, trying to get spin_lock 17 times, in addition
to 16c, there is an extralist.

after patch: hash_map performance
./map_perf_test 1
0:hash_map_perf pre-alloc 969348 events per sec
10:hash_map_perf pre-alloc 906526 events per sec
11:hash_map_perf pre-alloc 904557 events per sec
9:hash_map_perf pre-alloc 902384 events per sec
15:hash_map_perf pre-alloc 912287 events per sec
14:hash_map_perf pre-alloc 905689 events per sec
12:hash_map_perf pre-alloc 903680 events per sec
13:hash_map_perf pre-alloc 902631 events per sec
8:hash_map_perf pre-alloc 875369 events per sec
4:hash_map_perf pre-alloc 862808 events per sec
1:hash_map_perf pre-alloc 857218 events per sec
2:hash_map_perf pre-alloc 852875 events per sec
5:hash_map_perf pre-alloc 846497 events per sec
6:hash_map_perf pre-alloc 828467 events per sec
3:hash_map_perf pre-alloc 812542 events per sec
7:hash_map_perf pre-alloc 805336 events per sec

hash_map worst: no free
./map_perf_test 2048
7:worse hash_map_perf pre-alloc 391104 events per sec
4:worse hash_map_perf pre-alloc 388073 events per sec
5:worse hash_map_perf pre-alloc 387038 events per sec
1:worse hash_map_perf pre-alloc 386546 events per sec
0:worse hash_map_perf pre-alloc 384590 events per sec
11:worse hash_map_perf pre-alloc 379378 events per sec
10:worse hash_map_perf pre-alloc 375480 events per sec
12:worse hash_map_perf pre-alloc 372394 events per sec
6:worse hash_map_perf pre-alloc 367692 events per sec
3:worse hash_map_perf pre-alloc 363970 events per sec
9:worse hash_map_perf pre-alloc 364008 events per sec
8:worse hash_map_perf pre-alloc 363759 events per sec
2:worse hash_map_perf pre-alloc 360743 events per sec
14:worse hash_map_perf pre-alloc 361195 events per sec
13:worse hash_map_perf pre-alloc 360276 events per sec
15:worse hash_map_perf pre-alloc 360057 events per sec
0:worse hash_map_perf pre-alloc 378177 events per sec

ftrace trace
0)               |  htab_map_update_elem() {
0)   0.317 us    |    migrate_disable();
0)               |    _raw_spin_lock_irqsave() {
0)   0.260 us    |      preempt_count_add();
0)   1.803 us    |    }
0)   0.276 us    |    lookup_elem_raw();
0)               |    alloc_htab_elem() {
0)   0.586 us    |      __pcpu_freelist_pop();
0)   0.945 us    |    }
0)               |    _raw_spin_unlock_irqrestore() {
0)   0.160 us    |      preempt_count_sub();
0)   0.972 us    |    }
0)   0.657 us    |    migrate_enable();
0)   8.669 us    |  }

It can be seen that after adding this patch, the map performance is
almost not degraded, and when free=0, first check is_empty instead of
directly acquiring spin_lock.

As for why to add is_empty instead of directly judging head->first, my
understanding is this, head->first is frequently modified during updating
map, which will lead to invalid other cpus's cache, and is_empty is after
freelist having no free elems will be changed, the performance will be better.

Co-developed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Feng Zhou <zhoufeng.zf@bytedance.com>
---
 kernel/bpf/percpu_freelist.c | 28 +++++++++++++++++++++++++---
 kernel/bpf/percpu_freelist.h |  1 +
 2 files changed, 26 insertions(+), 3 deletions(-)

Comments

Alexei Starovoitov June 1, 2022, 9:50 a.m. UTC | #1
On Wed, Jun 1, 2022 at 10:42 AM Feng zhou <zhoufeng.zf@bytedance.com> wrote:
>
>  static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
> @@ -130,14 +134,19 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
>         orig_cpu = cpu = raw_smp_processor_id();
>         while (1) {
>                 head = per_cpu_ptr(s->freelist, cpu);
> +               if (READ_ONCE(head->is_empty))
> +                       goto next_cpu;
>                 raw_spin_lock(&head->lock);
>                 node = head->first;
>                 if (node) {

extra bool is unnecessary.
just READ_ONCE(head->first)
Feng Zhou June 1, 2022, 11:10 a.m. UTC | #2
在 2022/6/1 下午5:50, Alexei Starovoitov 写道:
> On Wed, Jun 1, 2022 at 10:42 AM Feng zhou <zhoufeng.zf@bytedance.com> wrote:
>>   static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
>> @@ -130,14 +134,19 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
>>          orig_cpu = cpu = raw_smp_processor_id();
>>          while (1) {
>>                  head = per_cpu_ptr(s->freelist, cpu);
>> +               if (READ_ONCE(head->is_empty))
>> +                       goto next_cpu;
>>                  raw_spin_lock(&head->lock);
>>                  node = head->first;
>>                  if (node) {
> extra bool is unnecessary.
> just READ_ONCE(head->first)

As for why to add is_empty instead of directly judging head->first, my
understanding is this, head->first is frequently modified during updating
map, which will lead to invalid other cpus's cache, and is_empty is after
freelist having no free elems will be changed, the performance will be 
better.
If I'm thinking wrong, please tell me why.
Alexei Starovoitov June 1, 2022, 11:35 a.m. UTC | #3
On Wed, Jun 1, 2022 at 1:11 PM Feng Zhou <zhoufeng.zf@bytedance.com> wrote:
>
> 在 2022/6/1 下午5:50, Alexei Starovoitov 写道:
> > On Wed, Jun 1, 2022 at 10:42 AM Feng zhou <zhoufeng.zf@bytedance.com> wrote:
> >>   static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
> >> @@ -130,14 +134,19 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
> >>          orig_cpu = cpu = raw_smp_processor_id();
> >>          while (1) {
> >>                  head = per_cpu_ptr(s->freelist, cpu);
> >> +               if (READ_ONCE(head->is_empty))
> >> +                       goto next_cpu;
> >>                  raw_spin_lock(&head->lock);
> >>                  node = head->first;
> >>                  if (node) {
> > extra bool is unnecessary.
> > just READ_ONCE(head->first)
>
> As for why to add is_empty instead of directly judging head->first, my
> understanding is this, head->first is frequently modified during updating
> map, which will lead to invalid other cpus's cache, and is_empty is after
> freelist having no free elems will be changed, the performance will be
> better.

maybe. pls benchmark it.
imo wasting a bool for the corner case is not a good trade off.
Feng Zhou June 2, 2022, 3:27 a.m. UTC | #4
在 2022/6/1 下午7:35, Alexei Starovoitov 写道:
> On Wed, Jun 1, 2022 at 1:11 PM Feng Zhou <zhoufeng.zf@bytedance.com> wrote:
>> 在 2022/6/1 下午5:50, Alexei Starovoitov 写道:
>>> On Wed, Jun 1, 2022 at 10:42 AM Feng zhou <zhoufeng.zf@bytedance.com> wrote:
>>>>    static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
>>>> @@ -130,14 +134,19 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
>>>>           orig_cpu = cpu = raw_smp_processor_id();
>>>>           while (1) {
>>>>                   head = per_cpu_ptr(s->freelist, cpu);
>>>> +               if (READ_ONCE(head->is_empty))
>>>> +                       goto next_cpu;
>>>>                   raw_spin_lock(&head->lock);
>>>>                   node = head->first;
>>>>                   if (node) {
>>> extra bool is unnecessary.
>>> just READ_ONCE(head->first)
>> As for why to add is_empty instead of directly judging head->first, my
>> understanding is this, head->first is frequently modified during updating
>> map, which will lead to invalid other cpus's cache, and is_empty is after
>> freelist having no free elems will be changed, the performance will be
>> better.
> maybe. pls benchmark it.
> imo wasting a bool for the corner case is not a good trade off.

Yes, I will do and post the results as soon as possible, Thanks.
Feng Zhou June 6, 2022, 6:12 a.m. UTC | #5
在 2022/6/1 下午7:35, Alexei Starovoitov 写道:
> On Wed, Jun 1, 2022 at 1:11 PM Feng Zhou <zhoufeng.zf@bytedance.com> wrote:
>> 在 2022/6/1 下午5:50, Alexei Starovoitov 写道:
>>> On Wed, Jun 1, 2022 at 10:42 AM Feng zhou <zhoufeng.zf@bytedance.com> wrote:
>>>>    static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
>>>> @@ -130,14 +134,19 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
>>>>           orig_cpu = cpu = raw_smp_processor_id();
>>>>           while (1) {
>>>>                   head = per_cpu_ptr(s->freelist, cpu);
>>>> +               if (READ_ONCE(head->is_empty))
>>>> +                       goto next_cpu;
>>>>                   raw_spin_lock(&head->lock);
>>>>                   node = head->first;
>>>>                   if (node) {
>>> extra bool is unnecessary.
>>> just READ_ONCE(head->first)
>> As for why to add is_empty instead of directly judging head->first, my
>> understanding is this, head->first is frequently modified during updating
>> map, which will lead to invalid other cpus's cache, and is_empty is after
>> freelist having no free elems will be changed, the performance will be
>> better.
> maybe. pls benchmark it.
> imo wasting a bool for the corner case is not a good trade off.

before patch
./map_perf_test 1
35:hash_map_perf pre-alloc 1224983 events per sec
38:hash_map_perf pre-alloc 1113232 events per sec
27:hash_map_perf pre-alloc 1097989 events per sec
19:hash_map_perf pre-alloc 1092061 events per sec
21:hash_map_perf pre-alloc 1084639 events per sec
29:hash_map_perf pre-alloc 1077162 events per sec
4:hash_map_perf pre-alloc 1067511 events per sec
9:hash_map_perf pre-alloc 1063166 events per sec
33:hash_map_perf pre-alloc 1064487 events per sec
8:hash_map_perf pre-alloc 1059271 events per sec
32:hash_map_perf pre-alloc 1061351 events per sec
1:hash_map_perf pre-alloc 1055527 events per sec
15:hash_map_perf pre-alloc 1056587 events per sec
2:hash_map_perf pre-alloc 1054106 events per sec
13:hash_map_perf pre-alloc 1053028 events per sec
25:hash_map_perf pre-alloc 1053575 events per sec
26:hash_map_perf pre-alloc 1052503 events per sec
7:hash_map_perf pre-alloc 1049950 events per sec
39:hash_map_perf pre-alloc 1054421 events per sec
28:hash_map_perf pre-alloc 1050109 events per sec
6:hash_map_perf pre-alloc 1046496 events per sec
44:hash_map_perf pre-alloc 1054757 events per sec
34:hash_map_perf pre-alloc 1048549 events per sec
31:hash_map_perf pre-alloc 1047911 events per sec
18:hash_map_perf pre-alloc 1046435 events per sec
41:hash_map_perf pre-alloc 1051626 events per sec
0:hash_map_perf pre-alloc 1043397 events per sec
10:hash_map_perf pre-alloc 1043903 events per sec
20:hash_map_perf pre-alloc 1044380 events per sec
24:hash_map_perf pre-alloc 1042957 events per sec
47:hash_map_perf pre-alloc 1049337 events per sec
17:hash_map_perf pre-alloc 1038108 events per sec
42:hash_map_perf pre-alloc 1044159 events per sec
45:hash_map_perf pre-alloc 1044698 events per sec
37:hash_map_perf pre-alloc 1038156 events per sec
46:hash_map_perf pre-alloc 1039755 events per sec
22:hash_map_perf pre-alloc 1032287 events per sec
14:hash_map_perf pre-alloc 1019353 events per sec
30:hash_map_perf pre-alloc 1019358 events per sec
43:hash_map_perf pre-alloc 1015956 events per sec
36:hash_map_perf pre-alloc 997864 events per sec
40:hash_map_perf pre-alloc 972771 events per sec
12:hash_map_perf pre-alloc 891118 events per sec
16:hash_map_perf pre-alloc 882166 events per sec
23:hash_map_perf pre-alloc 882177 events per sec
11:hash_map_perf pre-alloc 880153 events per sec
3:hash_map_perf pre-alloc 843192 events per sec
5:hash_map_perf pre-alloc 826944 events per sec

./run_bench_bpf_hashmap_full_update.sh
Setting up benchmark 'bpf-hashmap-ful-update'...
Benchmark 'bpf-hashmap-ful-update' started.
1:hash_map_full_perf 15687 events per sec
2:hash_map_full_perf 15760 events per sec
3:hash_map_full_perf 15699 events per sec
4:hash_map_full_perf 15732 events per sec
5:hash_map_full_perf 15633 events per sec
6:hash_map_full_perf 15623 events per sec
7:hash_map_full_perf 15678 events per sec
8:hash_map_full_perf 15661 events per sec
9:hash_map_full_perf 15659 events per sec
10:hash_map_full_perf 15653 events per sec
11:hash_map_full_perf 15632 events per sec
12:hash_map_full_perf 16059 events per sec
13:hash_map_full_perf 16055 events per sec
14:hash_map_full_perf 16093 events per sec
15:hash_map_full_perf 16053 events per sec
16:hash_map_full_perf 16096 events per sec
17:hash_map_full_perf 15977 events per sec
18:hash_map_full_perf 15986 events per sec
19:hash_map_full_perf 16109 events per sec
20:hash_map_full_perf 16025 events per sec
21:hash_map_full_perf 16052 events per sec
22:hash_map_full_perf 16023 events per sec
23:hash_map_full_perf 16008 events per sec
24:hash_map_full_perf 16484 events per sec
25:hash_map_full_perf 15684 events per sec
26:hash_map_full_perf 15749 events per sec
27:hash_map_full_perf 15677 events per sec
28:hash_map_full_perf 15699 events per sec
29:hash_map_full_perf 15630 events per sec
30:hash_map_full_perf 15603 events per sec
31:hash_map_full_perf 15664 events per sec
32:hash_map_full_perf 15645 events per sec
33:hash_map_full_perf 15682 events per sec
34:hash_map_full_perf 15636 events per sec
35:hash_map_full_perf 15628 events per sec
36:hash_map_full_perf 16068 events per sec
37:hash_map_full_perf 16056 events per sec
38:hash_map_full_perf 16105 events per sec
39:hash_map_full_perf 16077 events per sec
40:hash_map_full_perf 16060 events per sec
41:hash_map_full_perf 15986 events per sec
42:hash_map_full_perf 15962 events per sec
43:hash_map_full_perf 16074 events per sec
44:hash_map_full_perf 16040 events per sec
45:hash_map_full_perf 16035 events per sec
46:hash_map_full_perf 16017 events per sec
47:hash_map_full_perf 15957 events per sec

after patch, use head->is_empty
./map_perf_test 1
6:hash_map_perf pre-alloc 1126051 events per sec
34:hash_map_perf pre-alloc 1122413 events per sec
42:hash_map_perf pre-alloc 1088827 events per sec
39:hash_map_perf pre-alloc 1089041 events per sec
2:hash_map_perf pre-alloc 1062943 events per sec
33:hash_map_perf pre-alloc 1065414 events per sec
4:hash_map_perf pre-alloc 1057170 events per sec
3:hash_map_perf pre-alloc 1056752 events per sec
7:hash_map_perf pre-alloc 1055573 events per sec
1:hash_map_perf pre-alloc 1054998 events per sec
27:hash_map_perf pre-alloc 1056539 events per sec
28:hash_map_perf pre-alloc 1055846 events per sec
14:hash_map_perf pre-alloc 1053706 events per sec
25:hash_map_perf pre-alloc 1054690 events per sec
31:hash_map_perf pre-alloc 1055151 events per sec
13:hash_map_perf pre-alloc 1050262 events per sec
38:hash_map_perf pre-alloc 1051390 events per sec
37:hash_map_perf pre-alloc 1050348 events per sec
44:hash_map_perf pre-alloc 1049442 events per sec
45:hash_map_perf pre-alloc 1049346 events per sec
5:hash_map_perf pre-alloc 1041591 events per sec
16:hash_map_perf pre-alloc 1041668 events per sec
22:hash_map_perf pre-alloc 1041963 events per sec
23:hash_map_perf pre-alloc 1040848 events per sec
11:hash_map_perf pre-alloc 1038474 events per sec
0:hash_map_perf pre-alloc 1037474 events per sec
29:hash_map_perf pre-alloc 1040162 events per sec
12:hash_map_perf pre-alloc 1038138 events per sec
24:hash_map_perf pre-alloc 1036339 events per sec
36:hash_map_perf pre-alloc 1036703 events per sec
35:hash_map_perf pre-alloc 1035780 events per sec
46:hash_map_perf pre-alloc 1035651 events per sec
47:hash_map_perf pre-alloc 1031633 events per sec
26:hash_map_perf pre-alloc 1022568 events per sec
9:hash_map_perf pre-alloc 1020232 events per sec
21:hash_map_perf pre-alloc 1012416 events per sec
20:hash_map_perf pre-alloc 1010835 events per sec
15:hash_map_perf pre-alloc 998342 events per sec
17:hash_map_perf pre-alloc 994979 events per sec
43:hash_map_perf pre-alloc 995927 events per sec
30:hash_map_perf pre-alloc 890710 events per sec
10:hash_map_perf pre-alloc 886156 events per sec
40:hash_map_perf pre-alloc 835611 events per sec
18:hash_map_perf pre-alloc 826670 events per sec
8:hash_map_perf pre-alloc 784346 events per sec
41:hash_map_perf pre-alloc 781841 events per sec
32:hash_map_perf pre-alloc 775770 events per sec
19:hash_map_perf pre-alloc 774079 events per sec

./run_bench_bpf_hashmap_full_update.sh
Setting up benchmark 'bpf-hashmap-ful-update'...
Benchmark 'bpf-hashmap-ful-update' started.
1:hash_map_full_perf 607964 events per sec
2:hash_map_full_perf 580060 events per sec
3:hash_map_full_perf 617285 events per sec
4:hash_map_full_perf 647106 events per sec
5:hash_map_full_perf 578899 events per sec
6:hash_map_full_perf 620514 events per sec
7:hash_map_full_perf 601275 events per sec
8:hash_map_full_perf 638629 events per sec
9:hash_map_full_perf 587900 events per sec
10:hash_map_full_perf 574542 events per sec
11:hash_map_full_perf 575143 events per sec
12:hash_map_full_perf 594191 events per sec
13:hash_map_full_perf 587638 events per sec
14:hash_map_full_perf 543425 events per sec
15:hash_map_full_perf 566564 events per sec
16:hash_map_full_perf 603950 events per sec
17:hash_map_full_perf 567153 events per sec
18:hash_map_full_perf 604260 events per sec
19:hash_map_full_perf 581898 events per sec
20:hash_map_full_perf 569864 events per sec
21:hash_map_full_perf 307428 events per sec
22:hash_map_full_perf 621568 events per sec
23:hash_map_full_perf 568043 events per sec
24:hash_map_full_perf 714765 events per sec
25:hash_map_full_perf 613165 events per sec
26:hash_map_full_perf 647286 events per sec
27:hash_map_full_perf 610911 events per sec
28:hash_map_full_perf 590805 events per sec
29:hash_map_full_perf 621013 events per sec
30:hash_map_full_perf 614053 events per sec
31:hash_map_full_perf 618858 events per sec
32:hash_map_full_perf 593847 events per sec
33:hash_map_full_perf 648223 events per sec
34:hash_map_full_perf 649868 events per sec
35:hash_map_full_perf 657349 events per sec
36:hash_map_full_perf 595112 events per sec
37:hash_map_full_perf 595443 events per sec
38:hash_map_full_perf 557591 events per sec
39:hash_map_full_perf 591079 events per sec
40:hash_map_full_perf 558251 events per sec
41:hash_map_full_perf 572870 events per sec
42:hash_map_full_perf 567184 events per sec
43:hash_map_full_perf 604783 events per sec
44:hash_map_full_perf 632444 events per sec
45:hash_map_full_perf 307268 events per sec
46:hash_map_full_perf 566827 events per sec
47:hash_map_full_perf 626162 events per sec

after patch, use head->first
./map_perf_test 1
45:hash_map_perf pre-alloc 1263804 events per sec
4:hash_map_perf pre-alloc 1234841 events per sec
6:hash_map_perf pre-alloc 1231915 events per sec
11:hash_map_perf pre-alloc 1206927 events per sec
20:hash_map_perf pre-alloc 1179066 events per sec
32:hash_map_perf pre-alloc 1177190 events per sec
23:hash_map_perf pre-alloc 1170498 events per sec
12:hash_map_perf pre-alloc 1140194 events per sec
37:hash_map_perf pre-alloc 1136824 events per sec
9:hash_map_perf pre-alloc 1118735 events per sec
39:hash_map_perf pre-alloc 1113166 events per sec
3:hash_map_perf pre-alloc 1096464 events per sec
19:hash_map_perf pre-alloc 1084696 events per sec
43:hash_map_perf pre-alloc 1087715 events per sec
14:hash_map_perf pre-alloc 1074943 events per sec
38:hash_map_perf pre-alloc 1073905 events per sec
2:hash_map_perf pre-alloc 1067794 events per sec
17:hash_map_perf pre-alloc 1067320 events per sec
26:hash_map_perf pre-alloc 1067185 events per sec
41:hash_map_perf pre-alloc 1066780 events per sec
15:hash_map_perf pre-alloc 1057620 events per sec
0:hash_map_perf pre-alloc 1053298 events per sec
10:hash_map_perf pre-alloc 1053699 events per sec
24:hash_map_perf pre-alloc 1053075 events per sec
34:hash_map_perf pre-alloc 1053347 events per sec
18:hash_map_perf pre-alloc 1050559 events per sec
42:hash_map_perf pre-alloc 1050033 events per sec
33:hash_map_perf pre-alloc 1025317 events per sec
29:hash_map_perf pre-alloc 1000465 events per sec
28:hash_map_perf pre-alloc 975533 events per sec
35:hash_map_perf pre-alloc 974307 events per sec
44:hash_map_perf pre-alloc 966598 events per sec
27:hash_map_perf pre-alloc 962746 events per sec
36:hash_map_perf pre-alloc 945986 events per sec
22:hash_map_perf pre-alloc 914717 events per sec
13:hash_map_perf pre-alloc 901797 events per sec
30:hash_map_perf pre-alloc 849463 events per sec
5:hash_map_perf pre-alloc 842851 events per sec
16:hash_map_perf pre-alloc 814149 events per sec
1:hash_map_perf pre-alloc 798610 events per sec
46:hash_map_perf pre-alloc 793487 events per sec
40:hash_map_perf pre-alloc 772092 events per sec
7:hash_map_perf pre-alloc 742190 events per sec
21:hash_map_perf pre-alloc 714585 events per sec
8:hash_map_perf pre-alloc 702297 events per sec
31:hash_map_perf pre-alloc 700180 events per sec
47:hash_map_perf pre-alloc 686786 events per sec
25:hash_map_perf pre-alloc 655706 events per sec

./run_bench_bpf_hashmap_full_update.sh
Setting up benchmark 'bpf-hashmap-ful-update'...
Benchmark 'bpf-hashmap-ful-update' started.
1:hash_map_full_perf 555830 events per sec
2:hash_map_full_perf 591009 events per sec
3:hash_map_full_perf 585934 events per sec
4:hash_map_full_perf 573066 events per sec
5:hash_map_full_perf 586800 events per sec
6:hash_map_full_perf 587997 events per sec
7:hash_map_full_perf 610463 events per sec
8:hash_map_full_perf 560239 events per sec
9:hash_map_full_perf 612683 events per sec
10:hash_map_full_perf 617636 events per sec
11:hash_map_full_perf 558120 events per sec
12:hash_map_full_perf 505507 events per sec
13:hash_map_full_perf 509096 events per sec
14:hash_map_full_perf 500372 events per sec
15:hash_map_full_perf 495395 events per sec
16:hash_map_full_perf 510147 events per sec
17:hash_map_full_perf 511348 events per sec
18:hash_map_full_perf 523750 events per sec
19:hash_map_full_perf 508013 events per sec
20:hash_map_full_perf 528064 events per sec
21:hash_map_full_perf 543195 events per sec
22:hash_map_full_perf 541737 events per sec
23:hash_map_full_perf 528646 events per sec
24:hash_map_full_perf 683963 events per sec
25:hash_map_full_perf 598496 events per sec
26:hash_map_full_perf 528436 events per sec
27:hash_map_full_perf 576641 events per sec
28:hash_map_full_perf 599424 events per sec
29:hash_map_full_perf 575479 events per sec
30:hash_map_full_perf 580070 events per sec
31:hash_map_full_perf 563594 events per sec
32:hash_map_full_perf 601996 events per sec
33:hash_map_full_perf 548413 events per sec
34:hash_map_full_perf 551068 events per sec
35:hash_map_full_perf 605726 events per sec
36:hash_map_full_perf 505460 events per sec
37:hash_map_full_perf 519113 events per sec
38:hash_map_full_perf 547602 events per sec
39:hash_map_full_perf 547053 events per sec
40:hash_map_full_perf 516993 events per sec
41:hash_map_full_perf 506970 events per sec
42:hash_map_full_perf 500630 events per sec
43:hash_map_full_perf 553099 events per sec
44:hash_map_full_perf 528657 events per sec
45:hash_map_full_perf 517173 events per sec
46:hash_map_full_perf 503649 events per sec
47:hash_map_full_perf 527035 events per sec

 From the point of view of normal performance test, using head->first 
and head->is_empty, compared
with before patch, there is not much performance drop. In the worst 
case, the comparison between
head->first and head->is_empty is not much different as a whole.
diff mbox series

Patch

diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 3d897de89061..4d55a81ba896 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -16,9 +16,11 @@  int pcpu_freelist_init(struct pcpu_freelist *s)
 
 		raw_spin_lock_init(&head->lock);
 		head->first = NULL;
+		head->is_empty = true;
 	}
 	raw_spin_lock_init(&s->extralist.lock);
 	s->extralist.first = NULL;
+	s->extralist.is_empty = true;
 	return 0;
 }
 
@@ -32,6 +34,8 @@  static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
 {
 	node->next = head->first;
 	head->first = node;
+	if (head->is_empty)
+		WRITE_ONCE(head->is_empty, false);
 }
 
 static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
@@ -130,14 +134,19 @@  static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
+		if (READ_ONCE(head->is_empty))
+			goto next_cpu;
 		raw_spin_lock(&head->lock);
 		node = head->first;
 		if (node) {
 			head->first = node->next;
+			if (!head->first)
+				WRITE_ONCE(head->is_empty, true);
 			raw_spin_unlock(&head->lock);
 			return node;
 		}
 		raw_spin_unlock(&head->lock);
+next_cpu:
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
@@ -146,10 +155,15 @@  static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 	}
 
 	/* per cpu lists are all empty, try extralist */
+	if (READ_ONCE(s->extralist.is_empty))
+		return NULL;
 	raw_spin_lock(&s->extralist.lock);
 	node = s->extralist.first;
-	if (node)
+	if (node) {
 		s->extralist.first = node->next;
+		if (!s->extralist.first)
+			WRITE_ONCE(s->extralist.is_empty, true);
+	}
 	raw_spin_unlock(&s->extralist.lock);
 	return node;
 }
@@ -164,15 +178,20 @@  ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
+		if (READ_ONCE(head->is_empty))
+			goto next_cpu;
 		if (raw_spin_trylock(&head->lock)) {
 			node = head->first;
 			if (node) {
 				head->first = node->next;
+				if (!head->first)
+					WRITE_ONCE(head->is_empty, true);
 				raw_spin_unlock(&head->lock);
 				return node;
 			}
 			raw_spin_unlock(&head->lock);
 		}
+next_cpu:
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
@@ -181,11 +200,14 @@  ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
 	}
 
 	/* cannot pop from per cpu lists, try extralist */
-	if (!raw_spin_trylock(&s->extralist.lock))
+	if (READ_ONCE(s->extralist.is_empty) || !raw_spin_trylock(&s->extralist.lock))
 		return NULL;
 	node = s->extralist.first;
-	if (node)
+	if (node) {
 		s->extralist.first = node->next;
+		if (!s->extralist.first)
+			WRITE_ONCE(s->extralist.is_empty, true);
+	}
 	raw_spin_unlock(&s->extralist.lock);
 	return node;
 }
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3c76553cfe57..9e4545631ed5 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -9,6 +9,7 @@ 
 struct pcpu_freelist_head {
 	struct pcpu_freelist_node *first;
 	raw_spinlock_t lock;
+	bool is_empty;
 };
 
 struct pcpu_freelist {