Message ID | 20230308005050.255859-2-kuifeng@meta.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Transit between BPF TCP congestion controls. | expand |
On 3/7/23 4:50 PM, Kui-Feng Lee wrote: > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 6792a7940e1e..50cfc2388cbc 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -78,6 +78,7 @@ struct bpf_map_ops { > struct bpf_map *(*map_alloc)(union bpf_attr *attr); > void (*map_release)(struct bpf_map *map, struct file *map_file); > void (*map_free)(struct bpf_map *map); > + void (*map_free_rcu)(struct bpf_map *map); This is no longer needed... > int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); > void (*map_release_uref)(struct bpf_map *map); > void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); > @@ -1934,6 +1935,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd); > struct bpf_map *__bpf_map_get(struct fd f); > void bpf_map_inc(struct bpf_map *map); > void bpf_map_inc_with_uref(struct bpf_map *map); > +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); > struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); > void bpf_map_put_with_uref(struct bpf_map *map); > void bpf_map_put(struct bpf_map *map); > diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c > index 38903fb52f98..9e097fcc9cf4 100644 > --- a/kernel/bpf/bpf_struct_ops.c > +++ b/kernel/bpf/bpf_struct_ops.c > @@ -58,6 +58,8 @@ struct bpf_struct_ops_map { > struct bpf_struct_ops_value kvalue; > }; > > +static DEFINE_MUTEX(update_mutex); Please address or reply to the earlier review comments. Stan had mentioned in v3 that this mutex is unused in this patch. This is only used in patch 5 of this set. Please move it there. > + > #define VALUE_PREFIX "bpf_struct_ops_" > #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) > > @@ -249,6 +251,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, > struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; > struct bpf_struct_ops_value *uvalue, *kvalue; > enum bpf_struct_ops_state state; > + s64 refcnt; > > if (unlikely(*(u32 *)key != 0)) > return -ENOENT; > @@ -267,7 +270,9 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, > uvalue = value; > memcpy(uvalue, st_map->uvalue, map->value_size); > uvalue->state = state; > - refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); > + > + refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); > + refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); Please explain a few words that why it will work good enough and no need to be very accurate (eg. it is for introspection purpose to give an idea on how many refcnts are held by a subsystem, eg tcp_sock ...). This was also a comment given in v3. > > return 0; > } > @@ -491,7 +496,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > *(unsigned long *)(udata + moff) = prog->aux->id; > } > > - refcount_set(&kvalue->refcnt, 1); > bpf_map_inc(map); > > set_memory_rox((long)st_map->image, 1); > @@ -536,8 +540,7 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) > switch (prev_state) { > case BPF_STRUCT_OPS_STATE_INUSE: > st_map->st_ops->unreg(&st_map->kvalue.data); > - if (refcount_dec_and_test(&st_map->kvalue.refcnt)) > - bpf_map_put(map); > + bpf_map_put(map); > return 0; > case BPF_STRUCT_OPS_STATE_TOBEFREE: > return -EINPROGRESS; > @@ -574,6 +577,19 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) > { > struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; > > + /* The struct_ops's function may switch to another struct_ops. > + * > + * For example, bpf_tcp_cc_x->init() may switch to > + * another tcp_cc_y by calling > + * setsockopt(TCP_CONGESTION, "tcp_cc_y"). > + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called > + * and its refcount may reach 0 which then free its > + * trampoline image while tcp_cc_x is still running. > + * > + * Thus, a rcu grace period is needed here. > + */ > + synchronize_rcu(); > + > if (st_map->links) > bpf_struct_ops_map_put_progs(st_map); > bpf_map_area_free(st_map->links); > @@ -676,41 +692,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { > bool bpf_struct_ops_get(const void *kdata) > { > struct bpf_struct_ops_value *kvalue; > + struct bpf_struct_ops_map *st_map; > + struct bpf_map *map; > > kvalue = container_of(kdata, struct bpf_struct_ops_value, data); > + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); > > - return refcount_inc_not_zero(&kvalue->refcnt); > -} > - > -static void bpf_struct_ops_put_rcu(struct rcu_head *head) > -{ > - struct bpf_struct_ops_map *st_map; > - > - st_map = container_of(head, struct bpf_struct_ops_map, rcu); > - bpf_map_put(&st_map->map); > + map = __bpf_map_inc_not_zero(&st_map->map, false); > + return !IS_ERR(map); > } > > void bpf_struct_ops_put(const void *kdata) > { > struct bpf_struct_ops_value *kvalue; > + struct bpf_struct_ops_map *st_map; > > kvalue = container_of(kdata, struct bpf_struct_ops_value, data); > - if (refcount_dec_and_test(&kvalue->refcnt)) { > - struct bpf_struct_ops_map *st_map; > - > - st_map = container_of(kvalue, struct bpf_struct_ops_map, > - kvalue); > - /* The struct_ops's function may switch to another struct_ops. > - * > - * For example, bpf_tcp_cc_x->init() may switch to > - * another tcp_cc_y by calling > - * setsockopt(TCP_CONGESTION, "tcp_cc_y"). > - * During the switch, bpf_struct_ops_put(tcp_cc_x) is called > - * and its map->refcnt may reach 0 which then free its > - * trampoline image while tcp_cc_x is still running. > - * > - * Thus, a rcu grace period is needed here. > - */ > - call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); > - } > + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); > + > + bpf_map_put(&st_map->map); > } > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index f406dfa13792..03273cddd6bd 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -1288,7 +1288,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) > } > > /* map_idr_lock should have been held */ This comment needs to change, eg. map_idr_lock should have been held or the map is protected by rcu gp.... > -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) > +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) > { > int refold; >
On 3/8/23 10:30, Martin KaFai Lau wrote: > On 3/7/23 4:50 PM, Kui-Feng Lee wrote: >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index 6792a7940e1e..50cfc2388cbc 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -78,6 +78,7 @@ struct bpf_map_ops { >> struct bpf_map *(*map_alloc)(union bpf_attr *attr); >> void (*map_release)(struct bpf_map *map, struct file *map_file); >> void (*map_free)(struct bpf_map *map); >> + void (*map_free_rcu)(struct bpf_map *map); > > This is no longer needed... > >> int (*map_get_next_key)(struct bpf_map *map, void *key, void >> *next_key); >> void (*map_release_uref)(struct bpf_map *map); >> void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); >> @@ -1934,6 +1935,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd); >> struct bpf_map *__bpf_map_get(struct fd f); >> void bpf_map_inc(struct bpf_map *map); >> void bpf_map_inc_with_uref(struct bpf_map *map); >> +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); >> struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map >> *map); >> void bpf_map_put_with_uref(struct bpf_map *map); >> void bpf_map_put(struct bpf_map *map); >> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c >> index 38903fb52f98..9e097fcc9cf4 100644 >> --- a/kernel/bpf/bpf_struct_ops.c >> +++ b/kernel/bpf/bpf_struct_ops.c >> @@ -58,6 +58,8 @@ struct bpf_struct_ops_map { >> struct bpf_struct_ops_value kvalue; >> }; >> +static DEFINE_MUTEX(update_mutex); > > Please address or reply to the earlier review comments. Stan had > mentioned in v3 that this mutex is unused in this patch. > > This is only used in patch 5 of this set. Please move it there. I will address this. > >> + >> #define VALUE_PREFIX "bpf_struct_ops_" >> #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) >> @@ -249,6 +251,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct >> bpf_map *map, void *key, >> struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map >> *)map; >> struct bpf_struct_ops_value *uvalue, *kvalue; >> enum bpf_struct_ops_state state; >> + s64 refcnt; >> if (unlikely(*(u32 *)key != 0)) >> return -ENOENT; >> @@ -267,7 +270,9 @@ int bpf_struct_ops_map_sys_lookup_elem(struct >> bpf_map *map, void *key, >> uvalue = value; >> memcpy(uvalue, st_map->uvalue, map->value_size); >> uvalue->state = state; >> - refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); >> + >> + refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); >> + refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); > > Please explain a few words that why it will work good enough and no need > to be very accurate (eg. it is for introspection purpose to give an idea > on how many refcnts are held by a subsystem, eg tcp_sock ...). This was > also a comment given in v3. Sure > >> return 0; >> } >> @@ -491,7 +496,6 @@ static int bpf_struct_ops_map_update_elem(struct >> bpf_map *map, void *key, >> *(unsigned long *)(udata + moff) = prog->aux->id; >> } >> - refcount_set(&kvalue->refcnt, 1); >> bpf_map_inc(map); >> set_memory_rox((long)st_map->image, 1); >> @@ -536,8 +540,7 @@ static int bpf_struct_ops_map_delete_elem(struct >> bpf_map *map, void *key) >> switch (prev_state) { >> case BPF_STRUCT_OPS_STATE_INUSE: >> st_map->st_ops->unreg(&st_map->kvalue.data); >> - if (refcount_dec_and_test(&st_map->kvalue.refcnt)) >> - bpf_map_put(map); >> + bpf_map_put(map); >> return 0; >> case BPF_STRUCT_OPS_STATE_TOBEFREE: >> return -EINPROGRESS; >> @@ -574,6 +577,19 @@ static void bpf_struct_ops_map_free(struct >> bpf_map *map) >> { >> struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map >> *)map; >> + /* The struct_ops's function may switch to another struct_ops. >> + * >> + * For example, bpf_tcp_cc_x->init() may switch to >> + * another tcp_cc_y by calling >> + * setsockopt(TCP_CONGESTION, "tcp_cc_y"). >> + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called >> + * and its refcount may reach 0 which then free its >> + * trampoline image while tcp_cc_x is still running. >> + * >> + * Thus, a rcu grace period is needed here. >> + */ >> + synchronize_rcu(); >> + >> if (st_map->links) >> bpf_struct_ops_map_put_progs(st_map); >> bpf_map_area_free(st_map->links); >> @@ -676,41 +692,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { >> bool bpf_struct_ops_get(const void *kdata) >> { >> struct bpf_struct_ops_value *kvalue; >> + struct bpf_struct_ops_map *st_map; >> + struct bpf_map *map; >> kvalue = container_of(kdata, struct bpf_struct_ops_value, data); >> + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); >> - return refcount_inc_not_zero(&kvalue->refcnt); >> -} >> - >> -static void bpf_struct_ops_put_rcu(struct rcu_head *head) >> -{ >> - struct bpf_struct_ops_map *st_map; >> - >> - st_map = container_of(head, struct bpf_struct_ops_map, rcu); >> - bpf_map_put(&st_map->map); >> + map = __bpf_map_inc_not_zero(&st_map->map, false); >> + return !IS_ERR(map); >> } >> void bpf_struct_ops_put(const void *kdata) >> { >> struct bpf_struct_ops_value *kvalue; >> + struct bpf_struct_ops_map *st_map; >> kvalue = container_of(kdata, struct bpf_struct_ops_value, data); >> - if (refcount_dec_and_test(&kvalue->refcnt)) { >> - struct bpf_struct_ops_map *st_map; >> - >> - st_map = container_of(kvalue, struct bpf_struct_ops_map, >> - kvalue); >> - /* The struct_ops's function may switch to another struct_ops. >> - * >> - * For example, bpf_tcp_cc_x->init() may switch to >> - * another tcp_cc_y by calling >> - * setsockopt(TCP_CONGESTION, "tcp_cc_y"). >> - * During the switch, bpf_struct_ops_put(tcp_cc_x) is called >> - * and its map->refcnt may reach 0 which then free its >> - * trampoline image while tcp_cc_x is still running. >> - * >> - * Thus, a rcu grace period is needed here. >> - */ >> - call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); >> - } >> + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); >> + >> + bpf_map_put(&st_map->map); >> } >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index f406dfa13792..03273cddd6bd 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -1288,7 +1288,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) >> } >> /* map_idr_lock should have been held */ > > This comment needs to change, eg. > map_idr_lock should have been held or the map is protected by rcu gp.... > Sure. >> -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, >> bool uref) >> +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) >> { >> int refold; >
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6792a7940e1e..50cfc2388cbc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -78,6 +78,7 @@ struct bpf_map_ops { struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); + void (*map_free_rcu)(struct bpf_map *map); int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); @@ -1934,6 +1935,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 38903fb52f98..9e097fcc9cf4 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -58,6 +58,8 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_value kvalue; }; +static DEFINE_MUTEX(update_mutex); + #define VALUE_PREFIX "bpf_struct_ops_" #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) @@ -249,6 +251,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; struct bpf_struct_ops_value *uvalue, *kvalue; enum bpf_struct_ops_state state; + s64 refcnt; if (unlikely(*(u32 *)key != 0)) return -ENOENT; @@ -267,7 +270,9 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); uvalue->state = state; - refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); + + refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); + refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); return 0; } @@ -491,7 +496,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, *(unsigned long *)(udata + moff) = prog->aux->id; } - refcount_set(&kvalue->refcnt, 1); bpf_map_inc(map); set_memory_rox((long)st_map->image, 1); @@ -536,8 +540,7 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) switch (prev_state) { case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); - if (refcount_dec_and_test(&st_map->kvalue.refcnt)) - bpf_map_put(map); + bpf_map_put(map); return 0; case BPF_STRUCT_OPS_STATE_TOBEFREE: return -EINPROGRESS; @@ -574,6 +577,19 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + /* The struct_ops's function may switch to another struct_ops. + * + * For example, bpf_tcp_cc_x->init() may switch to + * another tcp_cc_y by calling + * setsockopt(TCP_CONGESTION, "tcp_cc_y"). + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called + * and its refcount may reach 0 which then free its + * trampoline image while tcp_cc_x is still running. + * + * Thus, a rcu grace period is needed here. + */ + synchronize_rcu(); + if (st_map->links) bpf_struct_ops_map_put_progs(st_map); bpf_map_area_free(st_map->links); @@ -676,41 +692,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { bool bpf_struct_ops_get(const void *kdata) { struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + struct bpf_map *map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); - return refcount_inc_not_zero(&kvalue->refcnt); -} - -static void bpf_struct_ops_put_rcu(struct rcu_head *head) -{ - struct bpf_struct_ops_map *st_map; - - st_map = container_of(head, struct bpf_struct_ops_map, rcu); - bpf_map_put(&st_map->map); + map = __bpf_map_inc_not_zero(&st_map->map, false); + return !IS_ERR(map); } void bpf_struct_ops_put(const void *kdata) { struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); - if (refcount_dec_and_test(&kvalue->refcnt)) { - struct bpf_struct_ops_map *st_map; - - st_map = container_of(kvalue, struct bpf_struct_ops_map, - kvalue); - /* The struct_ops's function may switch to another struct_ops. - * - * For example, bpf_tcp_cc_x->init() may switch to - * another tcp_cc_y by calling - * setsockopt(TCP_CONGESTION, "tcp_cc_y"). - * During the switch, bpf_struct_ops_put(tcp_cc_x) is called - * and its map->refcnt may reach 0 which then free its - * trampoline image while tcp_cc_x is still running. - * - * Thus, a rcu grace period is needed here. - */ - call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); - } + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + bpf_map_put(&st_map->map); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f406dfa13792..03273cddd6bd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1288,7 +1288,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) } /* map_idr_lock should have been held */ -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold;
We have replaced kvalue-refcnt with synchronize_rcu() to wait for an RCU grace period. Maintenance of kvalue->refcnt was a complicated task, as we had to simultaneously keep track of two reference counts: one for the reference count of bpf_map. When the kvalue->refcnt reaches zero, we also have to reduce the reference count on bpf_map - yet these steps are not performed in an atomic manner and require us to be vigilant when managing them. By eliminating kvalue->refcnt, we can make our maintenance more straightforward as the refcount of bpf_map is now solely managed! To prevent the trampoline image of a struct_ops from being released while it is still in use, we wait for an RCU grace period. The setsockopt(TCP_CONGESTION, "...") command allows you to change your socket's congestion control algorithm and can result in releasing the old struct_ops implementation. Moreover, since this function is exposed through bpf_setsockopt(), it may be accessed by BPF programs as well. To ensure that the trampoline image belonging to struct_op can be safely called while its method is in use, struct_ops is safeguarded with rcu_read_lock(). Doing so prevents any destruction of the associated images before returning from a trampoline and requires us to wait for an RCU grace period. Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> --- include/linux/bpf.h | 2 ++ kernel/bpf/bpf_struct_ops.c | 60 ++++++++++++++++++------------------- kernel/bpf/syscall.c | 2 +- 3 files changed, 32 insertions(+), 32 deletions(-)