Message ID | 20210325092733.3058653-3-liuhangbin@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | xdp: extend xdp_redirect_map with broadcast support | expand |
Hangbin Liu <liuhangbin@gmail.com> writes: > This patch add two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to extend > xdp_redirect_map for broadcast support. > > Keep the general data path in net/core/filter.c and the native data > path in kernel/bpf/devmap.c so we can use direct calls to get better > performace. > > Here is the performance result by using xdp_redirect_{map, map_multi} in > sample/bpf and send pkts via pktgen cmd: > ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 > > There are some drop back as we need to loop the map and get each interface. > > Version | Test | Generic | Native > 5.12 rc2 | redirect_map i40e->i40e | 2.0M | 9.8M > 5.12 rc2 | redirect_map i40e->veth | 1.8M | 12.0M > 5.12 rc2 + patch | redirect_map i40e->i40e | 2.0M | 9.6M > 5.12 rc2 + patch | redirect_map i40e->veth | 1.7M | 12.0M > 5.12 rc2 + patch | redirect_map multi i40e->i40e | 1.6M | 7.8M > 5.12 rc2 + patch | redirect_map multi i40e->veth | 1.4M | 9.3M > 5.12 rc2 + patch | redirect_map multi i40e->mlx4+veth | 1.0M | 3.4M > > Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> > --- > v3: > a) Rebase the code on Björn's "bpf, xdp: Restructure redirect actions". > - Add struct bpf_map *map back to struct bpf_redirect_info as we need > it for multicast. > - Add bpf_clear_redirect_map() back for devmap.c > - Add devmap_lookup_elem() as we need it in general path. > b) remove tmp_key in devmap_get_next_obj() > > v2: Fix flag renaming issue in v1 > --- > include/linux/bpf.h | 22 ++++++ > include/linux/filter.h | 14 +++- > include/net/xdp.h | 1 + > include/uapi/linux/bpf.h | 17 ++++- > kernel/bpf/devmap.c | 127 +++++++++++++++++++++++++++++++++ > net/core/filter.c | 92 +++++++++++++++++++++++- > net/core/xdp.c | 29 ++++++++ > tools/include/uapi/linux/bpf.h | 17 ++++- > 8 files changed, 310 insertions(+), 9 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index a25730eaa148..5dacb1a45a03 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1456,11 +1456,15 @@ struct sk_buff; > struct bpf_dtab_netdev; > struct bpf_cpu_map_entry; > > +struct bpf_dtab_netdev *devmap_lookup_elem(struct bpf_map *map, u32 key); > void __dev_flush(void); > int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, > struct net_device *dev_rx); > int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > struct net_device *dev_rx); > +bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex); > +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, > + struct bpf_map *map, bool exclude_ingress); > int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, > struct bpf_prog *xdp_prog); > bool dev_map_can_have_prog(struct bpf_map *map); > @@ -1595,6 +1599,11 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) > return -EOPNOTSUPP; > } > > +static inline struct net_device *devmap_lookup_elem(struct bpf_map *map, u32 key) > +{ > + return NULL; > +} > + > static inline bool dev_map_can_have_prog(struct bpf_map *map) > { > return false; > @@ -1622,6 +1631,19 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > return 0; > } > > +static inline > +bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex) > +{ > + return false; > +} > + > +static inline > +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, > + struct bpf_map *map, bool exclude_ingress) > +{ > + return 0; > +} > + > struct sk_buff; > > static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, > diff --git a/include/linux/filter.h b/include/linux/filter.h > index b2b85b2cad8e..434170aafd0d 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -646,6 +646,7 @@ struct bpf_redirect_info { > u32 flags; > u32 tgt_index; > void *tgt_value; > + struct bpf_map *map; > u32 map_id; > enum bpf_map_type map_type; > u32 kern_flags; > @@ -1479,11 +1480,11 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > > /* Lower bits of the flags are used as return code on lookup failure */ > - if (unlikely(flags > XDP_TX)) > + if (unlikely(flags & ~(BPF_F_ACTION_MASK | BPF_F_REDIR_MASK))) > return XDP_ABORTED; > > ri->tgt_value = lookup_elem(map, ifindex); > - if (unlikely(!ri->tgt_value)) { > + if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { > /* If the lookup fails we want to clear out the state in the > * redirect_info struct completely, so that if an eBPF program > * performs multiple lookups, the last one always takes > @@ -1491,13 +1492,20 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind > */ > ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ > ri->map_type = BPF_MAP_TYPE_UNSPEC; > - return flags; > + return flags & BPF_F_ACTION_MASK; > } > > ri->tgt_index = ifindex; > ri->map_id = map->id; > ri->map_type = map->map_type; > > + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || > + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && > + (flags & BPF_F_BROADCAST)) { > + ri->flags = flags; This, combined with this: [...] > + if (ri->flags & BPF_F_BROADCAST) { > + map = READ_ONCE(ri->map); > + WRITE_ONCE(ri->map, NULL); > + } > + > switch (map_type) { > case BPF_MAP_TYPE_DEVMAP: > fallthrough; > case BPF_MAP_TYPE_DEVMAP_HASH: > - err = dev_map_enqueue(fwd, xdp, dev); > + if (ri->flags & BPF_F_BROADCAST) > + err = dev_map_enqueue_multi(xdp, dev, map, > + ri->flags & BPF_F_EXCLUDE_INGRESS); > + else > + err = dev_map_enqueue(fwd, xdp, dev); Means that (since the flags value is never cleared again) once someone has done a broadcast redirect, that's all they'll ever get until the next reboot ;) Also, the bpf_clear_redirect_map() call has no effect since the call to dev_map_enqueue_multi() only checks the flags and not the value of the map pointer before deciding which enqueue function to call. To fix both of these, how about changing the logic so that: - __bpf_xdp_redirect_map() sets the map pointer if the broadcast flag is set (and clears it if the flag isn't set!) - xdp_do_redirect() does the READ_ONCE/WRITE_ONCE on ri->map unconditionally and then dispatches to dev_map_enqueue_multi() if the read resulted in a non-NULL pointer Also, it should be invalid to set the broadcast flag with a map type other than a devmap; __bpf_xdp_redirect_map() should check that. And finally, with the changes above, you no longer need the broadcast flag in do_redirect() at all, so you could just have a bool ri->exclude_ingress that is set in the helper and pass that directly to dev_map_enqueue_multi(). A selftest that validates that the above works as it's supposed to might be nice as well (i.e., one that broadcasts and does a regular redirect after one another) -Toke
On Wed, Mar 31, 2021 at 03:41:17PM +0200, Toke Høiland-Jørgensen wrote: > > @@ -1491,13 +1492,20 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind > > */ > > ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ > > ri->map_type = BPF_MAP_TYPE_UNSPEC; > > - return flags; > > + return flags & BPF_F_ACTION_MASK; > > } > > > > ri->tgt_index = ifindex; > > ri->map_id = map->id; > > ri->map_type = map->map_type; > > > > + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || > > + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && > > + (flags & BPF_F_BROADCAST)) { > > + ri->flags = flags; > > This, combined with this: > > [...] > > > + if (ri->flags & BPF_F_BROADCAST) { > > + map = READ_ONCE(ri->map); > > + WRITE_ONCE(ri->map, NULL); > > + } > > + > > switch (map_type) { > > case BPF_MAP_TYPE_DEVMAP: > > fallthrough; > > case BPF_MAP_TYPE_DEVMAP_HASH: > > - err = dev_map_enqueue(fwd, xdp, dev); > > + if (ri->flags & BPF_F_BROADCAST) > > + err = dev_map_enqueue_multi(xdp, dev, map, > > + ri->flags & BPF_F_EXCLUDE_INGRESS); > > + else > > + err = dev_map_enqueue(fwd, xdp, dev); > > Means that (since the flags value is never cleared again) once someone > has done a broadcast redirect, that's all they'll ever get until the > next reboot ;) How about just get the ri->flags first and clean it directly. e.g. flags = ri->flags; ri->flags = 0; With this we don't need to add an extra field ri->exclude_ingress as you mentioned later. People may also need the flags field in future. > > Also, the bpf_clear_redirect_map() call has no effect since the call to > dev_map_enqueue_multi() only checks the flags and not the value of the > map pointer before deciding which enqueue function to call. > > To fix both of these, how about changing the logic so that: > > - __bpf_xdp_redirect_map() sets the map pointer if the broadcast flag is > set (and clears it if the flag isn't set!) OK > > - xdp_do_redirect() does the READ_ONCE/WRITE_ONCE on ri->map > unconditionally and then dispatches to dev_map_enqueue_multi() if the > read resulted in a non-NULL pointer > > Also, it should be invalid to set the broadcast flag with a map type > other than a devmap; __bpf_xdp_redirect_map() should check that. The current code only do if (unlikely(flags > XDP_TX)) and didn't check the map type. I also only set the map when there has devmap + broadcast flag. Do you mean we need a more strict check? like if (unlikely((flags & ~(BPF_F_ACTION_MASK | BPF_F_REDIR_MASK)) || (map->map_type != BPF_MAP_TYPE_DEVMAP && map->map_type != BPF_MAP_TYPE_DEVMAP_HASH && flags & BPF_F_REDIR_MASK))) Thanks Hangbin > > And finally, with the changes above, you no longer need the broadcast > flag in do_redirect() at all, so you could just have a bool > ri->exclude_ingress that is set in the helper and pass that directly to > dev_map_enqueue_multi(). > > A selftest that validates that the above works as it's supposed to might > be nice as well (i.e., one that broadcasts and does a regular redirect > after one another) > > -Toke >
Hangbin Liu <liuhangbin@gmail.com> writes: > On Wed, Mar 31, 2021 at 03:41:17PM +0200, Toke Høiland-Jørgensen wrote: >> > @@ -1491,13 +1492,20 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind >> > */ >> > ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ >> > ri->map_type = BPF_MAP_TYPE_UNSPEC; >> > - return flags; >> > + return flags & BPF_F_ACTION_MASK; >> > } >> > >> > ri->tgt_index = ifindex; >> > ri->map_id = map->id; >> > ri->map_type = map->map_type; >> > >> > + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || >> > + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && >> > + (flags & BPF_F_BROADCAST)) { >> > + ri->flags = flags; >> >> This, combined with this: >> >> [...] >> >> > + if (ri->flags & BPF_F_BROADCAST) { >> > + map = READ_ONCE(ri->map); >> > + WRITE_ONCE(ri->map, NULL); >> > + } >> > + >> > switch (map_type) { >> > case BPF_MAP_TYPE_DEVMAP: >> > fallthrough; >> > case BPF_MAP_TYPE_DEVMAP_HASH: >> > - err = dev_map_enqueue(fwd, xdp, dev); >> > + if (ri->flags & BPF_F_BROADCAST) >> > + err = dev_map_enqueue_multi(xdp, dev, map, >> > + ri->flags & BPF_F_EXCLUDE_INGRESS); >> > + else >> > + err = dev_map_enqueue(fwd, xdp, dev); >> >> Means that (since the flags value is never cleared again) once someone >> has done a broadcast redirect, that's all they'll ever get until the >> next reboot ;) > > How about just get the ri->flags first and clean it directly. e.g. > > flags = ri->flags; > ri->flags = 0; That would fix the "until next reboot" issue, but would still render bpf_clear_redirect_map() useless. So you still need to check ri->map and if you do that you don't actually need to clear the flag field as long as it is set correctly whenever the map pointer is. > With this we don't need to add an extra field ri->exclude_ingress as you > mentioned later. The ri->exclude_ingress would be *instead* of the flags field. You could of course also just keep the flags field, but turning it into a bool makes it obvious that only one of the bits is actually used (and thus easier to see that it's correct to not clear it). > People may also need the flags field in future. In which case they can add it back at that time :) >> Also, the bpf_clear_redirect_map() call has no effect since the call to >> dev_map_enqueue_multi() only checks the flags and not the value of the >> map pointer before deciding which enqueue function to call. >> >> To fix both of these, how about changing the logic so that: >> >> - __bpf_xdp_redirect_map() sets the map pointer if the broadcast flag is >> set (and clears it if the flag isn't set!) > > OK >> >> - xdp_do_redirect() does the READ_ONCE/WRITE_ONCE on ri->map >> unconditionally and then dispatches to dev_map_enqueue_multi() if the >> read resulted in a non-NULL pointer >> >> Also, it should be invalid to set the broadcast flag with a map type >> other than a devmap; __bpf_xdp_redirect_map() should check that. > > The current code only do if (unlikely(flags > XDP_TX)) and didn't check the > map type. I also only set the map when there has devmap + broadcast flag. > Do you mean we need a more strict check? like > > if (unlikely((flags & ~(BPF_F_ACTION_MASK | BPF_F_REDIR_MASK)) || > (map->map_type != BPF_MAP_TYPE_DEVMAP && > map->map_type != BPF_MAP_TYPE_DEVMAP_HASH && > flags & BPF_F_REDIR_MASK))) Yeah, that's what I meant, but when you type it out that does seem like a bit too many checks. However, I think we can do something different: Since Björn has helpfully split out the helper functions for the different map types, we can add another argument to __bpf_xdp_redirect_map() which is the mask of valid flag values. With this, dev_{hash_,}map_redirect() can include BPF_F_REDIR_MASK in the valid flags, and {xsk,cpu}_map_redirect() can leave them out. That makes the code do the right thing without actually adding any more checks in the fast path :) (You'd still need to AND the return value with BPF_F_ACTION_MASK when returning, of course). -Toke
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a25730eaa148..5dacb1a45a03 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1456,11 +1456,15 @@ struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; +struct bpf_dtab_netdev *devmap_lookup_elem(struct bpf_map *map, u32 key); void __dev_flush(void); int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); +bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex); +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); bool dev_map_can_have_prog(struct bpf_map *map); @@ -1595,6 +1599,11 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } +static inline struct net_device *devmap_lookup_elem(struct bpf_map *map, u32 key) +{ + return NULL; +} + static inline bool dev_map_can_have_prog(struct bpf_map *map) { return false; @@ -1622,6 +1631,19 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return 0; } +static inline +bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex) +{ + return false; +} + +static inline +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress) +{ + return 0; +} + struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, diff --git a/include/linux/filter.h b/include/linux/filter.h index b2b85b2cad8e..434170aafd0d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -646,6 +646,7 @@ struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; + struct bpf_map *map; u32 map_id; enum bpf_map_type map_type; u32 kern_flags; @@ -1479,11 +1480,11 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); /* Lower bits of the flags are used as return code on lookup failure */ - if (unlikely(flags > XDP_TX)) + if (unlikely(flags & ~(BPF_F_ACTION_MASK | BPF_F_REDIR_MASK))) return XDP_ABORTED; ri->tgt_value = lookup_elem(map, ifindex); - if (unlikely(!ri->tgt_value)) { + if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program * performs multiple lookups, the last one always takes @@ -1491,13 +1492,20 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind */ ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; - return flags; + return flags & BPF_F_ACTION_MASK; } ri->tgt_index = ifindex; ri->map_id = map->id; ri->map_type = map->map_type; + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + (flags & BPF_F_BROADCAST)) { + ri->flags = flags; + WRITE_ONCE(ri->map, map); + } + return XDP_REDIRECT; } diff --git a/include/net/xdp.h b/include/net/xdp.h index a5bc214a49d9..5533f0ab2afc 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2d3036e292a9..5982ceb217dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2508,8 +2508,12 @@ union bpf_attr { * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be * one of the XDP program return codes up to **XDP_TX**, as chosen - * by the caller. Any higher bits in the *flags* argument must be - * unset. + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. * * See also **bpf_redirect**\ (), which only supports redirecting * to an ifindex, but doesn't require a map to do so. @@ -5004,6 +5008,15 @@ enum { BPF_F_BPRM_SECUREEXEC = (1ULL << 0), }; +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + +#define BPF_F_ACTION_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX) +#define BPF_F_REDIR_MASK (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS) + #define __bpf_md_ptr(type, name) \ union { \ type name; \ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2add12a289c3..3e999d6c9283 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -198,6 +198,7 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); + bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ @@ -451,6 +452,24 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) return obj; } +struct bpf_dtab_netdev *devmap_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab_netdev *obj = NULL; + + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: + obj = __dev_map_lookup_elem(map, key); + break; + case BPF_MAP_TYPE_DEVMAP_HASH: + obj = __dev_map_hash_lookup_elem(map, key); + break; + default: + break; + } + + return obj; +} + /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ @@ -515,6 +534,114 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); } +/* Use direct call in fast path instead of map->ops->map_get_next_key() */ +static int devmap_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: + return dev_map_get_next_key(map, key, next_key); + case BPF_MAP_TYPE_DEVMAP_HASH: + return dev_map_hash_get_next_key(map, key, next_key); + default: + break; + } + + return -ENOENT; +} + +bool dst_dev_is_ingress(struct bpf_dtab_netdev *dst, int ifindex) +{ + return dst->dev->ifindex == ifindex; +} + +static struct bpf_dtab_netdev *devmap_get_next_obj(struct xdp_buff *xdp, + struct bpf_map *map, + u32 *key, u32 *next_key, + int ex_ifindex) +{ + struct bpf_dtab_netdev *obj; + struct net_device *dev; + u32 index; + int err; + + err = devmap_get_next_key(map, key, next_key); + if (err) + return NULL; + + /* When using dev map hash, we could restart the hashtab traversal + * in case the key has been updated/removed in the mean time. + * So we may end up potentially looping due to traversal restarts + * from first elem. + * + * Let's use map's max_entries to limit the loop number. + */ + for (index = 0; index < map->max_entries; index++) { + obj = devmap_lookup_elem(map, *next_key); + if (!obj || dst_dev_is_ingress(obj, ex_ifindex)) + goto find_next; + + dev = obj->dev; + + if (!dev->netdev_ops->ndo_xdp_xmit) + goto find_next; + + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + goto find_next; + + return obj; + +find_next: + key = next_key; + err = devmap_get_next_key(map, key, next_key); + if (err) + break; + } + + return NULL; +} + +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress) +{ + struct bpf_dtab_netdev *obj = NULL, *next_obj = NULL; + struct xdp_frame *xdpf, *nxdpf; + u32 key, next_key; + int ex_ifindex; + + ex_ifindex = exclude_ingress ? dev_rx->ifindex : 0; + + /* Find first available obj */ + obj = devmap_get_next_obj(xdp, map, NULL, &key, ex_ifindex); + if (!obj) + return -ENOENT; + + xdpf = xdp_convert_buff_to_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + for (;;) { + /* Check if we still have one more available obj */ + next_obj = devmap_get_next_obj(xdp, map, &key, &next_key, ex_ifindex); + if (!next_obj) { + bq_enqueue(obj->dev, xdpf, dev_rx, obj->xdp_prog); + return 0; + } + + nxdpf = xdpf_clone(xdpf); + if (unlikely(!nxdpf)) { + xdp_return_frame_rx_napi(xdpf); + return -ENOMEM; + } + + bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); + + /* Deal with next obj */ + obj = next_obj; + key = next_key; + } +} + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { diff --git a/net/core/filter.c b/net/core/filter.c index 10dac9dd5086..6d4d037d70ac 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3926,6 +3926,23 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); +void bpf_clear_redirect_map(struct bpf_map *map) +{ + struct bpf_redirect_info *ri; + int cpu; + + for_each_possible_cpu(cpu) { + ri = per_cpu_ptr(&bpf_redirect_info, cpu); + /* Avoid polluting remote cacheline due to writes if + * not needed. Once we pass this test, we need the + * cmpxchg() to make sure it hasn't been changed in + * the meantime by remote CPU. + */ + if (unlikely(READ_ONCE(ri->map) == map)) + cmpxchg(&ri->map, map, NULL); + } +} + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { @@ -3933,16 +3950,26 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; + if (ri->flags & BPF_F_BROADCAST) { + map = READ_ONCE(ri->map); + WRITE_ONCE(ri->map, NULL); + } + switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - err = dev_map_enqueue(fwd, xdp, dev); + if (ri->flags & BPF_F_BROADCAST) + err = dev_map_enqueue_multi(xdp, dev, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + else + err = dev_map_enqueue(fwd, xdp, dev); break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_enqueue(fwd, xdp, dev); @@ -3976,6 +4003,57 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, } EXPORT_SYMBOL_GPL(xdp_do_redirect); +static int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog, struct bpf_map *map, + bool exclude_ingress) +{ + struct bpf_dtab_netdev *dst; + u32 key, next_key, index; + struct sk_buff *nskb; + void *fwd; + int err; + + err = map->ops->map_get_next_key(map, NULL, &key); + if (err) + return err; + + /* When using dev map hash, we could restart the hashtab traversal + * in case the key has been updated/removed in the mean time. + * So we may end up potentially looping due to traversal restarts + * from first elem. + * + * Let's use map's max_entries to limit the loop number. + */ + + for (index = 0; index < map->max_entries; index++) { + fwd = devmap_lookup_elem(map, key); + if (fwd) { + dst = (struct bpf_dtab_netdev *)fwd; + if (dst_dev_is_ingress(dst, exclude_ingress ? dev->ifindex : 0)) + goto find_next; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + /* Try forword next one no mater the current forward + * succeed or not. + */ + dev_map_generic_redirect(dst, nskb, xdp_prog); + } + +find_next: + err = map->ops->map_get_next_key(map, &key, &next_key); + if (err) + break; + + key = next_key; + } + + consume_skb(skb); + return 0; +} + static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, @@ -3984,13 +4062,23 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map; int err; + if (ri->flags & BPF_F_BROADCAST) { + map = READ_ONCE(ri->map); + WRITE_ONCE(ri->map, NULL); + } + switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - err = dev_map_generic_redirect(fwd, skb, xdp_prog); + if (ri->flags & BPF_F_BROADCAST) + err = dev_map_redirect_multi(dev, skb, xdp_prog, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + else + err = dev_map_generic_redirect(fwd, skb, xdp_prog); if (unlikely(err)) goto err; break; diff --git a/net/core/xdp.c b/net/core/xdp.c index 05354976c1fc..aba84d04642b 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -583,3 +583,32 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, return __xdp_build_skb_from_frame(xdpf, skb, dev); } EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); + +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) +{ + unsigned int headroom, totalsize; + struct xdp_frame *nxdpf; + struct page *page; + void *addr; + + headroom = xdpf->headroom + sizeof(*xdpf); + totalsize = headroom + xdpf->len; + + if (unlikely(totalsize > PAGE_SIZE)) + return NULL; + page = dev_alloc_page(); + if (!page) + return NULL; + addr = page_to_virt(page); + + memcpy(addr, xdpf, totalsize); + + nxdpf = addr; + nxdpf->data = addr + headroom; + nxdpf->frame_sz = PAGE_SIZE; + nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + nxdpf->mem.id = 0; + + return nxdpf; +} +EXPORT_SYMBOL_GPL(xdpf_clone); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d3036e292a9..5982ceb217dc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2508,8 +2508,12 @@ union bpf_attr { * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be * one of the XDP program return codes up to **XDP_TX**, as chosen - * by the caller. Any higher bits in the *flags* argument must be - * unset. + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. * * See also **bpf_redirect**\ (), which only supports redirecting * to an ifindex, but doesn't require a map to do so. @@ -5004,6 +5008,15 @@ enum { BPF_F_BPRM_SECUREEXEC = (1ULL << 0), }; +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + +#define BPF_F_ACTION_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX) +#define BPF_F_REDIR_MASK (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS) + #define __bpf_md_ptr(type, name) \ union { \ type name; \
This patch add two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to extend xdp_redirect_map for broadcast support. Keep the general data path in net/core/filter.c and the native data path in kernel/bpf/devmap.c so we can use direct calls to get better performace. Here is the performance result by using xdp_redirect_{map, map_multi} in sample/bpf and send pkts via pktgen cmd: ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 There are some drop back as we need to loop the map and get each interface. Version | Test | Generic | Native 5.12 rc2 | redirect_map i40e->i40e | 2.0M | 9.8M 5.12 rc2 | redirect_map i40e->veth | 1.8M | 12.0M 5.12 rc2 + patch | redirect_map i40e->i40e | 2.0M | 9.6M 5.12 rc2 + patch | redirect_map i40e->veth | 1.7M | 12.0M 5.12 rc2 + patch | redirect_map multi i40e->i40e | 1.6M | 7.8M 5.12 rc2 + patch | redirect_map multi i40e->veth | 1.4M | 9.3M 5.12 rc2 + patch | redirect_map multi i40e->mlx4+veth | 1.0M | 3.4M Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> --- v3: a) Rebase the code on Björn's "bpf, xdp: Restructure redirect actions". - Add struct bpf_map *map back to struct bpf_redirect_info as we need it for multicast. - Add bpf_clear_redirect_map() back for devmap.c - Add devmap_lookup_elem() as we need it in general path. b) remove tmp_key in devmap_get_next_obj() v2: Fix flag renaming issue in v1 --- include/linux/bpf.h | 22 ++++++ include/linux/filter.h | 14 +++- include/net/xdp.h | 1 + include/uapi/linux/bpf.h | 17 ++++- kernel/bpf/devmap.c | 127 +++++++++++++++++++++++++++++++++ net/core/filter.c | 92 +++++++++++++++++++++++- net/core/xdp.c | 29 ++++++++ tools/include/uapi/linux/bpf.h | 17 ++++- 8 files changed, 310 insertions(+), 9 deletions(-)