@@ -1438,6 +1438,11 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx);
+bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
+ int exclude_ifindex);
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+ struct bpf_map *map, struct bpf_map *ex_map,
+ u32 flags);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
struct bpf_prog *xdp_prog);
bool dev_map_can_have_prog(struct bpf_map *map);
@@ -1606,6 +1611,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
return 0;
}
+static inline
+bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
+ int exclude_ifindex)
+{
+ return false;
+}
+
+static inline
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+ struct bpf_map *map, struct bpf_map *ex_map,
+ u32 flags)
+{
+ return 0;
+}
+
struct sk_buff;
static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
@@ -647,6 +647,7 @@ struct bpf_redirect_info {
u32 tgt_index;
void *tgt_value;
struct bpf_map *map;
+ struct bpf_map *ex_map;
u32 kern_flags;
struct bpf_nh_params nh;
};
@@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct net_device *dev);
int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
static inline
void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
@@ -3910,6 +3910,27 @@ union bpf_attr {
* * **BPF_MTU_CHK_RET_FRAG_NEEDED**
* * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
*
+ * long bpf_redirect_map_multi(struct bpf_map *map, struct bpf_map *ex_map, u64 flags)
+ * Description
+ * This is a multicast implementation for XDP redirect. It will
+ * redirect the packet to ALL the interfaces in *map*, but
+ * exclude the interfaces in *ex_map*.
+ *
+ * The forwarding *map* could be either BPF_MAP_TYPE_DEVMAP or
+ * BPF_MAP_TYPE_DEVMAP_HASH. To get better performance, the
+ * *ex_map* is limited to BPF_MAP_TYPE_DEVMAP_HASH and must be
+ * keyed by ifindex for the helper to work.
+ *
+ * Currently the *flags* only supports *BPF_F_EXCLUDE_INGRESS*,
+ * which additionally excludes the current ingress device.
+ *
+ * See also bpf_redirect_map() as a unicast implementation,
+ * which supports redirecting packet to a specific ifindex
+ * in the map. As both helpers use struct bpf_redirect_info
+ * to store the redirect info, we will use a a NULL tgt_value
+ * to distinguish multicast and unicast redirecting.
+ * Return
+ * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -4076,6 +4097,7 @@ union bpf_attr {
FN(ima_inode_hash), \
FN(sock_from_file), \
FN(check_mtu), \
+ FN(redirect_map_multi), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -4252,6 +4274,11 @@ enum {
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
};
+/* BPF_FUNC_redirect_map_multi flags. */
+enum {
+ BPF_F_EXCLUDE_INGRESS = (1ULL << 0),
+};
+
#define __bpf_md_ptr(type, name) \
union { \
type name; \
@@ -519,6 +519,133 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
}
+/* Use direct call in fast path instead of map->ops->map_get_next_key() */
+static int devmap_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ return dev_map_get_next_key(map, key, next_key);
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ return dev_map_hash_get_next_key(map, key, next_key);
+ default:
+ break;
+ }
+
+ return -ENOENT;
+}
+
+bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
+ int exclude_ifindex)
+{
+ if (obj->dev->ifindex == exclude_ifindex)
+ return true;
+
+ if (!map)
+ return false;
+
+ return __dev_map_hash_lookup_elem(map, obj->dev->ifindex) != NULL;
+}
+
+static struct bpf_dtab_netdev *devmap_get_next_obj(struct xdp_buff *xdp, struct bpf_map *map,
+ struct bpf_map *ex_map, u32 *key,
+ u32 *next_key, int ex_ifindex)
+{
+ struct bpf_dtab_netdev *obj;
+ struct net_device *dev;
+ u32 *tmp_key = key;
+ u32 index;
+ int err;
+
+ err = devmap_get_next_key(map, tmp_key, next_key);
+ if (err)
+ return NULL;
+
+ /* When using dev map hash, we could restart the hashtab traversal
+ * in case the key has been updated/removed in the mean time.
+ * So we may end up potentially looping due to traversal restarts
+ * from first elem.
+ *
+ * Let's use map's max_entries to limit the loop number.
+ */
+ for (index = 0; index < map->max_entries; index++) {
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ obj = __dev_map_lookup_elem(map, *next_key);
+ break;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ obj = __dev_map_hash_lookup_elem(map, *next_key);
+ break;
+ default:
+ break;
+ }
+
+ if (!obj || dev_in_exclude_map(obj, ex_map, ex_ifindex))
+ goto find_next;
+
+ dev = obj->dev;
+
+ if (!dev->netdev_ops->ndo_xdp_xmit)
+ goto find_next;
+
+ err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ if (unlikely(err))
+ goto find_next;
+
+ return obj;
+
+find_next:
+ tmp_key = next_key;
+ err = devmap_get_next_key(map, tmp_key, next_key);
+ if (err)
+ break;
+ }
+
+ return NULL;
+}
+
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+ struct bpf_map *map, struct bpf_map *ex_map,
+ u32 flags)
+{
+ struct bpf_dtab_netdev *obj = NULL, *next_obj = NULL;
+ struct xdp_frame *xdpf, *nxdpf;
+ int ex_ifindex;
+ u32 key, next_key;
+
+ ex_ifindex = flags & BPF_F_EXCLUDE_INGRESS ? dev_rx->ifindex : 0;
+
+ /* Find first available obj */
+ obj = devmap_get_next_obj(xdp, map, ex_map, NULL, &key, ex_ifindex);
+ if (!obj)
+ return 0;
+
+ xdpf = xdp_convert_buff_to_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ for (;;) {
+ /* Check if we still have one more available obj */
+ next_obj = devmap_get_next_obj(xdp, map, ex_map, &key,
+ &next_key, ex_ifindex);
+ if (!next_obj) {
+ bq_enqueue(obj->dev, xdpf, dev_rx, obj->xdp_prog);
+ return 0;
+ }
+
+ nxdpf = xdpf_clone(xdpf);
+ if (unlikely(!nxdpf)) {
+ xdp_return_frame_rx_napi(xdpf);
+ return -ENOMEM;
+ }
+
+ bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+ /* Deal with next obj */
+ obj = next_obj;
+ key = next_key;
+ }
+}
+
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
struct bpf_prog *xdp_prog)
{
@@ -4884,6 +4884,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_DEVMAP:
case BPF_MAP_TYPE_DEVMAP_HASH:
if (func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_redirect_map_multi &&
func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
@@ -4988,6 +4989,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_XSKMAP)
goto error;
break;
+ case BPF_FUNC_redirect_map_multi:
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+ map->map_type != BPF_MAP_TYPE_DEVMAP_HASH)
+ goto error;
+ break;
case BPF_FUNC_sk_redirect_map:
case BPF_FUNC_msg_redirect_map:
case BPF_FUNC_sock_map_update:
@@ -3920,12 +3920,19 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
};
static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
- struct bpf_map *map, struct xdp_buff *xdp)
+ struct bpf_map *map, struct xdp_buff *xdp,
+ struct bpf_map *ex_map, u32 flags)
{
switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP:
case BPF_MAP_TYPE_DEVMAP_HASH:
- return dev_map_enqueue(fwd, xdp, dev_rx);
+ /* We use a NULL fwd value to distinguish multicast
+ * and unicast forwarding
+ */
+ if (fwd)
+ return dev_map_enqueue(fwd, xdp, dev_rx);
+ else
+ return dev_map_enqueue_multi(xdp, dev_rx, map, ex_map, flags);
case BPF_MAP_TYPE_CPUMAP:
return cpu_map_enqueue(fwd, xdp, dev_rx);
case BPF_MAP_TYPE_XSKMAP:
@@ -3982,12 +3989,19 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map);
+ struct bpf_map *ex_map = ri->ex_map;
u32 index = ri->tgt_index;
void *fwd = ri->tgt_value;
int err;
+ /* The READ/WRITE_ONCE() is not needed for ex_map because the field
+ * is only read from or written to by the CPU owning the per-cpu
+ * pointer. Whereas the 'map' field is manipulated by remote CPUs
+ * in bpf_clear_redirect_map().
+ */
ri->tgt_index = 0;
ri->tgt_value = NULL;
+ ri->ex_map = NULL;
WRITE_ONCE(ri->map, NULL);
if (unlikely(!map)) {
@@ -3999,7 +4013,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
err = dev_xdp_enqueue(fwd, xdp, dev);
} else {
- err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+ err = __bpf_tx_xdp_map(dev, fwd, map, xdp, ex_map, ri->flags);
}
if (unlikely(err))
@@ -4013,6 +4027,63 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
+static int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog,
+ struct bpf_map *map, struct bpf_map *ex_map,
+ u32 flags)
+
+{
+ struct bpf_dtab_netdev *dst;
+ struct sk_buff *nskb;
+ bool exclude_ingress;
+ u32 key, next_key, index;
+ void *fwd;
+ int err;
+
+ /* Get first key from forward map */
+ err = map->ops->map_get_next_key(map, NULL, &key);
+ if (err)
+ return err;
+
+ exclude_ingress = !!(flags & BPF_F_EXCLUDE_INGRESS);
+
+ /* When using dev map hash, we could restart the hashtab traversal
+ * in case the key has been updated/removed in the mean time.
+ * So we may end up potentially looping due to traversal restarts
+ * from first elem.
+ *
+ * Let's use map's max_entries to limit the loop number.
+ */
+ for (index = 0; index < map->max_entries; index++) {
+ fwd = __xdp_map_lookup_elem(map, key);
+ if (fwd) {
+ dst = (struct bpf_dtab_netdev *)fwd;
+ if (dev_in_exclude_map(dst, ex_map,
+ exclude_ingress ? dev->ifindex : 0))
+ goto find_next;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ /* Try forword next one no mater the current forward
+ * succeed or not.
+ */
+ dev_map_generic_redirect(dst, nskb, xdp_prog);
+ }
+
+find_next:
+ err = map->ops->map_get_next_key(map, &key, &next_key);
+ if (err)
+ break;
+
+ key = next_key;
+ }
+
+ consume_skb(skb);
+ return 0;
+}
+
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
@@ -4020,19 +4091,30 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
struct bpf_map *map)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_map *ex_map = ri->ex_map;
u32 index = ri->tgt_index;
void *fwd = ri->tgt_value;
int err = 0;
ri->tgt_index = 0;
ri->tgt_value = NULL;
+ ri->ex_map = NULL;
WRITE_ONCE(ri->map, NULL);
if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- struct bpf_dtab_netdev *dst = fwd;
+ /* We use a NULL fwd value to distinguish multicast
+ * and unicast forwarding
+ */
+ if (fwd) {
+ struct bpf_dtab_netdev *dst = fwd;
+
+ err = dev_map_generic_redirect(dst, skb, xdp_prog);
+ } else {
+ err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+ ex_map, ri->flags);
+ }
- err = dev_map_generic_redirect(dst, skb, xdp_prog);
if (unlikely(err))
goto err;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
@@ -4146,6 +4228,36 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_xdp_redirect_map_multi, struct bpf_map *, map,
+ struct bpf_map *, ex_map, u64, flags)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+ /* Limit ex_map type to DEVMAP_HASH to get better performance */
+ if (unlikely((ex_map && ex_map->map_type != BPF_MAP_TYPE_DEVMAP_HASH) ||
+ flags & ~BPF_F_EXCLUDE_INGRESS))
+ return XDP_ABORTED;
+
+ ri->tgt_index = 0;
+ /* Set the tgt_value to NULL to distinguish with bpf_xdp_redirect_map */
+ ri->tgt_value = NULL;
+ ri->flags = flags;
+ ri->ex_map = ex_map;
+
+ WRITE_ONCE(ri->map, map);
+
+ return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_map_multi_proto = {
+ .func = bpf_xdp_redirect_map_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_CONST_MAP_PTR_OR_NULL,
+ .arg3_type = ARG_ANYTHING,
+};
+
static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
unsigned long off, unsigned long len)
{
@@ -7397,6 +7509,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_redirect_proto;
case BPF_FUNC_redirect_map:
return &bpf_xdp_redirect_map_proto;
+ case BPF_FUNC_redirect_map_multi:
+ return &bpf_xdp_redirect_map_multi_proto;
case BPF_FUNC_xdp_adjust_tail:
return &bpf_xdp_adjust_tail_proto;
case BPF_FUNC_fib_lookup:
@@ -583,3 +583,32 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
return __xdp_build_skb_from_frame(xdpf, skb, dev);
}
EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
+
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
+{
+ unsigned int headroom, totalsize;
+ struct xdp_frame *nxdpf;
+ struct page *page;
+ void *addr;
+
+ headroom = xdpf->headroom + sizeof(*xdpf);
+ totalsize = headroom + xdpf->len;
+
+ if (unlikely(totalsize > PAGE_SIZE))
+ return NULL;
+ page = dev_alloc_page();
+ if (!page)
+ return NULL;
+ addr = page_to_virt(page);
+
+ memcpy(addr, xdpf, totalsize);
+
+ nxdpf = addr;
+ nxdpf->data = addr + headroom;
+ nxdpf->frame_sz = PAGE_SIZE;
+ nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+ nxdpf->mem.id = 0;
+
+ return nxdpf;
+}
+EXPORT_SYMBOL_GPL(xdpf_clone);
@@ -3910,6 +3910,27 @@ union bpf_attr {
* * **BPF_MTU_CHK_RET_FRAG_NEEDED**
* * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
*
+ * long bpf_redirect_map_multi(struct bpf_map *map, struct bpf_map *ex_map, u64 flags)
+ * Description
+ * This is a multicast implementation for XDP redirect. It will
+ * redirect the packet to ALL the interfaces in *map*, but
+ * exclude the interfaces in *ex_map*.
+ *
+ * The forwarding *map* could be either BPF_MAP_TYPE_DEVMAP or
+ * BPF_MAP_TYPE_DEVMAP_HASH. To get better performance, the
+ * *ex_map* is limited to BPF_MAP_TYPE_DEVMAP_HASH and must be
+ * keyed by ifindex for the helper to work.
+ *
+ * Currently the *flags* only supports *BPF_F_EXCLUDE_INGRESS*,
+ * which additionally excludes the current ingress device.
+ *
+ * See also bpf_redirect_map() as a unicast implementation,
+ * which supports redirecting packet to a specific ifindex
+ * in the map. As both helpers use struct bpf_redirect_info
+ * to store the redirect info, we will use a a NULL tgt_value
+ * to distinguish multicast and unicast redirecting.
+ * Return
+ * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -4076,6 +4097,7 @@ union bpf_attr {
FN(ima_inode_hash), \
FN(sock_from_file), \
FN(check_mtu), \
+ FN(redirect_map_multi), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -4252,6 +4274,11 @@ enum {
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
};
+/* BPF_FUNC_redirect_map_multi flags. */
+enum {
+ BPF_F_EXCLUDE_INGRESS = (1ULL << 0),
+};
+
#define __bpf_md_ptr(type, name) \
union { \
type name; \