Message ID | 20231121122800.13521-8-fw@strlen.de (mailing list archive) |
---|---|
State | Awaiting Upstream |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | netfilter: make nf_flowtable lifetime differ from container struct | expand |
> This adds a small internal mapping table so that a new bpf (xdp) kfunc > can perform lookups in a flowtable. > > As-is, xdp program has access to the device pointer, but no way to do a > lookup in a flowtable -- there is no way to obtain the needed struct > without questionable stunts. > > This allows to obtain an nf_flowtable pointer given a net_device > structure. > > A device cannot be added to multiple flowtables, the mapping needs > to be unique. This is enforced when a flowtables with the > NF_FLOWTABLE_XDP_OFFLOAD was added. > > Exposure of this NF_FLOWTABLE_XDP_OFFLOAD in UAPI could be avoided, > iff the 'net_device maps to 0 or 1 flowtable' paradigm is enforced > regardless of offload-or-not flag. > > HOWEVER, that does break existing behaviour. > > An alternative would be to repurpose the hw offload flag by allowing > XDP fallback when hw offload cannot be done due to lack of ndo > callbacks. > > Signed-off-by: Florian Westphal <fw@strlen.de> Tested-by: Lorenzo Bianconi <lorenzo@kernel.org> > --- > include/net/netfilter/nf_flow_table.h | 7 ++ > net/netfilter/nf_flow_table_offload.c | 131 +++++++++++++++++++++++++- > net/netfilter/nf_tables_api.c | 3 +- > 3 files changed, 139 insertions(+), 2 deletions(-) > > diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h > index 11985d9b8370..b8b7fcb98732 100644 > --- a/include/net/netfilter/nf_flow_table.h > +++ b/include/net/netfilter/nf_flow_table.h > @@ -93,6 +93,11 @@ static inline bool nf_flowtable_hw_offload(struct nf_flowtable *flowtable) > return flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD; > } > > +static inline bool nf_flowtable_xdp_offload(struct nf_flowtable *flowtable) > +{ > + return flowtable->flags & NF_FLOWTABLE_XDP_OFFLOAD; > +} > + > enum flow_offload_tuple_dir { > FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, > FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, > @@ -299,6 +304,8 @@ struct flow_ports { > __be16 source, dest; > }; > > +struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev); > + > unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, > const struct nf_hook_state *state); > unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, > diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c > index a010b25076ca..9ec7aa4ad2e5 100644 > --- a/net/netfilter/nf_flow_table_offload.c > +++ b/net/netfilter/nf_flow_table_offload.c > @@ -17,6 +17,92 @@ static struct workqueue_struct *nf_flow_offload_add_wq; > static struct workqueue_struct *nf_flow_offload_del_wq; > static struct workqueue_struct *nf_flow_offload_stats_wq; > > +struct flow_offload_xdp { > + struct hlist_node hnode; > + > + unsigned long net_device_addr; > + struct nf_flowtable *ft; > + > + struct rcu_head rcuhead; > +}; > + > +#define NF_XDP_HT_BITS 4 > +static DEFINE_HASHTABLE(nf_xdp_hashtable, NF_XDP_HT_BITS); > +static DEFINE_MUTEX(nf_xdp_hashtable_lock); > + > +/* caller must hold rcu read lock */ > +struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev) > +{ > + unsigned long key = (unsigned long)dev; > + const struct flow_offload_xdp *cur; > + > + hash_for_each_possible_rcu(nf_xdp_hashtable, cur, hnode, key) { > + if (key == cur->net_device_addr) > + return cur->ft; > + } > + > + return NULL; > +} > + > +static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft, > + const struct net_device *dev) > +{ > + unsigned long key = (unsigned long)dev; > + struct flow_offload_xdp *cur; > + int err = 0; > + > + mutex_lock(&nf_xdp_hashtable_lock); > + hash_for_each_possible(nf_xdp_hashtable, cur, hnode, key) { > + if (key != cur->net_device_addr) > + continue; > + err = -EEXIST; > + break; > + } > + > + if (err == 0) { > + struct flow_offload_xdp *new; > + > + new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); > + if (new) { > + new->net_device_addr = key; > + new->ft = ft; > + > + hash_add_rcu(nf_xdp_hashtable, &new->hnode, key); > + } else { > + err = -ENOMEM; > + } > + } > + > + mutex_unlock(&nf_xdp_hashtable_lock); > + > + DEBUG_NET_WARN_ON_ONCE(err == 0 && nf_flowtable_by_dev(dev) != ft); > + > + return err; > +} > + > +static void nf_flowtable_by_dev_remove(const struct net_device *dev) > +{ > + unsigned long key = (unsigned long)dev; > + struct flow_offload_xdp *cur; > + bool found = false; > + > + mutex_lock(&nf_xdp_hashtable_lock); > + > + hash_for_each_possible(nf_xdp_hashtable, cur, hnode, key) { > + if (key != cur->net_device_addr) > + continue; > + > + hash_del_rcu(&cur->hnode); > + kfree_rcu(cur, rcuhead); > + found = true; > + break; > + } > + > + mutex_unlock(&nf_xdp_hashtable_lock); > + > + WARN_ON_ONCE(!found); > +} > + > struct flow_offload_work { > struct list_head list; > enum flow_cls_command cmd; > @@ -1183,6 +1269,44 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, > return 0; > } > > +static int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable, > + struct net_device *dev, > + enum flow_block_command cmd) > +{ > + if (!nf_flowtable_xdp_offload(flowtable)) > + return 0; > + > + switch (cmd) { > + case FLOW_BLOCK_BIND: > + return nf_flowtable_by_dev_insert(flowtable, dev); > + case FLOW_BLOCK_UNBIND: > + nf_flowtable_by_dev_remove(dev); > + return 0; > + } > + > + WARN_ON_ONCE(1); > + return 0; > +} > + > +static void nf_flow_offload_xdp_cancel(struct nf_flowtable *flowtable, > + struct net_device *dev, > + enum flow_block_command cmd) > +{ > + if (!nf_flowtable_xdp_offload(flowtable)) > + return; > + > + switch (cmd) { > + case FLOW_BLOCK_BIND: > + nf_flowtable_by_dev_remove(dev); > + return; > + case FLOW_BLOCK_UNBIND: > + /* We do not re-bind in case hw offload would report error > + * on *unregister*. > + */ > + break; > + } > +} > + > int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, > struct net_device *dev, > enum flow_block_command cmd) > @@ -1191,6 +1315,9 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, > struct flow_block_offload bo; > int err; > > + if (nf_flow_offload_xdp_setup(flowtable, dev, cmd)) > + return -EBUSY; > + > if (!nf_flowtable_hw_offload(flowtable)) > return 0; > > @@ -1200,8 +1327,10 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, > else > err = nf_flow_table_indr_offload_cmd(&bo, flowtable, dev, cmd, > &extack); > - if (err < 0) > + if (err < 0) { > + nf_flow_offload_xdp_cancel(flowtable, dev, cmd); > return err; > + } > > return nf_flow_table_block_setup(flowtable, &bo, cmd); > } > diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c > index 4e21311ec768..223ca4d0e2a5 100644 > --- a/net/netfilter/nf_tables_api.c > +++ b/net/netfilter/nf_tables_api.c > @@ -8198,7 +8198,8 @@ static bool nft_flowtable_offload_clash(struct net *net, > const struct nft_table *table; > > /* No offload requested, no need to validate */ > - if (!nf_flowtable_hw_offload(flowtable->ft)) > + if (!nf_flowtable_hw_offload(flowtable->ft) && > + !nf_flowtable_xdp_offload(flowtable->ft)) > return false; > > nft_net = nft_pernet(net); > -- > 2.41.0 >
Florian Westphal <fw@strlen.de> writes: > A device cannot be added to multiple flowtables, the mapping needs > to be unique. This is enforced when a flowtables with the > NF_FLOWTABLE_XDP_OFFLOAD was added. > > Exposure of this NF_FLOWTABLE_XDP_OFFLOAD in UAPI could be avoided, > iff the 'net_device maps to 0 or 1 flowtable' paradigm is enforced > regardless of offload-or-not flag. > > HOWEVER, that does break existing behaviour. I am not a huge fan of this flag, especially not as UAPI. Using the XDP offload functionality is already an explicit opt-in by userspace (you need to load the XDP program). So adding a second UAPI flag that you have to set for the flowtable to be compatible with XDP seems to just constrain things needlessly (and is bound to lead to bugs)? If we can't change the behaviour, we could change the lookup mechanism? BPF is pretty flexible, nothing says it has to use an ifindex as the lookup key? The neatest thing would be to have some way for userspace to directly populate a reference to the flowtable struct in a map, but a simpler solution would be to just introduce an opaque ID for each flowtable instance and use that as the lookup key (userspace could trivially put that into a map for the BPF program to find)? -Toke
Toke Høiland-Jørgensen <toke@toke.dk> wrote: > I am not a huge fan of this flag, especially not as UAPI. Using the XDP > offload functionality is already an explicit opt-in by userspace (you > need to load the XDP program). So adding a second UAPI flag that you > have to set for the flowtable to be compatible with XDP seems to just > constrain things needlessly (and is bound to lead to bugs)? I can remove it. But it leads to issues, for example one flowtable can shadow another one. I'd prefer to handle this from control plane and reject such config. Alternative is to ignore this and handle it as "self sabotage, don't care" combined with "do not do that, then". > If we can't change the behaviour, we could change the lookup mechanism? > BPF is pretty flexible, nothing says it has to use an ifindex as the > lookup key? The neatest thing would be to have some way for userspace to > directly populate a reference to the flowtable struct in a map, but a > simpler solution would be to just introduce an opaque ID for each > flowtable instance and use that as the lookup key (userspace could > trivially put that into a map for the BPF program to find)? Won't that complicate things? Userspace will have to use netlink events to discover when a flowtable is removed, no?
Florian Westphal <fw@strlen.de> writes: > Toke Høiland-Jørgensen <toke@toke.dk> wrote: >> I am not a huge fan of this flag, especially not as UAPI. Using the XDP >> offload functionality is already an explicit opt-in by userspace (you >> need to load the XDP program). So adding a second UAPI flag that you >> have to set for the flowtable to be compatible with XDP seems to just >> constrain things needlessly (and is bound to lead to bugs)? > > I can remove it. But it leads to issues, for example one flowtable > can shadow another one. > > I'd prefer to handle this from control plane and reject such config. > Alternative is to ignore this and handle it as "self sabotage, don't > care" combined with "do not do that, then". I do see your point about avoiding invalid configurations, but, well XDP is already very much a "use it right or it will break on you" kind of thing, so I think that bit is kinda unavoidable. As in, upon loading the XDP program that does the lookup, you can validate the configuration and reject loading if it's setup in a way that your program can support. Whereas if you have to set a flag on the flowtable itself, that means you have to make changes to the nft ruleset itself to be compatible with XDP acceleration (right?), you can't just go "accelerate my existing ruleset". >> If we can't change the behaviour, we could change the lookup mechanism? >> BPF is pretty flexible, nothing says it has to use an ifindex as the >> lookup key? The neatest thing would be to have some way for userspace to >> directly populate a reference to the flowtable struct in a map, but a >> simpler solution would be to just introduce an opaque ID for each >> flowtable instance and use that as the lookup key (userspace could >> trivially put that into a map for the BPF program to find)? > > Won't that complicate things? Userspace will have to use netlink > events to discover when a flowtable is removed, no? Well, I am kinda assuming that userspace is the entity doing the removing, in which case it should already know this, right? I must admit to being a little fuzzy on the details of when a flowtable object is replaced, though. For instance, does reloading an nft ruleset always replace the flowtable with a new one (even if there's no change to the flowtable config itself)? -Toke
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 11985d9b8370..b8b7fcb98732 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -93,6 +93,11 @@ static inline bool nf_flowtable_hw_offload(struct nf_flowtable *flowtable) return flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD; } +static inline bool nf_flowtable_xdp_offload(struct nf_flowtable *flowtable) +{ + return flowtable->flags & NF_FLOWTABLE_XDP_OFFLOAD; +} + enum flow_offload_tuple_dir { FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, @@ -299,6 +304,8 @@ struct flow_ports { __be16 source, dest; }; +struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev); + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index a010b25076ca..9ec7aa4ad2e5 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -17,6 +17,92 @@ static struct workqueue_struct *nf_flow_offload_add_wq; static struct workqueue_struct *nf_flow_offload_del_wq; static struct workqueue_struct *nf_flow_offload_stats_wq; +struct flow_offload_xdp { + struct hlist_node hnode; + + unsigned long net_device_addr; + struct nf_flowtable *ft; + + struct rcu_head rcuhead; +}; + +#define NF_XDP_HT_BITS 4 +static DEFINE_HASHTABLE(nf_xdp_hashtable, NF_XDP_HT_BITS); +static DEFINE_MUTEX(nf_xdp_hashtable_lock); + +/* caller must hold rcu read lock */ +struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev) +{ + unsigned long key = (unsigned long)dev; + const struct flow_offload_xdp *cur; + + hash_for_each_possible_rcu(nf_xdp_hashtable, cur, hnode, key) { + if (key == cur->net_device_addr) + return cur->ft; + } + + return NULL; +} + +static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft, + const struct net_device *dev) +{ + unsigned long key = (unsigned long)dev; + struct flow_offload_xdp *cur; + int err = 0; + + mutex_lock(&nf_xdp_hashtable_lock); + hash_for_each_possible(nf_xdp_hashtable, cur, hnode, key) { + if (key != cur->net_device_addr) + continue; + err = -EEXIST; + break; + } + + if (err == 0) { + struct flow_offload_xdp *new; + + new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); + if (new) { + new->net_device_addr = key; + new->ft = ft; + + hash_add_rcu(nf_xdp_hashtable, &new->hnode, key); + } else { + err = -ENOMEM; + } + } + + mutex_unlock(&nf_xdp_hashtable_lock); + + DEBUG_NET_WARN_ON_ONCE(err == 0 && nf_flowtable_by_dev(dev) != ft); + + return err; +} + +static void nf_flowtable_by_dev_remove(const struct net_device *dev) +{ + unsigned long key = (unsigned long)dev; + struct flow_offload_xdp *cur; + bool found = false; + + mutex_lock(&nf_xdp_hashtable_lock); + + hash_for_each_possible(nf_xdp_hashtable, cur, hnode, key) { + if (key != cur->net_device_addr) + continue; + + hash_del_rcu(&cur->hnode); + kfree_rcu(cur, rcuhead); + found = true; + break; + } + + mutex_unlock(&nf_xdp_hashtable_lock); + + WARN_ON_ONCE(!found); +} + struct flow_offload_work { struct list_head list; enum flow_cls_command cmd; @@ -1183,6 +1269,44 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, return 0; } +static int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + if (!nf_flowtable_xdp_offload(flowtable)) + return 0; + + switch (cmd) { + case FLOW_BLOCK_BIND: + return nf_flowtable_by_dev_insert(flowtable, dev); + case FLOW_BLOCK_UNBIND: + nf_flowtable_by_dev_remove(dev); + return 0; + } + + WARN_ON_ONCE(1); + return 0; +} + +static void nf_flow_offload_xdp_cancel(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + if (!nf_flowtable_xdp_offload(flowtable)) + return; + + switch (cmd) { + case FLOW_BLOCK_BIND: + nf_flowtable_by_dev_remove(dev); + return; + case FLOW_BLOCK_UNBIND: + /* We do not re-bind in case hw offload would report error + * on *unregister*. + */ + break; + } +} + int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct net_device *dev, enum flow_block_command cmd) @@ -1191,6 +1315,9 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct flow_block_offload bo; int err; + if (nf_flow_offload_xdp_setup(flowtable, dev, cmd)) + return -EBUSY; + if (!nf_flowtable_hw_offload(flowtable)) return 0; @@ -1200,8 +1327,10 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, else err = nf_flow_table_indr_offload_cmd(&bo, flowtable, dev, cmd, &extack); - if (err < 0) + if (err < 0) { + nf_flow_offload_xdp_cancel(flowtable, dev, cmd); return err; + } return nf_flow_table_block_setup(flowtable, &bo, cmd); } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4e21311ec768..223ca4d0e2a5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8198,7 +8198,8 @@ static bool nft_flowtable_offload_clash(struct net *net, const struct nft_table *table; /* No offload requested, no need to validate */ - if (!nf_flowtable_hw_offload(flowtable->ft)) + if (!nf_flowtable_hw_offload(flowtable->ft) && + !nf_flowtable_xdp_offload(flowtable->ft)) return false; nft_net = nft_pernet(net);
This adds a small internal mapping table so that a new bpf (xdp) kfunc can perform lookups in a flowtable. As-is, xdp program has access to the device pointer, but no way to do a lookup in a flowtable -- there is no way to obtain the needed struct without questionable stunts. This allows to obtain an nf_flowtable pointer given a net_device structure. A device cannot be added to multiple flowtables, the mapping needs to be unique. This is enforced when a flowtables with the NF_FLOWTABLE_XDP_OFFLOAD was added. Exposure of this NF_FLOWTABLE_XDP_OFFLOAD in UAPI could be avoided, iff the 'net_device maps to 0 or 1 flowtable' paradigm is enforced regardless of offload-or-not flag. HOWEVER, that does break existing behaviour. An alternative would be to repurpose the hw offload flag by allowing XDP fallback when hw offload cannot be done due to lack of ndo callbacks. Signed-off-by: Florian Westphal <fw@strlen.de> --- include/net/netfilter/nf_flow_table.h | 7 ++ net/netfilter/nf_flow_table_offload.c | 131 +++++++++++++++++++++++++- net/netfilter/nf_tables_api.c | 3 +- 3 files changed, 139 insertions(+), 2 deletions(-)