diff mbox series

[RFC,2/2] net: hotdata optimization for netns ptypes

Message ID 0f44b47dd83152000e35355e4f9096a72ead7b87.1741957452.git.pabeni@redhat.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series net: introduce per netns packet type chains | expand

Checks

Context Check Description
netdev/series_format warning Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 38 this patch: 38
netdev/build_tools success Errors and warnings before: 26 (+0) this patch: 26 (+0)
netdev/cc_maintainers warning 1 maintainers not CCed: linux-doc@vger.kernel.org
netdev/build_clang success Errors and warnings before: 67 this patch: 67
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4096 this patch: 4096
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 85 this patch: 85
netdev/source_inline success Was 0 now: 0

Commit Message

Paolo Abeni March 14, 2025, 1:05 p.m. UTC
Per netns ptype usage is/should be an exception, but the current code
unconditionally touches the related lists for each RX and TX packet.

Add a per device flag in the hot data net_device section to cache the
'netns ptype required' information, and update it accordingly to the
relevant netns status. The new fields are placed in existing holes,
moved slightly to fit the relevant cacheline groups.

Be sure to keep such flag up2date when new devices are created and/or
devices are moved across namespaces initializing it in list_netdevice().

In the fast path we can skip per-netns list processing when such patch
is clear.

This avoid touching in the fastpath the additional cacheline needed by
the previous patch.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
This is a little cumbersome for possibly little gain. An alternative
could be caching even the per-device list status in similar
flags. Both RX and TX could use a single conditional to completely
skip all the per dev/netns list. Potentially even moving the per
device lists out of hotdata.

Side note: despite being unconditionally touched in fastpath on both
RX and TX, currently dev->ptype_all is not placed in any cacheline
group hotdata.
---
 .../networking/net_cachelines/net_device.rst  |  2 +
 include/linux/netdevice.h                     |  9 ++-
 net/core/dev.c                                | 58 ++++++++++++++-----
 3 files changed, 53 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 6327e689e8a84..206f4afded60c 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -22,7 +22,9 @@  struct list_head                    napi_list
 struct list_head                    unreg_list
 struct list_head                    close_list
 struct list_head                    ptype_all                   read_mostly                             dev_nit_active(tx)
+bool                                ptype_all_ns                read_mostly                             dev_nit_active(tx)
 struct list_head                    ptype_specific                                  read_mostly         deliver_ptype_list_skb/__netif_receive_skb_core(rx)
+bool                                ptype_specific_ns                               read_mostly         deliver_ptype_list_skb/__netif_receive_skb_core(rx)
 struct                              adj_list
 unsigned_int                        flags                       read_mostly         read_mostly         __dev_queue_xmit,__dev_xmit_skb,ip6_output,__ip6_finish_output(tx);ip6_rcv_core(rx)
 xdp_features_t                      xdp_features
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0dbfe069a6e38..031e3e42db4a6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1791,6 +1791,9 @@  enum netdev_reg_state {
  *	@close_list:	List entry used when we are closing the device
  *	@ptype_all:     Device-specific packet handlers for all protocols
  *	@ptype_specific: Device-specific, protocol-specific packet handlers
+ *	@ptype_all_ns:	The owning netns has packet handlers for all protocols
+ *	@ptype_specific_ns:	The owning netns has protocol-specific packet
+ *				handlers
  *
  *	@adj_list:	Directly linked devices, like slaves for bonding
  *	@features:	Currently active device features
@@ -2125,14 +2128,16 @@  struct net_device {
 		struct pcpu_dstats __percpu		*dstats;
 	};
 	unsigned long		state;
-	unsigned int		flags;
-	unsigned short		hard_header_len;
 	netdev_features_t	features;
 	struct inet6_dev __rcu	*ip6_ptr;
+	unsigned int		flags;
+	unsigned short		hard_header_len;
+	bool			ptype_all_ns;
 	__cacheline_group_end(net_device_read_txrx);
 
 	/* RX read-mostly hotpath */
 	__cacheline_group_begin(net_device_read_rx);
+	bool			ptype_specific_ns;
 	struct bpf_prog __rcu	*xdp_prog;
 	struct list_head	ptype_specific;
 	int			ifindex;
diff --git a/net/core/dev.c b/net/core/dev.c
index 00bdd8316cb5e..878122757c78b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -405,6 +405,12 @@  static void list_netdevice(struct net_device *dev)
 
 	ASSERT_RTNL();
 
+	/* update ptype flags according to the current netns setting */
+	spin_lock(&ptype_lock);
+	dev->ptype_all_ns = !list_empty(&net->ptype_all);
+	dev->ptype_specific_ns = !list_empty(&net->ptype_specific);
+	spin_unlock(&ptype_lock);
+
 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 	netdev_name_node_add(net, dev->name_node);
 	hlist_add_head_rcu(&dev->index_hlist,
@@ -587,6 +593,20 @@  static inline struct list_head *ptype_head(const struct packet_type *pt)
 				   &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 }
 
+static void net_set_ptype(struct net *net, bool ptype_all, bool val)
+{
+	struct net_device *dev;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		if (ptype_all)
+			WRITE_ONCE(dev->ptype_all_ns, val);
+		else
+			WRITE_ONCE(dev->ptype_specific_ns, val);
+	}
+	rcu_read_unlock();
+}
+
 /**
  *	dev_add_pack - add packet handler
  *	@pt: packet type declaration
@@ -609,6 +629,9 @@  void dev_add_pack(struct packet_type *pt)
 
 	spin_lock(&ptype_lock);
 	list_add_rcu(&pt->list, head);
+	if (pt->af_packet_net && !pt->dev && list_is_singular(head))
+		net_set_ptype(pt->af_packet_net, pt->type == htons(ETH_P_ALL),
+			      true);
 	spin_unlock(&ptype_lock);
 }
 EXPORT_SYMBOL(dev_add_pack);
@@ -637,10 +660,13 @@  void __dev_remove_pack(struct packet_type *pt)
 	spin_lock(&ptype_lock);
 
 	list_for_each_entry(pt1, head, list) {
-		if (pt == pt1) {
-			list_del_rcu(&pt->list);
-			goto out;
-		}
+		if (pt != pt1)
+			continue;
+		list_del_rcu(&pt->list);
+		if (pt->af_packet_net && !pt->dev && list_empty(head))
+			net_set_ptype(pt->af_packet_net,
+				      pt->type == htons(ETH_P_ALL), false);
+		goto out;
 	}
 
 	pr_warn("dev_remove_pack: %p not found\n", pt);
@@ -2483,8 +2509,7 @@  static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
  */
 bool dev_nit_active(struct net_device *dev)
 {
-	return !list_empty(&dev_net(dev)->ptype_all) ||
-	       !list_empty(&dev->ptype_all);
+	return READ_ONCE(dev->ptype_all_ns) || !list_empty(&dev->ptype_all);
 }
 EXPORT_SYMBOL_GPL(dev_nit_active);
 
@@ -5732,10 +5757,12 @@  static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 	if (pfmemalloc)
 		goto skip_taps;
 
-	list_for_each_entry_rcu(ptype, &dev_net(skb->dev)->ptype_all, list) {
-		if (pt_prev)
-			ret = deliver_skb(skb, pt_prev, orig_dev);
-		pt_prev = ptype;
+	if (READ_ONCE(skb->dev->ptype_all_ns)) {
+		list_for_each_entry_rcu(ptype, &dev_net(skb->dev)->ptype_all, list) {
+			if (pt_prev)
+				ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = ptype;
+		}
 	}
 
 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
@@ -5844,8 +5871,9 @@  static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 				       &ptype_base[ntohs(type) &
 						   PTYPE_HASH_MASK]);
-		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
-				       &dev_net(orig_dev)->ptype_specific);
+		if (READ_ONCE(skb->dev->ptype_specific_ns))
+			deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+					       &dev_net(skb->dev)->ptype_specific);
 	}
 
 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
@@ -12563,10 +12591,12 @@  static void __init net_dev_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
-	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ptype_all_ns);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 47);
 
 	/* RX read-mostly hotpath */
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific_ns);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
@@ -12581,7 +12611,7 @@  static void __init net_dev_struct_check(void)
 #ifdef CONFIG_NET_XGRESS
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
 #endif
-	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 93);
 }
 
 /*