diff mbox series

[v2,1/1] net-next: Improve perf of bond/vlans modification

Message ID 20210818062910.344660-1-gnaaman@drivenets.com (mailing list archive)
State Accepted
Delegated to: Netdev Maintainers
Headers show
Series [v2,1/1] net-next: Improve perf of bond/vlans modification | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 4824 this patch: 4824
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Comparison to NULL could be written "!ha_node" CHECK: Please don't use multiple blank lines WARNING: 'auxilliary' may be misspelled - perhaps 'auxiliary'? WARNING: else is not generally useful after a break or return WARNING: line length of 116 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 4887 this patch: 4887
netdev/header_inline success Link

Commit Message

Gilad Naaman Aug. 18, 2021, 6:29 a.m. UTC
When a bond have a massive amount of VLANs with IPv6 addresses,
performance of changing link state, attaching a VRF, changing an IPv6
address, etc. go down dramtically.

The source of most of the slow down is the `dev_addr_lists.c` module,
which mainatins a linked list of HW addresses.
When using IPv6, this list grows for each IPv6 address added on a
VLAN, since each IPv6 address has a multicast HW address associated with
it.

When performing any modification to the involved links, this list is
traversed many times, often for nothing, all while holding the RTNL
lock.

Instead, this patch adds an auxilliary rbtree which cuts down
traversal time significantly.

Performance can be seen with the following script:

	#!/bin/bash
	ip netns del test || true 2>/dev/null
	ip netns add test

	echo 1 | ip netns exec test tee /proc/sys/net/ipv6/conf/all/keep_addr_on_down > /dev/null

	set -e

	ip -n test link add foo type veth peer name bar
	ip -n test link add b1 type bond
	ip -n test link add florp type vrf table 10

	ip -n test link set bar master b1
	ip -n test link set foo up
	ip -n test link set bar up
	ip -n test link set b1 up
	ip -n test link set florp up

	VLAN_COUNT=1500
	BASE_DEV=b1

	echo Creating vlans
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test link add link $BASE_DEV name foo.\$i type vlan id \$i; done"

	echo Bringing them up
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test link set foo.\$i up; done"

	echo Assiging IPv6 Addresses
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test address add dev foo.\$i 2000::\$i/64; done"

	echo Attaching to VRF
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test link set foo.\$i master florp; done"

On an Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz machine, the performance
before the patch is (truncated):

	Creating vlans
	real 108.35
	Bringing them up
	real 4.96
	Assiging IPv6 Addresses
	real 19.22
	Attaching to VRF
	real 458.84

After the patch:

	Creating vlans
	real 5.59
	Bringing them up
	real 5.07
	Assiging IPv6 Addresses
	real 5.64
	Attaching to VRF
	real 25.37

Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Lu Wei <luwei32@huawei.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
---
 include/linux/netdevice.h |   5 ++
 net/core/dev_addr_lists.c | 161 ++++++++++++++++++++++++++++----------
 2 files changed, 124 insertions(+), 42 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eaf5bb008aa9..8ae56a25661b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -47,6 +47,7 @@ 
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
 #include <linux/hashtable.h>
+#include <linux/rbtree.h>
 
 struct netpoll_info;
 struct device;
@@ -218,12 +219,16 @@  struct netdev_hw_addr {
 	int			sync_cnt;
 	int			refcount;
 	int			synced;
+	struct rb_node		node;
 	struct rcu_head		rcu_head;
 };
 
 struct netdev_hw_addr_list {
 	struct list_head	list;
 	int			count;
+
+	/* Auxiliary tree for faster lookup when modifying the structure. */
+	struct rb_root		tree_root;
 };
 
 #define netdev_hw_addr_list_count(l) ((l)->count)
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 45ae6eeb2964..7fd73b905790 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -12,6 +12,70 @@ 
 #include <linux/export.h>
 #include <linux/list.h>
 
+/* Lookup for an address in the list using the rbtree.
+ * The return value is always a valid pointer.
+ * If the address exists, `*ret` is non-null and the address can be retrieved using
+ *
+ *     container_of(*ret, struct netdev_hw_addr, node)
+ *
+ * Otherwise, `ret` can be used with `parent` as an insertion point
+ * when calling `__hw_addr_insert_address_to_tree`.
+ *
+ * Must only be called when holding the netdevice's spinlock.
+ *
+ * @ignore_zero_addr_type if true and `addr_type` is zero,
+ *                        disregard addr_type when matching;
+ */
+static struct rb_node **__hw_addr_tree_address_lookup(struct netdev_hw_addr_list *list,
+					  const unsigned char *addr,
+					  int addr_len,
+					  unsigned char addr_type,
+					  bool ignore_zero_addr_type,
+					  struct rb_node **parent)
+{
+	struct rb_node **node = &list->tree_root.rb_node, *_parent;
+
+	while (*node) {
+		struct netdev_hw_addr *data = container_of(*node, struct netdev_hw_addr, node);
+		int result;
+
+		result = memcmp(addr, data->addr, addr_len);
+
+		if (!result && (ignore_zero_addr_type && !addr_type))
+			result = memcmp(&addr_type, &data->type, sizeof(addr_type));
+
+		_parent = *node;
+		if (result < 0)
+			node = &(*node)->rb_left;
+		else if (result > 0)
+			node = &(*node)->rb_right;
+		else
+			break;
+	}
+
+	if (parent)
+		*parent = _parent;
+	return node;
+}
+
+
+static int __hw_addr_insert_address_to_tree(struct netdev_hw_addr_list *list,
+				  struct netdev_hw_addr *ha,
+				  int addr_len,
+				  struct rb_node **insertion_point,
+				  struct rb_node *parent)
+{
+	/* Figure out where to put new node */
+	if (!insertion_point || !parent)
+		insertion_point = __hw_addr_tree_address_lookup(list, ha->addr, addr_len, ha->type, false, &parent);
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&ha->node, parent, insertion_point);
+	rb_insert_color(&ha->node, &list->tree_root);
+
+	return true;
+}
+
 /*
  * General list handling functions
  */
@@ -19,7 +83,9 @@ 
 static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
 			       const unsigned char *addr, int addr_len,
 			       unsigned char addr_type, bool global,
-			       bool sync)
+			       bool sync,
+			       struct rb_node **insertion_point,
+			       struct rb_node *parent)
 {
 	struct netdev_hw_addr *ha;
 	int alloc_size;
@@ -36,6 +102,10 @@  static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
 	ha->global_use = global;
 	ha->synced = sync ? 1 : 0;
 	ha->sync_cnt = 0;
+
+	/* Insert node to hash table for quicker lookups during modification */
+	__hw_addr_insert_address_to_tree(list, ha, addr_len, insertion_point, parent);
+
 	list_add_tail_rcu(&ha->list, &list->list);
 	list->count++;
 
@@ -47,34 +117,36 @@  static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
 			    unsigned char addr_type, bool global, bool sync,
 			    int sync_count)
 {
+	struct rb_node *insert_parent = NULL;
 	struct netdev_hw_addr *ha;
+	struct rb_node **ha_node;
 
 	if (addr_len > MAX_ADDR_LEN)
 		return -EINVAL;
 
-	list_for_each_entry(ha, &list->list, list) {
-		if (ha->type == addr_type &&
-		    !memcmp(ha->addr, addr, addr_len)) {
-			if (global) {
-				/* check if addr is already used as global */
-				if (ha->global_use)
-					return 0;
-				else
-					ha->global_use = true;
-			}
-			if (sync) {
-				if (ha->synced && sync_count)
-					return -EEXIST;
-				else
-					ha->synced++;
-			}
-			ha->refcount++;
-			return 0;
+	ha_node = __hw_addr_tree_address_lookup(list, addr, addr_len,
+						addr_type, false, &insert_parent);
+	if (*ha_node) {
+		ha = container_of(*ha_node, struct netdev_hw_addr, node);
+		if (global) {
+			/* check if addr is already used as global */
+			if (ha->global_use)
+				return 0;
+			else
+				ha->global_use = true;
 		}
+		if (sync) {
+			if (ha->synced && sync_count)
+				return -EEXIST;
+			else
+				ha->synced++;
+		}
+		ha->refcount++;
+		return 0;
 	}
 
 	return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
-				   sync);
+				   sync, ha_node, insert_parent);
 }
 
 static int __hw_addr_add(struct netdev_hw_addr_list *list,
@@ -103,6 +175,8 @@  static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
 
 	if (--ha->refcount)
 		return 0;
+
+	rb_erase(&ha->node, &list->tree_root);
 	list_del_rcu(&ha->list);
 	kfree_rcu(ha, rcu_head);
 	list->count--;
@@ -114,13 +188,14 @@  static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
 			    unsigned char addr_type, bool global, bool sync)
 {
 	struct netdev_hw_addr *ha;
+	struct rb_node **ha_node;
 
-	list_for_each_entry(ha, &list->list, list) {
-		if (!memcmp(ha->addr, addr, addr_len) &&
-		    (ha->type == addr_type || !addr_type))
-			return __hw_addr_del_entry(list, ha, global, sync);
-	}
-	return -ENOENT;
+	ha_node = __hw_addr_tree_address_lookup(list, addr, addr_len, addr_type, true, NULL);
+	if (*ha_node == NULL)
+		return -ENOENT;
+
+	ha = container_of(*ha_node, struct netdev_hw_addr, node);
+	return __hw_addr_del_entry(list, ha, global, sync);
 }
 
 static int __hw_addr_del(struct netdev_hw_addr_list *list,
@@ -418,6 +493,7 @@  void __hw_addr_init(struct netdev_hw_addr_list *list)
 {
 	INIT_LIST_HEAD(&list->list);
 	list->count = 0;
+	list->tree_root = RB_ROOT;
 }
 EXPORT_SYMBOL(__hw_addr_init);
 
@@ -552,19 +628,20 @@  EXPORT_SYMBOL(dev_addr_del);
  */
 int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
 {
-	struct netdev_hw_addr *ha;
+	struct rb_node *insert_parent = NULL;
+	struct rb_node **ha_node = NULL;
 	int err;
 
 	netif_addr_lock_bh(dev);
-	list_for_each_entry(ha, &dev->uc.list, list) {
-		if (!memcmp(ha->addr, addr, dev->addr_len) &&
-		    ha->type == NETDEV_HW_ADDR_T_UNICAST) {
-			err = -EEXIST;
-			goto out;
-		}
+	ha_node = __hw_addr_tree_address_lookup(&dev->uc, addr, dev->addr_len,
+						NETDEV_HW_ADDR_T_UNICAST, false, &insert_parent);
+	if (*ha_node) {
+		err = -EEXIST;
+		goto out;
 	}
+
 	err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
-				  NETDEV_HW_ADDR_T_UNICAST, true, false);
+				  NETDEV_HW_ADDR_T_UNICAST, true, false, ha_node, insert_parent);
 	if (!err)
 		__dev_set_rx_mode(dev);
 out:
@@ -745,19 +822,19 @@  EXPORT_SYMBOL(dev_uc_init);
  */
 int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
 {
-	struct netdev_hw_addr *ha;
+	struct rb_node *insert_parent = NULL;
+	struct rb_node **ha_node;
 	int err;
 
 	netif_addr_lock_bh(dev);
-	list_for_each_entry(ha, &dev->mc.list, list) {
-		if (!memcmp(ha->addr, addr, dev->addr_len) &&
-		    ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
-			err = -EEXIST;
-			goto out;
-		}
+	ha_node = __hw_addr_tree_address_lookup(&dev->mc, addr, dev->addr_len,
+						NETDEV_HW_ADDR_T_MULTICAST, false, &insert_parent);
+	if (*ha_node) {
+		err = -EEXIST;
+		goto out;
 	}
 	err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
-				  NETDEV_HW_ADDR_T_MULTICAST, true, false);
+				  NETDEV_HW_ADDR_T_MULTICAST, true, false, ha_node, insert_parent);
 	if (!err)
 		__dev_set_rx_mode(dev);
 out: