diff mbox series

[RFC,1/2] net: add sysfs attributes for customized dim profile management

Message ID 1710421773-61277-2-git-send-email-hengqi@linux.alibaba.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series net: provides dim profile fine-tuning channels | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4886 this patch: 4886
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang success Errors and warnings before: 1062 this patch: 1062
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5170 this patch: 5170
netdev/checkpatch warning CHECK: Please use a blank line after function/struct/union/enum declarations WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Heng Qi March 14, 2024, 1:09 p.m. UTC
The NetDIM library, currently leveraged by an array of NICs, delivers
excellent acceleration benefits. Nevertheless, NICs vary significantly
in their dim profile list prerequisites.

Specifically, virtio-net backends may present diverse sw or hw device
implementation, making a one-size-fits-all parameter list impractical.
On Alibaba Cloud, the virtio DPU's performance under the default DIM
profile falls short of expectations, partly due to a mismatch in
parameter configuration.

I also noticed that ice/idpf/ena and other NICs have customized
profilelist or placed some restrictions on dim capabilities.

Motivated by this, I tried adding new sysfs attributes that provides
a per-device control to modify and access a device's interrupt parameters.

Usage
========
1. Query the currently customized list of the device

$ cat dim_profs
The profiles of (RX, EQE):
{.usec =   1, .pkts = 256, .comps =   0,},
{.usec =   8, .pkts = 256, .comps =   0,},
{.usec =  64, .pkts = 256, .comps =   0,},
{.usec = 128, .pkts = 256, .comps =   0,},
{.usec = 256, .pkts = 256, .comps =   0,}
The profiles of (TX, EQE):
{.usec =   1, .pkts = 256, .comps =   0,},
{.usec =   2, .pkts = 256, .comps =   0,},
{.usec =   3, .pkts = 256, .comps =   0,},
{.usec =   4, .pkts = 256, .comps =   0,},
{.usec =   5, .pkts = 256, .comps =   0,}

2. Tune

$ echo "RX EQE 8,8,0 16,16,0 32,32,0 64,64,0 128,128,0" > dim_profs
$ echo "  TX  EQE 0,2,0   1,3,0 2,4,0   3,5,0  4,6,0   " > dim_profs
$ cat dim_profs
The profiles of (RX, EQE):
{.usec =   8, .pkts =   8, .comps =   0,},
{.usec =  16, .pkts =  16, .comps =   0,},
{.usec =  32, .pkts =  32, .comps =   0,},
{.usec =  64, .pkts =  64, .comps =   0,},
{.usec = 128, .pkts = 128, .comps =   0,}
The profiles of (TX, EQE):
{.usec =   0, .pkts =   2, .comps =   0,},
{.usec =   1, .pkts =   3, .comps =   0,},
{.usec =   2, .pkts =   4, .comps =   0,},
{.usec =   3, .pkts =   5, .comps =   0,},
{.usec =   4, .pkts =   6, .comps =   0,}

3. Warn
If the device does not support .ndo_dim_moder_{set, get},
the following warning will response:
"Profile is default and not customized by the device."

Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
---
 Documentation/ABI/testing/sysfs-class-net |  17 +++
 include/linux/dim.h                       |   7 ++
 include/linux/netdevice.h                 |  35 ++++++
 lib/dim/net_dim.c                         |   6 --
 net/core/net-sysfs.c                      | 172 ++++++++++++++++++++++++++++++
 5 files changed, 231 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index ebf21be..1e4faa8 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -352,3 +352,20 @@  Description:
 		0  threaded mode disabled for this dev
 		1  threaded mode enabled for this dev
 		== ==================================
+
+What:		/sys/class/net/<iface>/dim_profs
+Date:		Mar 2024
+KernelVersion:	6.8
+Contact:	netdev@vger.kernel.org
+Description:
+		String value to control the profile list of DIM per device. User could
+		set this value to tune the profile list for RX/TX direction and EQE/CQE
+		mode respectively.
+
+		Possible values:
+		================================================ ==========================
+		RX EQE 1,1,0  2,2,0   3,3,0   4,4,0    5,5,0     tune RX + EQE profile list
+		RX CQE 8,8,0  16,16,0 32,32,0 64,64,0  128,128,0 tune RX + CQE profile list
+		TX EQE 16,8,0 2,16,0  16,8,0  32,64,0  128,64,0  tune TX + EQE profile list
+		TX CQE 8,5,0  8,16,0  32,12,0 128,64,0 256,128,0 tune TX + CQE profile list
+		================================================ ==========================
diff --git a/include/linux/dim.h b/include/linux/dim.h
index f343bc9..43398f5 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -10,6 +10,13 @@ 
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
+/* Number of DIM profiles and period mode. */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
 /*
  * Number of events between DIM iterations.
  * Causes a moderation of the algorithm run.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6f6ac7..bc2f3ac 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -49,6 +49,7 @@ 
 #include <uapi/linux/netdev.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
+#include <linux/dim.h>
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
@@ -998,6 +999,27 @@  struct netdev_net_notifier {
 	struct notifier_block *nb;
 };
 
+enum dim_direction {
+	DIM_RX_DIRECTION = 0x0,
+	DIM_TX_DIRECTION = 0x1,
+	DIM_NUM_DIRECTIONS
+};
+/**
+ * struct dim_profs_list - Structure for dim sysfs configuration.
+ * Used to exchange profile list between the sysfs and the driver.
+ *
+ * @direction: RX or TX dim information
+ * @mode: CQ period count mode (from CQE/EQE)
+ * @num: the number of profs array
+ * @profs: dim profile list
+ */
+struct dim_profs_list {
+	u8 direction;
+	u8 mode;
+	u8 num;
+	struct dim_cq_moder profs[];
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1351,6 +1373,14 @@  struct netdev_net_notifier {
  *			   struct kernel_hwtstamp_config *kernel_config,
  *			   struct netlink_ext_ack *extack);
  *	Change the hardware timestamping parameters for NIC device.
+ *
+ * int (*ndo_dim_moder_get)(struct net_device *dev,
+ *			    struct dim_profs_list *list);
+ *	Get dim profiles list from the NIC device.
+ *
+ * int (*ndo_dim_moder_set)(struct net_device *dev,
+ *			    struct dim_profs_list *list);
+ *	Configure dim profiles list for the NIC device.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1595,6 +1625,11 @@  struct net_device_ops {
 	int			(*ndo_hwtstamp_set)(struct net_device *dev,
 						    struct kernel_hwtstamp_config *kernel_config,
 						    struct netlink_ext_ack *extack);
+	int			(*ndo_dim_moder_get)(struct net_device *dev,
+						     struct dim_profs_list *list);
+
+	int			(*ndo_dim_moder_set)(struct net_device *dev,
+						     struct dim_profs_list *list);
 };
 
 /**
diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c
index 4e32f7a..67d5beb 100644
--- a/lib/dim/net_dim.c
+++ b/lib/dim/net_dim.c
@@ -11,12 +11,6 @@ 
  *        There are different set of profiles for RX/TX CQs.
  *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
  */
-#define NET_DIM_PARAMS_NUM_PROFILES 5
-#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256
-#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128
-#define NET_DIM_DEF_PROFILE_CQE 1
-#define NET_DIM_DEF_PROFILE_EQE 1
-
 #define NET_DIM_RX_EQE_PROFILES { \
 	{.usec = 1,   .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \
 	{.usec = 8,   .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e3d7a8c..801cb07 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -23,6 +23,7 @@ 
 #include <linux/of.h>
 #include <linux/of_net.h>
 #include <linux/cpu.h>
+#include <linux/dim.h>
 #include <net/netdev_rx_queue.h>
 #include <net/rps.h>
 
@@ -638,6 +639,176 @@  static ssize_t threaded_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(threaded);
 
+static struct dim_profs_list *parse_dim_profs(const char *buf, ssize_t len)
+{
+	int i, ret, size, totlen = 0, retlen = 0;
+	char direction[3], period_mode[4];
+	struct dim_profs_list *list;
+
+	size = sizeof(*list) + NET_DIM_PARAMS_NUM_PROFILES * sizeof(struct dim_cq_moder);
+	list = kzalloc(size, GFP_KERNEL);
+	if (!list)
+		goto err_list;
+
+	list->num = NET_DIM_PARAMS_NUM_PROFILES;
+
+	ret = sscanf(buf, "%2s %3s%n", direction, period_mode, &retlen);
+	if (ret != 2)
+		goto err_parse;
+
+	if (!strcasecmp(direction, "RX"))
+		list->direction = DIM_RX_DIRECTION;
+	else if (!strcasecmp(direction, "TX"))
+		list->direction = DIM_TX_DIRECTION;
+	else
+		goto err_parse;
+
+	if (!strcasecmp(period_mode, "EQE"))
+		list->mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	else if (!strcasecmp(period_mode, "CQE"))
+		list->mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE;
+	else
+		goto err_parse;
+
+	totlen += retlen;
+	if (totlen > len)
+		goto err_parse;
+
+	buf += retlen;
+	if (!buf)
+		goto err_parse;
+
+	for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) {
+		ret = sscanf(buf, "%hu,%hu,%hu%n", &list->profs[i].usec,
+			     &list->profs[i].pkts, &list->profs[i].comps, &retlen);
+		if (ret != 3)
+			goto err_parse;
+
+		totlen += retlen;
+		if (totlen > len)
+			goto err_parse;
+
+		buf += retlen;
+		if (i == NET_DIM_PARAMS_NUM_PROFILES - 1)
+			break;
+	}
+
+	return list;
+
+err_parse:
+	kfree(list);
+err_list:
+	return NULL;
+}
+
+static ssize_t dim_profs_store(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	const struct net_device_ops *ops = netdev->netdev_ops;
+	struct net *net = dev_net(netdev);
+	struct dim_profs_list *list;
+	int ret = 0;
+
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	list = parse_dim_profs(buf, len);
+	if (!list)
+		return -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (dev_isalive(netdev)) {
+		if (!ops->ndo_dim_moder_set)
+			ret = -EINVAL;
+		else
+			ret = ops->ndo_dim_moder_set(netdev, list) ? : len;
+	}
+
+	kfree(list);
+	rtnl_unlock();
+
+	return ret;
+}
+
+static ssize_t dim_profs_show_one(struct device *dev,
+				  struct device_attribute *attr,
+				  char *buf, u8 direct, u8 mode,
+				  size_t *len_)
+{
+	static const char fmt_body[] = "{.usec = %3hu, .pkts = %3hu, .comps = %3hu,}%s";
+	static const char fmt_hdr[] = "The profiles of (%2s, %3s):\n";
+	const char *direction[2] = {"RX", "TX"}, *period_mode[2] = {"EQE", "CQE"};
+	struct net_device *netdev = to_net_dev(dev);
+	const struct net_device_ops *ops = netdev->netdev_ops;
+	struct dim_profs_list *list;
+	size_t size, len = *len_;
+	ssize_t i;
+
+	size = sizeof(*list) + NET_DIM_PARAMS_NUM_PROFILES * sizeof(struct dim_cq_moder);
+	list = kzalloc(size, GFP_KERNEL);
+	if (!list)
+		return -ENOMEM;
+
+	list->num = NET_DIM_PARAMS_NUM_PROFILES;
+	list->direction = direct;
+	list->mode = mode;
+	if (ops->ndo_dim_moder_get(netdev, list))
+		goto ret_;
+
+	len += scnprintf(buf + len, PAGE_SIZE - len,
+			 fmt_hdr, direction[direct], period_mode[mode]);
+	for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) {
+		len += scnprintf(buf + len, PAGE_SIZE - len, fmt_body,
+				list->profs[i].usec, list->profs[i].pkts,
+				list->profs[i].comps,
+				(i == NET_DIM_PARAMS_NUM_PROFILES - 1) ? "\n" : ",\n");
+	}
+	*len_ = len;
+ret_:
+	kfree(list);
+	return 0;
+}
+
+static ssize_t dim_profs_show(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	static const char out[] = "profile is default and not customized by the device.";
+	struct net_device *netdev = to_net_dev(dev);
+	const struct net_device_ops *ops = netdev->netdev_ops;
+	ssize_t i, j, ret = 0;
+	size_t len = 0;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (!ops->ndo_dim_moder_get) {
+		ret = sysfs_emit(buf, "%s\n", out);
+		goto ret_;
+	}
+
+	for (i = 0; i < DIM_NUM_DIRECTIONS; i++) {
+		for (j = 0; j < DIM_CQ_PERIOD_NUM_MODES; j++) {
+			ret = dim_profs_show_one(dev, attr, buf, i, j, &len);
+			if (ret)
+				goto ret_;
+		}
+	}
+
+	rtnl_unlock();
+	return len;
+
+ret_:
+	rtnl_unlock();
+	return ret;
+}
+
+static DEVICE_ATTR_RW(dim_profs);
+
 static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_netdev_group.attr,
 	&dev_attr_type.attr,
@@ -671,6 +842,7 @@  static ssize_t threaded_store(struct device *dev,
 	&dev_attr_carrier_up_count.attr,
 	&dev_attr_carrier_down_count.attr,
 	&dev_attr_threaded.attr,
+	&dev_attr_dim_profs.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(net_class);