diff mbox series

[net-next,V3,4/8] devlink: Expose port function commands to control RoCE

Message ID 20221204141632.201932-5-shayd@nvidia.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series devlink: Add port function attribute to enable/disable Roce and migratable | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/apply fail Patch does not apply to net-next

Commit Message

Shay Drori Dec. 4, 2022, 2:16 p.m. UTC
Expose port function commands to enable / disable RoCE, this is used to
control the port RoCE device capabilities.

When RoCE is disabled for a function of the port, function cannot create
any RoCE specific resources (e.g GID table).
It also saves system memory utilization. For example disabling RoCE enable a
VF/SF saves 1 Mbytes of system memory per function.

Example of a PCI VF port which supports function configuration:
Set RoCE of the VF's port function.

$ devlink port show pci/0000:06:00.0/2
pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
vfnum 1
    function:
        hw_addr 00:00:00:00:00:00 roce enable

$ devlink port function set pci/0000:06:00.0/2 roce disable

$ devlink port show pci/0000:06:00.0/2
pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
vfnum 1
    function:
        hw_addr 00:00:00:00:00:00 roce disable

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
---
v2->v3:
 - change DEVLINK_PORT_FN_SET_CAP to devlink_port_fn_cap_fill.
 - move out DEVLINK_PORT_FN_CAPS_VALID_MASK from UAPI.
 - introduce DEVLINK_PORT_FN_CAP_ROCE and add _BIT suffix to
   devlink_port_fn_attr_cap.
 - remove DEVLINK_PORT_FN_ATTR_CAPS_MAX
---
 .../networking/devlink/devlink-port.rst       |  34 +++++-
 include/net/devlink.h                         |  19 +++
 include/uapi/linux/devlink.h                  |  10 ++
 net/core/devlink.c                            | 113 ++++++++++++++++++
 4 files changed, 175 insertions(+), 1 deletion(-)

Comments

Jiri Pirko Dec. 5, 2022, 10:12 a.m. UTC | #1
Sun, Dec 04, 2022 at 03:16:28PM CET, shayd@nvidia.com wrote:
>Expose port function commands to enable / disable RoCE, this is used to
>control the port RoCE device capabilities.
>
>When RoCE is disabled for a function of the port, function cannot create
>any RoCE specific resources (e.g GID table).
>It also saves system memory utilization. For example disabling RoCE enable a
>VF/SF saves 1 Mbytes of system memory per function.
>
>Example of a PCI VF port which supports function configuration:
>Set RoCE of the VF's port function.
>
>$ devlink port show pci/0000:06:00.0/2
>pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
>vfnum 1
>    function:
>        hw_addr 00:00:00:00:00:00 roce enable
>
>$ devlink port function set pci/0000:06:00.0/2 roce disable
>
>$ devlink port show pci/0000:06:00.0/2
>pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
>vfnum 1
>    function:
>        hw_addr 00:00:00:00:00:00 roce disable
>
>Signed-off-by: Shay Drory <shayd@nvidia.com>
>Reviewed-by: Jiri Pirko <jiri@nvidia.com>

When you do changes in the patch, you should remove reviewed-by and
acked-by tags.


>---
>v2->v3:
> - change DEVLINK_PORT_FN_SET_CAP to devlink_port_fn_cap_fill.
> - move out DEVLINK_PORT_FN_CAPS_VALID_MASK from UAPI.
> - introduce DEVLINK_PORT_FN_CAP_ROCE and add _BIT suffix to
>   devlink_port_fn_attr_cap.
> - remove DEVLINK_PORT_FN_ATTR_CAPS_MAX
>---
> .../networking/devlink/devlink-port.rst       |  34 +++++-
> include/net/devlink.h                         |  19 +++
> include/uapi/linux/devlink.h                  |  10 ++
> net/core/devlink.c                            | 113 ++++++++++++++++++
> 4 files changed, 175 insertions(+), 1 deletion(-)
>
>diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
>index 2c637f4aae8e..c3302d23e480 100644
>--- a/Documentation/networking/devlink/devlink-port.rst
>+++ b/Documentation/networking/devlink/devlink-port.rst
>@@ -110,7 +110,7 @@ devlink ports for both the controllers.
> Function configuration
> ======================
> 
>-A user can configure the function attribute before enumerating the PCI
>+Users can configure one or more function attributes before enumerating the PCI
> function. Usually it means, user should configure function attribute
> before a bus specific device for the function is created. However, when
> SRIOV is enabled, virtual function devices are created on the PCI bus.
>@@ -122,6 +122,9 @@ A user may set the hardware address of the function using
> `devlink port function set hw_addr` command. For Ethernet port function
> this means a MAC address.
> 
>+Users may also set the RoCE capability of the function using
>+`devlink port function set roce` command.
>+
> Function attributes
> ===================
> 
>@@ -162,6 +165,35 @@ device created for the PCI VF/SF.
>       function:
>         hw_addr 00:00:00:00:88:88
> 
>+RoCE capability setup
>+---------------------
>+Not all PCI VFs/SFs require RoCE capability.
>+
>+When RoCE capability is disabled, it saves system memory per PCI VF/SF.
>+
>+When user disables RoCE capability for a VF/SF, user application cannot send or
>+receive any RoCE packets through this VF/SF and RoCE GID table for this PCI
>+will be empty.
>+
>+When RoCE capability is disabled in the device using port function attribute,
>+VF/SF driver cannot override it.
>+
>+- Get RoCE capability of the VF device::
>+
>+    $ devlink port show pci/0000:06:00.0/2
>+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
>+        function:
>+            hw_addr 00:00:00:00:00:00 roce enable
>+
>+- Set RoCE capability of the VF device::
>+
>+    $ devlink port function set pci/0000:06:00.0/2 roce disable
>+
>+    $ devlink port show pci/0000:06:00.0/2
>+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
>+        function:
>+            hw_addr 00:00:00:00:00:00 roce disable
>+
> Subfunction
> ============
> 
>diff --git a/include/net/devlink.h b/include/net/devlink.h
>index 5f6eca5e4a40..20306fb8a1d9 100644
>--- a/include/net/devlink.h
>+++ b/include/net/devlink.h
>@@ -1451,6 +1451,25 @@ struct devlink_ops {
> 	int (*port_function_hw_addr_set)(struct devlink_port *port,
> 					 const u8 *hw_addr, int hw_addr_len,
> 					 struct netlink_ext_ack *extack);
>+	/**
>+	 * @port_function_roce_get: Port function's roce get function.
>+	 *
>+	 * Query RoCE state of a function managed by the devlink port.
>+	 * Return -EOPNOTSUPP if port function RoCE handling is not supported.
>+	 */
>+	int (*port_function_roce_get)(struct devlink_port *devlink_port,
>+				      bool *is_enable,
>+				      struct netlink_ext_ack *extack);
>+	/**
>+	 * @port_function_roce_set: Port function's roce set function.
>+	 *
>+	 * Enable/Disable the RoCE state of a function managed by the devlink
>+	 * port.
>+	 * Return -EOPNOTSUPP if port function RoCE handling is not supported.
>+	 */
>+	int (*port_function_roce_set)(struct devlink_port *devlink_port,
>+				      bool enable,
>+				      struct netlink_ext_ack *extack);
> 	/**
> 	 * port_new() - Add a new port function of a specified flavor
> 	 * @devlink: Devlink instance
>diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
>index 70191d96af89..6cc2925bd478 100644
>--- a/include/uapi/linux/devlink.h
>+++ b/include/uapi/linux/devlink.h
>@@ -658,11 +658,21 @@ enum devlink_resource_unit {
> 	DEVLINK_RESOURCE_UNIT_ENTRY,
> };
> 
>+enum devlink_port_fn_attr_cap {
>+	DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT,
>+
>+	/* Add new caps above */
>+	__DEVLINK_PORT_FN_ATTR_CAPS_MAX,

Well this is not needed in uapi too, but I don't see any good way to
maintain this internally :/ No harm to expose.

Looks good,
Reviewed-by: Jiri Pirko <jiri@nvidia.com>




>+};
>+
>+#define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT)
>+
> enum devlink_port_function_attr {
> 	DEVLINK_PORT_FUNCTION_ATTR_UNSPEC,
> 	DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR,	/* binary */
> 	DEVLINK_PORT_FN_ATTR_STATE,	/* u8 */
> 	DEVLINK_PORT_FN_ATTR_OPSTATE,	/* u8 */
>+	DEVLINK_PORT_FN_ATTR_CAPS,	/* bitfield32 */
> 
> 	__DEVLINK_PORT_FUNCTION_ATTR_MAX,
> 	DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1
>diff --git a/net/core/devlink.c b/net/core/devlink.c
>index 2b6e11277837..5c4d3abd7677 100644
>--- a/net/core/devlink.c
>+++ b/net/core/devlink.c
>@@ -195,11 +195,16 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
> EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
> EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
> 
>+#define DEVLINK_PORT_FN_CAPS_VALID_MASK \
>+	(_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1)
>+
> static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
> 	[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
> 	[DEVLINK_PORT_FN_ATTR_STATE] =
> 		NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
> 				 DEVLINK_PORT_FN_STATE_ACTIVE),
>+	[DEVLINK_PORT_FN_ATTR_CAPS] =
>+		NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
> };
> 
> static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
>@@ -692,6 +697,60 @@ devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
> 	return 0;
> }
> 
>+static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps,
>+				     u32 cap, bool is_enable)
>+{
>+	caps->selector |= cap;
>+	if (is_enable)
>+		caps->value |= cap;
>+}
>+
>+static int devlink_port_fn_roce_fill(const struct devlink_ops *ops,
>+				     struct devlink_port *devlink_port,
>+				     struct nla_bitfield32 *caps,
>+				     struct netlink_ext_ack *extack)
>+{
>+	bool is_enable;
>+	int err;
>+
>+	if (!ops->port_function_roce_get)
>+		return 0;
>+
>+	err = ops->port_function_roce_get(devlink_port, &is_enable, extack);
>+	if (err) {
>+		if (err == -EOPNOTSUPP)
>+			return 0;
>+		return err;
>+	}
>+
>+	devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable);
>+	return 0;
>+}
>+
>+static int devlink_port_fn_caps_fill(const struct devlink_ops *ops,
>+				     struct devlink_port *devlink_port,
>+				     struct sk_buff *msg,
>+				     struct netlink_ext_ack *extack,
>+				     bool *msg_updated)
>+{
>+	struct nla_bitfield32 caps = {};
>+	int err;
>+
>+	err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack);
>+	if (err)
>+		return err;
>+
>+	if (!caps.selector)
>+		return 0;
>+	err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
>+				 caps.selector);
>+	if (err)
>+		return err;
>+
>+	*msg_updated = true;
>+	return 0;
>+}
>+
> static int
> devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
> 				  struct genl_info *info,
>@@ -1275,6 +1334,35 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
> 	return 0;
> }
> 
>+static int
>+devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
>+			 struct netlink_ext_ack *extack)
>+{
>+	const struct devlink_ops *ops = devlink_port->devlink->ops;
>+
>+	return ops->port_function_roce_set(devlink_port, enable, extack);
>+}
>+
>+static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
>+				    const struct nlattr *attr,
>+				    struct netlink_ext_ack *extack)
>+{
>+	struct nla_bitfield32 caps;
>+	u32 caps_value;
>+	int err;
>+
>+	caps = nla_get_bitfield32(attr);
>+	caps_value = caps.value & caps.selector;
>+	if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) {
>+		err = devlink_port_fn_roce_set(devlink_port,
>+					       caps_value & DEVLINK_PORT_FN_CAP_ROCE,
>+					       extack);
>+		if (err)
>+			return err;
>+	}
>+	return 0;
>+}
>+
> static int
> devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
> 				   struct netlink_ext_ack *extack)
>@@ -1293,6 +1381,10 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
> 					   &msg_updated);
> 	if (err)
> 		goto out;
>+	err = devlink_port_fn_caps_fill(ops, port, msg, extack,
>+					&msg_updated);
>+	if (err)
>+		goto out;
> 	err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
> out:
> 	if (err || !msg_updated)
>@@ -1665,6 +1757,7 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port,
> 					  struct netlink_ext_ack *extack)
> {
> 	const struct devlink_ops *ops = devlink_port->devlink->ops;
>+	struct nlattr *attr;
> 
> 	if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] &&
> 	    !ops->port_function_hw_addr_set) {
>@@ -1677,6 +1770,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port,
> 				   "Function does not support state setting");
> 		return -EOPNOTSUPP;
> 	}
>+	attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
>+	if (attr) {
>+		struct nla_bitfield32 caps;
>+
>+		caps = nla_get_bitfield32(attr);
>+		if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE &&
>+		    !ops->port_function_roce_set) {
>+			NL_SET_ERR_MSG_ATTR(extack, attr,
>+					    "Port doesn't support RoCE function attribute");
>+			return -EOPNOTSUPP;
>+		}
>+	}
> 	return 0;
> }
> 
>@@ -1704,6 +1809,14 @@ static int devlink_port_function_set(struct devlink_port *port,
> 		if (err)
> 			return err;
> 	}
>+
>+	attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
>+	if (attr) {
>+		err = devlink_port_fn_caps_set(port, attr, extack);
>+		if (err)
>+			return err;
>+	}
>+
> 	/* Keep this as the last function attribute set, so that when
> 	 * multiple port function attributes are set along with state,
> 	 * Those can be applied first before activating the state.
>-- 
>2.38.1
>
Shannon Nelson Dec. 5, 2022, 11:37 p.m. UTC | #2
On 12/4/22 6:16 AM, Shay Drory wrote:
> Expose port function commands to enable / disable RoCE, this is used to
> control the port RoCE device capabilities.
> 
> When RoCE is disabled for a function of the port, function cannot create
> any RoCE specific resources (e.g GID table).
> It also saves system memory utilization. For example disabling RoCE enable a
> VF/SF saves 1 Mbytes of system memory per function.
> 
> Example of a PCI VF port which supports function configuration:
> Set RoCE of the VF's port function.
> 
> $ devlink port show pci/0000:06:00.0/2
> pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
> vfnum 1
>      function:
>          hw_addr 00:00:00:00:00:00 roce enable
> 
> $ devlink port function set pci/0000:06:00.0/2 roce disable
> 
> $ devlink port show pci/0000:06:00.0/2
> pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
> vfnum 1
>      function:
>          hw_addr 00:00:00:00:00:00 roce disable
> 
> Signed-off-by: Shay Drory <shayd@nvidia.com>
> Reviewed-by: Jiri Pirko <jiri@nvidia.com>
> ---



> +
> +#define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT)
> +
>   enum devlink_port_function_attr {
>          DEVLINK_PORT_FUNCTION_ATTR_UNSPEC,
>          DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR,     /* binary */
>          DEVLINK_PORT_FN_ATTR_STATE,     /* u8 */
>          DEVLINK_PORT_FN_ATTR_OPSTATE,   /* u8 */
> +       DEVLINK_PORT_FN_ATTR_CAPS,      /* bitfield32 */

Will 32 bits be enough, or should we start off with u64?  It will 
probably be fine, but since we're setting a uapi thing here we probably 
want to be sure we won't need to change it in the future.

sln
Jakub Kicinski Dec. 6, 2022, 2:02 a.m. UTC | #3
On Mon, 5 Dec 2022 15:37:26 -0800 Shannon Nelson wrote:
> >   enum devlink_port_function_attr {
> >          DEVLINK_PORT_FUNCTION_ATTR_UNSPEC,
> >          DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR,     /* binary */
> >          DEVLINK_PORT_FN_ATTR_STATE,     /* u8 */
> >          DEVLINK_PORT_FN_ATTR_OPSTATE,   /* u8 */
> > +       DEVLINK_PORT_FN_ATTR_CAPS,      /* bitfield32 */  
> 
> Will 32 bits be enough, or should we start off with u64?  It will 
> probably be fine, but since we're setting a uapi thing here we probably 
> want to be sure we won't need to change it in the future.

Ah, if only variable size integer types from Olek were ready :(

Unfortunately there is no bf64 today, so we'd either have to add soon
to be deprecated bf64 or hold off waiting for Olek...
I reckon the dumb thing of merging bf32 may be the best choice right
now :(
Jiri Pirko Dec. 6, 2022, 8:52 a.m. UTC | #4
Tue, Dec 06, 2022 at 03:02:34AM CET, kuba@kernel.org wrote:
>On Mon, 5 Dec 2022 15:37:26 -0800 Shannon Nelson wrote:
>> >   enum devlink_port_function_attr {
>> >          DEVLINK_PORT_FUNCTION_ATTR_UNSPEC,
>> >          DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR,     /* binary */
>> >          DEVLINK_PORT_FN_ATTR_STATE,     /* u8 */
>> >          DEVLINK_PORT_FN_ATTR_OPSTATE,   /* u8 */
>> > +       DEVLINK_PORT_FN_ATTR_CAPS,      /* bitfield32 */  
>> 
>> Will 32 bits be enough, or should we start off with u64?  It will 
>> probably be fine, but since we're setting a uapi thing here we probably 
>> want to be sure we won't need to change it in the future.
>
>Ah, if only variable size integer types from Olek were ready :(

Or, if the bitfield was variable length from the beginning (as I asked
for :)).


>
>Unfortunately there is no bf64 today, so we'd either have to add soon
>to be deprecated bf64 or hold off waiting for Olek...
>I reckon the dumb thing of merging bf32 may be the best choice right
>now :(

+1
diff mbox series

Patch

diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index 2c637f4aae8e..c3302d23e480 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -110,7 +110,7 @@  devlink ports for both the controllers.
 Function configuration
 ======================
 
-A user can configure the function attribute before enumerating the PCI
+Users can configure one or more function attributes before enumerating the PCI
 function. Usually it means, user should configure function attribute
 before a bus specific device for the function is created. However, when
 SRIOV is enabled, virtual function devices are created on the PCI bus.
@@ -122,6 +122,9 @@  A user may set the hardware address of the function using
 `devlink port function set hw_addr` command. For Ethernet port function
 this means a MAC address.
 
+Users may also set the RoCE capability of the function using
+`devlink port function set roce` command.
+
 Function attributes
 ===================
 
@@ -162,6 +165,35 @@  device created for the PCI VF/SF.
       function:
         hw_addr 00:00:00:00:88:88
 
+RoCE capability setup
+---------------------
+Not all PCI VFs/SFs require RoCE capability.
+
+When RoCE capability is disabled, it saves system memory per PCI VF/SF.
+
+When user disables RoCE capability for a VF/SF, user application cannot send or
+receive any RoCE packets through this VF/SF and RoCE GID table for this PCI
+will be empty.
+
+When RoCE capability is disabled in the device using port function attribute,
+VF/SF driver cannot override it.
+
+- Get RoCE capability of the VF device::
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 roce enable
+
+- Set RoCE capability of the VF device::
+
+    $ devlink port function set pci/0000:06:00.0/2 roce disable
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 roce disable
+
 Subfunction
 ============
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 5f6eca5e4a40..20306fb8a1d9 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1451,6 +1451,25 @@  struct devlink_ops {
 	int (*port_function_hw_addr_set)(struct devlink_port *port,
 					 const u8 *hw_addr, int hw_addr_len,
 					 struct netlink_ext_ack *extack);
+	/**
+	 * @port_function_roce_get: Port function's roce get function.
+	 *
+	 * Query RoCE state of a function managed by the devlink port.
+	 * Return -EOPNOTSUPP if port function RoCE handling is not supported.
+	 */
+	int (*port_function_roce_get)(struct devlink_port *devlink_port,
+				      bool *is_enable,
+				      struct netlink_ext_ack *extack);
+	/**
+	 * @port_function_roce_set: Port function's roce set function.
+	 *
+	 * Enable/Disable the RoCE state of a function managed by the devlink
+	 * port.
+	 * Return -EOPNOTSUPP if port function RoCE handling is not supported.
+	 */
+	int (*port_function_roce_set)(struct devlink_port *devlink_port,
+				      bool enable,
+				      struct netlink_ext_ack *extack);
 	/**
 	 * port_new() - Add a new port function of a specified flavor
 	 * @devlink: Devlink instance
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 70191d96af89..6cc2925bd478 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -658,11 +658,21 @@  enum devlink_resource_unit {
 	DEVLINK_RESOURCE_UNIT_ENTRY,
 };
 
+enum devlink_port_fn_attr_cap {
+	DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT,
+
+	/* Add new caps above */
+	__DEVLINK_PORT_FN_ATTR_CAPS_MAX,
+};
+
+#define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT)
+
 enum devlink_port_function_attr {
 	DEVLINK_PORT_FUNCTION_ATTR_UNSPEC,
 	DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR,	/* binary */
 	DEVLINK_PORT_FN_ATTR_STATE,	/* u8 */
 	DEVLINK_PORT_FN_ATTR_OPSTATE,	/* u8 */
+	DEVLINK_PORT_FN_ATTR_CAPS,	/* bitfield32 */
 
 	__DEVLINK_PORT_FUNCTION_ATTR_MAX,
 	DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2b6e11277837..5c4d3abd7677 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -195,11 +195,16 @@  EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
 
+#define DEVLINK_PORT_FN_CAPS_VALID_MASK \
+	(_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1)
+
 static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
 	[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
 	[DEVLINK_PORT_FN_ATTR_STATE] =
 		NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
 				 DEVLINK_PORT_FN_STATE_ACTIVE),
+	[DEVLINK_PORT_FN_ATTR_CAPS] =
+		NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
 };
 
 static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
@@ -692,6 +697,60 @@  devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
 	return 0;
 }
 
+static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps,
+				     u32 cap, bool is_enable)
+{
+	caps->selector |= cap;
+	if (is_enable)
+		caps->value |= cap;
+}
+
+static int devlink_port_fn_roce_fill(const struct devlink_ops *ops,
+				     struct devlink_port *devlink_port,
+				     struct nla_bitfield32 *caps,
+				     struct netlink_ext_ack *extack)
+{
+	bool is_enable;
+	int err;
+
+	if (!ops->port_function_roce_get)
+		return 0;
+
+	err = ops->port_function_roce_get(devlink_port, &is_enable, extack);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable);
+	return 0;
+}
+
+static int devlink_port_fn_caps_fill(const struct devlink_ops *ops,
+				     struct devlink_port *devlink_port,
+				     struct sk_buff *msg,
+				     struct netlink_ext_ack *extack,
+				     bool *msg_updated)
+{
+	struct nla_bitfield32 caps = {};
+	int err;
+
+	err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack);
+	if (err)
+		return err;
+
+	if (!caps.selector)
+		return 0;
+	err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
+				 caps.selector);
+	if (err)
+		return err;
+
+	*msg_updated = true;
+	return 0;
+}
+
 static int
 devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
 				  struct genl_info *info,
@@ -1275,6 +1334,35 @@  static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
 	return 0;
 }
 
+static int
+devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
+			 struct netlink_ext_ack *extack)
+{
+	const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+	return ops->port_function_roce_set(devlink_port, enable, extack);
+}
+
+static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
+				    const struct nlattr *attr,
+				    struct netlink_ext_ack *extack)
+{
+	struct nla_bitfield32 caps;
+	u32 caps_value;
+	int err;
+
+	caps = nla_get_bitfield32(attr);
+	caps_value = caps.value & caps.selector;
+	if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) {
+		err = devlink_port_fn_roce_set(devlink_port,
+					       caps_value & DEVLINK_PORT_FN_CAP_ROCE,
+					       extack);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
 static int
 devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
 				   struct netlink_ext_ack *extack)
@@ -1293,6 +1381,10 @@  devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
 					   &msg_updated);
 	if (err)
 		goto out;
+	err = devlink_port_fn_caps_fill(ops, port, msg, extack,
+					&msg_updated);
+	if (err)
+		goto out;
 	err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
 out:
 	if (err || !msg_updated)
@@ -1665,6 +1757,7 @@  static int devlink_port_function_validate(struct devlink_port *devlink_port,
 					  struct netlink_ext_ack *extack)
 {
 	const struct devlink_ops *ops = devlink_port->devlink->ops;
+	struct nlattr *attr;
 
 	if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] &&
 	    !ops->port_function_hw_addr_set) {
@@ -1677,6 +1770,18 @@  static int devlink_port_function_validate(struct devlink_port *devlink_port,
 				   "Function does not support state setting");
 		return -EOPNOTSUPP;
 	}
+	attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+	if (attr) {
+		struct nla_bitfield32 caps;
+
+		caps = nla_get_bitfield32(attr);
+		if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE &&
+		    !ops->port_function_roce_set) {
+			NL_SET_ERR_MSG_ATTR(extack, attr,
+					    "Port doesn't support RoCE function attribute");
+			return -EOPNOTSUPP;
+		}
+	}
 	return 0;
 }
 
@@ -1704,6 +1809,14 @@  static int devlink_port_function_set(struct devlink_port *port,
 		if (err)
 			return err;
 	}
+
+	attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+	if (attr) {
+		err = devlink_port_fn_caps_set(port, attr, extack);
+		if (err)
+			return err;
+	}
+
 	/* Keep this as the last function attribute set, so that when
 	 * multiple port function attributes are set along with state,
 	 * Those can be applied first before activating the state.