diff mbox series

[net-next,04/13] net/sched: taprio: allow user input of per-tc max SDU

Message ID 20220914153303.1792444-5-vladimir.oltean@nxp.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series Add tc-taprio support for queueMaxSDU | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4383 this patch: 4383
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang success Errors and warnings before: 1066 this patch: 1066
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4574 this patch: 4574
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 218 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Vladimir Oltean Sept. 14, 2022, 3:32 p.m. UTC
IEEE 802.1Q clause 12.29.1.1 "The queueMaxSDUTable structure and data
types" and 8.6.8.4 "Enhancements for scheduled traffic" talk about the
existence of a per traffic class limitation of maximum frame sizes, with
a fallback on the port-based MTU.

As far as I am able to understand, the 802.1Q Service Data Unit (SDU)
represents the MAC Service Data Unit (MSDU, i.e. L2 payload), excluding
any number of prepended VLAN headers which may be otherwise present in
the MSDU. Therefore, the queueMaxSDU is directly comparable to the
device MTU (1500 means L2 payload sizes are accepted, or frame sizes of
1518 octets, or 1522 plus one VLAN header). Drivers which offload this
are directly responsible of translating into other units of measurement.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 include/net/pkt_sched.h        |   1 +
 include/uapi/linux/pkt_sched.h |  11 +++
 net/sched/sch_taprio.c         | 122 ++++++++++++++++++++++++++++++++-
 3 files changed, 133 insertions(+), 1 deletion(-)

Comments

Vinicius Costa Gomes Sept. 14, 2022, 9:43 p.m. UTC | #1
Vladimir Oltean <vladimir.oltean@nxp.com> writes:

> IEEE 802.1Q clause 12.29.1.1 "The queueMaxSDUTable structure and data
> types" and 8.6.8.4 "Enhancements for scheduled traffic" talk about the
> existence of a per traffic class limitation of maximum frame sizes, with
> a fallback on the port-based MTU.
>
> As far as I am able to understand, the 802.1Q Service Data Unit (SDU)
> represents the MAC Service Data Unit (MSDU, i.e. L2 payload), excluding
> any number of prepended VLAN headers which may be otherwise present in
> the MSDU. Therefore, the queueMaxSDU is directly comparable to the
> device MTU (1500 means L2 payload sizes are accepted, or frame sizes of
> 1518 octets, or 1522 plus one VLAN header). Drivers which offload this
> are directly responsible of translating into other units of measurement.
>
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
> ---
>  include/net/pkt_sched.h        |   1 +
>  include/uapi/linux/pkt_sched.h |  11 +++
>  net/sched/sch_taprio.c         | 122 ++++++++++++++++++++++++++++++++-
>  3 files changed, 133 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
> index 29f65632ebc5..88080998557b 100644
> --- a/include/net/pkt_sched.h
> +++ b/include/net/pkt_sched.h
> @@ -168,6 +168,7 @@ struct tc_taprio_qopt_offload {
>  	ktime_t base_time;
>  	u64 cycle_time;
>  	u64 cycle_time_extension;
> +	u32 max_sdu[TC_MAX_QUEUE];
>  
>  	size_t num_entries;
>  	struct tc_taprio_sched_entry entries[];
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index f292b467b27f..000eec106856 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -1232,6 +1232,16 @@ enum {
>  #define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST	_BITUL(0)
>  #define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD	_BITUL(1)
>  
> +enum {
> +	TCA_TAPRIO_TC_ENTRY_UNSPEC,
> +	TCA_TAPRIO_TC_ENTRY_INDEX,		/* u32 */
> +	TCA_TAPRIO_TC_ENTRY_MAX_SDU,		/* u32 */
> +
> +	/* add new constants above here */
> +	__TCA_TAPRIO_TC_ENTRY_CNT,
> +	TCA_TAPRIO_TC_ENTRY_MAX = (__TCA_TAPRIO_TC_ENTRY_CNT - 1)
> +};
> +
>  enum {
>  	TCA_TAPRIO_ATTR_UNSPEC,
>  	TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
> @@ -1245,6 +1255,7 @@ enum {
>  	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
>  	TCA_TAPRIO_ATTR_FLAGS, /* u32 */
>  	TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */
> +	TCA_TAPRIO_ATTR_TC_ENTRY, /* nest */
>  	__TCA_TAPRIO_ATTR_MAX,
>  };
>  
> diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
> index 2a4b8f59f444..834cbed88e4f 100644
> --- a/net/sched/sch_taprio.c
> +++ b/net/sched/sch_taprio.c
> @@ -79,6 +79,7 @@ struct taprio_sched {
>  	struct sched_gate_list __rcu *admin_sched;
>  	struct hrtimer advance_timer;
>  	struct list_head taprio_list;
> +	u32 max_sdu[TC_MAX_QUEUE];
>  	u32 txtime_delay;
>  };
>  
> @@ -416,6 +417,9 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
>  			      struct Qdisc *child, struct sk_buff **to_free)
>  {
>  	struct taprio_sched *q = qdisc_priv(sch);
> +	struct net_device *dev = qdisc_dev(sch);
> +	int prio = skb->priority;
> +	u8 tc;
>  
>  	/* sk_flags are only safe to use on full sockets. */
>  	if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
> @@ -427,6 +431,12 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
>  			return qdisc_drop(skb, sch, to_free);
>  	}
>  
> +	/* Devices with full offload are expected to honor this in hardware */
> +	tc = netdev_get_prio_tc_map(dev, prio);
> +	if (q->max_sdu[tc] &&
> +	    q->max_sdu[tc] < max_t(int, 0, skb->len - skb_mac_header_len(skb)))
> +		return qdisc_drop(skb, sch, to_free);
> +

One minor idea, perhaps if you initialize q->max_sdu[] with a value that
you could use to compare here (2^32 - 1), this comparison could be
simplified. The issue is that that value would become invalid for a
maximum SDU, not a problem for ethernet.

>  	qdisc_qstats_backlog_inc(sch, skb);
>  	sch->q.qlen++;
>  
> @@ -761,6 +771,11 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
>  	[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]  = { .type = NLA_U32 },
>  };
>  
> +static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
> +	[TCA_TAPRIO_TC_ENTRY_INDEX]	   = { .type = NLA_U32 },
> +	[TCA_TAPRIO_TC_ENTRY_MAX_SDU]	   = { .type = NLA_U32 },
> +};
> +
>  static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
>  	[TCA_TAPRIO_ATTR_PRIOMAP]	       = {
>  		.len = sizeof(struct tc_mqprio_qopt)
> @@ -773,6 +788,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
>  	[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
>  	[TCA_TAPRIO_ATTR_FLAGS]                      = { .type = NLA_U32 },
>  	[TCA_TAPRIO_ATTR_TXTIME_DELAY]		     = { .type = NLA_U32 },
> +	[TCA_TAPRIO_ATTR_TC_ENTRY]		     = { .type = NLA_NESTED },
>  };
>  
>  static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb,
> @@ -1236,7 +1252,7 @@ static int taprio_enable_offload(struct net_device *dev,
>  {
>  	const struct net_device_ops *ops = dev->netdev_ops;
>  	struct tc_taprio_qopt_offload *offload;
> -	int err = 0;
> +	int tc, err = 0;
>  
>  	if (!ops->ndo_setup_tc) {
>  		NL_SET_ERR_MSG(extack,
> @@ -1253,6 +1269,9 @@ static int taprio_enable_offload(struct net_device *dev,
>  	offload->enable = 1;
>  	taprio_sched_to_offload(dev, sched, offload);
>  
> +	for (tc = 0; tc < TC_MAX_QUEUE; tc++)
> +		offload->max_sdu[tc] = q->max_sdu[tc];
> +
>  	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
>  	if (err < 0) {
>  		NL_SET_ERR_MSG(extack,
> @@ -1387,6 +1406,73 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
>  	return err;
>  }
>  
> +static int taprio_parse_tc_entry(struct Qdisc *sch,
> +				 struct nlattr *opt,
> +				 unsigned long *seen_tcs,
> +				 struct netlink_ext_ack *extack)
> +{
> +	struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
> +	struct taprio_sched *q = qdisc_priv(sch);
> +	struct net_device *dev = qdisc_dev(sch);
> +	u32 max_sdu = 0;
> +	int err, tc;
> +
> +	err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
> +			       taprio_tc_policy, extack);
> +	if (err < 0)
> +		return err;
> +
> +	if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) {
> +		NL_SET_ERR_MSG_MOD(extack, "TC entry index missing");
> +		return -EINVAL;
> +	}
> +
> +	tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]);
> +	if (tc >= TC_QOPT_MAX_QUEUE) {
> +		NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range");
> +		return -ERANGE;
> +	}
> +
> +	if (*seen_tcs & BIT(tc)) {
> +		NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry");
> +		return -EINVAL;
> +	}
> +
> +	*seen_tcs |= BIT(tc);
> +
> +	if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU])
> +		max_sdu = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
> +
> +	if (max_sdu > dev->max_mtu) {
> +		NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
> +		return -ERANGE;
> +	}
> +
> +	q->max_sdu[tc] = max_sdu;
> +
> +	return 0;
> +}
> +
> +static int taprio_parse_tc_entries(struct Qdisc *sch,
> +				   struct nlattr *opt,
> +				   struct netlink_ext_ack *extack)
> +{
> +	unsigned long seen_tcs = 0;
> +	struct nlattr *n;
> +	int err = 0, rem;
> +
> +	nla_for_each_nested(n, opt, rem) {
> +		if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
> +			continue;
> +
> +		err = taprio_parse_tc_entry(sch, n, &seen_tcs, extack);
> +		if (err)
> +			break;
> +	}
> +
> +	return err;
> +}
> +
>  static int taprio_mqprio_cmp(const struct net_device *dev,
>  			     const struct tc_mqprio_qopt *mqprio)
>  {
> @@ -1465,6 +1551,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
>  	if (err < 0)
>  		return err;
>  
> +	err = taprio_parse_tc_entries(sch, opt, extack);
> +	if (err)
> +		return err;
> +
>  	new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
>  	if (!new_admin) {
>  		NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
> @@ -1855,6 +1945,33 @@ static int dump_schedule(struct sk_buff *msg,
>  	return -1;
>  }
>  
> +static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
> +{
> +	struct nlattr *n;
> +	int tc;
> +
> +	for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
> +		n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY);
> +		if (!n)
> +			return -EMSGSIZE;
> +
> +		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc))
> +			goto nla_put_failure;
> +
> +		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
> +				q->max_sdu[tc]))
> +			goto nla_put_failure;
> +
> +		nla_nest_end(skb, n);
> +	}
> +
> +	return 0;
> +
> +nla_put_failure:
> +	nla_nest_cancel(skb, n);
> +	return -EMSGSIZE;
> +}
> +
>  static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
>  {
>  	struct taprio_sched *q = qdisc_priv(sch);
> @@ -1894,6 +2011,9 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
>  	    nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
>  		goto options_error;
>  
> +	if (taprio_dump_tc_entries(q, skb))
> +		goto options_error;
> +
>  	if (oper && dump_schedule(skb, oper))
>  		goto options_error;
>  
> -- 
> 2.34.1
>
Vladimir Oltean Sept. 14, 2022, 10:10 p.m. UTC | #2
On Wed, Sep 14, 2022 at 02:43:02PM -0700, Vinicius Costa Gomes wrote:
> > @@ -416,6 +417,9 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
> >  			      struct Qdisc *child, struct sk_buff **to_free)
> >  {
> >  	struct taprio_sched *q = qdisc_priv(sch);
> > +	struct net_device *dev = qdisc_dev(sch);
> > +	int prio = skb->priority;
> > +	u8 tc;
> >  
> >  	/* sk_flags are only safe to use on full sockets. */
> >  	if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
> > @@ -427,6 +431,12 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
> >  			return qdisc_drop(skb, sch, to_free);
> >  	}
> >  
> > +	/* Devices with full offload are expected to honor this in hardware */
> > +	tc = netdev_get_prio_tc_map(dev, prio);
> > +	if (q->max_sdu[tc] &&
> > +	    q->max_sdu[tc] < max_t(int, 0, skb->len - skb_mac_header_len(skb)))
> > +		return qdisc_drop(skb, sch, to_free);
> > +
> 
> One minor idea, perhaps if you initialize q->max_sdu[] with a value that
> you could use to compare here (2^32 - 1), this comparison could be
> simplified. The issue is that that value would become invalid for a
> maximum SDU, not a problem for ethernet.

Could do (and the fact that U32_MAX becomes a reserved value shouldn't
be a problem for any linklayer), but if I optimize the code for this one
place, I need, in turn, to increase the complexity in the netlink dump
and in the offload procedures, to hide what I've done.

If I look at the difference in generated code, maybe it's worth it
(I get rid of a "cbz" instruction). Maybe it's worth simply creating a
shadow array of q->max_sdu[], but which is also adjusted for something
like dev->hard_header_len (also a fast path invariant)? This way, we
could only check for q->max_frm_len[tc] > skb->len and save even more
checks in the fast path.
Vinicius Costa Gomes Sept. 14, 2022, 11 p.m. UTC | #3
Vladimir Oltean <vladimir.oltean@nxp.com> writes:

> On Wed, Sep 14, 2022 at 02:43:02PM -0700, Vinicius Costa Gomes wrote:
>> > @@ -416,6 +417,9 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
>> >  			      struct Qdisc *child, struct sk_buff **to_free)
>> >  {
>> >  	struct taprio_sched *q = qdisc_priv(sch);
>> > +	struct net_device *dev = qdisc_dev(sch);
>> > +	int prio = skb->priority;
>> > +	u8 tc;
>> >  
>> >  	/* sk_flags are only safe to use on full sockets. */
>> >  	if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
>> > @@ -427,6 +431,12 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
>> >  			return qdisc_drop(skb, sch, to_free);
>> >  	}
>> >  
>> > +	/* Devices with full offload are expected to honor this in hardware */
>> > +	tc = netdev_get_prio_tc_map(dev, prio);
>> > +	if (q->max_sdu[tc] &&
>> > +	    q->max_sdu[tc] < max_t(int, 0, skb->len - skb_mac_header_len(skb)))
>> > +		return qdisc_drop(skb, sch, to_free);
>> > +
>> 
>> One minor idea, perhaps if you initialize q->max_sdu[] with a value that
>> you could use to compare here (2^32 - 1), this comparison could be
>> simplified. The issue is that that value would become invalid for a
>> maximum SDU, not a problem for ethernet.
>
> Could do (and the fact that U32_MAX becomes a reserved value shouldn't
> be a problem for any linklayer), but if I optimize the code for this one
> place, I need, in turn, to increase the complexity in the netlink dump
> and in the offload procedures, to hide what I've done.

Hm, I just noticed something.

During parse the user only sets the max-sdu for the traffic classes she
is interested on. During dump you are showing all of them, the unset
ones will be shown as zero, that seems a bit confusing, which could mean
that you would have to add some checks anyway.

For the offload side, you could just document that U32_MAX means unset.

>
> If I look at the difference in generated code, maybe it's worth it
> (I get rid of a "cbz" instruction). Maybe it's worth simply creating a
> shadow array of q->max_sdu[], but which is also adjusted for something
> like dev->hard_header_len (also a fast path invariant)? This way, we
> could only check for q->max_frm_len[tc] > skb->len and save even more
> checks in the fast path.
Vinicius Costa Gomes Sept. 14, 2022, 11:03 p.m. UTC | #4
Vladimir Oltean <vladimir.oltean@nxp.com> writes:

> IEEE 802.1Q clause 12.29.1.1 "The queueMaxSDUTable structure and data
> types" and 8.6.8.4 "Enhancements for scheduled traffic" talk about the
> existence of a per traffic class limitation of maximum frame sizes, with
> a fallback on the port-based MTU.
>
> As far as I am able to understand, the 802.1Q Service Data Unit (SDU)
> represents the MAC Service Data Unit (MSDU, i.e. L2 payload), excluding
> any number of prepended VLAN headers which may be otherwise present in
> the MSDU. Therefore, the queueMaxSDU is directly comparable to the
> device MTU (1500 means L2 payload sizes are accepted, or frame sizes of
> 1518 octets, or 1522 plus one VLAN header). Drivers which offload this
> are directly responsible of translating into other units of measurement.
>
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
> ---

Could you please add an example of the 'tc' command syntax you are
thinking about?

Another point to think about: does it make sense to allow 'only' the
max-sdu to be changed, i.e. the user doesn't set an a schedule, nor a
map, only the max-sdu information.

>  include/net/pkt_sched.h        |   1 +
>  include/uapi/linux/pkt_sched.h |  11 +++
>  net/sched/sch_taprio.c         | 122 ++++++++++++++++++++++++++++++++-
>  3 files changed, 133 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
> index 29f65632ebc5..88080998557b 100644
> --- a/include/net/pkt_sched.h
> +++ b/include/net/pkt_sched.h
> @@ -168,6 +168,7 @@ struct tc_taprio_qopt_offload {
>  	ktime_t base_time;
>  	u64 cycle_time;
>  	u64 cycle_time_extension;
> +	u32 max_sdu[TC_MAX_QUEUE];
>  
>  	size_t num_entries;
>  	struct tc_taprio_sched_entry entries[];
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index f292b467b27f..000eec106856 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -1232,6 +1232,16 @@ enum {
>  #define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST	_BITUL(0)
>  #define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD	_BITUL(1)
>  
> +enum {
> +	TCA_TAPRIO_TC_ENTRY_UNSPEC,
> +	TCA_TAPRIO_TC_ENTRY_INDEX,		/* u32 */
> +	TCA_TAPRIO_TC_ENTRY_MAX_SDU,		/* u32 */
> +
> +	/* add new constants above here */
> +	__TCA_TAPRIO_TC_ENTRY_CNT,
> +	TCA_TAPRIO_TC_ENTRY_MAX = (__TCA_TAPRIO_TC_ENTRY_CNT - 1)
> +};
> +
>  enum {
>  	TCA_TAPRIO_ATTR_UNSPEC,
>  	TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
> @@ -1245,6 +1255,7 @@ enum {
>  	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
>  	TCA_TAPRIO_ATTR_FLAGS, /* u32 */
>  	TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */
> +	TCA_TAPRIO_ATTR_TC_ENTRY, /* nest */
>  	__TCA_TAPRIO_ATTR_MAX,
>  };
>  
> diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
> index 2a4b8f59f444..834cbed88e4f 100644
> --- a/net/sched/sch_taprio.c
> +++ b/net/sched/sch_taprio.c
> @@ -79,6 +79,7 @@ struct taprio_sched {
>  	struct sched_gate_list __rcu *admin_sched;
>  	struct hrtimer advance_timer;
>  	struct list_head taprio_list;
> +	u32 max_sdu[TC_MAX_QUEUE];
>  	u32 txtime_delay;
>  };
>  
> @@ -416,6 +417,9 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
>  			      struct Qdisc *child, struct sk_buff **to_free)
>  {
>  	struct taprio_sched *q = qdisc_priv(sch);
> +	struct net_device *dev = qdisc_dev(sch);
> +	int prio = skb->priority;
> +	u8 tc;
>  
>  	/* sk_flags are only safe to use on full sockets. */
>  	if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
> @@ -427,6 +431,12 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
>  			return qdisc_drop(skb, sch, to_free);
>  	}
>  
> +	/* Devices with full offload are expected to honor this in hardware */
> +	tc = netdev_get_prio_tc_map(dev, prio);
> +	if (q->max_sdu[tc] &&
> +	    q->max_sdu[tc] < max_t(int, 0, skb->len - skb_mac_header_len(skb)))
> +		return qdisc_drop(skb, sch, to_free);
> +
>  	qdisc_qstats_backlog_inc(sch, skb);
>  	sch->q.qlen++;
>  
> @@ -761,6 +771,11 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
>  	[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]  = { .type = NLA_U32 },
>  };
>  
> +static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
> +	[TCA_TAPRIO_TC_ENTRY_INDEX]	   = { .type = NLA_U32 },
> +	[TCA_TAPRIO_TC_ENTRY_MAX_SDU]	   = { .type = NLA_U32 },
> +};
> +
>  static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
>  	[TCA_TAPRIO_ATTR_PRIOMAP]	       = {
>  		.len = sizeof(struct tc_mqprio_qopt)
> @@ -773,6 +788,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
>  	[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
>  	[TCA_TAPRIO_ATTR_FLAGS]                      = { .type = NLA_U32 },
>  	[TCA_TAPRIO_ATTR_TXTIME_DELAY]		     = { .type = NLA_U32 },
> +	[TCA_TAPRIO_ATTR_TC_ENTRY]		     = { .type = NLA_NESTED },
>  };
>  
>  static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb,
> @@ -1236,7 +1252,7 @@ static int taprio_enable_offload(struct net_device *dev,
>  {
>  	const struct net_device_ops *ops = dev->netdev_ops;
>  	struct tc_taprio_qopt_offload *offload;
> -	int err = 0;
> +	int tc, err = 0;
>  
>  	if (!ops->ndo_setup_tc) {
>  		NL_SET_ERR_MSG(extack,
> @@ -1253,6 +1269,9 @@ static int taprio_enable_offload(struct net_device *dev,
>  	offload->enable = 1;
>  	taprio_sched_to_offload(dev, sched, offload);
>  
> +	for (tc = 0; tc < TC_MAX_QUEUE; tc++)
> +		offload->max_sdu[tc] = q->max_sdu[tc];
> +
>  	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
>  	if (err < 0) {
>  		NL_SET_ERR_MSG(extack,
> @@ -1387,6 +1406,73 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
>  	return err;
>  }
>  
> +static int taprio_parse_tc_entry(struct Qdisc *sch,
> +				 struct nlattr *opt,
> +				 unsigned long *seen_tcs,
> +				 struct netlink_ext_ack *extack)
> +{
> +	struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
> +	struct taprio_sched *q = qdisc_priv(sch);
> +	struct net_device *dev = qdisc_dev(sch);
> +	u32 max_sdu = 0;
> +	int err, tc;
> +
> +	err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
> +			       taprio_tc_policy, extack);
> +	if (err < 0)
> +		return err;
> +
> +	if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) {
> +		NL_SET_ERR_MSG_MOD(extack, "TC entry index missing");
> +		return -EINVAL;
> +	}
> +
> +	tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]);
> +	if (tc >= TC_QOPT_MAX_QUEUE) {
> +		NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range");
> +		return -ERANGE;
> +	}
> +
> +	if (*seen_tcs & BIT(tc)) {
> +		NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry");
> +		return -EINVAL;
> +	}
> +
> +	*seen_tcs |= BIT(tc);
> +
> +	if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU])
> +		max_sdu = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
> +
> +	if (max_sdu > dev->max_mtu) {
> +		NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
> +		return -ERANGE;
> +	}
> +
> +	q->max_sdu[tc] = max_sdu;
> +
> +	return 0;
> +}
> +
> +static int taprio_parse_tc_entries(struct Qdisc *sch,
> +				   struct nlattr *opt,
> +				   struct netlink_ext_ack *extack)
> +{
> +	unsigned long seen_tcs = 0;
> +	struct nlattr *n;
> +	int err = 0, rem;
> +
> +	nla_for_each_nested(n, opt, rem) {
> +		if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
> +			continue;
> +
> +		err = taprio_parse_tc_entry(sch, n, &seen_tcs, extack);
> +		if (err)
> +			break;
> +	}
> +
> +	return err;
> +}
> +
>  static int taprio_mqprio_cmp(const struct net_device *dev,
>  			     const struct tc_mqprio_qopt *mqprio)
>  {
> @@ -1465,6 +1551,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
>  	if (err < 0)
>  		return err;
>  
> +	err = taprio_parse_tc_entries(sch, opt, extack);
> +	if (err)
> +		return err;
> +
>  	new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
>  	if (!new_admin) {
>  		NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
> @@ -1855,6 +1945,33 @@ static int dump_schedule(struct sk_buff *msg,
>  	return -1;
>  }
>  
> +static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
> +{
> +	struct nlattr *n;
> +	int tc;
> +
> +	for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
> +		n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY);
> +		if (!n)
> +			return -EMSGSIZE;
> +
> +		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc))
> +			goto nla_put_failure;
> +
> +		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
> +				q->max_sdu[tc]))
> +			goto nla_put_failure;
> +
> +		nla_nest_end(skb, n);
> +	}
> +
> +	return 0;
> +
> +nla_put_failure:
> +	nla_nest_cancel(skb, n);
> +	return -EMSGSIZE;
> +}
> +
>  static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
>  {
>  	struct taprio_sched *q = qdisc_priv(sch);
> @@ -1894,6 +2011,9 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
>  	    nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
>  		goto options_error;
>  
> +	if (taprio_dump_tc_entries(q, skb))
> +		goto options_error;
> +
>  	if (oper && dump_schedule(skb, oper))
>  		goto options_error;
>  
> -- 
> 2.34.1
>
Vladimir Oltean Sept. 14, 2022, 11:03 p.m. UTC | #5
On Wed, Sep 14, 2022 at 04:00:07PM -0700, Vinicius Costa Gomes wrote:
> Hm, I just noticed something.
> 
> During parse the user only sets the max-sdu for the traffic classes she
> is interested on. During dump you are showing all of them, the unset
> ones will be shown as zero, that seems a bit confusing, which could mean
> that you would have to add some checks anyway.
> 
> For the offload side, you could just document that U32_MAX means unset.

Yes, choosing '0' rather than other value, to mean 'default to port MTU'
was intentional. It is also in line with what other places, like the
YANG models, expect to see:
https://github.com/YangModels/yang/blob/main/standard/ieee/draft/802.1/Qcw/ieee802-dot1q-sched.yang#L128
Vladimir Oltean Sept. 14, 2022, 11:11 p.m. UTC | #6
On Wed, Sep 14, 2022 at 04:03:26PM -0700, Vinicius Costa Gomes wrote:
> Could you please add an example of the 'tc' command syntax you are
> thinking about?

I was working with this, as a matter of fact:

#!/bin/bash

h1=eno0
h2=eno2
swp1=swp0
swp2=swp4

ip link set $h2 address 00:01:02:03:04:05
ip link set $h1 up
ip link set $h2 up
tc qdisc del dev $swp1 root || :
ip link del br0 || :

ip link add br0 type bridge && ip link set br0 up
ip link set $swp1 master br0 && ip link set $swp1 up
ip link set $swp2 master br0 && ip link set $swp2 up
tc qdisc replace dev $swp1 root taprio \
	num_tc 8 \
	map 0 1 2 3 4 5 6 7 \
	queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \
	base-time 0 \
	sched-entry S 0x7f 990000 \
	sched-entry S 0x80  10000 \
	max-sdu 0 0 0 0 0 0 0 200 \
	flags 0x2

echo "Run:"
echo "isochron rcv --interface $h1 &"
echo "isochron send --interface $h2 --client 127.0.0.1 --cycle-time 0.001 --frame-size 60 --omit-sync --num-frames 10 --priority 7 --vid 0"

I've also tested it in software mode, and at least on my system, a 10 us
interval is not really enough for the qdisc to make forward progress and
dequeue any packet. My application freezes until I destroy the qdisc and
re-create it using a larger interval for TC7. I'll debug that some more.
I was thinking, after the basic queueMaxSDU feature is merged, maybe
taprio could automatically further limit queueMaxSDU based on the
smallest intervals from the schedule. Something like a generalization of
vsc9959_tas_guard_bands_update(), basically.

> Another point to think about: does it make sense to allow 'only' the
> max-sdu to be changed, i.e. the user doesn't set an a schedule, nor a
> map, only the max-sdu information.

Maybe, I haven't tried. I think iproute2 doesn't currently support
skipping TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST.
Vinicius Costa Gomes Sept. 14, 2022, 11:17 p.m. UTC | #7
Vladimir Oltean <vladimir.oltean@nxp.com> writes:

> On Wed, Sep 14, 2022 at 04:00:07PM -0700, Vinicius Costa Gomes wrote:
>> Hm, I just noticed something.
>> 
>> During parse the user only sets the max-sdu for the traffic classes she
>> is interested on. During dump you are showing all of them, the unset
>> ones will be shown as zero, that seems a bit confusing, which could mean
>> that you would have to add some checks anyway.
>> 
>> For the offload side, you could just document that U32_MAX means unset.
>
> Yes, choosing '0' rather than other value, to mean 'default to port MTU'
> was intentional. It is also in line with what other places, like the
> YANG models, expect to see:
> https://github.com/YangModels/yang/blob/main/standard/ieee/draft/802.1/Qcw/ieee802-dot1q-sched.yang#L128

Oh, I see. My bad. So, only that comment about thinking about making the
comparison simpler is still valid.


Cheers,
diff mbox series

Patch

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 29f65632ebc5..88080998557b 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -168,6 +168,7 @@  struct tc_taprio_qopt_offload {
 	ktime_t base_time;
 	u64 cycle_time;
 	u64 cycle_time_extension;
+	u32 max_sdu[TC_MAX_QUEUE];
 
 	size_t num_entries;
 	struct tc_taprio_sched_entry entries[];
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f292b467b27f..000eec106856 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1232,6 +1232,16 @@  enum {
 #define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST	_BITUL(0)
 #define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD	_BITUL(1)
 
+enum {
+	TCA_TAPRIO_TC_ENTRY_UNSPEC,
+	TCA_TAPRIO_TC_ENTRY_INDEX,		/* u32 */
+	TCA_TAPRIO_TC_ENTRY_MAX_SDU,		/* u32 */
+
+	/* add new constants above here */
+	__TCA_TAPRIO_TC_ENTRY_CNT,
+	TCA_TAPRIO_TC_ENTRY_MAX = (__TCA_TAPRIO_TC_ENTRY_CNT - 1)
+};
+
 enum {
 	TCA_TAPRIO_ATTR_UNSPEC,
 	TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
@@ -1245,6 +1255,7 @@  enum {
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
 	TCA_TAPRIO_ATTR_FLAGS, /* u32 */
 	TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */
+	TCA_TAPRIO_ATTR_TC_ENTRY, /* nest */
 	__TCA_TAPRIO_ATTR_MAX,
 };
 
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 2a4b8f59f444..834cbed88e4f 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -79,6 +79,7 @@  struct taprio_sched {
 	struct sched_gate_list __rcu *admin_sched;
 	struct hrtimer advance_timer;
 	struct list_head taprio_list;
+	u32 max_sdu[TC_MAX_QUEUE];
 	u32 txtime_delay;
 };
 
@@ -416,6 +417,9 @@  static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
 			      struct Qdisc *child, struct sk_buff **to_free)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	int prio = skb->priority;
+	u8 tc;
 
 	/* sk_flags are only safe to use on full sockets. */
 	if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
@@ -427,6 +431,12 @@  static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
 			return qdisc_drop(skb, sch, to_free);
 	}
 
+	/* Devices with full offload are expected to honor this in hardware */
+	tc = netdev_get_prio_tc_map(dev, prio);
+	if (q->max_sdu[tc] &&
+	    q->max_sdu[tc] < max_t(int, 0, skb->len - skb_mac_header_len(skb)))
+		return qdisc_drop(skb, sch, to_free);
+
 	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 
@@ -761,6 +771,11 @@  static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
 	[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]  = { .type = NLA_U32 },
 };
 
+static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
+	[TCA_TAPRIO_TC_ENTRY_INDEX]	   = { .type = NLA_U32 },
+	[TCA_TAPRIO_TC_ENTRY_MAX_SDU]	   = { .type = NLA_U32 },
+};
+
 static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
 	[TCA_TAPRIO_ATTR_PRIOMAP]	       = {
 		.len = sizeof(struct tc_mqprio_qopt)
@@ -773,6 +788,7 @@  static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
 	[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
 	[TCA_TAPRIO_ATTR_FLAGS]                      = { .type = NLA_U32 },
 	[TCA_TAPRIO_ATTR_TXTIME_DELAY]		     = { .type = NLA_U32 },
+	[TCA_TAPRIO_ATTR_TC_ENTRY]		     = { .type = NLA_NESTED },
 };
 
 static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb,
@@ -1236,7 +1252,7 @@  static int taprio_enable_offload(struct net_device *dev,
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct tc_taprio_qopt_offload *offload;
-	int err = 0;
+	int tc, err = 0;
 
 	if (!ops->ndo_setup_tc) {
 		NL_SET_ERR_MSG(extack,
@@ -1253,6 +1269,9 @@  static int taprio_enable_offload(struct net_device *dev,
 	offload->enable = 1;
 	taprio_sched_to_offload(dev, sched, offload);
 
+	for (tc = 0; tc < TC_MAX_QUEUE; tc++)
+		offload->max_sdu[tc] = q->max_sdu[tc];
+
 	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
 	if (err < 0) {
 		NL_SET_ERR_MSG(extack,
@@ -1387,6 +1406,73 @@  static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
 	return err;
 }
 
+static int taprio_parse_tc_entry(struct Qdisc *sch,
+				 struct nlattr *opt,
+				 unsigned long *seen_tcs,
+				 struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	u32 max_sdu = 0;
+	int err, tc;
+
+	err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
+			       taprio_tc_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) {
+		NL_SET_ERR_MSG_MOD(extack, "TC entry index missing");
+		return -EINVAL;
+	}
+
+	tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]);
+	if (tc >= TC_QOPT_MAX_QUEUE) {
+		NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range");
+		return -ERANGE;
+	}
+
+	if (*seen_tcs & BIT(tc)) {
+		NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry");
+		return -EINVAL;
+	}
+
+	*seen_tcs |= BIT(tc);
+
+	if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU])
+		max_sdu = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
+
+	if (max_sdu > dev->max_mtu) {
+		NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
+		return -ERANGE;
+	}
+
+	q->max_sdu[tc] = max_sdu;
+
+	return 0;
+}
+
+static int taprio_parse_tc_entries(struct Qdisc *sch,
+				   struct nlattr *opt,
+				   struct netlink_ext_ack *extack)
+{
+	unsigned long seen_tcs = 0;
+	struct nlattr *n;
+	int err = 0, rem;
+
+	nla_for_each_nested(n, opt, rem) {
+		if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
+			continue;
+
+		err = taprio_parse_tc_entry(sch, n, &seen_tcs, extack);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
 static int taprio_mqprio_cmp(const struct net_device *dev,
 			     const struct tc_mqprio_qopt *mqprio)
 {
@@ -1465,6 +1551,10 @@  static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 	if (err < 0)
 		return err;
 
+	err = taprio_parse_tc_entries(sch, opt, extack);
+	if (err)
+		return err;
+
 	new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
 	if (!new_admin) {
 		NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
@@ -1855,6 +1945,33 @@  static int dump_schedule(struct sk_buff *msg,
 	return -1;
 }
 
+static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
+{
+	struct nlattr *n;
+	int tc;
+
+	for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
+		n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY);
+		if (!n)
+			return -EMSGSIZE;
+
+		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc))
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
+				q->max_sdu[tc]))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, n);
+	}
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, n);
+	return -EMSGSIZE;
+}
+
 static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
@@ -1894,6 +2011,9 @@  static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
 		goto options_error;
 
+	if (taprio_dump_tc_entries(q, skb))
+		goto options_error;
+
 	if (oper && dump_schedule(skb, oper))
 		goto options_error;