diff mbox series

[net-next,v1,2/2] net: dsa: microchip: add ETS Qdisc support for KSZ9477 series

Message ID 20230306124940.865233-2-o.rempel@pengutronix.de (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next,v1,1/2] net: dsa: microchip: add ksz_setup_tc_mode() function | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 10 of 10 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 214 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Oleksij Rempel March 6, 2023, 12:49 p.m. UTC
Add ETS Qdisc support for KSZ9477 of switches. Current implementation is
limited to strict priority mode.

Tested on KSZ8563R with following configuration:
tc qdisc replace dev lan2 root handle 1: ets strict 4 \
  priomap 3 3 2 2 1 1 0 0
ip link add link lan2 name v1 type vlan id 1 \
  egress-qos-map 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7

and patched iperf3 version:
https://github.com/esnet/iperf/pull/1476
iperf3 -c 172.17.0.1 -b100M  -l1472 -t100 -u -R --sock-prio 2

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
---
 drivers/net/dsa/microchip/ksz_common.c | 178 +++++++++++++++++++++++++
 drivers/net/dsa/microchip/ksz_common.h |  12 ++
 2 files changed, 190 insertions(+)

Comments

Vladimir Oltean March 6, 2023, 2:06 p.m. UTC | #1
Hi Oleksij,

On Mon, Mar 06, 2023 at 01:49:40PM +0100, Oleksij Rempel wrote:
> Add ETS Qdisc support for KSZ9477 of switches. Current implementation is
> limited to strict priority mode.
> 
> Tested on KSZ8563R with following configuration:
> tc qdisc replace dev lan2 root handle 1: ets strict 4 \
>   priomap 3 3 2 2 1 1 0 0
> ip link add link lan2 name v1 type vlan id 1 \
>   egress-qos-map 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7
> 
> and patched iperf3 version:
> https://github.com/esnet/iperf/pull/1476
> iperf3 -c 172.17.0.1 -b100M  -l1472 -t100 -u -R --sock-prio 2
> 
> Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
> ---
>  drivers/net/dsa/microchip/ksz_common.c | 178 +++++++++++++++++++++++++
>  drivers/net/dsa/microchip/ksz_common.h |  12 ++
>  2 files changed, 190 insertions(+)
> 
> diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
> index ae05fe0b0a81..f32ad39c1d8d 100644
> --- a/drivers/net/dsa/microchip/ksz_common.c
> +++ b/drivers/net/dsa/microchip/ksz_common.c
> @@ -3172,12 +3172,190 @@ static int ksz_setup_tc_cbs(struct dsa_switch *ds, int port,
>  				 MTI_SHAPING_SRP);
>  }
>  
> +static int ksz_ets_band_to_queue(struct tc_ets_qopt_offload_replace_params *p,
> +				 int band)
> +{
> +	/* Compared to queues, bands prioritize packets differently. In strict
> +	 * priority mode, the lowest priority is assigned to Queue 0 while the
> +	 * highest priority is given to Band 0.
> +	 */
> +	return p->bands - 1 - band;
> +}
> +
> +static int ksz_queue_set_strict(struct ksz_device *dev, int port, int queue)
> +{
> +	int ret;
> +
> +	/* In order to ensure proper prioritization, it is necessary to set the
> +	 * rate limit for the related queue to zero. Otherwise strict priority
> +	 * mode will not work.
> +	 */
> +	ret = ksz_pwrite8(dev, port, KSZ9477_REG_PORT_OUT_RATE_0 + queue,
> +			  KSZ9477_OUT_RATE_NO_LIMIT);
> +	if (ret)
> +		return ret;
> +
> +	ret = ksz_pwrite32(dev, port, REG_PORT_MTI_QUEUE_INDEX__4, queue);
> +	if (ret)
> +		return ret;
> +
> +	return ksz_setup_tc_mode(dev, port, MTI_SCHEDULE_STRICT_PRIO,
> +				 MTI_SHAPING_OFF);
> +}
> +
> +static int ksz_queue_set_wrr(struct ksz_device *dev, int port, int queue,
> +			     int weight)
> +{
> +	int ret;
> +
> +	/* In order to ensure proper prioritization, it is necessary to set the
> +	 * rate limit for the related queue to zero. Otherwise weighted round
> +	 * robin mode will not work.
> +	 */
> +	ret = ksz_pwrite8(dev, port, KSZ9477_REG_PORT_OUT_RATE_0 + queue,
> +			  KSZ9477_OUT_RATE_NO_LIMIT);
> +	if (ret)
> +		return ret;
> +
> +	ret = ksz_pwrite32(dev, port, REG_PORT_MTI_QUEUE_INDEX__4, queue);
> +	if (ret)
> +		return ret;
> +
> +	ret = ksz_setup_tc_mode(dev, port, MTI_SCHEDULE_WRR,
> +				MTI_SHAPING_OFF);
> +	if (ret)
> +		return ret;
> +
> +	return ksz_pwrite8(dev, port, KSZ9477_PORT_MTI_QUEUE_CTRL_1, weight);
> +}
> +
> +static int ksz_tc_ets_add(struct ksz_device *dev, int port,
> +			  struct tc_ets_qopt_offload_replace_params *p)
> +{
> +	int ret, band, tc_prio;
> +	u32 queue_map = 0;
> +
> +	/* Configure queue scheduling mode for all bands. Currently only strict
> +	 * prio mode is supported.
> +	 */
> +	for (band = 0; band < p->bands; band++) {
> +		int queue = ksz_ets_band_to_queue(p, band);
> +
> +		ret = ksz_queue_set_strict(dev, port, queue);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	/* Configure the mapping between traffic classes and queues. Note:
> +	 * priomap variable support 16 traffic classes, but the chip can handle
> +	 * only 8 classes.
> +	 */
> +	for (tc_prio = 0; tc_prio < ARRAY_SIZE(p->priomap); tc_prio++) {
> +		int queue;
> +
> +		if (tc_prio > KSZ9477_MAX_TC_PRIO)
> +			break;
> +
> +		queue = ksz_ets_band_to_queue(p, p->priomap[tc_prio]);
> +		queue_map |= queue << (tc_prio * KSZ9477_PORT_TC_MAP_S);
> +	}
> +
> +	return ksz_pwrite32(dev, port, KSZ9477_PORT_MRI_TC_MAP__4, queue_map);
> +}
> +
> +static int ksz_tc_ets_del(struct ksz_device *dev, int port)
> +{
> +	int ret, queue;
> +
> +	/* To restore the default chip configuration, set all queues to use the
> +	 * WRR scheduler with a weight of 1.
> +	 */
> +	for (queue = 0; queue < dev->info->num_tx_queues; queue++) {
> +		ret = ksz_queue_set_wrr(dev, port, queue,
> +					KSZ9477_DEFAULT_WRR_WEIGHT);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	/* Revert the queue mapping for TC-priority to its default setting on
> +	 * the chip.
> +	 */
> +	return ksz_pwrite32(dev, port, KSZ9477_PORT_MRI_TC_MAP__4,
> +			    KSZ9477_DEFAULT_TC_MAP);
> +}
> +
> +static int ksz_tc_ets_validate(struct ksz_device *dev, int port,
> +			       struct tc_ets_qopt_offload_replace_params *p)
> +{
> +	int band;
> +
> +	/* Since it is not feasible to share one port among multiple qdisc,
> +	 * the user must configure all available queues appropriately.
> +	 */
> +	if (p->bands != dev->info->num_tx_queues) {
> +		dev_err(dev->dev, "Not supported amount of bands. It should be %d\n",
> +			dev->info->num_tx_queues);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	for (band = 0; band < p->bands; ++band) {
> +		/* The KSZ switches utilize a weighted round robin configuration
> +		 * where a certain number of packets can be transmitted from a
> +		 * queue before the next queue is serviced. For more information
> +		 * on this, refer to section 5.2.8.4 of the KSZ8565R
> +		 * documentation on the Port Transmit Queue Control 1 Register.
> +		 * However, the current ETS Qdisc implementation (as of February
> +		 * 2023) assigns a weight to each queue based on the number of
> +		 * bytes or extrapolated bandwidth in percentages. Since this
> +		 * differs from the KSZ switches' method and we don't want to
> +		 * fake support by converting bytes to packets, we have decided
> +		 * to return an error instead.
> +		 */
> +		if (p->quanta[band]) {
> +			dev_err(dev->dev, "Quanta/weights configuration is not supported.\n");
> +			return -EOPNOTSUPP;
> +		}

So what does the user gain using tc-ets over tc-mqprio? That has a way
to set up strict prioritization and prio:tc maps as well, and to my
knowledge mqprio is vastly more popular in non-DCB setups than tc-ets.
The only thing is that with mqprio, AFAIK, the round robin between TXQs
belonging to the same traffic class is not weighted.

> +	}
> +
> +	return 0;
> +}
> +
> +static int ksz_tc_setup_qdisc_ets(struct dsa_switch *ds, int port,
> +				  struct tc_ets_qopt_offload *qopt)
> +{
> +	struct ksz_device *dev = ds->priv;
> +	int ret;
> +
> +	if (qopt->parent != TC_H_ROOT) {
> +		dev_err(dev->dev, "Parent should be \"root\"\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	switch (qopt->command) {
> +	case TC_ETS_REPLACE:
> +		ret = ksz_tc_ets_validate(dev, port, &qopt->replace_params);
> +		if (ret)
> +			return ret;
> +
> +		return ksz_tc_ets_add(dev, port, &qopt->replace_params);
> +	case TC_ETS_DESTROY:
> +		return ksz_tc_ets_del(dev, port);
> +	case TC_ETS_STATS:
> +	case TC_ETS_GRAFT:
> +		return -EOPNOTSUPP;
> +	}
> +
> +	return -EOPNOTSUPP;
> +}
> +
>  static int ksz_setup_tc(struct dsa_switch *ds, int port,
>  			enum tc_setup_type type, void *type_data)
>  {
>  	switch (type) {
>  	case TC_SETUP_QDISC_CBS:
>  		return ksz_setup_tc_cbs(ds, port, type_data);
> +	case TC_SETUP_QDISC_ETS:
> +		return ksz_tc_setup_qdisc_ets(ds, port, type_data);
>  	default:
>  		return -EOPNOTSUPP;
>  	}
> diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
> index f53834bbe896..7618a4714e06 100644
> --- a/drivers/net/dsa/microchip/ksz_common.h
> +++ b/drivers/net/dsa/microchip/ksz_common.h
> @@ -657,6 +657,15 @@ static inline int is_lan937x(struct ksz_device *dev)
>  #define KSZ8_LEGAL_PACKET_SIZE		1518
>  #define KSZ9477_MAX_FRAME_SIZE		9000
>  
> +#define KSZ9477_REG_PORT_OUT_RATE_0	0x0420
> +#define KSZ9477_OUT_RATE_NO_LIMIT	0
> +
> +#define KSZ9477_PORT_MRI_TC_MAP__4	0x0808
> +#define KSZ9477_DEFAULT_TC_MAP		0x33221100
> +
> +#define KSZ9477_PORT_TC_MAP_S		4
> +#define KSZ9477_MAX_TC_PRIO		7
> +
>  /* CBS related registers */
>  #define REG_PORT_MTI_QUEUE_INDEX__4	0x0900
>  
> @@ -670,6 +679,9 @@ static inline int is_lan937x(struct ksz_device *dev)
>  #define MTI_SHAPING_SRP			1
>  #define MTI_SHAPING_TIME_AWARE		2
>  
> +#define KSZ9477_PORT_MTI_QUEUE_CTRL_1	0x0915
> +#define KSZ9477_DEFAULT_WRR_WEIGHT	1
> +
>  #define REG_PORT_MTI_HI_WATER_MARK	0x0916
>  #define REG_PORT_MTI_LO_WATER_MARK	0x0918
>  
> -- 
> 2.30.2
>
Oleksij Rempel March 6, 2023, 4:35 p.m. UTC | #2
Hi Vladimir,

On Mon, Mar 06, 2023 at 04:06:51PM +0200, Vladimir Oltean wrote:
> Hi Oleksij,
> 
> On Mon, Mar 06, 2023 at 01:49:40PM +0100, Oleksij Rempel wrote:
> > +	for (band = 0; band < p->bands; ++band) {
> > +		/* The KSZ switches utilize a weighted round robin configuration
> > +		 * where a certain number of packets can be transmitted from a
> > +		 * queue before the next queue is serviced. For more information
> > +		 * on this, refer to section 5.2.8.4 of the KSZ8565R
> > +		 * documentation on the Port Transmit Queue Control 1 Register.
> > +		 * However, the current ETS Qdisc implementation (as of February
> > +		 * 2023) assigns a weight to each queue based on the number of
> > +		 * bytes or extrapolated bandwidth in percentages. Since this
> > +		 * differs from the KSZ switches' method and we don't want to
> > +		 * fake support by converting bytes to packets, we have decided
> > +		 * to return an error instead.
> > +		 */
> > +		if (p->quanta[band]) {
> > +			dev_err(dev->dev, "Quanta/weights configuration is not supported.\n");
> > +			return -EOPNOTSUPP;
> > +		}
> 
> So what does the user gain using tc-ets over tc-mqprio? That has a way
> to set up strict prioritization and prio:tc maps as well, and to my
> knowledge mqprio is vastly more popular in non-DCB setups than tc-ets.
> The only thing is that with mqprio, AFAIK, the round robin between TXQs
> belonging to the same traffic class is not weighted.

Do mqprio already supports strict prio mode? net-next was not supporting
this back for two weeks. I do not care what to use, my motivation was based on
following points:
- tc-ets supports strict prio. mqprio need to be extended to do this
- tc-ets refers to IEEE 802.1Q specification, so i feel safe
  and do not need to invent new things.
- mqprio automatically creates software queues, but it seems to not
  provide any advantage for a typical bridged DSA setup. For example
  i can use queue mapping only for traffic from CPU to external DSA port
  but can't use multi queue advantages of CPU MAC for same traffic  (do I'm
  missing something). For bridged traffic i'll need to use HW offloading any
  way.

Regards,
Oleksij
Vladimir Oltean March 7, 2023, 4:46 p.m. UTC | #3
On Mon, Mar 06, 2023 at 05:35:42PM +0100, Oleksij Rempel wrote:
> > So what does the user gain using tc-ets over tc-mqprio? That has a way
> > to set up strict prioritization and prio:tc maps as well, and to my
> > knowledge mqprio is vastly more popular in non-DCB setups than tc-ets.
> > The only thing is that with mqprio, AFAIK, the round robin between TXQs
> > belonging to the same traffic class is not weighted.
> 
> Do mqprio already supports strict prio mode? net-next was not supporting
> this back for two weeks. I do not care what to use, my motivation was based on
> following points:
> - tc-ets supports strict prio. mqprio need to be extended to do this
> - tc-ets refers to IEEE 802.1Q specification, so i feel safe
>   and do not need to invent new things.
> - mqprio automatically creates software queues, but it seems to not
>   provide any advantage for a typical bridged DSA setup. For example
>   i can use queue mapping only for traffic from CPU to external DSA port
>   but can't use multi queue advantages of CPU MAC for same traffic  (do I'm
>   missing something). For bridged traffic i'll need to use HW offloading any
>   way.

Sorry, my inbox is a mess and I forgot to respond to this.
What do you mean tc-mqprio doesn't support strict priority? Strict
priority between traffic classes is what it *does* (the "prio" in the name),
although without hardware offload, the prioritization isn't enforced anywhere.
Perhaps I'm misunderstanding what you mean?

For strict prioritization using multi-queue on the DSA master you should
be able to set up a separate Qdisc.
Oleksij Rempel March 7, 2023, 6:27 p.m. UTC | #4
On Tue, Mar 07, 2023 at 06:46:14PM +0200, Vladimir Oltean wrote:
> On Mon, Mar 06, 2023 at 05:35:42PM +0100, Oleksij Rempel wrote:
> > > So what does the user gain using tc-ets over tc-mqprio? That has a way
> > > to set up strict prioritization and prio:tc maps as well, and to my
> > > knowledge mqprio is vastly more popular in non-DCB setups than tc-ets.
> > > The only thing is that with mqprio, AFAIK, the round robin between TXQs
> > > belonging to the same traffic class is not weighted.
> > 
> > Do mqprio already supports strict prio mode? net-next was not supporting
> > this back for two weeks. I do not care what to use, my motivation was based on
> > following points:
> > - tc-ets supports strict prio. mqprio need to be extended to do this
> > - tc-ets refers to IEEE 802.1Q specification, so i feel safe
> >   and do not need to invent new things.
> > - mqprio automatically creates software queues, but it seems to not
> >   provide any advantage for a typical bridged DSA setup. For example
> >   i can use queue mapping only for traffic from CPU to external DSA port
> >   but can't use multi queue advantages of CPU MAC for same traffic  (do I'm
> >   missing something). For bridged traffic i'll need to use HW offloading any
> >   way.
> 
> Sorry, my inbox is a mess and I forgot to respond to this.

No problem :)

> What do you mean tc-mqprio doesn't support strict priority? Strict
> priority between traffic classes is what it *does* (the "prio" in the name),
> although without hardware offload, the prioritization isn't enforced anywhere.
> Perhaps I'm misunderstanding what you mean?

Huh.. you have right, I overlooked this part of documentation:
"As one specific example numerous Ethernet cards support the
802.1Q link strict priority transmission selection algorithm
(TSA). MQPRIO enabled hardware in conjunction with the
classification methods below can provide hardware offloaded
support for this TSA."

But other parts of manual confuse me. May be you can help here:
- "map - The priority to traffic class map. Maps priorities 0..15 to a
   specified traffic class"
   "Priorities" is probably SO_PRIORITY? If yes, this option can't be offloaded
   by the KSZ switch.
- "queues - Provide count and offset of queue range for each traffic class..."
  If I see it correctly, I can map a traffic class to some queue. But traffic
  class is not priority? I can create traffic class with high number and map
  it to a low number queue but actual queue priority is HW specific and there
  is no way to notify user about it.
   
KSZ HW is capable of mapping 8 traffic classes separately to any available
queue. Ok, if I replace words used in manual from "priority" to "traffic class"
and "traffic class" to "queues". But even in this case the code will be even
more confusing - i'll have to use qopt->prio_tc_map array which is SO_PRIO to
TC map, as TC to queue map.

I still have difficulties to understand how priorities of actual queues
are organized. I see how to map traffic class to a queue, but I can't find
any thing in manual about queue priority. For example, if I assign traffic
class 3 to the Queue0 this traffic will have lowest priority in my HW. Is
it some how documented or known for users?

One more question is, what is actual expected behavior of mqprio if max_rate
option is used? In my case, if max_rate is set to a queue (even to max value),
then strict priority TSA will not work:
queue0---max rate 100Mbit/s---\
                               |---100Mbit/s---
queue1---max rate 100Mbit/s---/

in this example both streams will get 49Mbit/s. My expectation of strict prio
is that queue1 should get 100Mbit/s and queue 0Mbit/s

On other hand tc-ets made perfect sense to me from documentation and code pow.
TC is mapped to bands. Bands have documented priorities and it fit's to what
KSZ is supporting. Except of WRR configuration.

> For strict prioritization using multi-queue on the DSA master you should
> be able to set up a separate Qdisc.

I'll need to do more testing with FEC later, it didn't worked at first try, but
as you can see I still have a lot of misunderstandings.

Regards,
Oleksij
Vladimir Oltean March 7, 2023, 6:57 p.m. UTC | #5
On Tue, Mar 07, 2023 at 07:27:32PM +0100, Oleksij Rempel wrote:
> > What do you mean tc-mqprio doesn't support strict priority? Strict
> > priority between traffic classes is what it *does* (the "prio" in the name),
> > although without hardware offload, the prioritization isn't enforced anywhere.
> > Perhaps I'm misunderstanding what you mean?
> 
> Huh.. you have right, I overlooked this part of documentation:
> "As one specific example numerous Ethernet cards support the
> 802.1Q link strict priority transmission selection algorithm
> (TSA). MQPRIO enabled hardware in conjunction with the
> classification methods below can provide hardware offloaded
> support for this TSA."
> 
> But other parts of manual confuse me.

Not only you...
Does this discussion help any bit?
https://patchwork.kernel.org/project/netdevbpf/patch/20230220150548.2021-1-peti.antal99@gmail.com/

> May be you can help here:
> - "map - The priority to traffic class map. Maps priorities 0..15 to a
>    specified traffic class"
>    "Priorities" is probably SO_PRIORITY?

yeah
see the netdev_pick_tx() and skb_tx_hash() implementations for TXQ
selection based on skb->priority, it will answer a lot of questions

>    If yes, this option can't be offloaded by the KSZ switch.

because? it can offload the 1:1 prio:tc mapping only; reject anything else

> - "queues - Provide count and offset of queue range for each traffic class..."
>   If I see it correctly, I can map a traffic class to some queue.

s/queue/group of TX queues/

>   But traffic class is not priority? I can create traffic class with high number
>   and map it to a low number queue but actual queue priority is HW specific and
>   there is no way to notify user about it.

no, where did you read that you should do that?
traffic class number *is* the number based on which the NIC should do
egress prioritization. Higher traffic class number => higher priority.
Within the same traffic class, there should be round robin between TXQs.
The TXQ number should have nothing to do with priority. Intel igb/igc
NICs do that, and there was a recent discussion about this fact causing
problems with taprio (which reuses the mqprio concepts).

Since commit d7045f520a74 ("net/sched: mqprio: allow reverse TC:TXQ
mappings") it's possible to describe this kind of inherent TXQ priority
like this:

num_tc 8 map 0 1 2 3 4 5 6 7 queues 1@7 1@6 1@5 1@4 1@3 1@2 1@1 1@0

>    
> KSZ HW is capable of mapping 8 traffic classes separately to any available
> queue. Ok, if I replace words used in manual from "priority" to "traffic class"
> and "traffic class" to "queues". But even in this case the code will be even
> more confusing - i'll have to use qopt->prio_tc_map array which is SO_PRIO to
> TC map, as TC to queue map.

yeah, but with the 1:1 mapping between PRIO and TC, you only have to
concern about TC:TXQ.

> I still have difficulties to understand how priorities of actual queues
> are organized. I see how to map traffic class to a queue, but I can't find

s/a queue/one or more queues/

> any thing in manual about queue priority. For example, if I assign traffic
> class 3 to the Queue0 this traffic will have lowest priority in my HW. Is
> it some how documented or known for users?

This is a missing piece for mqprio's UAPI indeed. If TXQs have inherent
egress scheduling priority attached to them, there is no mechanism
currently which would allow the user to discover that. He would just
have to "know" that it is like that. Perhaps guided by errors emitted by
the driver, plus extack messages saying that lower TXQ numbers cannot
be mapped to a higher traffic class than higher TXQ numbers.

> One more question is, what is actual expected behavior of mqprio if max_rate
> option is used? In my case, if max_rate is set to a queue (even to max value),
> then strict priority TSA will not work:
> queue0---max rate 100Mbit/s---\
>                                |---100Mbit/s---
> queue1---max rate 100Mbit/s---/
> 
> in this example both streams will get 49Mbit/s. My expectation of strict prio
> is that queue1 should get 100Mbit/s and queue 0Mbit/s

I don't understand this. Have you already implemented mqprio offloading
and this is what you observe?

max_rate is an option per traffic class. Are queue0 and queue1 mapped to
the same traffic class in your example, or are they not? Could you show
the full command you ran?

> On other hand tc-ets made perfect sense to me from documentation and code pow.
> TC is mapped to bands. Bands have documented priorities and it fit's to what
> KSZ is supporting. Except of WRR configuration.

I haven't used tc-ets, I was just curious about the differences you saw
between it and mqprio which led to you choosing it.

> > For strict prioritization using multi-queue on the DSA master you should
> > be able to set up a separate Qdisc.
> 
> I'll need to do more testing with FEC later, it didn't worked at first try, but
> as you can see I still have a lot of misunderstandings.

fec doesn't seem to implement ndo_setup_tc() at all, so I'm not sure
what you're going to try exactly. OTOH it has this weird ndo_select_queue()
implementation which (I think) implements multi-queue based on VLAN PCP.

sorry for the quick response, need to go right now
Oleksij Rempel March 7, 2023, 7:52 p.m. UTC | #6
On Tue, Mar 07, 2023 at 08:57:34PM +0200, Vladimir Oltean wrote:
> On Tue, Mar 07, 2023 at 07:27:32PM +0100, Oleksij Rempel wrote:
> > > What do you mean tc-mqprio doesn't support strict priority? Strict
> > > priority between traffic classes is what it *does* (the "prio" in the name),
> > > although without hardware offload, the prioritization isn't enforced anywhere.
> > > Perhaps I'm misunderstanding what you mean?
> > 
> > Huh.. you have right, I overlooked this part of documentation:
> > "As one specific example numerous Ethernet cards support the
> > 802.1Q link strict priority transmission selection algorithm
> > (TSA). MQPRIO enabled hardware in conjunction with the
> > classification methods below can provide hardware offloaded
> > support for this TSA."
> > 
> > But other parts of manual confuse me.
> 
> Not only you...
> Does this discussion help any bit?
> https://patchwork.kernel.org/project/netdevbpf/patch/20230220150548.2021-1-peti.antal99@gmail.com/

Ok, thx.

> > - "queues - Provide count and offset of queue range for each traffic class..."
> >   If I see it correctly, I can map a traffic class to some queue.
> 
> s/queue/group of TX queues/

grouping of TX queues to the same TC is not supported by this HW, but
yes, this description is more precise.

> >   But traffic class is not priority? I can create traffic class with high number
> >   and map it to a low number queue but actual queue priority is HW specific and
> >   there is no way to notify user about it.
> 
> no, where did you read that you should do that?
> traffic class number *is* the number based on which the NIC should do
> egress prioritization. Higher traffic class number => higher priority.

Ok, this was the missing part.

> > One more question is, what is actual expected behavior of mqprio if max_rate
> > option is used? In my case, if max_rate is set to a queue (even to max value),
> > then strict priority TSA will not work:
> > queue0---max rate 100Mbit/s---\
> >                                |---100Mbit/s---
> > queue1---max rate 100Mbit/s---/
> > 
> > in this example both streams will get 49Mbit/s. My expectation of strict prio
> > is that queue1 should get 100Mbit/s and queue 0Mbit/s
> 
> I don't understand this. Have you already implemented mqprio offloading
> and this is what you observe?

Ack.

> max_rate is an option per traffic class. Are queue0 and queue1 mapped to
> the same traffic class in your example, or are they not?

They are separate TCs. It is not possible to assign multiple TXQs to on TC on
KSZ.

> Could you show the full ommand you ran?

tc qdisc add dev lan2 parent root handle 100 mqprio num_tc 4 map 0 1 2 3
queues 1@0 1@1 1@2 1@3  hw 1 mode channel shaper bw_rlimit max_rate
70Mbit 70Mbit 70Mbit 70Mbit

lan2 is bridged with lan1 and lan3. Egress traffic on lan2 is from lan1 and lan3.
For testing I use 2 iperf3 instances with different PCP values in the VLAN tag.
Classification is done by HW (currently not configurable from user space)

> > On other hand tc-ets made perfect sense to me from documentation and code pow.
> > TC is mapped to bands. Bands have documented priorities and it fit's to what
> > KSZ is supporting. Except of WRR configuration.
> 
> I haven't used tc-ets, I was just curious about the differences you saw
> between it and mqprio which led to you choosing it.

Currently, the main difference would be flexible TC:TXQ mapping. In case
of mqprio it should be hard coded.

> > > For strict prioritization using multi-queue on the DSA master you should
> > > be able to set up a separate Qdisc.
> > 
> > I'll need to do more testing with FEC later, it didn't worked at first try, but
> > as you can see I still have a lot of misunderstandings.
> 
> fec doesn't seem to implement ndo_setup_tc() at all, so I'm not sure
> what you're going to try exactly.  OTOH it has this weird ndo_select_queue()
> implementation which (I think) implements multi-queue based on VLAN PCP.

Yes, this is exactly what I wont to investigate. How it should be
configured to work as plain interface and compare it combination with
DSA master. My tests are based on VLAN PCP, but only one queue is used.

> sorry for the quick response, need to go right now

No proble. Have fun.

Regards,
Oleksij
Vladimir Oltean March 7, 2023, 9:11 p.m. UTC | #7
On Tue, Mar 07, 2023 at 08:52:50PM +0100, Oleksij Rempel wrote:
> > > One more question is, what is actual expected behavior of mqprio if max_rate
> > > option is used? In my case, if max_rate is set to a queue (even to max value),
> > > then strict priority TSA will not work:
> > > queue0---max rate 100Mbit/s---\
> > >                                |---100Mbit/s---
> > > queue1---max rate 100Mbit/s---/
> > > 
> > > in this example both streams will get 49Mbit/s. My expectation of strict prio
> > > is that queue1 should get 100Mbit/s and queue 0Mbit/s
> > 
> > I don't understand this. Have you already implemented mqprio offloading
> > and this is what you observe?
> 
> Ack.
> 
> > max_rate is an option per traffic class. Are queue0 and queue1 mapped to
> > the same traffic class in your example, or are they not?
> 
> They are separate TCs. It is not possible to assign multiple TXQs to on TC on
> KSZ.
> 
> > Could you show the full ommand you ran?
> 
> tc qdisc add dev lan2 parent root handle 100 mqprio num_tc 4 map 0 1 2 3
> queues 1@0 1@1 1@2 1@3  hw 1 mode channel shaper bw_rlimit max_rate
> 70Mbit 70Mbit 70Mbit 70Mbit
> 
> lan2 is bridged with lan1 and lan3. Egress traffic on lan2 is from lan1 and lan3.
> For testing I use 2 iperf3 instances with different PCP values in the VLAN tag.
> Classification is done by HW (currently not configurable from user space)

Hmm, I still don't understand the question. First of all you changed the
data between messages - first you talk about max_rate 100 Mbps and then
you specify max_rate 70Mbit per traffic class. Possibly also the link
speeds are changed between the 2 examples. What is the link speed of the
egress port in the 2 examples?

The question is phrased as "what is the actual expected behavior" - that
would be easy - the traffic classes corresponding to the 2 TXQs are rate
limited to no more than 100 Mbps each. When the total sum of bandwidth
consumptions exceeds the capacity of the link is when you'll start
seeing prioritization effects.

If the question is why this doesn't happen in your case and they get
equal bandwidths instead (assuming you do create congestion), I don't know;
I have seen neither your implementation nor am I familiar with the
hardware. However, there are a few things I've noticed which might be of
help:

- the fact that you get 50-50 bandwidth allocation sounds an awful lot
  to me as if the TXQs are still operating in WRR mode and not in strict
  priority mode.

- the KSZ9477 datasheet says that rate limiting is per port, and not per
  queue, unless Switch MAC Control 5 Register bit 3 (Queue Based Egress
  Rate Limit Enable) is set.

- maybe you simply failed to convert the rates properly between the unit
  of measurement passed by iproute2 to the unit of measurement expected
  by hw. Here's a random comment from the ice driver:

		/* TC command takes input in K/N/Gbps or K/M/Gbit etc but
		 * converts the bandwidth rate limit into Bytes/s when
		 * passing it down to the driver. So convert input bandwidth
		 * from Bytes/s to Kbps
		 */

("TC command" means iproute2, the conversion is in the "get_rate64()" function)

> > sorry for the quick response, need to go right now
> 
> No proble. Have fun.

There wasn't anything funny to do, I had to rush to do some shopping
before the grocery stores closed.
Oleksij Rempel March 8, 2023, 5:48 a.m. UTC | #8
On Tue, Mar 07, 2023 at 11:11:34PM +0200, Vladimir Oltean wrote:
> On Tue, Mar 07, 2023 at 08:52:50PM +0100, Oleksij Rempel wrote:
> > > > One more question is, what is actual expected behavior of mqprio if max_rate
> > > > option is used? In my case, if max_rate is set to a queue (even to max value),
> > > > then strict priority TSA will not work:
> > > > queue0---max rate 100Mbit/s---\
> > > >                                |---100Mbit/s---
> > > > queue1---max rate 100Mbit/s---/
> > > > 
> > > > in this example both streams will get 49Mbit/s. My expectation of strict prio
> > > > is that queue1 should get 100Mbit/s and queue 0Mbit/s
> > > 
> > > I don't understand this. Have you already implemented mqprio offloading
> > > and this is what you observe?
> > 
> > Ack.
> > 
> > > max_rate is an option per traffic class. Are queue0 and queue1 mapped to
> > > the same traffic class in your example, or are they not?
> > 
> > They are separate TCs. It is not possible to assign multiple TXQs to on TC on
> > KSZ.
> > 
> > > Could you show the full ommand you ran?
> > 
> > tc qdisc add dev lan2 parent root handle 100 mqprio num_tc 4 map 0 1 2 3
> > queues 1@0 1@1 1@2 1@3  hw 1 mode channel shaper bw_rlimit max_rate
> > 70Mbit 70Mbit 70Mbit 70Mbit
> > 
> > lan2 is bridged with lan1 and lan3. Egress traffic on lan2 is from lan1 and lan3.
> > For testing I use 2 iperf3 instances with different PCP values in the VLAN tag.
> > Classification is done by HW (currently not configurable from user space)
> 
> Hmm, I still don't understand the question. First of all you changed the
> data between messages - first you talk about max_rate 100 Mbps and then
> you specify max_rate 70Mbit per traffic class. Possibly also the link
> speeds are changed between the 2 examples. What is the link speed of the
> egress port in the 2 examples?

The link is 100Mbit/s

Configuring all TCs to 100Mbit will make same effect, no prioritisation
will happen:
 tc qdisc add dev lan2 parent root handle 100 mqprio num_tc 4 map 0 1 2 3
 queues 1@0 1@1 1@2 1@3  hw 1 mode channel shaper bw_rlimit max_rate
 100Mbit 100Mbit 100Mbit 100Mbit

> The question is phrased as "what is the actual expected behavior" - that
> would be easy - the traffic classes corresponding to the 2 TXQs are rate
> limited to no more than 100 Mbps each. When the total sum of bandwidth
> consumptions exceeds the capacity of the link is when you'll start
> seeing prioritization effects.

In this case and if my code is correct, rate limit can't be used for
MQPRIO on this chip.

> If the question is why this doesn't happen in your case and they get
> equal bandwidths instead (assuming you do create congestion), I don't know;
> I have seen neither your implementation nor am I familiar with the
> hardware.

See the code at the end of mail.

>  However, there are a few things I've noticed which might be of
> help:
> 
> - the fact that you get 50-50 bandwidth allocation sounds an awful lot
>   to me as if the TXQs are still operating in WRR mode and not in strict
>   priority mode.

Even not *W*RR. Configuring weight to different queues will not make
effect as soon as rate limit is not zero.

> - the KSZ9477 datasheet says that rate limiting is per port, and not per
>   queue, unless Switch MAC Control 5 Register bit 3 (Queue Based Egress
>   Rate Limit Enable) is set.

This part was easy to test. By configuring different queues to different
rate i was able to see if traffic flows trough expected queue.
 tc qdisc add dev lan2 parent root handle 100 mqprio num_tc 4 map 0 1 2 3
 queues 1@0 1@1 1@2 1@3  hw 1 mode channel shaper bw_rlimit max_rate
 10Mbit 20Mbit 30Mbit 40Mbit

Since there are no queue counters, it is only way to confirm if my code
was actually working.

> - maybe you simply failed to convert the rates properly between the unit
>   of measurement passed by iproute2 to the unit of measurement expected
>   by hw. Here's a random comment from the ice driver:
> 
> 		/* TC command takes input in K/N/Gbps or K/M/Gbit etc but
> 		 * converts the bandwidth rate limit into Bytes/s when
> 		 * passing it down to the driver. So convert input bandwidth
> 		 * from Bytes/s to Kbps
> 		 */

I get expected rate per queue if rate limiting is working. But like I
said, it is enough to set max rate limit to a queue to make strict prio
and WRR modes do not work.

This is draft code for MQPRIO. Mainline version would look a bit more
complicated, because rate configuration is different per link speed. I
would need to update rate config on each link up event. But, since
prioritization is not working as soon as rate limit is configured, using
MQPRIO make no sense any way.

diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index e9cee0ce6b46..9a57595d3248 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -22,6 +22,7 @@
 #include <linux/of_device.h>
 #include <linux/of_net.h>
 #include <linux/micrel_phy.h>
+#include <linux/units.h>
 #include <net/dsa.h>
 #include <net/pkt_cls.h>
 #include <net/switchdev.h>
@@ -3348,6 +3349,88 @@ static int ksz_tc_setup_qdisc_ets(struct dsa_switch *ds, int port,
 	return -EOPNOTSUPP;
 }
 
+static int ksz_reset_per_queue_rate_limit(struct ksz_device *dev, int port)
+{
+	int i, ret;
+
+	for (i = 0; i < dev->info->num_tx_queues; i++) {
+		ret = ksz_pwrite8(dev, port, KSZ9477_REG_PORT_OUT_RATE_0 + i, 0);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int ksz_setup_queue_rates(struct ksz_device *dev, int port, int queue,
+				 u64 max_rate)
+{
+	u8 val = 0;
+
+	/* Convert to from Bps to bps */
+	max_rate *= 8;
+
+	/* TODO: this conversation works only for 10 and 100Mbit links */
+	if (max_rate >= (1 * MEGA)) {
+		val = DIV_ROUND_CLOSEST_ULL(max_rate, 1 * MEGA);
+	}
+
+	return ksz_pwrite8(dev, port, KSZ9477_REG_PORT_OUT_RATE_0 + queue, val);
+}
+
+static int ksz_setup_tc_mqprio(struct dsa_switch *ds, int port,
+			       struct tc_mqprio_qopt_offload *mqprio)
+{
+	struct ksz_device *dev = ds->priv;
+	struct net_device *ndev;
+	int ret, queue;
+	u8 num_queues;
+	u8 num_tc;
+
+	mqprio->qopt.hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	ndev = ksz_port_to_netdev(dev, port);
+	num_queues = dev->info->num_tx_queues;
+	num_tc = mqprio->qopt.num_tc;
+
+	if (num_tc > num_queues)
+		return -EINVAL;
+
+	if (!num_tc) {
+		netdev_reset_tc(ndev);
+		return ksz_reset_per_queue_rate_limit(dev, port);
+	}
+
+	netdev_set_num_tc(ndev, mqprio->qopt.num_tc);
+
+	for (queue = 0; queue < mqprio->qopt.num_tc; queue++) {
+		netdev_set_tc_queue(ndev, queue, mqprio->qopt.count[queue],
+				    mqprio->qopt.offset[queue]);
+
+		ret = ksz_pwrite32(dev, port, REG_PORT_MTI_QUEUE_INDEX__4, queue);
+		if (ret)
+			return ret;
+
+		ret = ksz_setup_tc_mode(dev, port, MTI_SCHEDULE_STRICT_PRIO,
+					MTI_SHAPING_OFF);
+		if (ret)
+			return ret;
+	}
+
+	if (mqprio->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+		return ksz_reset_per_queue_rate_limit(dev, port);
+
+	for (queue = 0; queue < mqprio->qopt.num_tc; queue++) {
+		ret = ksz_setup_queue_rates(dev, port, queue,
+					    mqprio->max_rate[queue]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+
+
 static int ksz_setup_tc(struct dsa_switch *ds, int port,
 			enum tc_setup_type type, void *type_data)
 {
@@ -3357,6 +3440,8 @@ static int ksz_setup_tc(struct dsa_switch *ds, int port,
 	case TC_SETUP_QDISC_ETS:
 		ksz_tc_setup_qdisc_ets(ds, 2, type_data);
 		return ksz_tc_setup_qdisc_ets(ds, port, type_data);
+	case TC_SETUP_QDISC_MQPRIO:
+		return ksz_setup_tc_mqprio(ds, port, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 7618a4714e06..e284538d6898 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -590,6 +590,17 @@ static inline int is_lan937x(struct ksz_device *dev)
 		dev->chip_id == LAN9374_CHIP_ID;
 }
 
+static inline struct net_device *ksz_port_to_netdev(struct ksz_device *dev,
+						    int port)
+{
+	struct dsa_switch *ds = dev->ds;
+
+	if (!dsa_is_user_port(ds, port))
+		return NULL;
+
+	return dsa_to_port(ds, port)->slave;
+}
+
 /* STP State Defines */
 #define PORT_TX_ENABLE			BIT(2)
 #define PORT_RX_ENABLE			BIT(1)
Oleksij Rempel March 8, 2023, 7:16 a.m. UTC | #9
On Mon, Mar 06, 2023 at 01:49:40PM +0100, Oleksij Rempel wrote:
> +static int ksz_queue_set_strict(struct ksz_device *dev, int port, int queue)
> +{
> +	int ret;
> +
> +	/* In order to ensure proper prioritization, it is necessary to set the
> +	 * rate limit for the related queue to zero. Otherwise strict priority
> +	 * mode will not work.
> +	 */
> +	ret = ksz_pwrite8(dev, port, KSZ9477_REG_PORT_OUT_RATE_0 + queue,
> +			  KSZ9477_OUT_RATE_NO_LIMIT);

Uff, this part works by accident. KSZ9477_REG_PORT_OUT_RATE_0 registers
should be written in a direct order. According to the documentation
"update will not take effect until the Port Queue 3 Egress Limit
Control Register is written.". But we are writing in a reverse order -
queue 3 is written first.

> +	if (ret)
> +		return ret;
> +
> +	ret = ksz_pwrite32(dev, port, REG_PORT_MTI_QUEUE_INDEX__4, queue);
> +	if (ret)
> +		return ret;
> +
> +	return ksz_setup_tc_mode(dev, port, MTI_SCHEDULE_STRICT_PRIO,
> +				 MTI_SHAPING_OFF);
> +}
> +
> +static int ksz_queue_set_wrr(struct ksz_device *dev, int port, int queue,
> +			     int weight)
> +{
> +	int ret;
> +
> +	/* In order to ensure proper prioritization, it is necessary to set the
> +	 * rate limit for the related queue to zero. Otherwise weighted round
> +	 * robin mode will not work.
> +	 */
> +	ret = ksz_pwrite8(dev, port, KSZ9477_REG_PORT_OUT_RATE_0 + queue,
> +			  KSZ9477_OUT_RATE_NO_LIMIT);

same here.
diff mbox series

Patch

diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index ae05fe0b0a81..f32ad39c1d8d 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -3172,12 +3172,190 @@  static int ksz_setup_tc_cbs(struct dsa_switch *ds, int port,
 				 MTI_SHAPING_SRP);
 }
 
+static int ksz_ets_band_to_queue(struct tc_ets_qopt_offload_replace_params *p,
+				 int band)
+{
+	/* Compared to queues, bands prioritize packets differently. In strict
+	 * priority mode, the lowest priority is assigned to Queue 0 while the
+	 * highest priority is given to Band 0.
+	 */
+	return p->bands - 1 - band;
+}
+
+static int ksz_queue_set_strict(struct ksz_device *dev, int port, int queue)
+{
+	int ret;
+
+	/* In order to ensure proper prioritization, it is necessary to set the
+	 * rate limit for the related queue to zero. Otherwise strict priority
+	 * mode will not work.
+	 */
+	ret = ksz_pwrite8(dev, port, KSZ9477_REG_PORT_OUT_RATE_0 + queue,
+			  KSZ9477_OUT_RATE_NO_LIMIT);
+	if (ret)
+		return ret;
+
+	ret = ksz_pwrite32(dev, port, REG_PORT_MTI_QUEUE_INDEX__4, queue);
+	if (ret)
+		return ret;
+
+	return ksz_setup_tc_mode(dev, port, MTI_SCHEDULE_STRICT_PRIO,
+				 MTI_SHAPING_OFF);
+}
+
+static int ksz_queue_set_wrr(struct ksz_device *dev, int port, int queue,
+			     int weight)
+{
+	int ret;
+
+	/* In order to ensure proper prioritization, it is necessary to set the
+	 * rate limit for the related queue to zero. Otherwise weighted round
+	 * robin mode will not work.
+	 */
+	ret = ksz_pwrite8(dev, port, KSZ9477_REG_PORT_OUT_RATE_0 + queue,
+			  KSZ9477_OUT_RATE_NO_LIMIT);
+	if (ret)
+		return ret;
+
+	ret = ksz_pwrite32(dev, port, REG_PORT_MTI_QUEUE_INDEX__4, queue);
+	if (ret)
+		return ret;
+
+	ret = ksz_setup_tc_mode(dev, port, MTI_SCHEDULE_WRR,
+				MTI_SHAPING_OFF);
+	if (ret)
+		return ret;
+
+	return ksz_pwrite8(dev, port, KSZ9477_PORT_MTI_QUEUE_CTRL_1, weight);
+}
+
+static int ksz_tc_ets_add(struct ksz_device *dev, int port,
+			  struct tc_ets_qopt_offload_replace_params *p)
+{
+	int ret, band, tc_prio;
+	u32 queue_map = 0;
+
+	/* Configure queue scheduling mode for all bands. Currently only strict
+	 * prio mode is supported.
+	 */
+	for (band = 0; band < p->bands; band++) {
+		int queue = ksz_ets_band_to_queue(p, band);
+
+		ret = ksz_queue_set_strict(dev, port, queue);
+		if (ret)
+			return ret;
+	}
+
+	/* Configure the mapping between traffic classes and queues. Note:
+	 * priomap variable support 16 traffic classes, but the chip can handle
+	 * only 8 classes.
+	 */
+	for (tc_prio = 0; tc_prio < ARRAY_SIZE(p->priomap); tc_prio++) {
+		int queue;
+
+		if (tc_prio > KSZ9477_MAX_TC_PRIO)
+			break;
+
+		queue = ksz_ets_band_to_queue(p, p->priomap[tc_prio]);
+		queue_map |= queue << (tc_prio * KSZ9477_PORT_TC_MAP_S);
+	}
+
+	return ksz_pwrite32(dev, port, KSZ9477_PORT_MRI_TC_MAP__4, queue_map);
+}
+
+static int ksz_tc_ets_del(struct ksz_device *dev, int port)
+{
+	int ret, queue;
+
+	/* To restore the default chip configuration, set all queues to use the
+	 * WRR scheduler with a weight of 1.
+	 */
+	for (queue = 0; queue < dev->info->num_tx_queues; queue++) {
+		ret = ksz_queue_set_wrr(dev, port, queue,
+					KSZ9477_DEFAULT_WRR_WEIGHT);
+		if (ret)
+			return ret;
+	}
+
+	/* Revert the queue mapping for TC-priority to its default setting on
+	 * the chip.
+	 */
+	return ksz_pwrite32(dev, port, KSZ9477_PORT_MRI_TC_MAP__4,
+			    KSZ9477_DEFAULT_TC_MAP);
+}
+
+static int ksz_tc_ets_validate(struct ksz_device *dev, int port,
+			       struct tc_ets_qopt_offload_replace_params *p)
+{
+	int band;
+
+	/* Since it is not feasible to share one port among multiple qdisc,
+	 * the user must configure all available queues appropriately.
+	 */
+	if (p->bands != dev->info->num_tx_queues) {
+		dev_err(dev->dev, "Not supported amount of bands. It should be %d\n",
+			dev->info->num_tx_queues);
+		return -EOPNOTSUPP;
+	}
+
+	for (band = 0; band < p->bands; ++band) {
+		/* The KSZ switches utilize a weighted round robin configuration
+		 * where a certain number of packets can be transmitted from a
+		 * queue before the next queue is serviced. For more information
+		 * on this, refer to section 5.2.8.4 of the KSZ8565R
+		 * documentation on the Port Transmit Queue Control 1 Register.
+		 * However, the current ETS Qdisc implementation (as of February
+		 * 2023) assigns a weight to each queue based on the number of
+		 * bytes or extrapolated bandwidth in percentages. Since this
+		 * differs from the KSZ switches' method and we don't want to
+		 * fake support by converting bytes to packets, we have decided
+		 * to return an error instead.
+		 */
+		if (p->quanta[band]) {
+			dev_err(dev->dev, "Quanta/weights configuration is not supported.\n");
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return 0;
+}
+
+static int ksz_tc_setup_qdisc_ets(struct dsa_switch *ds, int port,
+				  struct tc_ets_qopt_offload *qopt)
+{
+	struct ksz_device *dev = ds->priv;
+	int ret;
+
+	if (qopt->parent != TC_H_ROOT) {
+		dev_err(dev->dev, "Parent should be \"root\"\n");
+		return -EOPNOTSUPP;
+	}
+
+	switch (qopt->command) {
+	case TC_ETS_REPLACE:
+		ret = ksz_tc_ets_validate(dev, port, &qopt->replace_params);
+		if (ret)
+			return ret;
+
+		return ksz_tc_ets_add(dev, port, &qopt->replace_params);
+	case TC_ETS_DESTROY:
+		return ksz_tc_ets_del(dev, port);
+	case TC_ETS_STATS:
+	case TC_ETS_GRAFT:
+		return -EOPNOTSUPP;
+	}
+
+	return -EOPNOTSUPP;
+}
+
 static int ksz_setup_tc(struct dsa_switch *ds, int port,
 			enum tc_setup_type type, void *type_data)
 {
 	switch (type) {
 	case TC_SETUP_QDISC_CBS:
 		return ksz_setup_tc_cbs(ds, port, type_data);
+	case TC_SETUP_QDISC_ETS:
+		return ksz_tc_setup_qdisc_ets(ds, port, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index f53834bbe896..7618a4714e06 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -657,6 +657,15 @@  static inline int is_lan937x(struct ksz_device *dev)
 #define KSZ8_LEGAL_PACKET_SIZE		1518
 #define KSZ9477_MAX_FRAME_SIZE		9000
 
+#define KSZ9477_REG_PORT_OUT_RATE_0	0x0420
+#define KSZ9477_OUT_RATE_NO_LIMIT	0
+
+#define KSZ9477_PORT_MRI_TC_MAP__4	0x0808
+#define KSZ9477_DEFAULT_TC_MAP		0x33221100
+
+#define KSZ9477_PORT_TC_MAP_S		4
+#define KSZ9477_MAX_TC_PRIO		7
+
 /* CBS related registers */
 #define REG_PORT_MTI_QUEUE_INDEX__4	0x0900
 
@@ -670,6 +679,9 @@  static inline int is_lan937x(struct ksz_device *dev)
 #define MTI_SHAPING_SRP			1
 #define MTI_SHAPING_TIME_AWARE		2
 
+#define KSZ9477_PORT_MTI_QUEUE_CTRL_1	0x0915
+#define KSZ9477_DEFAULT_WRR_WEIGHT	1
+
 #define REG_PORT_MTI_HI_WATER_MARK	0x0916
 #define REG_PORT_MTI_LO_WATER_MARK	0x0918