diff mbox series

[net-next,v5,5/6] octeontx2-pf: Add support for HTB offload

Message ID 20230326181245.29149-6-hkelam@marvell.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series octeontx2-pf: HTB offload support | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 18 this patch: 18
netdev/cc_maintainers warning 5 maintainers not CCed: daniel@iogearbox.net john.fastabend@gmail.com bpf@vger.kernel.org ast@kernel.org hawk@kernel.org
netdev/build_clang success Errors and warnings before: 21 this patch: 21
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 40 this patch: 40
netdev/checkpatch warning WARNING: 'managment' may be misspelled - perhaps 'management'? WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Hariprasad Kelam March 26, 2023, 6:12 p.m. UTC
From: Naveen Mamindlapalli <naveenm@marvell.com>

This patch registers callbacks to support HTB offload.

Below are features supported,

- supports traffic shaping on the given class by honoring rate and ceil
configuration.

- supports traffic scheduling,  which prioritizes different types of
traffic based on strict priority values.

- supports the creation of leaf to inner classes such that parent node
rate limits apply to all child nodes.

Signed-off-by: Naveen Mamindlapalli <naveenm@marvell.com>
Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
---
 .../ethernet/marvell/octeontx2/af/common.h    |    2 +-
 .../ethernet/marvell/octeontx2/nic/Makefile   |    2 +-
 .../marvell/octeontx2/nic/otx2_common.c       |   35 +-
 .../marvell/octeontx2/nic/otx2_common.h       |    8 +-
 .../marvell/octeontx2/nic/otx2_ethtool.c      |   31 +-
 .../ethernet/marvell/octeontx2/nic/otx2_pf.c  |   56 +-
 .../ethernet/marvell/octeontx2/nic/otx2_reg.h |   13 +
 .../ethernet/marvell/octeontx2/nic/otx2_tc.c  |    7 +-
 .../net/ethernet/marvell/octeontx2/nic/qos.c  | 1460 +++++++++++++++++
 .../net/ethernet/marvell/octeontx2/nic/qos.h  |   58 +-
 .../ethernet/marvell/octeontx2/nic/qos_sq.c   |   20 +-
 11 files changed, 1657 insertions(+), 35 deletions(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/qos.c

Comments

Paolo Abeni March 28, 2023, 10:56 a.m. UTC | #1
On Sun, 2023-03-26 at 23:42 +0530, Hariprasad Kelam wrote:
[...]
> +static int otx2_qos_root_add(struct otx2_nic *pfvf, u16 htb_maj_id, u16 htb_defcls,
> +			     struct netlink_ext_ack *extack)
> +{
> +	struct otx2_qos_cfg *new_cfg;
> +	struct otx2_qos_node *root;
> +	int err;
> +
> +	netdev_dbg(pfvf->netdev,
> +		   "TC_HTB_CREATE: handle=0x%x defcls=0x%x\n",
> +		   htb_maj_id, htb_defcls);
> +
> +	INIT_LIST_HEAD(&pfvf->qos.qos_tree);
> +	mutex_init(&pfvf->qos.qos_lock);

It's quite strange and error prone dynamically init this mutex and the
list here. Why don't you do such init ad device creation time?

> +
> +	root = otx2_qos_alloc_root(pfvf);
> +	if (IS_ERR(root)) {
> +		mutex_destroy(&pfvf->qos.qos_lock);
> +		err = PTR_ERR(root);
> +		return err;
> +	}
> +
> +	/* allocate txschq queue */
> +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> +	if (!new_cfg) {
> +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");

Here the root node is leaked.

> +		mutex_destroy(&pfvf->qos.qos_lock);
> +		return -ENOMEM;
> +	}


[...]

Cheers,

Paolo
Simon Horman March 28, 2023, 2:41 p.m. UTC | #2
On Sun, Mar 26, 2023 at 11:42:44PM +0530, Hariprasad Kelam wrote:
> From: Naveen Mamindlapalli <naveenm@marvell.com>
> 
> This patch registers callbacks to support HTB offload.
> 
> Below are features supported,
> 
> - supports traffic shaping on the given class by honoring rate and ceil
> configuration.
> 
> - supports traffic scheduling,  which prioritizes different types of
> traffic based on strict priority values.
> 
> - supports the creation of leaf to inner classes such that parent node
> rate limits apply to all child nodes.
> 
> Signed-off-by: Naveen Mamindlapalli <naveenm@marvell.com>
> Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
> Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>

Reviewed-by: Simon Horman <simon.horman@corigine.com>

> ---
>  .../ethernet/marvell/octeontx2/af/common.h    |    2 +-
>  .../ethernet/marvell/octeontx2/nic/Makefile   |    2 +-
>  .../marvell/octeontx2/nic/otx2_common.c       |   35 +-
>  .../marvell/octeontx2/nic/otx2_common.h       |    8 +-
>  .../marvell/octeontx2/nic/otx2_ethtool.c      |   31 +-
>  .../ethernet/marvell/octeontx2/nic/otx2_pf.c  |   56 +-
>  .../ethernet/marvell/octeontx2/nic/otx2_reg.h |   13 +
>  .../ethernet/marvell/octeontx2/nic/otx2_tc.c  |    7 +-
>  .../net/ethernet/marvell/octeontx2/nic/qos.c  | 1460 +++++++++++++++++
>  .../net/ethernet/marvell/octeontx2/nic/qos.h  |   58 +-
>  .../ethernet/marvell/octeontx2/nic/qos_sq.c   |   20 +-
>  11 files changed, 1657 insertions(+), 35 deletions(-)

nit: this is a rather long patch.

...

> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c

...

> @@ -159,7 +165,7 @@ static void otx2_get_qset_stats(struct otx2_nic *pfvf,
>  				[otx2_queue_stats[stat].index];
>  	}
>  
> -	for (qidx = 0; qidx < pfvf->hw.tx_queues; qidx++) {
> +	for (qidx = 0; qidx <  otx2_get_total_tx_queues(pfvf); qidx++) {

nit: extra whitespace after '<'

>  		if (!otx2_update_sq_stats(pfvf, qidx)) {
>  			for (stat = 0; stat < otx2_n_queue_stats; stat++)
>  				*((*data)++) = 0;

...

> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c

...

> +static int otx2_qos_update_tx_netdev_queues(struct otx2_nic *pfvf)
> +{
> +	int tx_queues, qos_txqs, err;
> +	struct otx2_hw *hw = &pfvf->hw;

nit: reverse xmas tree - longest line to shortest -
     for local variable declarations.

> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h

...

> +struct otx2_qos_node {
> +	struct list_head list; /* list managment */

nit: s/managment/management/

> +	struct list_head child_list;
> +	struct list_head child_schq_list;
> +	struct hlist_node hlist;
> +	DECLARE_BITMAP(prio_bmap, OTX2_QOS_MAX_PRIO + 1);
> +	struct otx2_qos_node *parent;	/* parent qos node */
> +	u64 rate; /* htb params */
> +	u64 ceil;
> +	u32 classid;
> +	u32 prio;
> +	u16 schq; /* hw txschq */
> +	u16 qid;
> +	u16 prio_anchor;
> +	u8 level;
> +};
> +
>  
>  #endif

...
Maxim Mikityanskiy March 28, 2023, 6:42 p.m. UTC | #3
I have a few comments about concurrency issues, see below. I didn't
analyze the concurrency model of your driver deeply, so please apologize
me if I missed some bugs or accidentally called some good code buggy.

A few general things to pay attention to, regarding HTB offload:

1. ndo_select_queue can be called at any time, there is no reliable way
to prevent the kernel from calling it, that means that ndo_select_queue
must not crash if HTB configuration and structures are being updated in
another thread.

2. ndo_start_xmit runs in an RCU read lock. If you need to release some
structure that can be used from another thread in the TX datapath, you
can set some atomic flag, synchronize with RCU, then release the object.

3. You can take some inspiration from mlx5e, although you may find it's
a convoluted cooperation of spinlocks, mutexes, atomic operations with
barriers, and RCU. A big part of it is related to the mechanism of safe
reopening of queues, which your driver may not need, but the remaining
parts have a lot of similarities, so you can find useful insights about
the locking for HTB in mlx5e.

On Sun, Mar 26, 2023 at 11:42:44PM +0530, Hariprasad Kelam wrote:
> From: Naveen Mamindlapalli <naveenm@marvell.com>
> 
> This patch registers callbacks to support HTB offload.
> 
> Below are features supported,
> 
> - supports traffic shaping on the given class by honoring rate and ceil
> configuration.
> 
> - supports traffic scheduling,  which prioritizes different types of
> traffic based on strict priority values.
> 
> - supports the creation of leaf to inner classes such that parent node
> rate limits apply to all child nodes.
> 
> Signed-off-by: Naveen Mamindlapalli <naveenm@marvell.com>
> Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
> Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
> ---
>  .../ethernet/marvell/octeontx2/af/common.h    |    2 +-
>  .../ethernet/marvell/octeontx2/nic/Makefile   |    2 +-
>  .../marvell/octeontx2/nic/otx2_common.c       |   35 +-
>  .../marvell/octeontx2/nic/otx2_common.h       |    8 +-
>  .../marvell/octeontx2/nic/otx2_ethtool.c      |   31 +-
>  .../ethernet/marvell/octeontx2/nic/otx2_pf.c  |   56 +-
>  .../ethernet/marvell/octeontx2/nic/otx2_reg.h |   13 +
>  .../ethernet/marvell/octeontx2/nic/otx2_tc.c  |    7 +-
>  .../net/ethernet/marvell/octeontx2/nic/qos.c  | 1460 +++++++++++++++++
>  .../net/ethernet/marvell/octeontx2/nic/qos.h  |   58 +-
>  .../ethernet/marvell/octeontx2/nic/qos_sq.c   |   20 +-
>  11 files changed, 1657 insertions(+), 35 deletions(-)
>  create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/qos.c
> 
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h b/drivers/net/ethernet/marvell/octeontx2/af/common.h
> index 8931864ee110..f5bf719a6ccf 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
> @@ -142,7 +142,7 @@ enum nix_scheduler {
>  
>  #define TXSCH_RR_QTM_MAX		((1 << 24) - 1)
>  #define TXSCH_TL1_DFLT_RR_QTM		TXSCH_RR_QTM_MAX
> -#define TXSCH_TL1_DFLT_RR_PRIO		(0x1ull)
> +#define TXSCH_TL1_DFLT_RR_PRIO		(0x7ull)
>  #define CN10K_MAX_DWRR_WEIGHT          16384 /* Weight is 14bit on CN10K */
>  
>  /* Min/Max packet sizes, excluding FCS */
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> index 3d31ddf7c652..5664f768cb0c 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> @@ -8,7 +8,7 @@ obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o otx2_ptp.o
>  
>  rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \
>                 otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o \
> -               otx2_devlink.o qos_sq.o
> +               otx2_devlink.o qos_sq.o qos.o
>  rvu_nicvf-y := otx2_vf.o otx2_devlink.o
>  
>  rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> index 32c02a2d3554..b4542a801291 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> @@ -89,6 +89,11 @@ int otx2_update_sq_stats(struct otx2_nic *pfvf, int qidx)
>  	if (!pfvf->qset.sq)
>  		return 0;
>  
> +	if (qidx >= pfvf->hw.non_qos_queues) {
> +		if (!test_bit(qidx - pfvf->hw.non_qos_queues, pfvf->qos.qos_sq_bmap))
> +			return 0;
> +	}
> +
>  	otx2_nix_sq_op_stats(&sq->stats, pfvf, qidx);
>  	return 1;
>  }
> @@ -747,29 +752,47 @@ int otx2_txsch_alloc(struct otx2_nic *pfvf)
>  	return 0;
>  }
>  
> -int otx2_txschq_stop(struct otx2_nic *pfvf)
> +void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq)
>  {
>  	struct nix_txsch_free_req *free_req;
> -	int lvl, schq, err;
> +	int err;
>  
>  	mutex_lock(&pfvf->mbox.lock);
> -	/* Free the transmit schedulers */
> +
>  	free_req = otx2_mbox_alloc_msg_nix_txsch_free(&pfvf->mbox);
>  	if (!free_req) {
>  		mutex_unlock(&pfvf->mbox.lock);
> -		return -ENOMEM;
> +		netdev_err(pfvf->netdev,
> +			   "Failed alloc txschq free req\n");
> +		return;
>  	}
>  
> -	free_req->flags = TXSCHQ_FREE_ALL;
> +	free_req->schq_lvl = lvl;
> +	free_req->schq = schq;
> +
>  	err = otx2_sync_mbox_msg(&pfvf->mbox);
> +	if (err) {
> +		netdev_err(pfvf->netdev,
> +			   "Failed stop txschq %d at level %d\n", lvl, schq);
> +	}
> +
>  	mutex_unlock(&pfvf->mbox.lock);
> +}
> +
> +void otx2_txschq_stop(struct otx2_nic *pfvf)
> +{
> +	int lvl, schq;
> +
> +	/* free non QOS TLx nodes */
> +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++)
> +		otx2_txschq_free_one(pfvf, lvl,
> +				     pfvf->hw.txschq_list[lvl][0]);
>  
>  	/* Clear the txschq list */
>  	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
>  		for (schq = 0; schq < MAX_TXSCHQ_PER_FUNC; schq++)
>  			pfvf->hw.txschq_list[lvl][schq] = 0;
>  	}
> -	return err;
>  }
>  
>  void otx2_sqb_flush(struct otx2_nic *pfvf)
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> index 3834cc447426..4b219e8e5b32 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> @@ -252,6 +252,7 @@ struct otx2_hw {
>  #define CN10K_RPM		3
>  #define CN10K_PTP_ONESTEP	4
>  #define CN10K_HW_MACSEC		5
> +#define QOS_CIR_PIR_SUPPORT	6
>  	unsigned long		cap_flag;
>  
>  #define LMT_LINE_SIZE		128
> @@ -586,6 +587,7 @@ static inline void otx2_setup_dev_hw_settings(struct otx2_nic *pfvf)
>  		__set_bit(CN10K_LMTST, &hw->cap_flag);
>  		__set_bit(CN10K_RPM, &hw->cap_flag);
>  		__set_bit(CN10K_PTP_ONESTEP, &hw->cap_flag);
> +		__set_bit(QOS_CIR_PIR_SUPPORT, &hw->cap_flag);
>  	}
>  
>  	if (is_dev_cn10kb(pfvf->pdev))
> @@ -935,7 +937,7 @@ int otx2_config_nix(struct otx2_nic *pfvf);
>  int otx2_config_nix_queues(struct otx2_nic *pfvf);
>  int otx2_txschq_config(struct otx2_nic *pfvf, int lvl, int prio, bool pfc_en);
>  int otx2_txsch_alloc(struct otx2_nic *pfvf);
> -int otx2_txschq_stop(struct otx2_nic *pfvf);
> +void otx2_txschq_stop(struct otx2_nic *pfvf);
>  void otx2_sqb_flush(struct otx2_nic *pfvf);
>  int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
>  		    dma_addr_t *dma);
> @@ -953,6 +955,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
>  		   int stack_pages, int numptrs, int buf_size);
>  int otx2_aura_init(struct otx2_nic *pfvf, int aura_id,
>  		   int pool_id, int numptrs);
> +void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq);
>  
>  /* RSS configuration APIs*/
>  int otx2_rss_init(struct otx2_nic *pfvf);
> @@ -1064,4 +1067,7 @@ static inline void cn10k_handle_mcs_event(struct otx2_nic *pfvf,
>  void otx2_qos_sq_setup(struct otx2_nic *pfvf, int qos_txqs);
>  u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
>  		      struct net_device *sb_dev);
> +int otx2_get_txq_by_classid(struct otx2_nic *pfvf, u16 classid);
> +void otx2_qos_config_txschq(struct otx2_nic *pfvf);
> +void otx2_clean_qos_queues(struct otx2_nic *pfvf);
>  #endif /* OTX2_COMMON_H */
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
> index 0f8d1a69139f..e8722a4f4cc6 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
> @@ -92,10 +92,16 @@ static void otx2_get_qset_strings(struct otx2_nic *pfvf, u8 **data, int qset)
>  			*data += ETH_GSTRING_LEN;
>  		}
>  	}
> -	for (qidx = 0; qidx < pfvf->hw.tx_queues; qidx++) {
> +
> +	for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) {
>  		for (stats = 0; stats < otx2_n_queue_stats; stats++) {
> -			sprintf(*data, "txq%d: %s", qidx + start_qidx,
> -				otx2_queue_stats[stats].name);
> +			if (qidx >= pfvf->hw.non_qos_queues)
> +				sprintf(*data, "txq_qos%d: %s",
> +					qidx + start_qidx - pfvf->hw.non_qos_queues,
> +					otx2_queue_stats[stats].name);
> +			else
> +				sprintf(*data, "txq%d: %s", qidx + start_qidx,
> +					otx2_queue_stats[stats].name);
>  			*data += ETH_GSTRING_LEN;
>  		}
>  	}
> @@ -159,7 +165,7 @@ static void otx2_get_qset_stats(struct otx2_nic *pfvf,
>  				[otx2_queue_stats[stat].index];
>  	}
>  
> -	for (qidx = 0; qidx < pfvf->hw.tx_queues; qidx++) {
> +	for (qidx = 0; qidx <  otx2_get_total_tx_queues(pfvf); qidx++) {
>  		if (!otx2_update_sq_stats(pfvf, qidx)) {
>  			for (stat = 0; stat < otx2_n_queue_stats; stat++)
>  				*((*data)++) = 0;
> @@ -254,7 +260,8 @@ static int otx2_get_sset_count(struct net_device *netdev, int sset)
>  		return -EINVAL;
>  
>  	qstats_count = otx2_n_queue_stats *
> -		       (pfvf->hw.rx_queues + pfvf->hw.tx_queues);
> +		       (pfvf->hw.rx_queues + pfvf->hw.non_qos_queues +
> +			pfvf->hw.tc_tx_queues);
>  	if (!test_bit(CN10K_RPM, &pfvf->hw.cap_flag))
>  		mac_stats = CGX_RX_STATS_COUNT + CGX_TX_STATS_COUNT;
>  	otx2_update_lmac_fec_stats(pfvf);
> @@ -282,7 +289,7 @@ static int otx2_set_channels(struct net_device *dev,
>  {
>  	struct otx2_nic *pfvf = netdev_priv(dev);
>  	bool if_up = netif_running(dev);
> -	int err = 0;
> +	int err, qos_txqs;
>  
>  	if (!channel->rx_count || !channel->tx_count)
>  		return -EINVAL;
> @@ -296,14 +303,19 @@ static int otx2_set_channels(struct net_device *dev,
>  	if (if_up)
>  		dev->netdev_ops->ndo_stop(dev);
>  
> -	err = otx2_set_real_num_queues(dev, channel->tx_count,
> +	qos_txqs = bitmap_weight(pfvf->qos.qos_sq_bmap,
> +				 OTX2_QOS_MAX_LEAF_NODES);
> +
> +	err = otx2_set_real_num_queues(dev, channel->tx_count + qos_txqs,
>  				       channel->rx_count);
>  	if (err)
>  		return err;
>  
>  	pfvf->hw.rx_queues = channel->rx_count;
>  	pfvf->hw.tx_queues = channel->tx_count;
> -	pfvf->qset.cq_cnt = pfvf->hw.tx_queues +  pfvf->hw.rx_queues;
> +	if (pfvf->xdp_prog)
> +		pfvf->hw.xdp_queues = channel->rx_count;
> +	pfvf->hw.non_qos_queues =  pfvf->hw.tx_queues + pfvf->hw.xdp_queues;
>  
>  	if (if_up)
>  		err = dev->netdev_ops->ndo_open(dev);
> @@ -1405,7 +1417,8 @@ static int otx2vf_get_sset_count(struct net_device *netdev, int sset)
>  		return -EINVAL;
>  
>  	qstats_count = otx2_n_queue_stats *
> -		       (vf->hw.rx_queues + vf->hw.tx_queues);
> +		       (vf->hw.rx_queues + vf->hw.tx_queues +
> +			vf->hw.tc_tx_queues);
>  
>  	return otx2_n_dev_stats + otx2_n_drv_stats + qstats_count + 1;
>  }
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> index a32f0cb89fc4..d0192f9089ee 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> @@ -1387,6 +1387,9 @@ static void otx2_free_sq_res(struct otx2_nic *pf)
>  	otx2_sq_free_sqbs(pf);
>  	for (qidx = 0; qidx < otx2_get_total_tx_queues(pf); qidx++) {
>  		sq = &qset->sq[qidx];
> +		/* Skip freeing Qos queues if they are not initialized */
> +		if (!sq->sqb_count)
> +			continue;
>  		qmem_free(pf->dev, sq->sqe);
>  		qmem_free(pf->dev, sq->tso_hdrs);
>  		kfree(sq->sg);
> @@ -1518,8 +1521,7 @@ static int otx2_init_hw_resources(struct otx2_nic *pf)
>  	otx2_free_cq_res(pf);
>  	otx2_ctx_disable(mbox, NIX_AQ_CTYPE_RQ, false);
>  err_free_txsch:
> -	if (otx2_txschq_stop(pf))
> -		dev_err(pf->dev, "%s failed to stop TX schedulers\n", __func__);
> +	otx2_txschq_stop(pf);
>  err_free_sq_ptrs:
>  	otx2_sq_free_sqbs(pf);
>  err_free_rq_ptrs:
> @@ -1554,21 +1556,21 @@ static void otx2_free_hw_resources(struct otx2_nic *pf)
>  	struct mbox *mbox = &pf->mbox;
>  	struct otx2_cq_queue *cq;
>  	struct msg_req *req;
> -	int qidx, err;
> +	int qidx;
>  
>  	/* Ensure all SQE are processed */
>  	otx2_sqb_flush(pf);
>  
>  	/* Stop transmission */
> -	err = otx2_txschq_stop(pf);
> -	if (err)
> -		dev_err(pf->dev, "RVUPF: Failed to stop/free TX schedulers\n");
> +	otx2_txschq_stop(pf);
>  
>  #ifdef CONFIG_DCB
>  	if (pf->pfc_en)
>  		otx2_pfc_txschq_stop(pf);
>  #endif
>  
> +	otx2_clean_qos_queues(pf);
> +
>  	mutex_lock(&mbox->lock);
>  	/* Disable backpressure */
>  	if (!(pf->pcifunc & RVU_PFVF_FUNC_MASK))
> @@ -1836,6 +1838,9 @@ int otx2_open(struct net_device *netdev)
>  	/* 'intf_down' may be checked on any cpu */
>  	smp_wmb();
>  
> +	/* Enable QoS configuration before starting tx queues */
> +	otx2_qos_config_txschq(pf);
> +
>  	/* we have already received link status notification */
>  	if (pf->linfo.link_up && !(pf->pcifunc & RVU_PFVF_FUNC_MASK))
>  		otx2_handle_link_event(pf);
> @@ -1980,14 +1985,45 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev)
>  	return NETDEV_TX_OK;
>  }
>  
> +static int otx2_qos_select_htb_queue(struct otx2_nic *pf, struct sk_buff *skb,
> +				     u16 htb_maj_id)
> +{
> +	u16 classid;
> +
> +	if ((TC_H_MAJ(skb->priority) >> 16) == htb_maj_id)
> +		classid = TC_H_MIN(skb->priority);
> +	else
> +		classid = READ_ONCE(pf->qos.defcls);
> +
> +	if (!classid)
> +		return 0;
> +
> +	return otx2_get_txq_by_classid(pf, classid);

This selects queues with numbers >= pf->hw.tx_queues, and otx2_xmit
indexes pfvf->qset.sq with these qids, however, pfvf->qset.sq is
allocated only up to pf->hw.non_qos_queues. Array out-of-bounds?

> +}
> +
>  u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
>  		      struct net_device *sb_dev)
>  {
> -#ifdef CONFIG_DCB
>  	struct otx2_nic *pf = netdev_priv(netdev);
> +	bool qos_enabled;
> +#ifdef CONFIG_DCB
>  	u8 vlan_prio;
>  #endif
> +	int txq;
>  
> +	qos_enabled = (netdev->real_num_tx_queues > pf->hw.tx_queues) ? true : false;
> +	if (unlikely(qos_enabled)) {
> +		u16 htb_maj_id = smp_load_acquire(&pf->qos.maj_id); /* barrier */

Checkpatch requires to add comments for the barriers for a reason :)

"Barrier" is a useless comment, we all know that smp_load_acquire is a
barrier, you should explain why this barrier is needed and which other
barriers it pairs with.

> +
> +		if (unlikely(htb_maj_id)) {
> +			txq = otx2_qos_select_htb_queue(pf, skb, htb_maj_id);
> +			if (txq > 0)
> +				return txq;
> +			goto process_pfc;
> +		}
> +	}
> +
> +process_pfc:
>  #ifdef CONFIG_DCB
>  	if (!skb_vlan_tag_present(skb))
>  		goto pick_tx;
> @@ -2001,7 +2037,11 @@ u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
>  
>  pick_tx:
>  #endif
> -	return netdev_pick_tx(netdev, skb, NULL);
> +	txq = netdev_pick_tx(netdev, skb, NULL);
> +	if (unlikely(qos_enabled))
> +		return txq % pf->hw.tx_queues;
> +
> +	return txq;
>  }
>  EXPORT_SYMBOL(otx2_select_queue);
>  
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
> index 1b967eaf948b..45a32e4b49d1 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
> @@ -145,12 +145,25 @@
>  #define NIX_AF_TL1X_TOPOLOGY(a)		(0xC80 | (a) << 16)
>  #define NIX_AF_TL2X_PARENT(a)		(0xE88 | (a) << 16)
>  #define NIX_AF_TL2X_SCHEDULE(a)		(0xE00 | (a) << 16)
> +#define NIX_AF_TL2X_TOPOLOGY(a)		(0xE80 | (a) << 16)
> +#define NIX_AF_TL2X_CIR(a)              (0xE20 | (a) << 16)
> +#define NIX_AF_TL2X_PIR(a)              (0xE30 | (a) << 16)
>  #define NIX_AF_TL3X_PARENT(a)		(0x1088 | (a) << 16)
>  #define NIX_AF_TL3X_SCHEDULE(a)		(0x1000 | (a) << 16)
> +#define NIX_AF_TL3X_SHAPE(a)		(0x1010 | (a) << 16)
> +#define NIX_AF_TL3X_CIR(a)		(0x1020 | (a) << 16)
> +#define NIX_AF_TL3X_PIR(a)		(0x1030 | (a) << 16)
> +#define NIX_AF_TL3X_TOPOLOGY(a)		(0x1080 | (a) << 16)
>  #define NIX_AF_TL4X_PARENT(a)		(0x1288 | (a) << 16)
>  #define NIX_AF_TL4X_SCHEDULE(a)		(0x1200 | (a) << 16)
> +#define NIX_AF_TL4X_SHAPE(a)		(0x1210 | (a) << 16)
> +#define NIX_AF_TL4X_CIR(a)		(0x1220 | (a) << 16)
>  #define NIX_AF_TL4X_PIR(a)		(0x1230 | (a) << 16)
> +#define NIX_AF_TL4X_TOPOLOGY(a)		(0x1280 | (a) << 16)
>  #define NIX_AF_MDQX_SCHEDULE(a)		(0x1400 | (a) << 16)
> +#define NIX_AF_MDQX_SHAPE(a)		(0x1410 | (a) << 16)
> +#define NIX_AF_MDQX_CIR(a)		(0x1420 | (a) << 16)
> +#define NIX_AF_MDQX_PIR(a)		(0x1430 | (a) << 16)
>  #define NIX_AF_MDQX_PARENT(a)		(0x1480 | (a) << 16)
>  #define NIX_AF_TL3_TL2X_LINKX_CFG(a, b)	(0x1700 | (a) << 16 | (b) << 3)
>  
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
> index 044cc211424e..42c49249f4e7 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
> @@ -19,6 +19,7 @@
>  
>  #include "cn10k.h"
>  #include "otx2_common.h"
> +#include "qos.h"
>  
>  /* Egress rate limiting definitions */
>  #define MAX_BURST_EXPONENT		0x0FULL
> @@ -147,8 +148,8 @@ static void otx2_get_egress_rate_cfg(u64 maxrate, u32 *exp,
>  	}
>  }
>  
> -static u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic,
> -				       u64 maxrate, u32 burst)
> +u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic,
> +				u64 maxrate, u32 burst)
>  {
>  	u32 burst_exp, burst_mantissa;
>  	u32 exp, mantissa, div_exp;
> @@ -1127,6 +1128,8 @@ int otx2_setup_tc(struct net_device *netdev, enum tc_setup_type type,
>  	switch (type) {
>  	case TC_SETUP_BLOCK:
>  		return otx2_setup_tc_block(netdev, type_data);
> +	case TC_SETUP_QDISC_HTB:
> +		return otx2_setup_tc_htb(netdev, type_data);
>  	default:
>  		return -EOPNOTSUPP;
>  	}
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
> new file mode 100644
> index 000000000000..22c5b6a2871a
> --- /dev/null
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
> @@ -0,0 +1,1460 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Marvell RVU Ethernet driver
> + *
> + * Copyright (C) 2023 Marvell.
> + *
> + */
> +#include <linux/netdevice.h>
> +#include <linux/etherdevice.h>
> +#include <linux/inetdevice.h>
> +#include <linux/bitfield.h>
> +
> +#include "otx2_common.h"
> +#include "cn10k.h"
> +#include "qos.h"
> +
> +#define OTX2_QOS_QID_INNER		0xFFFFU
> +#define OTX2_QOS_QID_NONE		0xFFFEU
> +#define OTX2_QOS_ROOT_CLASSID		0xFFFFFFFF
> +#define OTX2_QOS_CLASS_NONE		0
> +#define OTX2_QOS_DEFAULT_PRIO		0xF
> +#define OTX2_QOS_INVALID_SQ		0xFFFF
> +
> +/* Egress rate limiting definitions */
> +#define MAX_BURST_EXPONENT		0x0FULL
> +#define MAX_BURST_MANTISSA		0xFFULL
> +#define MAX_BURST_SIZE			130816ULL
> +#define MAX_RATE_DIVIDER_EXPONENT	12ULL
> +#define MAX_RATE_EXPONENT		0x0FULL
> +#define MAX_RATE_MANTISSA		0xFFULL
> +
> +/* Bitfields in NIX_TLX_PIR register */
> +#define TLX_RATE_MANTISSA		GENMASK_ULL(8, 1)
> +#define TLX_RATE_EXPONENT		GENMASK_ULL(12, 9)
> +#define TLX_RATE_DIVIDER_EXPONENT	GENMASK_ULL(16, 13)
> +#define TLX_BURST_MANTISSA		GENMASK_ULL(36, 29)
> +#define TLX_BURST_EXPONENT		GENMASK_ULL(40, 37)
> +
> +static int otx2_qos_update_tx_netdev_queues(struct otx2_nic *pfvf)
> +{
> +	int tx_queues, qos_txqs, err;
> +	struct otx2_hw *hw = &pfvf->hw;
> +
> +	qos_txqs = bitmap_weight(pfvf->qos.qos_sq_bmap,
> +				 OTX2_QOS_MAX_LEAF_NODES);
> +
> +	tx_queues = hw->tx_queues + qos_txqs;
> +
> +	err = netif_set_real_num_tx_queues(pfvf->netdev, tx_queues);
> +	if (err) {
> +		netdev_err(pfvf->netdev,
> +			   "Failed to set no of Tx queues: %d\n", tx_queues);
> +		return err;
> +	}
> +
> +	return 0;
> +}
> +
> +static u64 otx2_qos_convert_rate(u64 rate)
> +{
> +	u64 converted_rate;
> +
> +	/* convert bytes per second to Mbps */
> +	converted_rate = rate * 8;
> +	converted_rate = max_t(u64, converted_rate / 1000000, 1);
> +
> +	return converted_rate;
> +}
> +
> +static void __otx2_qos_txschq_cfg(struct otx2_nic *pfvf,
> +				  struct otx2_qos_node *node,
> +				  struct nix_txschq_config *cfg)
> +{
> +	struct otx2_hw *hw = &pfvf->hw;
> +	int num_regs = 0;
> +	u64 maxrate;
> +	u8 level;
> +
> +	level = node->level;
> +
> +	/* program txschq registers */
> +	if (level == NIX_TXSCH_LVL_SMQ) {
> +		cfg->reg[num_regs] = NIX_AF_SMQX_CFG(node->schq);
> +		cfg->regval[num_regs] = ((u64)pfvf->tx_max_pktlen << 8) |
> +					OTX2_MIN_MTU;
> +		cfg->regval[num_regs] |= (0x20ULL << 51) | (0x80ULL << 39) |
> +					 (0x2ULL << 36);
> +		num_regs++;
> +
> +		/* configure parent txschq */
> +		cfg->reg[num_regs] = NIX_AF_MDQX_PARENT(node->schq);
> +		cfg->regval[num_regs] = node->parent->schq << 16;
> +		num_regs++;
> +
> +		/* configure prio/quantum */
> +		if (node->qid == OTX2_QOS_QID_NONE) {
> +			cfg->reg[num_regs] = NIX_AF_MDQX_SCHEDULE(node->schq);
> +			cfg->regval[num_regs] =  node->prio << 24 |
> +						 mtu_to_dwrr_weight(pfvf,
> +								    pfvf->tx_max_pktlen);
> +			num_regs++;
> +			goto txschq_cfg_out;
> +		}
> +
> +		/* configure prio */
> +		cfg->reg[num_regs] = NIX_AF_MDQX_SCHEDULE(node->schq);
> +		cfg->regval[num_regs] = (node->schq -
> +					 node->parent->prio_anchor) << 24;
> +		num_regs++;
> +
> +		/* configure PIR */
> +		maxrate = (node->rate > node->ceil) ? node->rate : node->ceil;
> +
> +		cfg->reg[num_regs] = NIX_AF_MDQX_PIR(node->schq);
> +		cfg->regval[num_regs] =
> +			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
> +		num_regs++;
> +
> +		/* configure CIR */
> +		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
> +			/* Don't configure CIR when both CIR+PIR not supported
> +			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes deadlock
> +			 */
> +			goto txschq_cfg_out;
> +		}
> +
> +		cfg->reg[num_regs] = NIX_AF_MDQX_CIR(node->schq);
> +		cfg->regval[num_regs] =
> +			otx2_get_txschq_rate_regval(pfvf, node->rate, 65536);
> +		num_regs++;
> +	} else if (level == NIX_TXSCH_LVL_TL4) {
> +		/* configure parent txschq */
> +		cfg->reg[num_regs] = NIX_AF_TL4X_PARENT(node->schq);
> +		cfg->regval[num_regs] = node->parent->schq << 16;
> +		num_regs++;
> +
> +		/* return if not htb node */
> +		if (node->qid == OTX2_QOS_QID_NONE) {
> +			cfg->reg[num_regs] = NIX_AF_TL4X_SCHEDULE(node->schq);
> +			cfg->regval[num_regs] =  node->prio << 24 |
> +						 mtu_to_dwrr_weight(pfvf,
> +								    pfvf->tx_max_pktlen);
> +			num_regs++;
> +			goto txschq_cfg_out;
> +		}
> +
> +		/* configure priority */
> +		cfg->reg[num_regs] = NIX_AF_TL4X_SCHEDULE(node->schq);
> +		cfg->regval[num_regs] = (node->schq -
> +					 node->parent->prio_anchor) << 24;
> +		num_regs++;
> +
> +		/* configure PIR */
> +		maxrate = (node->rate > node->ceil) ? node->rate : node->ceil;
> +		cfg->reg[num_regs] = NIX_AF_TL4X_PIR(node->schq);
> +		cfg->regval[num_regs] =
> +			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
> +		num_regs++;
> +
> +		/* configure CIR */
> +		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
> +			/* Don't configure CIR when both CIR+PIR not supported
> +			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes deadlock
> +			 */
> +			goto txschq_cfg_out;
> +		}
> +
> +		cfg->reg[num_regs] = NIX_AF_TL4X_CIR(node->schq);
> +		cfg->regval[num_regs] =
> +			otx2_get_txschq_rate_regval(pfvf, node->rate, 65536);
> +		num_regs++;
> +	} else if (level == NIX_TXSCH_LVL_TL3) {
> +		/* configure parent txschq */
> +		cfg->reg[num_regs] = NIX_AF_TL3X_PARENT(node->schq);
> +		cfg->regval[num_regs] = node->parent->schq << 16;
> +		num_regs++;
> +
> +		/* configure link cfg */
> +		if (level == pfvf->qos.link_cfg_lvl) {
> +			cfg->reg[num_regs] = NIX_AF_TL3_TL2X_LINKX_CFG(node->schq, hw->tx_link);
> +			cfg->regval[num_regs] = BIT_ULL(13) | BIT_ULL(12);
> +			num_regs++;
> +		}
> +
> +		/* return if not htb node */
> +		if (node->qid == OTX2_QOS_QID_NONE) {
> +			cfg->reg[num_regs] = NIX_AF_TL3X_SCHEDULE(node->schq);
> +			cfg->regval[num_regs] =  node->prio << 24 |
> +						 mtu_to_dwrr_weight(pfvf,
> +								    pfvf->tx_max_pktlen);
> +			num_regs++;
> +			goto txschq_cfg_out;
> +		}
> +
> +		/* configure priority */
> +		cfg->reg[num_regs] = NIX_AF_TL3X_SCHEDULE(node->schq);
> +		cfg->regval[num_regs] = (node->schq -
> +					 node->parent->prio_anchor) << 24;
> +		num_regs++;
> +
> +		/* configure PIR */
> +		maxrate = (node->rate > node->ceil) ? node->rate : node->ceil;
> +		cfg->reg[num_regs] = NIX_AF_TL3X_PIR(node->schq);
> +		cfg->regval[num_regs] =
> +			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
> +		num_regs++;
> +
> +		/* configure CIR */
> +		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
> +			/* Don't configure CIR when both CIR+PIR not supported
> +			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes deadlock
> +			 */
> +			goto txschq_cfg_out;
> +		}
> +
> +		cfg->reg[num_regs] = NIX_AF_TL3X_CIR(node->schq);
> +		cfg->regval[num_regs] =
> +			otx2_get_txschq_rate_regval(pfvf, node->rate, 65536);
> +		num_regs++;
> +	} else if (level == NIX_TXSCH_LVL_TL2) {
> +		/* configure parent txschq */
> +		cfg->reg[num_regs] = NIX_AF_TL2X_PARENT(node->schq);
> +		cfg->regval[num_regs] = hw->tx_link << 16;
> +		num_regs++;
> +
> +		/* configure link cfg */
> +		if (level == pfvf->qos.link_cfg_lvl) {
> +			cfg->reg[num_regs] = NIX_AF_TL3_TL2X_LINKX_CFG(node->schq, hw->tx_link);
> +			cfg->regval[num_regs] = BIT_ULL(13) | BIT_ULL(12);
> +			num_regs++;
> +		}
> +
> +		/* return if not htb node */
> +		if (node->qid == OTX2_QOS_QID_NONE) {
> +			cfg->reg[num_regs] = NIX_AF_TL2X_SCHEDULE(node->schq);
> +			cfg->regval[num_regs] =  node->prio << 24 |
> +						 mtu_to_dwrr_weight(pfvf,
> +								    pfvf->tx_max_pktlen);
> +			num_regs++;
> +			goto txschq_cfg_out;
> +		}
> +
> +		/* check if node is root */
> +		if (node->qid == OTX2_QOS_QID_INNER && !node->parent) {
> +			cfg->reg[num_regs] = NIX_AF_TL2X_SCHEDULE(node->schq);
> +			cfg->regval[num_regs] =  TXSCH_TL1_DFLT_RR_PRIO << 24 |
> +						 mtu_to_dwrr_weight(pfvf,
> +								    pfvf->tx_max_pktlen);
> +			num_regs++;
> +			goto txschq_cfg_out;
> +		}
> +
> +		/* configure priority/quantum */
> +		cfg->reg[num_regs] = NIX_AF_TL2X_SCHEDULE(node->schq);
> +		cfg->regval[num_regs] = (node->schq -
> +					 node->parent->prio_anchor) << 24;
> +		num_regs++;
> +
> +		/* configure PIR */
> +		maxrate = (node->rate > node->ceil) ? node->rate : node->ceil;
> +		cfg->reg[num_regs] = NIX_AF_TL2X_PIR(node->schq);
> +		cfg->regval[num_regs] =
> +			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
> +		num_regs++;
> +
> +		/* configure CIR */
> +		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
> +			/* Don't configure CIR when both CIR+PIR not supported
> +			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes deadlock
> +			 */
> +			goto txschq_cfg_out;
> +		}
> +
> +		cfg->reg[num_regs] = NIX_AF_TL2X_CIR(node->schq);
> +		cfg->regval[num_regs] =
> +			otx2_get_txschq_rate_regval(pfvf, node->rate, 65536);
> +		num_regs++;
> +	}
> +
> +txschq_cfg_out:
> +	cfg->num_regs = num_regs;
> +}
> +
> +static int otx2_qos_txschq_set_parent_topology(struct otx2_nic *pfvf,
> +					       struct otx2_qos_node *parent)
> +{
> +	struct mbox *mbox = &pfvf->mbox;
> +	struct nix_txschq_config *cfg;
> +	int rc;
> +
> +	if (parent->level == NIX_TXSCH_LVL_MDQ)
> +		return 0;
> +
> +	mutex_lock(&mbox->lock);
> +
> +	cfg = otx2_mbox_alloc_msg_nix_txschq_cfg(&pfvf->mbox);
> +	if (!cfg) {
> +		mutex_unlock(&mbox->lock);
> +		return -ENOMEM;
> +	}
> +
> +	cfg->lvl = parent->level;
> +
> +	if (parent->level == NIX_TXSCH_LVL_TL4)
> +		cfg->reg[0] = NIX_AF_TL4X_TOPOLOGY(parent->schq);
> +	else if (parent->level == NIX_TXSCH_LVL_TL3)
> +		cfg->reg[0] = NIX_AF_TL3X_TOPOLOGY(parent->schq);
> +	else if (parent->level == NIX_TXSCH_LVL_TL2)
> +		cfg->reg[0] = NIX_AF_TL2X_TOPOLOGY(parent->schq);
> +	else if (parent->level == NIX_TXSCH_LVL_TL1)
> +		cfg->reg[0] = NIX_AF_TL1X_TOPOLOGY(parent->schq);
> +
> +	cfg->regval[0] = (u64)parent->prio_anchor << 32;
> +	if (parent->level == NIX_TXSCH_LVL_TL1)
> +		cfg->regval[0] |= (u64)TXSCH_TL1_DFLT_RR_PRIO << 1;
> +
> +	cfg->num_regs++;
> +
> +	rc = otx2_sync_mbox_msg(&pfvf->mbox);
> +
> +	mutex_unlock(&mbox->lock);
> +
> +	return rc;
> +}
> +
> +static void otx2_qos_free_hw_node_schq(struct otx2_nic *pfvf,
> +				       struct otx2_qos_node *parent)
> +{
> +	struct otx2_qos_node *node;
> +
> +	list_for_each_entry_reverse(node, &parent->child_schq_list, list)
> +		otx2_txschq_free_one(pfvf, node->level, node->schq);
> +}
> +
> +static void otx2_qos_free_hw_node(struct otx2_nic *pfvf,
> +				  struct otx2_qos_node *parent)
> +{
> +	struct otx2_qos_node *node, *tmp;
> +
> +	list_for_each_entry_safe(node, tmp, &parent->child_list, list) {
> +		otx2_qos_free_hw_node(pfvf, node);
> +		otx2_qos_free_hw_node_schq(pfvf, node);
> +		otx2_txschq_free_one(pfvf, node->level, node->schq);
> +	}
> +}
> +
> +static void otx2_qos_free_hw_cfg(struct otx2_nic *pfvf,
> +				 struct otx2_qos_node *node)
> +{
> +	mutex_lock(&pfvf->qos.qos_lock);
> +
> +	/* free child node hw mappings */
> +	otx2_qos_free_hw_node(pfvf, node);
> +	otx2_qos_free_hw_node_schq(pfvf, node);
> +
> +	/* free node hw mappings */
> +	otx2_txschq_free_one(pfvf, node->level, node->schq);
> +
> +	mutex_unlock(&pfvf->qos.qos_lock);
> +}
> +
> +static void otx2_qos_sw_node_delete(struct otx2_nic *pfvf,
> +				    struct otx2_qos_node *node)
> +{
> +	hash_del(&node->hlist);
> +
> +	if (node->qid != OTX2_QOS_QID_INNER && node->qid != OTX2_QOS_QID_NONE) {
> +		__clear_bit(node->qid, pfvf->qos.qos_sq_bmap);
> +		otx2_qos_update_tx_netdev_queues(pfvf);
> +	}
> +
> +	list_del(&node->list);
> +	kfree(node);
> +}
> +
> +static void otx2_qos_free_sw_node_schq(struct otx2_nic *pfvf,
> +				       struct otx2_qos_node *parent)
> +{
> +	struct otx2_qos_node *node, *tmp;
> +
> +	list_for_each_entry_safe(node, tmp, &parent->child_schq_list, list) {
> +		list_del(&node->list);
> +		kfree(node);
> +	}
> +}
> +
> +static void __otx2_qos_free_sw_node(struct otx2_nic *pfvf,
> +				    struct otx2_qos_node *parent)
> +{
> +	struct otx2_qos_node *node, *tmp;
> +
> +	list_for_each_entry_safe(node, tmp, &parent->child_list, list) {
> +		__otx2_qos_free_sw_node(pfvf, node);
> +		otx2_qos_free_sw_node_schq(pfvf, node);
> +		otx2_qos_sw_node_delete(pfvf, node);
> +	}
> +}
> +
> +static void otx2_qos_free_sw_node(struct otx2_nic *pfvf,
> +				  struct otx2_qos_node *node)
> +{
> +	mutex_lock(&pfvf->qos.qos_lock);
> +
> +	__otx2_qos_free_sw_node(pfvf, node);
> +	otx2_qos_free_sw_node_schq(pfvf, node);
> +	otx2_qos_sw_node_delete(pfvf, node);
> +
> +	mutex_unlock(&pfvf->qos.qos_lock);
> +}
> +
> +static void otx2_qos_destroy_node(struct otx2_nic *pfvf,
> +				  struct otx2_qos_node *node)
> +{
> +	otx2_qos_free_hw_cfg(pfvf, node);
> +	otx2_qos_free_sw_node(pfvf, node);
> +}
> +
> +static void otx2_qos_fill_cfg_schq(struct otx2_qos_node *parent,
> +				   struct otx2_qos_cfg *cfg)
> +{
> +	struct otx2_qos_node *node;
> +
> +	list_for_each_entry(node, &parent->child_schq_list, list)
> +		cfg->schq[node->level]++;
> +}
> +
> +static void otx2_qos_fill_cfg_tl(struct otx2_qos_node *parent,
> +				 struct otx2_qos_cfg *cfg)
> +{
> +	struct otx2_qos_node *node;
> +
> +	list_for_each_entry(node, &parent->child_list, list) {
> +		otx2_qos_fill_cfg_tl(node, cfg);
> +		cfg->schq_contig[node->level]++;
> +		otx2_qos_fill_cfg_schq(node, cfg);
> +	}
> +}
> +
> +static void otx2_qos_prepare_txschq_cfg(struct otx2_nic *pfvf,
> +					struct otx2_qos_node *parent,
> +					struct otx2_qos_cfg *cfg)
> +{
> +	mutex_lock(&pfvf->qos.qos_lock);
> +	otx2_qos_fill_cfg_tl(parent, cfg);
> +	mutex_unlock(&pfvf->qos.qos_lock);
> +}
> +
> +static void otx2_qos_read_txschq_cfg_schq(struct otx2_qos_node *parent,
> +					  struct otx2_qos_cfg *cfg)
> +{
> +	struct otx2_qos_node *node;
> +	int cnt;
> +
> +	list_for_each_entry(node, &parent->child_schq_list, list) {
> +		cnt = cfg->dwrr_node_pos[node->level];
> +		cfg->schq_list[node->level][cnt] = node->schq;
> +		cfg->schq[node->level]++;
> +		cfg->dwrr_node_pos[node->level]++;
> +	}
> +}
> +
> +static void otx2_qos_read_txschq_cfg_tl(struct otx2_qos_node *parent,
> +					struct otx2_qos_cfg *cfg)
> +{
> +	struct otx2_qos_node *node;
> +	int cnt;
> +
> +	list_for_each_entry(node, &parent->child_list, list) {
> +		otx2_qos_read_txschq_cfg_tl(node, cfg);
> +		cnt = cfg->static_node_pos[node->level];
> +		cfg->schq_contig_list[node->level][cnt] = node->schq;
> +		cfg->schq_contig[node->level]++;
> +		cfg->static_node_pos[node->level]++;
> +		otx2_qos_read_txschq_cfg_schq(node, cfg);
> +	}
> +}
> +
> +static void otx2_qos_read_txschq_cfg(struct otx2_nic *pfvf,
> +				     struct otx2_qos_node *node,
> +				     struct otx2_qos_cfg *cfg)
> +{
> +	mutex_lock(&pfvf->qos.qos_lock);
> +	otx2_qos_read_txschq_cfg_tl(node, cfg);
> +	mutex_unlock(&pfvf->qos.qos_lock);
> +}
> +
> +static struct otx2_qos_node *
> +otx2_qos_alloc_root(struct otx2_nic *pfvf)
> +{
> +	struct otx2_qos_node *node;
> +
> +	node = kzalloc(sizeof(*node), GFP_KERNEL);
> +	if (!node)
> +		return ERR_PTR(-ENOMEM);
> +
> +	node->parent = NULL;
> +	if (!is_otx2_vf(pfvf->pcifunc))
> +		node->level = NIX_TXSCH_LVL_TL1;
> +	else
> +		node->level = NIX_TXSCH_LVL_TL2;
> +
> +	node->qid = OTX2_QOS_QID_INNER;
> +	node->classid = OTX2_QOS_ROOT_CLASSID;
> +
> +	hash_add(pfvf->qos.qos_hlist, &node->hlist, node->classid);
> +	list_add_tail(&node->list, &pfvf->qos.qos_tree);
> +	INIT_LIST_HEAD(&node->child_list);
> +	INIT_LIST_HEAD(&node->child_schq_list);
> +
> +	return node;
> +}
> +
> +static int otx2_qos_add_child_node(struct otx2_qos_node *parent,
> +				   struct otx2_qos_node *node)
> +{
> +	struct list_head *head = &parent->child_list;
> +	struct otx2_qos_node *tmp_node;
> +	struct list_head *tmp;
> +
> +	for (tmp = head->next; tmp != head; tmp = tmp->next) {
> +		tmp_node = list_entry(tmp, struct otx2_qos_node, list);
> +		if (tmp_node->prio == node->prio)
> +			return -EEXIST;
> +		if (tmp_node->prio > node->prio) {
> +			list_add_tail(&node->list, tmp);
> +			return 0;
> +		}
> +	}
> +
> +	list_add_tail(&node->list, head);
> +	return 0;
> +}
> +
> +static int otx2_qos_alloc_txschq_node(struct otx2_nic *pfvf,
> +				      struct otx2_qos_node *node)
> +{
> +	struct otx2_qos_node *txschq_node, *parent, *tmp;
> +	int lvl;
> +
> +	parent = node;
> +	for (lvl = node->level - 1; lvl >= NIX_TXSCH_LVL_MDQ; lvl--) {
> +		txschq_node = kzalloc(sizeof(*txschq_node), GFP_KERNEL);
> +		if (!txschq_node)
> +			goto err_out;
> +
> +		txschq_node->parent = parent;
> +		txschq_node->level = lvl;
> +		txschq_node->classid = OTX2_QOS_CLASS_NONE;
> +		txschq_node->qid = OTX2_QOS_QID_NONE;
> +		txschq_node->rate = 0;
> +		txschq_node->ceil = 0;
> +		txschq_node->prio = 0;
> +
> +		mutex_lock(&pfvf->qos.qos_lock);
> +		list_add_tail(&txschq_node->list, &node->child_schq_list);
> +		mutex_unlock(&pfvf->qos.qos_lock);
> +
> +		INIT_LIST_HEAD(&txschq_node->child_list);
> +		INIT_LIST_HEAD(&txschq_node->child_schq_list);
> +		parent = txschq_node;
> +	}
> +
> +	return 0;
> +
> +err_out:
> +	list_for_each_entry_safe(txschq_node, tmp, &node->child_schq_list,
> +				 list) {
> +		list_del(&txschq_node->list);
> +		kfree(txschq_node);
> +	}
> +	return -ENOMEM;
> +}
> +
> +static struct otx2_qos_node *
> +otx2_qos_sw_create_leaf_node(struct otx2_nic *pfvf,
> +			     struct otx2_qos_node *parent,
> +			     u16 classid, u32 prio, u64 rate, u64 ceil,
> +			     u16 qid)
> +{
> +	struct otx2_qos_node *node;
> +	int err;
> +
> +	node = kzalloc(sizeof(*node), GFP_KERNEL);
> +	if (!node)
> +		return ERR_PTR(-ENOMEM);
> +
> +	node->parent = parent;
> +	node->level = parent->level - 1;
> +	node->classid = classid;
> +	node->qid = qid;
> +	node->rate = otx2_qos_convert_rate(rate);
> +	node->ceil = otx2_qos_convert_rate(ceil);
> +	node->prio = prio;
> +
> +	__set_bit(qid, pfvf->qos.qos_sq_bmap);
> +
> +	hash_add(pfvf->qos.qos_hlist, &node->hlist, classid);
> +
> +	mutex_lock(&pfvf->qos.qos_lock);
> +	err = otx2_qos_add_child_node(parent, node);
> +	if (err) {
> +		mutex_unlock(&pfvf->qos.qos_lock);
> +		return ERR_PTR(err);
> +	}
> +	mutex_unlock(&pfvf->qos.qos_lock);
> +
> +	INIT_LIST_HEAD(&node->child_list);
> +	INIT_LIST_HEAD(&node->child_schq_list);

Looks suspicious that some fields of node are initialized after
otx2_qos_add_child_node is called.

> +
> +	err = otx2_qos_alloc_txschq_node(pfvf, node);
> +	if (err) {
> +		otx2_qos_sw_node_delete(pfvf, node);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	return node;
> +}
> +
> +static struct otx2_qos_node *
> +otx2_sw_node_find(struct otx2_nic *pfvf, u32 classid)
> +{
> +	struct otx2_qos_node *node = NULL;
> +
> +	hash_for_each_possible(pfvf->qos.qos_hlist, node, hlist, classid) {

This loop may be called from ndo_select_queue, while another thread may
modify qos_hlist. We use RCU in mlx5e to protect this structure. What
protects it in your driver?

> +		if (node->classid == classid)
> +			break;
> +	}
> +
> +	return node;
> +}
> +
> +int otx2_get_txq_by_classid(struct otx2_nic *pfvf, u16 classid)
> +{
> +	struct otx2_qos_node *node;
> +	u16 qid;
> +	int res;
> +
> +	node = otx2_sw_node_find(pfvf, classid);
> +	if (IS_ERR(node)) {
> +		res = -ENOENT;
> +		goto out;
> +	}
> +	qid = READ_ONCE(node->qid);
> +	if (qid == OTX2_QOS_QID_INNER) {
> +		res = -EINVAL;
> +		goto out;
> +	}
> +	res = pfvf->hw.tx_queues + qid;
> +out:
> +	return res;
> +}
> +
> +static int
> +otx2_qos_txschq_config(struct otx2_nic *pfvf, struct otx2_qos_node *node)
> +{
> +	struct mbox *mbox = &pfvf->mbox;
> +	struct nix_txschq_config *req;
> +	int rc;
> +
> +	mutex_lock(&mbox->lock);
> +
> +	req = otx2_mbox_alloc_msg_nix_txschq_cfg(&pfvf->mbox);
> +	if (!req) {
> +		mutex_unlock(&mbox->lock);
> +		return -ENOMEM;
> +	}
> +
> +	req->lvl = node->level;
> +	__otx2_qos_txschq_cfg(pfvf, node, req);
> +
> +	rc = otx2_sync_mbox_msg(&pfvf->mbox);
> +
> +	mutex_unlock(&mbox->lock);
> +
> +	return rc;
> +}
> +
> +static int otx2_qos_txschq_alloc(struct otx2_nic *pfvf,
> +				 struct otx2_qos_cfg *cfg)
> +{
> +	struct nix_txsch_alloc_req *req;
> +	struct nix_txsch_alloc_rsp *rsp;
> +	struct mbox *mbox = &pfvf->mbox;
> +	int lvl, rc, schq;
> +
> +	mutex_lock(&mbox->lock);
> +	req = otx2_mbox_alloc_msg_nix_txsch_alloc(&pfvf->mbox);
> +	if (!req) {
> +		mutex_unlock(&mbox->lock);
> +		return -ENOMEM;
> +	}
> +
> +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> +		req->schq[lvl] = cfg->schq[lvl];
> +		req->schq_contig[lvl] = cfg->schq_contig[lvl];
> +	}
> +
> +	rc = otx2_sync_mbox_msg(&pfvf->mbox);
> +	if (rc) {
> +		mutex_unlock(&mbox->lock);
> +		return rc;
> +	}
> +
> +	rsp = (struct nix_txsch_alloc_rsp *)
> +	      otx2_mbox_get_rsp(&pfvf->mbox.mbox, 0, &req->hdr);
> +
> +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> +		for (schq = 0; schq < rsp->schq_contig[lvl]; schq++) {
> +			cfg->schq_contig_list[lvl][schq] =
> +				rsp->schq_contig_list[lvl][schq];
> +		}
> +	}
> +
> +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> +		for (schq = 0; schq < rsp->schq[lvl]; schq++) {
> +			cfg->schq_list[lvl][schq] =
> +				rsp->schq_list[lvl][schq];
> +		}
> +	}
> +
> +	pfvf->qos.link_cfg_lvl = rsp->link_cfg_lvl;
> +
> +	mutex_unlock(&mbox->lock);
> +
> +	return rc;
> +}
> +
> +static void otx2_qos_txschq_fill_cfg_schq(struct otx2_nic *pfvf,
> +					  struct otx2_qos_node *node,
> +					  struct otx2_qos_cfg *cfg)
> +{
> +	struct otx2_qos_node *tmp;
> +	int cnt;
> +
> +	list_for_each_entry(tmp, &node->child_schq_list, list) {
> +		cnt = cfg->dwrr_node_pos[tmp->level];
> +		tmp->schq = cfg->schq_list[tmp->level][cnt];
> +		cfg->dwrr_node_pos[tmp->level]++;
> +	}
> +}
> +
> +static void otx2_qos_txschq_fill_cfg_tl(struct otx2_nic *pfvf,
> +					struct otx2_qos_node *node,
> +					struct otx2_qos_cfg *cfg)
> +{
> +	struct otx2_qos_node *tmp;
> +	int cnt;
> +
> +	list_for_each_entry(tmp, &node->child_list, list) {
> +		otx2_qos_txschq_fill_cfg_tl(pfvf, tmp, cfg);
> +		cnt = cfg->static_node_pos[tmp->level];
> +		tmp->schq = cfg->schq_contig_list[tmp->level][cnt];
> +		if (cnt == 0)
> +			node->prio_anchor = tmp->schq;
> +		cfg->static_node_pos[tmp->level]++;
> +		otx2_qos_txschq_fill_cfg_schq(pfvf, tmp, cfg);
> +	}
> +}
> +
> +static void otx2_qos_txschq_fill_cfg(struct otx2_nic *pfvf,
> +				     struct otx2_qos_node *node,
> +				     struct otx2_qos_cfg *cfg)
> +{
> +	mutex_lock(&pfvf->qos.qos_lock);
> +	otx2_qos_txschq_fill_cfg_tl(pfvf, node, cfg);
> +	otx2_qos_txschq_fill_cfg_schq(pfvf, node, cfg);
> +	mutex_unlock(&pfvf->qos.qos_lock);
> +}
> +
> +static int otx2_qos_txschq_push_cfg_schq(struct otx2_nic *pfvf,
> +					 struct otx2_qos_node *node,
> +					 struct otx2_qos_cfg *cfg)
> +{
> +	struct otx2_qos_node *tmp;
> +	int ret = 0;
> +
> +	list_for_each_entry(tmp, &node->child_schq_list, list) {
> +		ret = otx2_qos_txschq_config(pfvf, tmp);
> +		if (ret)
> +			return -EIO;
> +		ret = otx2_qos_txschq_set_parent_topology(pfvf, tmp->parent);
> +		if (ret)
> +			return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +static int otx2_qos_txschq_push_cfg_tl(struct otx2_nic *pfvf,
> +				       struct otx2_qos_node *node,
> +				       struct otx2_qos_cfg *cfg)
> +{
> +	struct otx2_qos_node *tmp;
> +	int ret;
> +
> +	list_for_each_entry(tmp, &node->child_list, list) {
> +		ret = otx2_qos_txschq_push_cfg_tl(pfvf, tmp, cfg);
> +		if (ret)
> +			return -EIO;
> +		ret = otx2_qos_txschq_config(pfvf, tmp);
> +		if (ret)
> +			return -EIO;
> +		ret = otx2_qos_txschq_push_cfg_schq(pfvf, tmp, cfg);
> +		if (ret)
> +			return -EIO;
> +	}
> +
> +	ret = otx2_qos_txschq_set_parent_topology(pfvf, node);
> +	if (ret)
> +		return -EIO;
> +
> +	return 0;
> +}
> +
> +static int otx2_qos_txschq_push_cfg(struct otx2_nic *pfvf,
> +				    struct otx2_qos_node *node,
> +				    struct otx2_qos_cfg *cfg)
> +{
> +	int ret;
> +
> +	mutex_lock(&pfvf->qos.qos_lock);
> +	ret = otx2_qos_txschq_push_cfg_tl(pfvf, node, cfg);
> +	if (ret)
> +		goto out;
> +	ret = otx2_qos_txschq_push_cfg_schq(pfvf, node, cfg);
> +out:
> +	mutex_unlock(&pfvf->qos.qos_lock);
> +	return ret;
> +}
> +
> +static int otx2_qos_txschq_update_config(struct otx2_nic *pfvf,
> +					 struct otx2_qos_node *node,
> +					 struct otx2_qos_cfg *cfg)
> +{
> +	otx2_qos_txschq_fill_cfg(pfvf, node, cfg);
> +
> +	return otx2_qos_txschq_push_cfg(pfvf, node, cfg);
> +}
> +
> +static int otx2_qos_txschq_update_root_cfg(struct otx2_nic *pfvf,
> +					   struct otx2_qos_node *root,
> +					   struct otx2_qos_cfg *cfg)
> +{
> +	root->schq = cfg->schq_list[root->level][0];
> +	return otx2_qos_txschq_config(pfvf, root);
> +}
> +
> +static void otx2_qos_free_cfg(struct otx2_nic *pfvf, struct otx2_qos_cfg *cfg)
> +{
> +	int lvl, idx, schq;
> +
> +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> +		for (idx = 0; idx < cfg->schq[lvl]; idx++) {
> +			schq = cfg->schq_list[lvl][idx];
> +			otx2_txschq_free_one(pfvf, lvl, schq);
> +		}
> +	}
> +
> +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> +		for (idx = 0; idx < cfg->schq_contig[lvl]; idx++) {
> +			schq = cfg->schq_contig_list[lvl][idx];
> +			otx2_txschq_free_one(pfvf, lvl, schq);
> +		}
> +	}
> +}
> +
> +static void otx2_qos_enadis_sq(struct otx2_nic *pfvf,
> +			       struct otx2_qos_node *node,
> +			       u16 qid)
> +{
> +	if (pfvf->qos.qid_to_sqmap[qid] != OTX2_QOS_INVALID_SQ)
> +		otx2_qos_disable_sq(pfvf, qid);
> +
> +	pfvf->qos.qid_to_sqmap[qid] = node->schq;
> +	otx2_qos_enable_sq(pfvf, qid);
> +}
> +
> +static void otx2_qos_update_smq_schq(struct otx2_nic *pfvf,
> +				     struct otx2_qos_node *node,
> +				     bool action)
> +{
> +	struct otx2_qos_node *tmp;
> +
> +	if (node->qid == OTX2_QOS_QID_INNER)
> +		return;
> +
> +	list_for_each_entry(tmp, &node->child_schq_list, list) {
> +		if (tmp->level == NIX_TXSCH_LVL_MDQ) {
> +			if (action == QOS_SMQ_FLUSH)
> +				otx2_smq_flush(pfvf, tmp->schq);
> +			else
> +				otx2_qos_enadis_sq(pfvf, tmp, node->qid);
> +		}
> +	}
> +}
> +
> +static void __otx2_qos_update_smq(struct otx2_nic *pfvf,
> +				  struct otx2_qos_node *node,
> +				  bool action)
> +{
> +	struct otx2_qos_node *tmp;
> +
> +	list_for_each_entry(tmp, &node->child_list, list) {
> +		__otx2_qos_update_smq(pfvf, tmp, action);
> +		if (tmp->qid == OTX2_QOS_QID_INNER)
> +			continue;
> +		if (tmp->level == NIX_TXSCH_LVL_MDQ) {
> +			if (action == QOS_SMQ_FLUSH)
> +				otx2_smq_flush(pfvf, tmp->schq);
> +			else
> +				otx2_qos_enadis_sq(pfvf, tmp, tmp->qid);
> +		} else {
> +			otx2_qos_update_smq_schq(pfvf, tmp, action);
> +		}
> +	}
> +}
> +
> +static void otx2_qos_update_smq(struct otx2_nic *pfvf,
> +				struct otx2_qos_node *node,
> +				bool action)
> +{
> +	mutex_lock(&pfvf->qos.qos_lock);
> +	__otx2_qos_update_smq(pfvf, node, action);
> +	otx2_qos_update_smq_schq(pfvf, node, action);
> +	mutex_unlock(&pfvf->qos.qos_lock);
> +}
> +
> +static int otx2_qos_push_txschq_cfg(struct otx2_nic *pfvf,
> +				    struct otx2_qos_node *node,
> +				    struct otx2_qos_cfg *cfg)
> +{
> +	int ret = 0;
> +
> +	ret = otx2_qos_txschq_alloc(pfvf, cfg);
> +	if (ret)
> +		return -ENOSPC;
> +
> +	if (!(pfvf->netdev->flags & IFF_UP)) {
> +		otx2_qos_txschq_fill_cfg(pfvf, node, cfg);
> +		return 0;
> +	}
> +
> +	ret = otx2_qos_txschq_update_config(pfvf, node, cfg);
> +	if (ret) {
> +		otx2_qos_free_cfg(pfvf, cfg);
> +		return -EIO;
> +	}
> +
> +	otx2_qos_update_smq(pfvf, node, QOS_CFG_SQ);
> +
> +	return 0;
> +}
> +
> +static int otx2_qos_update_tree(struct otx2_nic *pfvf,
> +				struct otx2_qos_node *node,
> +				struct otx2_qos_cfg *cfg)
> +{
> +	otx2_qos_prepare_txschq_cfg(pfvf, node->parent, cfg);
> +	return otx2_qos_push_txschq_cfg(pfvf, node->parent, cfg);
> +}
> +
> +static int otx2_qos_root_add(struct otx2_nic *pfvf, u16 htb_maj_id, u16 htb_defcls,
> +			     struct netlink_ext_ack *extack)
> +{
> +	struct otx2_qos_cfg *new_cfg;
> +	struct otx2_qos_node *root;
> +	int err;
> +
> +	netdev_dbg(pfvf->netdev,
> +		   "TC_HTB_CREATE: handle=0x%x defcls=0x%x\n",
> +		   htb_maj_id, htb_defcls);
> +
> +	INIT_LIST_HEAD(&pfvf->qos.qos_tree);
> +	mutex_init(&pfvf->qos.qos_lock);
> +
> +	root = otx2_qos_alloc_root(pfvf);
> +	if (IS_ERR(root)) {
> +		mutex_destroy(&pfvf->qos.qos_lock);
> +		err = PTR_ERR(root);
> +		return err;
> +	}
> +
> +	/* allocate txschq queue */
> +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> +	if (!new_cfg) {
> +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
> +		mutex_destroy(&pfvf->qos.qos_lock);
> +		return -ENOMEM;
> +	}
> +	/* allocate htb root node */
> +	new_cfg->schq[root->level] = 1;
> +	err = otx2_qos_txschq_alloc(pfvf, new_cfg);
> +	if (err) {
> +		NL_SET_ERR_MSG_MOD(extack, "Error allocating txschq");
> +		goto free_root_node;
> +	}
> +
> +	if (!(pfvf->netdev->flags & IFF_UP) ||
> +	    root->level == NIX_TXSCH_LVL_TL1) {
> +		root->schq = new_cfg->schq_list[root->level][0];
> +		goto out;
> +	}
> +
> +	/* update the txschq configuration in hw */
> +	err = otx2_qos_txschq_update_root_cfg(pfvf, root, new_cfg);
> +	if (err) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "Error updating txschq configuration");
> +		goto txschq_free;
> +	}
> +
> +out:
> +	WRITE_ONCE(pfvf->qos.defcls, htb_defcls);
> +	smp_store_release(&pfvf->qos.maj_id, htb_maj_id); /* barrier */
> +	kfree(new_cfg);
> +	return 0;
> +
> +txschq_free:
> +	otx2_qos_free_cfg(pfvf, new_cfg);
> +free_root_node:
> +	kfree(new_cfg);
> +	otx2_qos_sw_node_delete(pfvf, root);
> +	mutex_destroy(&pfvf->qos.qos_lock);
> +	return err;
> +}
> +
> +static int otx2_qos_root_destroy(struct otx2_nic *pfvf)
> +{
> +	struct otx2_qos_node *root;
> +
> +	netdev_dbg(pfvf->netdev, "TC_HTB_DESTROY\n");
> +
> +	/* find root node */
> +	root = otx2_sw_node_find(pfvf, OTX2_QOS_ROOT_CLASSID);
> +	if (IS_ERR(root))
> +		return -ENOENT;
> +
> +	/* free the hw mappings */
> +	otx2_qos_destroy_node(pfvf, root);
> +	mutex_destroy(&pfvf->qos.qos_lock);
> +
> +	return 0;
> +}
> +
> +static int otx2_qos_validate_configuration(struct otx2_qos_node *parent,
> +					   struct netlink_ext_ack *extack,
> +					   struct otx2_nic *pfvf,
> +					   u64 prio)
> +{
> +	if (test_bit(prio, parent->prio_bmap)) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "Static priority child with same priority exists");
> +		return -EEXIST;
> +	}
> +
> +	if (prio == TXSCH_TL1_DFLT_RR_PRIO) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "Priority is reserved for Round Robin");
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static int otx2_qos_leaf_alloc_queue(struct otx2_nic *pfvf, u16 classid,
> +				     u32 parent_classid, u64 rate, u64 ceil,
> +				     u64 prio, struct netlink_ext_ack *extack)
> +{
> +	struct otx2_qos_cfg *old_cfg, *new_cfg;
> +	struct otx2_qos_node *node, *parent;
> +	int qid, ret, err;
> +
> +	netdev_dbg(pfvf->netdev,
> +		   "TC_HTB_LEAF_ALLOC_QUEUE: classid=0x%x parent_classid=0x%x rate=%lld ceil=%lld prio=%lld\n",
> +		   classid, parent_classid, rate, ceil, prio);
> +
> +	if (prio > OTX2_QOS_MAX_PRIO) {
> +		NL_SET_ERR_MSG_MOD(extack, "Valid priority range 0 to 7");
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +
> +	/* get parent node */
> +	parent = otx2_sw_node_find(pfvf, parent_classid);
> +	if (IS_ERR(parent)) {
> +		NL_SET_ERR_MSG_MOD(extack, "parent node not found");
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +	if (parent->level == NIX_TXSCH_LVL_MDQ) {
> +		NL_SET_ERR_MSG_MOD(extack, "HTB qos max levels reached");
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +
> +	ret = otx2_qos_validate_configuration(parent, extack, pfvf, prio);
> +	if (ret)
> +		goto out;
> +
> +	set_bit(prio, parent->prio_bmap);
> +
> +	/* read current txschq configuration */
> +	old_cfg = kzalloc(sizeof(*old_cfg), GFP_KERNEL);
> +	if (!old_cfg) {
> +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	otx2_qos_read_txschq_cfg(pfvf, parent, old_cfg);
> +
> +	/* allocate a new sq */
> +	qid = otx2_qos_get_qid(pfvf);
> +	if (qid < 0) {
> +		NL_SET_ERR_MSG_MOD(extack, "Reached max supported QOS SQ's");
> +		ret = -ENOMEM;
> +		goto free_old_cfg;
> +	}
> +
> +	/* Actual SQ mapping will be updated after SMQ alloc */
> +	pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ;
> +
> +	/* allocate and initialize a new child node */
> +	node = otx2_qos_sw_create_leaf_node(pfvf, parent, classid, prio, rate,
> +					    ceil, qid);
> +	if (IS_ERR(node)) {
> +		NL_SET_ERR_MSG_MOD(extack, "Unable to allocate leaf node");
> +		ret = PTR_ERR(node);
> +		goto free_old_cfg;
> +	}
> +
> +	/* push new txschq config to hw */
> +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> +	if (!new_cfg) {
> +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
> +		ret = -ENOMEM;
> +		goto free_node;
> +	}
> +	ret = otx2_qos_update_tree(pfvf, node, new_cfg);
> +	if (ret) {
> +		NL_SET_ERR_MSG_MOD(extack, "HTB HW configuration error");
> +		kfree(new_cfg);
> +		otx2_qos_sw_node_delete(pfvf, node);
> +		/* restore the old qos tree */
> +		err = otx2_qos_txschq_update_config(pfvf, parent, old_cfg);
> +		if (err) {
> +			netdev_err(pfvf->netdev,
> +				   "Failed to restore txcshq configuration");
> +			goto free_old_cfg;
> +		}
> +
> +		otx2_qos_update_smq(pfvf, parent, QOS_CFG_SQ);
> +		goto free_old_cfg;
> +	}
> +
> +	/* update tx_real_queues */
> +	otx2_qos_update_tx_netdev_queues(pfvf);
> +
> +	/* free new txschq config */
> +	kfree(new_cfg);
> +
> +	/* free old txschq config */
> +	otx2_qos_free_cfg(pfvf, old_cfg);
> +	kfree(old_cfg);
> +
> +	return pfvf->hw.tx_queues + qid;
> +
> +free_node:
> +	otx2_qos_sw_node_delete(pfvf, node);
> +free_old_cfg:
> +	kfree(old_cfg);
> +out:
> +	return ret;
> +}
> +
> +static int otx2_qos_leaf_to_inner(struct otx2_nic *pfvf, u16 classid,
> +				  u16 child_classid, u64 rate, u64 ceil, u64 prio,
> +				  struct netlink_ext_ack *extack)
> +{
> +	struct otx2_qos_cfg *old_cfg, *new_cfg;
> +	struct otx2_qos_node *node, *child;
> +	int ret, err;
> +	u16 qid;
> +
> +	netdev_dbg(pfvf->netdev,
> +		   "TC_HTB_LEAF_TO_INNER classid %04x, child %04x, rate %llu, ceil %llu\n",
> +		   classid, child_classid, rate, ceil);
> +
> +	if (prio > OTX2_QOS_MAX_PRIO) {
> +		NL_SET_ERR_MSG_MOD(extack, "Valid priority range 0 to 7");
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +
> +	/* find node related to classid */
> +	node = otx2_sw_node_find(pfvf, classid);
> +	if (IS_ERR(node)) {
> +		NL_SET_ERR_MSG_MOD(extack, "HTB node not found");
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +	/* check max qos txschq level */
> +	if (node->level == NIX_TXSCH_LVL_MDQ) {
> +		NL_SET_ERR_MSG_MOD(extack, "HTB qos level not supported");
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +
> +	set_bit(prio, node->prio_bmap);
> +
> +	/* store the qid to assign to leaf node */
> +	qid = node->qid;
> +
> +	/* read current txschq configuration */
> +	old_cfg = kzalloc(sizeof(*old_cfg), GFP_KERNEL);
> +	if (!old_cfg) {
> +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	otx2_qos_read_txschq_cfg(pfvf, node, old_cfg);
> +
> +	/* delete the txschq nodes allocated for this node */
> +	otx2_qos_free_sw_node_schq(pfvf, node);
> +
> +	/* mark this node as htb inner node */
> +	node->qid = OTX2_QOS_QID_INNER;

As you can concurrently read node->qid from the datapath
(ndo_select_queue), you should use READ_ONCE/WRITE_ONCE to guarantee
that the value will not be torn; you already use READ_ONCE, but it
doesn't pair with a WRITE_ONCE here.

> +
> +	/* allocate and initialize a new child node */
> +	child = otx2_qos_sw_create_leaf_node(pfvf, node, child_classid,
> +					     prio, rate, ceil, qid);
> +	if (IS_ERR(child)) {
> +		NL_SET_ERR_MSG_MOD(extack, "Unable to allocate leaf node");
> +		ret = PTR_ERR(child);
> +		goto free_old_cfg;
> +	}
> +
> +	/* push new txschq config to hw */
> +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> +	if (!new_cfg) {
> +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
> +		ret = -ENOMEM;
> +		goto free_node;
> +	}
> +	ret = otx2_qos_update_tree(pfvf, child, new_cfg);
> +	if (ret) {
> +		NL_SET_ERR_MSG_MOD(extack, "HTB HW configuration error");
> +		kfree(new_cfg);
> +		otx2_qos_sw_node_delete(pfvf, child);
> +		/* restore the old qos tree */
> +		node->qid = qid;

Same here; might be somewhere else as well.

> +		err = otx2_qos_alloc_txschq_node(pfvf, node);
> +		if (err) {
> +			netdev_err(pfvf->netdev,
> +				   "Failed to restore old leaf node");
> +			goto free_old_cfg;
> +		}
> +		err = otx2_qos_txschq_update_config(pfvf, node, old_cfg);
> +		if (err) {
> +			netdev_err(pfvf->netdev,
> +				   "Failed to restore txcshq configuration");
> +			goto free_old_cfg;
> +		}
> +		otx2_qos_update_smq(pfvf, node, QOS_CFG_SQ);
> +		goto free_old_cfg;
> +	}
> +
> +	/* free new txschq config */
> +	kfree(new_cfg);
> +
> +	/* free old txschq config */
> +	otx2_qos_free_cfg(pfvf, old_cfg);
> +	kfree(old_cfg);
> +
> +	return 0;
> +
> +free_node:
> +	otx2_qos_sw_node_delete(pfvf, child);
> +free_old_cfg:
> +	kfree(old_cfg);
> +out:
> +	return ret;
> +}
> +
> +static int otx2_qos_leaf_del(struct otx2_nic *pfvf, u16 *classid,
> +			     struct netlink_ext_ack *extack)
> +{
> +	struct otx2_qos_node *node, *parent;
> +	u64 prio;
> +	u16 qid;
> +
> +	netdev_dbg(pfvf->netdev, "TC_HTB_LEAF_DEL classid %04x\n", *classid);
> +
> +	/* find node related to classid */
> +	node = otx2_sw_node_find(pfvf, *classid);
> +	if (IS_ERR(node)) {
> +		NL_SET_ERR_MSG_MOD(extack, "HTB node not found");
> +		return -ENOENT;
> +	}
> +	parent = node->parent;
> +	prio   = node->prio;
> +	qid    = node->qid;
> +
> +	otx2_qos_disable_sq(pfvf, node->qid);
> +
> +	otx2_qos_destroy_node(pfvf, node);
> +	pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ;
> +
> +	clear_bit(prio, parent->prio_bmap);
> +
> +	return 0;
> +}
> +
> +static int otx2_qos_leaf_del_last(struct otx2_nic *pfvf, u16 classid, bool force,
> +				  struct netlink_ext_ack *extack)
> +{
> +	struct otx2_qos_node *node, *parent;
> +	struct otx2_qos_cfg *new_cfg;
> +	u64 prio;
> +	int err;
> +	u16 qid;
> +
> +	netdev_dbg(pfvf->netdev,
> +		   "TC_HTB_LEAF_DEL_LAST classid %04x\n", classid);
> +
> +	/* find node related to classid */
> +	node = otx2_sw_node_find(pfvf, classid);
> +	if (IS_ERR(node)) {
> +		NL_SET_ERR_MSG_MOD(extack, "HTB node not found");
> +		return -ENOENT;
> +	}
> +
> +	/* save qid for use by parent */
> +	qid = node->qid;
> +	prio = node->prio;
> +
> +	parent = otx2_sw_node_find(pfvf, node->parent->classid);
> +	if (IS_ERR(parent)) {
> +		NL_SET_ERR_MSG_MOD(extack, "parent node not found");
> +		return -ENOENT;
> +	}
> +
> +	/* destroy the leaf node */
> +	otx2_qos_destroy_node(pfvf, node);
> +	pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ;
> +
> +	clear_bit(prio, parent->prio_bmap);
> +
> +	/* create downstream txschq entries to parent */
> +	err = otx2_qos_alloc_txschq_node(pfvf, parent);
> +	if (err) {
> +		NL_SET_ERR_MSG_MOD(extack, "HTB failed to create txsch configuration");
> +		return err;
> +	}
> +	parent->qid = qid;
> +	__set_bit(qid, pfvf->qos.qos_sq_bmap);
> +
> +	/* push new txschq config to hw */
> +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> +	if (!new_cfg) {
> +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
> +		return -ENOMEM;
> +	}
> +	/* fill txschq cfg and push txschq cfg to hw */
> +	otx2_qos_fill_cfg_schq(parent, new_cfg);
> +	err = otx2_qos_push_txschq_cfg(pfvf, parent, new_cfg);
> +	if (err) {
> +		NL_SET_ERR_MSG_MOD(extack, "HTB HW configuration error");
> +		kfree(new_cfg);
> +		return err;
> +	}
> +	kfree(new_cfg);
> +
> +	/* update tx_real_queues */
> +	otx2_qos_update_tx_netdev_queues(pfvf);
> +
> +	return 0;
> +}
> +
> +void otx2_clean_qos_queues(struct otx2_nic *pfvf)
> +{
> +	struct otx2_qos_node *root;
> +
> +	root = otx2_sw_node_find(pfvf, OTX2_QOS_ROOT_CLASSID);
> +	if (IS_ERR(root))
> +		return;
> +
> +	otx2_qos_update_smq(pfvf, root, QOS_SMQ_FLUSH);
> +}
> +
> +void otx2_qos_config_txschq(struct otx2_nic *pfvf)
> +{
> +	struct otx2_qos_node *root;
> +	int err;
> +
> +	root = otx2_sw_node_find(pfvf, OTX2_QOS_ROOT_CLASSID);
> +	if (IS_ERR(root))
> +		return;
> +
> +	err = otx2_qos_txschq_config(pfvf, root);
> +	if (err) {
> +		netdev_err(pfvf->netdev, "Error update txschq configuration\n");
> +		goto root_destroy;
> +	}
> +
> +	err = otx2_qos_txschq_push_cfg_tl(pfvf, root, NULL);
> +	if (err) {
> +		netdev_err(pfvf->netdev, "Error update txschq configuration\n");
> +		goto root_destroy;
> +	}
> +
> +	otx2_qos_update_smq(pfvf, root, QOS_CFG_SQ);
> +	return;
> +
> +root_destroy:
> +	netdev_err(pfvf->netdev, "Failed to update Scheduler/Shaping config in Hardware\n");
> +	/* Free resources allocated */
> +	otx2_qos_root_destroy(pfvf);
> +}
> +
> +int otx2_setup_tc_htb(struct net_device *ndev, struct tc_htb_qopt_offload *htb)
> +{
> +	struct otx2_nic *pfvf = netdev_priv(ndev);
> +	int res;
> +
> +	switch (htb->command) {
> +	case TC_HTB_CREATE:
> +		return otx2_qos_root_add(pfvf, htb->parent_classid,
> +					 htb->classid, htb->extack);
> +	case TC_HTB_DESTROY:
> +		return otx2_qos_root_destroy(pfvf);
> +	case TC_HTB_LEAF_ALLOC_QUEUE:
> +		res = otx2_qos_leaf_alloc_queue(pfvf, htb->classid,
> +						htb->parent_classid,
> +						htb->rate, htb->ceil,
> +						htb->prio, htb->extack);
> +		if (res < 0)
> +			return res;
> +		htb->qid = res;
> +		return 0;
> +	case TC_HTB_LEAF_TO_INNER:
> +		return otx2_qos_leaf_to_inner(pfvf, htb->parent_classid,
> +					      htb->classid, htb->rate,
> +					      htb->ceil, htb->prio,
> +					      htb->extack);
> +	case TC_HTB_LEAF_DEL:
> +		return otx2_qos_leaf_del(pfvf, &htb->classid, htb->extack);
> +	case TC_HTB_LEAF_DEL_LAST:
> +	case TC_HTB_LEAF_DEL_LAST_FORCE:
> +		return otx2_qos_leaf_del_last(pfvf, htb->classid,
> +				htb->command == TC_HTB_LEAF_DEL_LAST_FORCE,
> +					      htb->extack);
> +	case TC_HTB_LEAF_QUERY_QUEUE:
> +		res = otx2_get_txq_by_classid(pfvf, htb->classid);
> +		htb->qid = res;
> +		return 0;
> +	case TC_HTB_NODE_MODIFY:
> +		fallthrough;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +}
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
> index 73a62d092e99..26de1af2aa57 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
> @@ -7,13 +7,63 @@
>  #ifndef OTX2_QOS_H
>  #define OTX2_QOS_H
>  
> +#include <linux/types.h>
> +#include <linux/netdevice.h>
> +#include <linux/rhashtable.h>
> +
> +#define OTX2_QOS_MAX_LVL		4
> +#define OTX2_QOS_MAX_PRIO		7
>  #define OTX2_QOS_MAX_LEAF_NODES                16
>  
> -int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq);
> -void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq);
> +enum qos_smq_operations {
> +	QOS_CFG_SQ,
> +	QOS_SMQ_FLUSH,
> +};
> +
> +u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic, u64 maxrate, u32 burst);
> +
> +int otx2_setup_tc_htb(struct net_device *ndev, struct tc_htb_qopt_offload *htb);
> +int otx2_qos_get_qid(struct otx2_nic *pfvf);
> +void otx2_qos_free_qid(struct otx2_nic *pfvf, int qidx);
> +int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx);
> +void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx);
> +
> +struct otx2_qos_cfg {
> +	u16 schq[NIX_TXSCH_LVL_CNT];
> +	u16 schq_contig[NIX_TXSCH_LVL_CNT];
> +	int static_node_pos[NIX_TXSCH_LVL_CNT];
> +	int dwrr_node_pos[NIX_TXSCH_LVL_CNT];
> +	u16 schq_contig_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC];
> +	u16 schq_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC];
> +};
>  
>  struct otx2_qos {
> -	       u16 qid_to_sqmap[OTX2_QOS_MAX_LEAF_NODES];
> -	};
> +	DECLARE_HASHTABLE(qos_hlist, order_base_2(OTX2_QOS_MAX_LEAF_NODES));
> +	struct mutex qos_lock; /* child list lock */
> +	u16 qid_to_sqmap[OTX2_QOS_MAX_LEAF_NODES];
> +	struct list_head qos_tree;
> +	DECLARE_BITMAP(qos_sq_bmap, OTX2_QOS_MAX_LEAF_NODES);
> +	u16 maj_id;
> +	u16 defcls;
> +	u8  link_cfg_lvl; /* LINKX_CFG CSRs mapped to TL3 or TL2's index ? */
> +};
> +
> +struct otx2_qos_node {
> +	struct list_head list; /* list managment */
> +	struct list_head child_list;
> +	struct list_head child_schq_list;
> +	struct hlist_node hlist;
> +	DECLARE_BITMAP(prio_bmap, OTX2_QOS_MAX_PRIO + 1);
> +	struct otx2_qos_node *parent;	/* parent qos node */
> +	u64 rate; /* htb params */
> +	u64 ceil;
> +	u32 classid;
> +	u32 prio;
> +	u16 schq; /* hw txschq */
> +	u16 qid;
> +	u16 prio_anchor;
> +	u8 level;
> +};
> +
>  
>  #endif
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
> index 1c77f024c360..8a1e89668a1b 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
> @@ -225,7 +225,22 @@ static int otx2_qos_ctx_disable(struct otx2_nic *pfvf, u16 qidx, int aura_id)
>  	return otx2_sync_mbox_msg(&pfvf->mbox);
>  }
>  
> -int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq)
> +int otx2_qos_get_qid(struct otx2_nic *pfvf)
> +{
> +	int qidx;
> +
> +	qidx = find_first_zero_bit(pfvf->qos.qos_sq_bmap,
> +				   pfvf->hw.tc_tx_queues);
> +
> +	return qidx == pfvf->hw.tc_tx_queues ? -ENOSPC : qidx;
> +}
> +
> +void otx2_qos_free_qid(struct otx2_nic *pfvf, int qidx)
> +{
> +	clear_bit(qidx, pfvf->qos.qos_sq_bmap);
> +}
> +
> +int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx)
>  {
>  	struct otx2_hw *hw = &pfvf->hw;
>  	int pool_id, sq_idx, err;
> @@ -241,7 +256,6 @@ int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq)
>  		goto out;
>  
>  	pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, sq_idx);
> -	pfvf->qos.qid_to_sqmap[qidx] = smq;
>  	err = otx2_sq_init(pfvf, sq_idx, pool_id);
>  	if (err)
>  		goto out;
> @@ -250,7 +264,7 @@ int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq)
>  	return err;
>  }
>  
> -void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq)
> +void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx)
>  {
>  	struct otx2_qset *qset = &pfvf->qset;
>  	struct otx2_hw *hw = &pfvf->hw;
> -- 
> 2.17.1
>
Hariprasad Kelam March 29, 2023, 4:44 p.m. UTC | #4
> On Sun, 2023-03-26 at 23:42 +0530, Hariprasad Kelam wrote:
> [...]
> > +static int otx2_qos_root_add(struct otx2_nic *pfvf, u16 htb_maj_id, u16
> htb_defcls,
> > +			     struct netlink_ext_ack *extack) {
> > +	struct otx2_qos_cfg *new_cfg;
> > +	struct otx2_qos_node *root;
> > +	int err;
> > +
> > +	netdev_dbg(pfvf->netdev,
> > +		   "TC_HTB_CREATE: handle=0x%x defcls=0x%x\n",
> > +		   htb_maj_id, htb_defcls);
> > +
> > +	INIT_LIST_HEAD(&pfvf->qos.qos_tree);
> > +	mutex_init(&pfvf->qos.qos_lock);
> 
> It's quite strange and error prone dynamically init this mutex and the list
> here. Why don't you do such init ad device creation time?
ACK,  we can safely move this logic in device init.
Will add these changes in next version.
> 
> > +
> > +	root = otx2_qos_alloc_root(pfvf);
> > +	if (IS_ERR(root)) {
> > +		mutex_destroy(&pfvf->qos.qos_lock);
> > +		err = PTR_ERR(root);
> > +		return err;
> > +	}
> > +
> > +	/* allocate txschq queue */
> > +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> > +	if (!new_cfg) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation
> error");
> 
> Here the root node is leaked.
ACK, will address this issue in next version.

Thanks,
Hariprasad k
> 
> > +		mutex_destroy(&pfvf->qos.qos_lock);
> > +		return -ENOMEM;
> > +	}
> 
> 
> [...]
> 
> Cheers,
> 
> Paolo
Hariprasad Kelam March 29, 2023, 5:17 p.m. UTC | #5
Thanks for the review, will address issues in next version.

Thanks,
Hariprasad k

> On Sun, Mar 26, 2023 at 11:42:44PM +0530, Hariprasad Kelam wrote:
> > From: Naveen Mamindlapalli <naveenm@marvell.com>
> >
> > This patch registers callbacks to support HTB offload.
> >
> > Below are features supported,
> >
> > - supports traffic shaping on the given class by honoring rate and
> > ceil configuration.
> >
> > - supports traffic scheduling,  which prioritizes different types of
> > traffic based on strict priority values.
> >
> > - supports the creation of leaf to inner classes such that parent node
> > rate limits apply to all child nodes.
> >
> > Signed-off-by: Naveen Mamindlapalli <naveenm@marvell.com>
> > Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
> > Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
> 
> Reviewed-by: Simon Horman <simon.horman@corigine.com>
> 
> > ---
> >  .../ethernet/marvell/octeontx2/af/common.h    |    2 +-
> >  .../ethernet/marvell/octeontx2/nic/Makefile   |    2 +-
> >  .../marvell/octeontx2/nic/otx2_common.c       |   35 +-
> >  .../marvell/octeontx2/nic/otx2_common.h       |    8 +-
> >  .../marvell/octeontx2/nic/otx2_ethtool.c      |   31 +-
> >  .../ethernet/marvell/octeontx2/nic/otx2_pf.c  |   56 +-
> >  .../ethernet/marvell/octeontx2/nic/otx2_reg.h |   13 +
> >  .../ethernet/marvell/octeontx2/nic/otx2_tc.c  |    7 +-
> >  .../net/ethernet/marvell/octeontx2/nic/qos.c  | 1460
> +++++++++++++++++
> >  .../net/ethernet/marvell/octeontx2/nic/qos.h  |   58 +-
> >  .../ethernet/marvell/octeontx2/nic/qos_sq.c   |   20 +-
> >  11 files changed, 1657 insertions(+), 35 deletions(-)
> 
> nit: this is a rather long patch.
> 
> ...
> 
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
> > b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
> 
> ...
> 
> > @@ -159,7 +165,7 @@ static void otx2_get_qset_stats(struct otx2_nic
> *pfvf,
> >  				[otx2_queue_stats[stat].index];
> >  	}
> >
> > -	for (qidx = 0; qidx < pfvf->hw.tx_queues; qidx++) {
> > +	for (qidx = 0; qidx <  otx2_get_total_tx_queues(pfvf); qidx++) {
> 
> nit: extra whitespace after '<'
> 
> >  		if (!otx2_update_sq_stats(pfvf, qidx)) {
> >  			for (stat = 0; stat < otx2_n_queue_stats; stat++)
> >  				*((*data)++) = 0;
> 
> ...
> 
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
> > b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
> 
> ...
> 
> > +static int otx2_qos_update_tx_netdev_queues(struct otx2_nic *pfvf) {
> > +	int tx_queues, qos_txqs, err;
> > +	struct otx2_hw *hw = &pfvf->hw;
> 
> nit: reverse xmas tree - longest line to shortest -
>      for local variable declarations.
> 
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
> > b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
> 
> ...
> 
> > +struct otx2_qos_node {
> > +	struct list_head list; /* list managment */
> 
> nit: s/managment/management/
> 
> > +	struct list_head child_list;
> > +	struct list_head child_schq_list;
> > +	struct hlist_node hlist;
> > +	DECLARE_BITMAP(prio_bmap, OTX2_QOS_MAX_PRIO + 1);
> > +	struct otx2_qos_node *parent;	/* parent qos node */
> > +	u64 rate; /* htb params */
> > +	u64 ceil;
> > +	u32 classid;
> > +	u32 prio;
> > +	u16 schq; /* hw txschq */
> > +	u16 qid;
> > +	u16 prio_anchor;
> > +	u8 level;
> > +};
> > +
> >
> >  #endif
> 
> ...
Hariprasad Kelam March 29, 2023, 6:03 p.m. UTC | #6
Please see inline,

> I have a few comments about concurrency issues, see below. I didn't
> analyze the concurrency model of your driver deeply, so please apologize
> me if I missed some bugs or accidentally called some good code buggy.
> 
> A few general things to pay attention to, regarding HTB offload:
> 
> 1. ndo_select_queue can be called at any time, there is no reliable way
> to prevent the kernel from calling it, that means that ndo_select_queue
> must not crash if HTB configuration and structures are being updated in
> another thread.
> 
> 2. ndo_start_xmit runs in an RCU read lock. If you need to release some
> structure that can be used from another thread in the TX datapath, you
> can set some atomic flag, synchronize with RCU, then release the object.
> 
> 3. You can take some inspiration from mlx5e, although you may find it's
> a convoluted cooperation of spinlocks, mutexes, atomic operations with
> barriers, and RCU. A big part of it is related to the mechanism of safe
> reopening of queues, which your driver may not need, but the remaining
> parts have a lot of similarities, so you can find useful insights about
> the locking for HTB in mlx5e.
> 
> On Sun, Mar 26, 2023 at 11:42:44PM +0530, Hariprasad Kelam wrote:
> > From: Naveen Mamindlapalli <naveenm@marvell.com>
> >
> > This patch registers callbacks to support HTB offload.
> >
> > Below are features supported,
> >
> > - supports traffic shaping on the given class by honoring rate and ceil
> > configuration.
> >
> > - supports traffic scheduling,  which prioritizes different types of
> > traffic based on strict priority values.
> >
> > - supports the creation of leaf to inner classes such that parent node
> > rate limits apply to all child nodes.
> >
> > Signed-off-by: Naveen Mamindlapalli <naveenm@marvell.com>
> > Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
> > Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
> > ---
> >  .../ethernet/marvell/octeontx2/af/common.h    |    2 +-
> >  .../ethernet/marvell/octeontx2/nic/Makefile   |    2 +-
> >  .../marvell/octeontx2/nic/otx2_common.c       |   35 +-
> >  .../marvell/octeontx2/nic/otx2_common.h       |    8 +-
> >  .../marvell/octeontx2/nic/otx2_ethtool.c      |   31 +-
> >  .../ethernet/marvell/octeontx2/nic/otx2_pf.c  |   56 +-
> >  .../ethernet/marvell/octeontx2/nic/otx2_reg.h |   13 +
> >  .../ethernet/marvell/octeontx2/nic/otx2_tc.c  |    7 +-
> >  .../net/ethernet/marvell/octeontx2/nic/qos.c  | 1460
> +++++++++++++++++
> >  .../net/ethernet/marvell/octeontx2/nic/qos.h  |   58 +-
> >  .../ethernet/marvell/octeontx2/nic/qos_sq.c   |   20 +-
> >  11 files changed, 1657 insertions(+), 35 deletions(-)
> >  create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/qos.c
> >
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h
> b/drivers/net/ethernet/marvell/octeontx2/af/common.h
> > index 8931864ee110..f5bf719a6ccf 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
> > +++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
> > @@ -142,7 +142,7 @@ enum nix_scheduler {
> >
> >  #define TXSCH_RR_QTM_MAX		((1 << 24) - 1)
> >  #define TXSCH_TL1_DFLT_RR_QTM		TXSCH_RR_QTM_MAX
> > -#define TXSCH_TL1_DFLT_RR_PRIO		(0x1ull)
> > +#define TXSCH_TL1_DFLT_RR_PRIO		(0x7ull)
> >  #define CN10K_MAX_DWRR_WEIGHT          16384 /* Weight is 14bit on
> CN10K */
> >
> >  /* Min/Max packet sizes, excluding FCS */
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> > index 3d31ddf7c652..5664f768cb0c 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> > @@ -8,7 +8,7 @@ obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o
> otx2_ptp.o
> >
> >  rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \
> >                 otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o \
> > -               otx2_devlink.o qos_sq.o
> > +               otx2_devlink.o qos_sq.o qos.o
> >  rvu_nicvf-y := otx2_vf.o otx2_devlink.o
> >
> >  rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> > index 32c02a2d3554..b4542a801291 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> > @@ -89,6 +89,11 @@ int otx2_update_sq_stats(struct otx2_nic *pfvf, int
> qidx)
> >  	if (!pfvf->qset.sq)
> >  		return 0;
> >
> > +	if (qidx >= pfvf->hw.non_qos_queues) {
> > +		if (!test_bit(qidx - pfvf->hw.non_qos_queues, pfvf-
> >qos.qos_sq_bmap))
> > +			return 0;
> > +	}
> > +
> >  	otx2_nix_sq_op_stats(&sq->stats, pfvf, qidx);
> >  	return 1;
> >  }
> > @@ -747,29 +752,47 @@ int otx2_txsch_alloc(struct otx2_nic *pfvf)
> >  	return 0;
> >  }
> >
> > -int otx2_txschq_stop(struct otx2_nic *pfvf)
> > +void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq)
> >  {
> >  	struct nix_txsch_free_req *free_req;
> > -	int lvl, schq, err;
> > +	int err;
> >
> >  	mutex_lock(&pfvf->mbox.lock);
> > -	/* Free the transmit schedulers */
> > +
> >  	free_req = otx2_mbox_alloc_msg_nix_txsch_free(&pfvf->mbox);
> >  	if (!free_req) {
> >  		mutex_unlock(&pfvf->mbox.lock);
> > -		return -ENOMEM;
> > +		netdev_err(pfvf->netdev,
> > +			   "Failed alloc txschq free req\n");
> > +		return;
> >  	}
> >
> > -	free_req->flags = TXSCHQ_FREE_ALL;
> > +	free_req->schq_lvl = lvl;
> > +	free_req->schq = schq;
> > +
> >  	err = otx2_sync_mbox_msg(&pfvf->mbox);
> > +	if (err) {
> > +		netdev_err(pfvf->netdev,
> > +			   "Failed stop txschq %d at level %d\n", lvl, schq);
> > +	}
> > +
> >  	mutex_unlock(&pfvf->mbox.lock);
> > +}
> > +
> > +void otx2_txschq_stop(struct otx2_nic *pfvf)
> > +{
> > +	int lvl, schq;
> > +
> > +	/* free non QOS TLx nodes */
> > +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++)
> > +		otx2_txschq_free_one(pfvf, lvl,
> > +				     pfvf->hw.txschq_list[lvl][0]);
> >
> >  	/* Clear the txschq list */
> >  	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> >  		for (schq = 0; schq < MAX_TXSCHQ_PER_FUNC; schq++)
> >  			pfvf->hw.txschq_list[lvl][schq] = 0;
> >  	}
> > -	return err;
> >  }
> >
> >  void otx2_sqb_flush(struct otx2_nic *pfvf)
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> > index 3834cc447426..4b219e8e5b32 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> > @@ -252,6 +252,7 @@ struct otx2_hw {
> >  #define CN10K_RPM		3
> >  #define CN10K_PTP_ONESTEP	4
> >  #define CN10K_HW_MACSEC		5
> > +#define QOS_CIR_PIR_SUPPORT	6
> >  	unsigned long		cap_flag;
> >
> >  #define LMT_LINE_SIZE		128
> > @@ -586,6 +587,7 @@ static inline void
> otx2_setup_dev_hw_settings(struct otx2_nic *pfvf)
> >  		__set_bit(CN10K_LMTST, &hw->cap_flag);
> >  		__set_bit(CN10K_RPM, &hw->cap_flag);
> >  		__set_bit(CN10K_PTP_ONESTEP, &hw->cap_flag);
> > +		__set_bit(QOS_CIR_PIR_SUPPORT, &hw->cap_flag);
> >  	}
> >
> >  	if (is_dev_cn10kb(pfvf->pdev))
> > @@ -935,7 +937,7 @@ int otx2_config_nix(struct otx2_nic *pfvf);
> >  int otx2_config_nix_queues(struct otx2_nic *pfvf);
> >  int otx2_txschq_config(struct otx2_nic *pfvf, int lvl, int prio, bool pfc_en);
> >  int otx2_txsch_alloc(struct otx2_nic *pfvf);
> > -int otx2_txschq_stop(struct otx2_nic *pfvf);
> > +void otx2_txschq_stop(struct otx2_nic *pfvf);
> >  void otx2_sqb_flush(struct otx2_nic *pfvf);
> >  int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
> >  		    dma_addr_t *dma);
> > @@ -953,6 +955,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16
> pool_id,
> >  		   int stack_pages, int numptrs, int buf_size);
> >  int otx2_aura_init(struct otx2_nic *pfvf, int aura_id,
> >  		   int pool_id, int numptrs);
> > +void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq);
> >
> >  /* RSS configuration APIs*/
> >  int otx2_rss_init(struct otx2_nic *pfvf);
> > @@ -1064,4 +1067,7 @@ static inline void cn10k_handle_mcs_event(struct
> otx2_nic *pfvf,
> >  void otx2_qos_sq_setup(struct otx2_nic *pfvf, int qos_txqs);
> >  u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
> >  		      struct net_device *sb_dev);
> > +int otx2_get_txq_by_classid(struct otx2_nic *pfvf, u16 classid);
> > +void otx2_qos_config_txschq(struct otx2_nic *pfvf);
> > +void otx2_clean_qos_queues(struct otx2_nic *pfvf);
> >  #endif /* OTX2_COMMON_H */
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
> > index 0f8d1a69139f..e8722a4f4cc6 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
> > @@ -92,10 +92,16 @@ static void otx2_get_qset_strings(struct otx2_nic
> *pfvf, u8 **data, int qset)
> >  			*data += ETH_GSTRING_LEN;
> >  		}
> >  	}
> > -	for (qidx = 0; qidx < pfvf->hw.tx_queues; qidx++) {
> > +
> > +	for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) {
> >  		for (stats = 0; stats < otx2_n_queue_stats; stats++) {
> > -			sprintf(*data, "txq%d: %s", qidx + start_qidx,
> > -				otx2_queue_stats[stats].name);
> > +			if (qidx >= pfvf->hw.non_qos_queues)
> > +				sprintf(*data, "txq_qos%d: %s",
> > +					qidx + start_qidx - pfvf-
> >hw.non_qos_queues,
> > +					otx2_queue_stats[stats].name);
> > +			else
> > +				sprintf(*data, "txq%d: %s", qidx + start_qidx,
> > +					otx2_queue_stats[stats].name);
> >  			*data += ETH_GSTRING_LEN;
> >  		}
> >  	}
> > @@ -159,7 +165,7 @@ static void otx2_get_qset_stats(struct otx2_nic
> *pfvf,
> >  				[otx2_queue_stats[stat].index];
> >  	}
> >
> > -	for (qidx = 0; qidx < pfvf->hw.tx_queues; qidx++) {
> > +	for (qidx = 0; qidx <  otx2_get_total_tx_queues(pfvf); qidx++) {
> >  		if (!otx2_update_sq_stats(pfvf, qidx)) {
> >  			for (stat = 0; stat < otx2_n_queue_stats; stat++)
> >  				*((*data)++) = 0;
> > @@ -254,7 +260,8 @@ static int otx2_get_sset_count(struct net_device
> *netdev, int sset)
> >  		return -EINVAL;
> >
> >  	qstats_count = otx2_n_queue_stats *
> > -		       (pfvf->hw.rx_queues + pfvf->hw.tx_queues);
> > +		       (pfvf->hw.rx_queues + pfvf->hw.non_qos_queues +
> > +			pfvf->hw.tc_tx_queues);
> >  	if (!test_bit(CN10K_RPM, &pfvf->hw.cap_flag))
> >  		mac_stats = CGX_RX_STATS_COUNT +
> CGX_TX_STATS_COUNT;
> >  	otx2_update_lmac_fec_stats(pfvf);
> > @@ -282,7 +289,7 @@ static int otx2_set_channels(struct net_device
> *dev,
> >  {
> >  	struct otx2_nic *pfvf = netdev_priv(dev);
> >  	bool if_up = netif_running(dev);
> > -	int err = 0;
> > +	int err, qos_txqs;
> >
> >  	if (!channel->rx_count || !channel->tx_count)
> >  		return -EINVAL;
> > @@ -296,14 +303,19 @@ static int otx2_set_channels(struct net_device
> *dev,
> >  	if (if_up)
> >  		dev->netdev_ops->ndo_stop(dev);
> >
> > -	err = otx2_set_real_num_queues(dev, channel->tx_count,
> > +	qos_txqs = bitmap_weight(pfvf->qos.qos_sq_bmap,
> > +				 OTX2_QOS_MAX_LEAF_NODES);
> > +
> > +	err = otx2_set_real_num_queues(dev, channel->tx_count +
> qos_txqs,
> >  				       channel->rx_count);
> >  	if (err)
> >  		return err;
> >
> >  	pfvf->hw.rx_queues = channel->rx_count;
> >  	pfvf->hw.tx_queues = channel->tx_count;
> > -	pfvf->qset.cq_cnt = pfvf->hw.tx_queues +  pfvf->hw.rx_queues;
> > +	if (pfvf->xdp_prog)
> > +		pfvf->hw.xdp_queues = channel->rx_count;
> > +	pfvf->hw.non_qos_queues =  pfvf->hw.tx_queues + pfvf-
> >hw.xdp_queues;
> >
> >  	if (if_up)
> >  		err = dev->netdev_ops->ndo_open(dev);
> > @@ -1405,7 +1417,8 @@ static int otx2vf_get_sset_count(struct
> net_device *netdev, int sset)
> >  		return -EINVAL;
> >
> >  	qstats_count = otx2_n_queue_stats *
> > -		       (vf->hw.rx_queues + vf->hw.tx_queues);
> > +		       (vf->hw.rx_queues + vf->hw.tx_queues +
> > +			vf->hw.tc_tx_queues);
> >
> >  	return otx2_n_dev_stats + otx2_n_drv_stats + qstats_count + 1;
> >  }
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> > index a32f0cb89fc4..d0192f9089ee 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> > @@ -1387,6 +1387,9 @@ static void otx2_free_sq_res(struct otx2_nic *pf)
> >  	otx2_sq_free_sqbs(pf);
> >  	for (qidx = 0; qidx < otx2_get_total_tx_queues(pf); qidx++) {
> >  		sq = &qset->sq[qidx];
> > +		/* Skip freeing Qos queues if they are not initialized */
> > +		if (!sq->sqb_count)
> > +			continue;
> >  		qmem_free(pf->dev, sq->sqe);
> >  		qmem_free(pf->dev, sq->tso_hdrs);
> >  		kfree(sq->sg);
> > @@ -1518,8 +1521,7 @@ static int otx2_init_hw_resources(struct otx2_nic
> *pf)
> >  	otx2_free_cq_res(pf);
> >  	otx2_ctx_disable(mbox, NIX_AQ_CTYPE_RQ, false);
> >  err_free_txsch:
> > -	if (otx2_txschq_stop(pf))
> > -		dev_err(pf->dev, "%s failed to stop TX schedulers\n",
> __func__);
> > +	otx2_txschq_stop(pf);
> >  err_free_sq_ptrs:
> >  	otx2_sq_free_sqbs(pf);
> >  err_free_rq_ptrs:
> > @@ -1554,21 +1556,21 @@ static void otx2_free_hw_resources(struct
> otx2_nic *pf)
> >  	struct mbox *mbox = &pf->mbox;
> >  	struct otx2_cq_queue *cq;
> >  	struct msg_req *req;
> > -	int qidx, err;
> > +	int qidx;
> >
> >  	/* Ensure all SQE are processed */
> >  	otx2_sqb_flush(pf);
> >
> >  	/* Stop transmission */
> > -	err = otx2_txschq_stop(pf);
> > -	if (err)
> > -		dev_err(pf->dev, "RVUPF: Failed to stop/free TX
> schedulers\n");
> > +	otx2_txschq_stop(pf);
> >
> >  #ifdef CONFIG_DCB
> >  	if (pf->pfc_en)
> >  		otx2_pfc_txschq_stop(pf);
> >  #endif
> >
> > +	otx2_clean_qos_queues(pf);
> > +
> >  	mutex_lock(&mbox->lock);
> >  	/* Disable backpressure */
> >  	if (!(pf->pcifunc & RVU_PFVF_FUNC_MASK))
> > @@ -1836,6 +1838,9 @@ int otx2_open(struct net_device *netdev)
> >  	/* 'intf_down' may be checked on any cpu */
> >  	smp_wmb();
> >
> > +	/* Enable QoS configuration before starting tx queues */
> > +	otx2_qos_config_txschq(pf);
> > +
> >  	/* we have already received link status notification */
> >  	if (pf->linfo.link_up && !(pf->pcifunc & RVU_PFVF_FUNC_MASK))
> >  		otx2_handle_link_event(pf);
> > @@ -1980,14 +1985,45 @@ static netdev_tx_t otx2_xmit(struct sk_buff
> *skb, struct net_device *netdev)
> >  	return NETDEV_TX_OK;
> >  }
> >
> > +static int otx2_qos_select_htb_queue(struct otx2_nic *pf, struct sk_buff
> *skb,
> > +				     u16 htb_maj_id)
> > +{
> > +	u16 classid;
> > +
> > +	if ((TC_H_MAJ(skb->priority) >> 16) == htb_maj_id)
> > +		classid = TC_H_MIN(skb->priority);
> > +	else
> > +		classid = READ_ONCE(pf->qos.defcls);
> > +
> > +	if (!classid)
> > +		return 0;
> > +
> > +	return otx2_get_txq_by_classid(pf, classid);
> 
> This selects queues with numbers >= pf->hw.tx_queues, and otx2_xmit
> indexes pfvf->qset.sq with these qids, however, pfvf->qset.sq is
> allocated only up to pf->hw.non_qos_queues. Array out-of-bounds?
> 

We are supposed to allocated all Sqs (non_qos_queues + tc_tx_queues). 
Looks like we missed this change in refactoring send queue code,
Will update this change.
> > +}
> > +
> >  u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
> >  		      struct net_device *sb_dev)
> >  {
> > -#ifdef CONFIG_DCB
> >  	struct otx2_nic *pf = netdev_priv(netdev);
> > +	bool qos_enabled;
> > +#ifdef CONFIG_DCB
> >  	u8 vlan_prio;
> >  #endif
> > +	int txq;
> >
> > +	qos_enabled = (netdev->real_num_tx_queues > pf-
> >hw.tx_queues) ? true : false;
> > +	if (unlikely(qos_enabled)) {
> > +		u16 htb_maj_id = smp_load_acquire(&pf->qos.maj_id); /*
> barrier */
> 
> Checkpatch requires to add comments for the barriers for a reason :)
> 
> "Barrier" is a useless comment, we all know that smp_load_acquire is a
> barrier, you should explain why this barrier is needed and which other
> barriers it pairs with.
> 
ACK, will update the details in next version.
> > +
> > +		if (unlikely(htb_maj_id)) {
> > +			txq = otx2_qos_select_htb_queue(pf, skb,
> htb_maj_id);
> > +			if (txq > 0)
> > +				return txq;
> > +			goto process_pfc;
> > +		}
> > +	}
> > +
> > +process_pfc:
> >  #ifdef CONFIG_DCB
> >  	if (!skb_vlan_tag_present(skb))
> >  		goto pick_tx;
> > @@ -2001,7 +2037,11 @@ u16 otx2_select_queue(struct net_device
> *netdev, struct sk_buff *skb,
> >
> >  pick_tx:
> >  #endif
> > -	return netdev_pick_tx(netdev, skb, NULL);
> > +	txq = netdev_pick_tx(netdev, skb, NULL);
> > +	if (unlikely(qos_enabled))
> > +		return txq % pf->hw.tx_queues;
> > +
> > +	return txq;
> >  }
> >  EXPORT_SYMBOL(otx2_select_queue);
> >
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
> > index 1b967eaf948b..45a32e4b49d1 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
> > @@ -145,12 +145,25 @@
> >  #define NIX_AF_TL1X_TOPOLOGY(a)		(0xC80 | (a) << 16)
> >  #define NIX_AF_TL2X_PARENT(a)		(0xE88 | (a) << 16)
> >  #define NIX_AF_TL2X_SCHEDULE(a)		(0xE00 | (a) << 16)
> > +#define NIX_AF_TL2X_TOPOLOGY(a)		(0xE80 | (a) << 16)
> > +#define NIX_AF_TL2X_CIR(a)              (0xE20 | (a) << 16)
> > +#define NIX_AF_TL2X_PIR(a)              (0xE30 | (a) << 16)
> >  #define NIX_AF_TL3X_PARENT(a)		(0x1088 | (a) << 16)
> >  #define NIX_AF_TL3X_SCHEDULE(a)		(0x1000 | (a) << 16)
> > +#define NIX_AF_TL3X_SHAPE(a)		(0x1010 | (a) << 16)
> > +#define NIX_AF_TL3X_CIR(a)		(0x1020 | (a) << 16)
> > +#define NIX_AF_TL3X_PIR(a)		(0x1030 | (a) << 16)
> > +#define NIX_AF_TL3X_TOPOLOGY(a)		(0x1080 | (a) << 16)
> >  #define NIX_AF_TL4X_PARENT(a)		(0x1288 | (a) << 16)
> >  #define NIX_AF_TL4X_SCHEDULE(a)		(0x1200 | (a) << 16)
> > +#define NIX_AF_TL4X_SHAPE(a)		(0x1210 | (a) << 16)
> > +#define NIX_AF_TL4X_CIR(a)		(0x1220 | (a) << 16)
> >  #define NIX_AF_TL4X_PIR(a)		(0x1230 | (a) << 16)
> > +#define NIX_AF_TL4X_TOPOLOGY(a)		(0x1280 | (a) << 16)
> >  #define NIX_AF_MDQX_SCHEDULE(a)		(0x1400 | (a) << 16)
> > +#define NIX_AF_MDQX_SHAPE(a)		(0x1410 | (a) << 16)
> > +#define NIX_AF_MDQX_CIR(a)		(0x1420 | (a) << 16)
> > +#define NIX_AF_MDQX_PIR(a)		(0x1430 | (a) << 16)
> >  #define NIX_AF_MDQX_PARENT(a)		(0x1480 | (a) << 16)
> >  #define NIX_AF_TL3_TL2X_LINKX_CFG(a, b)	(0x1700 | (a) << 16 | (b) << 3)
> >
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
> > index 044cc211424e..42c49249f4e7 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
> > @@ -19,6 +19,7 @@
> >
> >  #include "cn10k.h"
> >  #include "otx2_common.h"
> > +#include "qos.h"
> >
> >  /* Egress rate limiting definitions */
> >  #define MAX_BURST_EXPONENT		0x0FULL
> > @@ -147,8 +148,8 @@ static void otx2_get_egress_rate_cfg(u64 maxrate,
> u32 *exp,
> >  	}
> >  }
> >
> > -static u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic,
> > -				       u64 maxrate, u32 burst)
> > +u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic,
> > +				u64 maxrate, u32 burst)
> >  {
> >  	u32 burst_exp, burst_mantissa;
> >  	u32 exp, mantissa, div_exp;
> > @@ -1127,6 +1128,8 @@ int otx2_setup_tc(struct net_device *netdev,
> enum tc_setup_type type,
> >  	switch (type) {
> >  	case TC_SETUP_BLOCK:
> >  		return otx2_setup_tc_block(netdev, type_data);
> > +	case TC_SETUP_QDISC_HTB:
> > +		return otx2_setup_tc_htb(netdev, type_data);
> >  	default:
> >  		return -EOPNOTSUPP;
> >  	}
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
> b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
> > new file mode 100644
> > index 000000000000..22c5b6a2871a
> > --- /dev/null
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
> > @@ -0,0 +1,1460 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Marvell RVU Ethernet driver
> > + *
> > + * Copyright (C) 2023 Marvell.
> > + *
> > + */
> > +#include <linux/netdevice.h>
> > +#include <linux/etherdevice.h>
> > +#include <linux/inetdevice.h>
> > +#include <linux/bitfield.h>
> > +
> > +#include "otx2_common.h"
> > +#include "cn10k.h"
> > +#include "qos.h"
> > +
> > +#define OTX2_QOS_QID_INNER		0xFFFFU
> > +#define OTX2_QOS_QID_NONE		0xFFFEU
> > +#define OTX2_QOS_ROOT_CLASSID		0xFFFFFFFF
> > +#define OTX2_QOS_CLASS_NONE		0
> > +#define OTX2_QOS_DEFAULT_PRIO		0xF
> > +#define OTX2_QOS_INVALID_SQ		0xFFFF
> > +
> > +/* Egress rate limiting definitions */
> > +#define MAX_BURST_EXPONENT		0x0FULL
> > +#define MAX_BURST_MANTISSA		0xFFULL
> > +#define MAX_BURST_SIZE			130816ULL
> > +#define MAX_RATE_DIVIDER_EXPONENT	12ULL
> > +#define MAX_RATE_EXPONENT		0x0FULL
> > +#define MAX_RATE_MANTISSA		0xFFULL
> > +
> > +/* Bitfields in NIX_TLX_PIR register */
> > +#define TLX_RATE_MANTISSA		GENMASK_ULL(8, 1)
> > +#define TLX_RATE_EXPONENT		GENMASK_ULL(12, 9)
> > +#define TLX_RATE_DIVIDER_EXPONENT	GENMASK_ULL(16, 13)
> > +#define TLX_BURST_MANTISSA		GENMASK_ULL(36, 29)
> > +#define TLX_BURST_EXPONENT		GENMASK_ULL(40, 37)
> > +
> > +static int otx2_qos_update_tx_netdev_queues(struct otx2_nic *pfvf)
> > +{
> > +	int tx_queues, qos_txqs, err;
> > +	struct otx2_hw *hw = &pfvf->hw;
> > +
> > +	qos_txqs = bitmap_weight(pfvf->qos.qos_sq_bmap,
> > +				 OTX2_QOS_MAX_LEAF_NODES);
> > +
> > +	tx_queues = hw->tx_queues + qos_txqs;
> > +
> > +	err = netif_set_real_num_tx_queues(pfvf->netdev, tx_queues);
> > +	if (err) {
> > +		netdev_err(pfvf->netdev,
> > +			   "Failed to set no of Tx queues: %d\n", tx_queues);
> > +		return err;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static u64 otx2_qos_convert_rate(u64 rate)
> > +{
> > +	u64 converted_rate;
> > +
> > +	/* convert bytes per second to Mbps */
> > +	converted_rate = rate * 8;
> > +	converted_rate = max_t(u64, converted_rate / 1000000, 1);
> > +
> > +	return converted_rate;
> > +}
> > +
> > +static void __otx2_qos_txschq_cfg(struct otx2_nic *pfvf,
> > +				  struct otx2_qos_node *node,
> > +				  struct nix_txschq_config *cfg)
> > +{
> > +	struct otx2_hw *hw = &pfvf->hw;
> > +	int num_regs = 0;
> > +	u64 maxrate;
> > +	u8 level;
> > +
> > +	level = node->level;
> > +
> > +	/* program txschq registers */
> > +	if (level == NIX_TXSCH_LVL_SMQ) {
> > +		cfg->reg[num_regs] = NIX_AF_SMQX_CFG(node->schq);
> > +		cfg->regval[num_regs] = ((u64)pfvf->tx_max_pktlen << 8) |
> > +					OTX2_MIN_MTU;
> > +		cfg->regval[num_regs] |= (0x20ULL << 51) | (0x80ULL << 39)
> |
> > +					 (0x2ULL << 36);
> > +		num_regs++;
> > +
> > +		/* configure parent txschq */
> > +		cfg->reg[num_regs] = NIX_AF_MDQX_PARENT(node-
> >schq);
> > +		cfg->regval[num_regs] = node->parent->schq << 16;
> > +		num_regs++;
> > +
> > +		/* configure prio/quantum */
> > +		if (node->qid == OTX2_QOS_QID_NONE) {
> > +			cfg->reg[num_regs] =
> NIX_AF_MDQX_SCHEDULE(node->schq);
> > +			cfg->regval[num_regs] =  node->prio << 24 |
> > +						 mtu_to_dwrr_weight(pfvf,
> > +								    pfvf-
> >tx_max_pktlen);
> > +			num_regs++;
> > +			goto txschq_cfg_out;
> > +		}
> > +
> > +		/* configure prio */
> > +		cfg->reg[num_regs] = NIX_AF_MDQX_SCHEDULE(node-
> >schq);
> > +		cfg->regval[num_regs] = (node->schq -
> > +					 node->parent->prio_anchor) << 24;
> > +		num_regs++;
> > +
> > +		/* configure PIR */
> > +		maxrate = (node->rate > node->ceil) ? node->rate : node-
> >ceil;
> > +
> > +		cfg->reg[num_regs] = NIX_AF_MDQX_PIR(node->schq);
> > +		cfg->regval[num_regs] =
> > +			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
> > +		num_regs++;
> > +
> > +		/* configure CIR */
> > +		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
> > +			/* Don't configure CIR when both CIR+PIR not
> supported
> > +			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes
> deadlock
> > +			 */
> > +			goto txschq_cfg_out;
> > +		}
> > +
> > +		cfg->reg[num_regs] = NIX_AF_MDQX_CIR(node->schq);
> > +		cfg->regval[num_regs] =
> > +			otx2_get_txschq_rate_regval(pfvf, node->rate,
> 65536);
> > +		num_regs++;
> > +	} else if (level == NIX_TXSCH_LVL_TL4) {
> > +		/* configure parent txschq */
> > +		cfg->reg[num_regs] = NIX_AF_TL4X_PARENT(node->schq);
> > +		cfg->regval[num_regs] = node->parent->schq << 16;
> > +		num_regs++;
> > +
> > +		/* return if not htb node */
> > +		if (node->qid == OTX2_QOS_QID_NONE) {
> > +			cfg->reg[num_regs] =
> NIX_AF_TL4X_SCHEDULE(node->schq);
> > +			cfg->regval[num_regs] =  node->prio << 24 |
> > +						 mtu_to_dwrr_weight(pfvf,
> > +								    pfvf-
> >tx_max_pktlen);
> > +			num_regs++;
> > +			goto txschq_cfg_out;
> > +		}
> > +
> > +		/* configure priority */
> > +		cfg->reg[num_regs] = NIX_AF_TL4X_SCHEDULE(node-
> >schq);
> > +		cfg->regval[num_regs] = (node->schq -
> > +					 node->parent->prio_anchor) << 24;
> > +		num_regs++;
> > +
> > +		/* configure PIR */
> > +		maxrate = (node->rate > node->ceil) ? node->rate : node-
> >ceil;
> > +		cfg->reg[num_regs] = NIX_AF_TL4X_PIR(node->schq);
> > +		cfg->regval[num_regs] =
> > +			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
> > +		num_regs++;
> > +
> > +		/* configure CIR */
> > +		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
> > +			/* Don't configure CIR when both CIR+PIR not
> supported
> > +			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes
> deadlock
> > +			 */
> > +			goto txschq_cfg_out;
> > +		}
> > +
> > +		cfg->reg[num_regs] = NIX_AF_TL4X_CIR(node->schq);
> > +		cfg->regval[num_regs] =
> > +			otx2_get_txschq_rate_regval(pfvf, node->rate,
> 65536);
> > +		num_regs++;
> > +	} else if (level == NIX_TXSCH_LVL_TL3) {
> > +		/* configure parent txschq */
> > +		cfg->reg[num_regs] = NIX_AF_TL3X_PARENT(node->schq);
> > +		cfg->regval[num_regs] = node->parent->schq << 16;
> > +		num_regs++;
> > +
> > +		/* configure link cfg */
> > +		if (level == pfvf->qos.link_cfg_lvl) {
> > +			cfg->reg[num_regs] =
> NIX_AF_TL3_TL2X_LINKX_CFG(node->schq, hw->tx_link);
> > +			cfg->regval[num_regs] = BIT_ULL(13) | BIT_ULL(12);
> > +			num_regs++;
> > +		}
> > +
> > +		/* return if not htb node */
> > +		if (node->qid == OTX2_QOS_QID_NONE) {
> > +			cfg->reg[num_regs] =
> NIX_AF_TL3X_SCHEDULE(node->schq);
> > +			cfg->regval[num_regs] =  node->prio << 24 |
> > +						 mtu_to_dwrr_weight(pfvf,
> > +								    pfvf-
> >tx_max_pktlen);
> > +			num_regs++;
> > +			goto txschq_cfg_out;
> > +		}
> > +
> > +		/* configure priority */
> > +		cfg->reg[num_regs] = NIX_AF_TL3X_SCHEDULE(node-
> >schq);
> > +		cfg->regval[num_regs] = (node->schq -
> > +					 node->parent->prio_anchor) << 24;
> > +		num_regs++;
> > +
> > +		/* configure PIR */
> > +		maxrate = (node->rate > node->ceil) ? node->rate : node-
> >ceil;
> > +		cfg->reg[num_regs] = NIX_AF_TL3X_PIR(node->schq);
> > +		cfg->regval[num_regs] =
> > +			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
> > +		num_regs++;
> > +
> > +		/* configure CIR */
> > +		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
> > +			/* Don't configure CIR when both CIR+PIR not
> supported
> > +			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes
> deadlock
> > +			 */
> > +			goto txschq_cfg_out;
> > +		}
> > +
> > +		cfg->reg[num_regs] = NIX_AF_TL3X_CIR(node->schq);
> > +		cfg->regval[num_regs] =
> > +			otx2_get_txschq_rate_regval(pfvf, node->rate,
> 65536);
> > +		num_regs++;
> > +	} else if (level == NIX_TXSCH_LVL_TL2) {
> > +		/* configure parent txschq */
> > +		cfg->reg[num_regs] = NIX_AF_TL2X_PARENT(node->schq);
> > +		cfg->regval[num_regs] = hw->tx_link << 16;
> > +		num_regs++;
> > +
> > +		/* configure link cfg */
> > +		if (level == pfvf->qos.link_cfg_lvl) {
> > +			cfg->reg[num_regs] =
> NIX_AF_TL3_TL2X_LINKX_CFG(node->schq, hw->tx_link);
> > +			cfg->regval[num_regs] = BIT_ULL(13) | BIT_ULL(12);
> > +			num_regs++;
> > +		}
> > +
> > +		/* return if not htb node */
> > +		if (node->qid == OTX2_QOS_QID_NONE) {
> > +			cfg->reg[num_regs] =
> NIX_AF_TL2X_SCHEDULE(node->schq);
> > +			cfg->regval[num_regs] =  node->prio << 24 |
> > +						 mtu_to_dwrr_weight(pfvf,
> > +								    pfvf-
> >tx_max_pktlen);
> > +			num_regs++;
> > +			goto txschq_cfg_out;
> > +		}
> > +
> > +		/* check if node is root */
> > +		if (node->qid == OTX2_QOS_QID_INNER && !node->parent)
> {
> > +			cfg->reg[num_regs] =
> NIX_AF_TL2X_SCHEDULE(node->schq);
> > +			cfg->regval[num_regs] =  TXSCH_TL1_DFLT_RR_PRIO
> << 24 |
> > +						 mtu_to_dwrr_weight(pfvf,
> > +								    pfvf-
> >tx_max_pktlen);
> > +			num_regs++;
> > +			goto txschq_cfg_out;
> > +		}
> > +
> > +		/* configure priority/quantum */
> > +		cfg->reg[num_regs] = NIX_AF_TL2X_SCHEDULE(node-
> >schq);
> > +		cfg->regval[num_regs] = (node->schq -
> > +					 node->parent->prio_anchor) << 24;
> > +		num_regs++;
> > +
> > +		/* configure PIR */
> > +		maxrate = (node->rate > node->ceil) ? node->rate : node-
> >ceil;
> > +		cfg->reg[num_regs] = NIX_AF_TL2X_PIR(node->schq);
> > +		cfg->regval[num_regs] =
> > +			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
> > +		num_regs++;
> > +
> > +		/* configure CIR */
> > +		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
> > +			/* Don't configure CIR when both CIR+PIR not
> supported
> > +			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes
> deadlock
> > +			 */
> > +			goto txschq_cfg_out;
> > +		}
> > +
> > +		cfg->reg[num_regs] = NIX_AF_TL2X_CIR(node->schq);
> > +		cfg->regval[num_regs] =
> > +			otx2_get_txschq_rate_regval(pfvf, node->rate,
> 65536);
> > +		num_regs++;
> > +	}
> > +
> > +txschq_cfg_out:
> > +	cfg->num_regs = num_regs;
> > +}
> > +
> > +static int otx2_qos_txschq_set_parent_topology(struct otx2_nic *pfvf,
> > +					       struct otx2_qos_node *parent)
> > +{
> > +	struct mbox *mbox = &pfvf->mbox;
> > +	struct nix_txschq_config *cfg;
> > +	int rc;
> > +
> > +	if (parent->level == NIX_TXSCH_LVL_MDQ)
> > +		return 0;
> > +
> > +	mutex_lock(&mbox->lock);
> > +
> > +	cfg = otx2_mbox_alloc_msg_nix_txschq_cfg(&pfvf->mbox);
> > +	if (!cfg) {
> > +		mutex_unlock(&mbox->lock);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	cfg->lvl = parent->level;
> > +
> > +	if (parent->level == NIX_TXSCH_LVL_TL4)
> > +		cfg->reg[0] = NIX_AF_TL4X_TOPOLOGY(parent->schq);
> > +	else if (parent->level == NIX_TXSCH_LVL_TL3)
> > +		cfg->reg[0] = NIX_AF_TL3X_TOPOLOGY(parent->schq);
> > +	else if (parent->level == NIX_TXSCH_LVL_TL2)
> > +		cfg->reg[0] = NIX_AF_TL2X_TOPOLOGY(parent->schq);
> > +	else if (parent->level == NIX_TXSCH_LVL_TL1)
> > +		cfg->reg[0] = NIX_AF_TL1X_TOPOLOGY(parent->schq);
> > +
> > +	cfg->regval[0] = (u64)parent->prio_anchor << 32;
> > +	if (parent->level == NIX_TXSCH_LVL_TL1)
> > +		cfg->regval[0] |= (u64)TXSCH_TL1_DFLT_RR_PRIO << 1;
> > +
> > +	cfg->num_regs++;
> > +
> > +	rc = otx2_sync_mbox_msg(&pfvf->mbox);
> > +
> > +	mutex_unlock(&mbox->lock);
> > +
> > +	return rc;
> > +}
> > +
> > +static void otx2_qos_free_hw_node_schq(struct otx2_nic *pfvf,
> > +				       struct otx2_qos_node *parent)
> > +{
> > +	struct otx2_qos_node *node;
> > +
> > +	list_for_each_entry_reverse(node, &parent->child_schq_list, list)
> > +		otx2_txschq_free_one(pfvf, node->level, node->schq);
> > +}
> > +
> > +static void otx2_qos_free_hw_node(struct otx2_nic *pfvf,
> > +				  struct otx2_qos_node *parent)
> > +{
> > +	struct otx2_qos_node *node, *tmp;
> > +
> > +	list_for_each_entry_safe(node, tmp, &parent->child_list, list) {
> > +		otx2_qos_free_hw_node(pfvf, node);
> > +		otx2_qos_free_hw_node_schq(pfvf, node);
> > +		otx2_txschq_free_one(pfvf, node->level, node->schq);
> > +	}
> > +}
> > +
> > +static void otx2_qos_free_hw_cfg(struct otx2_nic *pfvf,
> > +				 struct otx2_qos_node *node)
> > +{
> > +	mutex_lock(&pfvf->qos.qos_lock);
> > +
> > +	/* free child node hw mappings */
> > +	otx2_qos_free_hw_node(pfvf, node);
> > +	otx2_qos_free_hw_node_schq(pfvf, node);
> > +
> > +	/* free node hw mappings */
> > +	otx2_txschq_free_one(pfvf, node->level, node->schq);
> > +
> > +	mutex_unlock(&pfvf->qos.qos_lock);
> > +}
> > +
> > +static void otx2_qos_sw_node_delete(struct otx2_nic *pfvf,
> > +				    struct otx2_qos_node *node)
> > +{
> > +	hash_del(&node->hlist);
> > +
> > +	if (node->qid != OTX2_QOS_QID_INNER && node->qid !=
> OTX2_QOS_QID_NONE) {
> > +		__clear_bit(node->qid, pfvf->qos.qos_sq_bmap);
> > +		otx2_qos_update_tx_netdev_queues(pfvf);
> > +	}
> > +
> > +	list_del(&node->list);
> > +	kfree(node);
> > +}
> > +
> > +static void otx2_qos_free_sw_node_schq(struct otx2_nic *pfvf,
> > +				       struct otx2_qos_node *parent)
> > +{
> > +	struct otx2_qos_node *node, *tmp;
> > +
> > +	list_for_each_entry_safe(node, tmp, &parent->child_schq_list, list) {
> > +		list_del(&node->list);
> > +		kfree(node);
> > +	}
> > +}
> > +
> > +static void __otx2_qos_free_sw_node(struct otx2_nic *pfvf,
> > +				    struct otx2_qos_node *parent)
> > +{
> > +	struct otx2_qos_node *node, *tmp;
> > +
> > +	list_for_each_entry_safe(node, tmp, &parent->child_list, list) {
> > +		__otx2_qos_free_sw_node(pfvf, node);
> > +		otx2_qos_free_sw_node_schq(pfvf, node);
> > +		otx2_qos_sw_node_delete(pfvf, node);
> > +	}
> > +}
> > +
> > +static void otx2_qos_free_sw_node(struct otx2_nic *pfvf,
> > +				  struct otx2_qos_node *node)
> > +{
> > +	mutex_lock(&pfvf->qos.qos_lock);
> > +
> > +	__otx2_qos_free_sw_node(pfvf, node);
> > +	otx2_qos_free_sw_node_schq(pfvf, node);
> > +	otx2_qos_sw_node_delete(pfvf, node);
> > +
> > +	mutex_unlock(&pfvf->qos.qos_lock);
> > +}
> > +
> > +static void otx2_qos_destroy_node(struct otx2_nic *pfvf,
> > +				  struct otx2_qos_node *node)
> > +{
> > +	otx2_qos_free_hw_cfg(pfvf, node);
> > +	otx2_qos_free_sw_node(pfvf, node);
> > +}
> > +
> > +static void otx2_qos_fill_cfg_schq(struct otx2_qos_node *parent,
> > +				   struct otx2_qos_cfg *cfg)
> > +{
> > +	struct otx2_qos_node *node;
> > +
> > +	list_for_each_entry(node, &parent->child_schq_list, list)
> > +		cfg->schq[node->level]++;
> > +}
> > +
> > +static void otx2_qos_fill_cfg_tl(struct otx2_qos_node *parent,
> > +				 struct otx2_qos_cfg *cfg)
> > +{
> > +	struct otx2_qos_node *node;
> > +
> > +	list_for_each_entry(node, &parent->child_list, list) {
> > +		otx2_qos_fill_cfg_tl(node, cfg);
> > +		cfg->schq_contig[node->level]++;
> > +		otx2_qos_fill_cfg_schq(node, cfg);
> > +	}
> > +}
> > +
> > +static void otx2_qos_prepare_txschq_cfg(struct otx2_nic *pfvf,
> > +					struct otx2_qos_node *parent,
> > +					struct otx2_qos_cfg *cfg)
> > +{
> > +	mutex_lock(&pfvf->qos.qos_lock);
> > +	otx2_qos_fill_cfg_tl(parent, cfg);
> > +	mutex_unlock(&pfvf->qos.qos_lock);
> > +}
> > +
> > +static void otx2_qos_read_txschq_cfg_schq(struct otx2_qos_node
> *parent,
> > +					  struct otx2_qos_cfg *cfg)
> > +{
> > +	struct otx2_qos_node *node;
> > +	int cnt;
> > +
> > +	list_for_each_entry(node, &parent->child_schq_list, list) {
> > +		cnt = cfg->dwrr_node_pos[node->level];
> > +		cfg->schq_list[node->level][cnt] = node->schq;
> > +		cfg->schq[node->level]++;
> > +		cfg->dwrr_node_pos[node->level]++;
> > +	}
> > +}
> > +
> > +static void otx2_qos_read_txschq_cfg_tl(struct otx2_qos_node *parent,
> > +					struct otx2_qos_cfg *cfg)
> > +{
> > +	struct otx2_qos_node *node;
> > +	int cnt;
> > +
> > +	list_for_each_entry(node, &parent->child_list, list) {
> > +		otx2_qos_read_txschq_cfg_tl(node, cfg);
> > +		cnt = cfg->static_node_pos[node->level];
> > +		cfg->schq_contig_list[node->level][cnt] = node->schq;
> > +		cfg->schq_contig[node->level]++;
> > +		cfg->static_node_pos[node->level]++;
> > +		otx2_qos_read_txschq_cfg_schq(node, cfg);
> > +	}
> > +}
> > +
> > +static void otx2_qos_read_txschq_cfg(struct otx2_nic *pfvf,
> > +				     struct otx2_qos_node *node,
> > +				     struct otx2_qos_cfg *cfg)
> > +{
> > +	mutex_lock(&pfvf->qos.qos_lock);
> > +	otx2_qos_read_txschq_cfg_tl(node, cfg);
> > +	mutex_unlock(&pfvf->qos.qos_lock);
> > +}
> > +
> > +static struct otx2_qos_node *
> > +otx2_qos_alloc_root(struct otx2_nic *pfvf)
> > +{
> > +	struct otx2_qos_node *node;
> > +
> > +	node = kzalloc(sizeof(*node), GFP_KERNEL);
> > +	if (!node)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	node->parent = NULL;
> > +	if (!is_otx2_vf(pfvf->pcifunc))
> > +		node->level = NIX_TXSCH_LVL_TL1;
> > +	else
> > +		node->level = NIX_TXSCH_LVL_TL2;
> > +
> > +	node->qid = OTX2_QOS_QID_INNER;
> > +	node->classid = OTX2_QOS_ROOT_CLASSID;
> > +
> > +	hash_add(pfvf->qos.qos_hlist, &node->hlist, node->classid);
> > +	list_add_tail(&node->list, &pfvf->qos.qos_tree);
> > +	INIT_LIST_HEAD(&node->child_list);
> > +	INIT_LIST_HEAD(&node->child_schq_list);
> > +
> > +	return node;
> > +}
> > +
> > +static int otx2_qos_add_child_node(struct otx2_qos_node *parent,
> > +				   struct otx2_qos_node *node)
> > +{
> > +	struct list_head *head = &parent->child_list;
> > +	struct otx2_qos_node *tmp_node;
> > +	struct list_head *tmp;
> > +
> > +	for (tmp = head->next; tmp != head; tmp = tmp->next) {
> > +		tmp_node = list_entry(tmp, struct otx2_qos_node, list);
> > +		if (tmp_node->prio == node->prio)
> > +			return -EEXIST;
> > +		if (tmp_node->prio > node->prio) {
> > +			list_add_tail(&node->list, tmp);
> > +			return 0;
> > +		}
> > +	}
> > +
> > +	list_add_tail(&node->list, head);
> > +	return 0;
> > +}
> > +
> > +static int otx2_qos_alloc_txschq_node(struct otx2_nic *pfvf,
> > +				      struct otx2_qos_node *node)
> > +{
> > +	struct otx2_qos_node *txschq_node, *parent, *tmp;
> > +	int lvl;
> > +
> > +	parent = node;
> > +	for (lvl = node->level - 1; lvl >= NIX_TXSCH_LVL_MDQ; lvl--) {
> > +		txschq_node = kzalloc(sizeof(*txschq_node), GFP_KERNEL);
> > +		if (!txschq_node)
> > +			goto err_out;
> > +
> > +		txschq_node->parent = parent;
> > +		txschq_node->level = lvl;
> > +		txschq_node->classid = OTX2_QOS_CLASS_NONE;
> > +		txschq_node->qid = OTX2_QOS_QID_NONE;
> > +		txschq_node->rate = 0;
> > +		txschq_node->ceil = 0;
> > +		txschq_node->prio = 0;
> > +
> > +		mutex_lock(&pfvf->qos.qos_lock);
> > +		list_add_tail(&txschq_node->list, &node->child_schq_list);
> > +		mutex_unlock(&pfvf->qos.qos_lock);
> > +
> > +		INIT_LIST_HEAD(&txschq_node->child_list);
> > +		INIT_LIST_HEAD(&txschq_node->child_schq_list);
> > +		parent = txschq_node;
> > +	}
> > +
> > +	return 0;
> > +
> > +err_out:
> > +	list_for_each_entry_safe(txschq_node, tmp, &node-
> >child_schq_list,
> > +				 list) {
> > +		list_del(&txschq_node->list);
> > +		kfree(txschq_node);
> > +	}
> > +	return -ENOMEM;
> > +}
> > +
> > +static struct otx2_qos_node *
> > +otx2_qos_sw_create_leaf_node(struct otx2_nic *pfvf,
> > +			     struct otx2_qos_node *parent,
> > +			     u16 classid, u32 prio, u64 rate, u64 ceil,
> > +			     u16 qid)
> > +{
> > +	struct otx2_qos_node *node;
> > +	int err;
> > +
> > +	node = kzalloc(sizeof(*node), GFP_KERNEL);
> > +	if (!node)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	node->parent = parent;
> > +	node->level = parent->level - 1;
> > +	node->classid = classid;
> > +	node->qid = qid;
> > +	node->rate = otx2_qos_convert_rate(rate);
> > +	node->ceil = otx2_qos_convert_rate(ceil);
> > +	node->prio = prio;
> > +
> > +	__set_bit(qid, pfvf->qos.qos_sq_bmap);
> > +
> > +	hash_add(pfvf->qos.qos_hlist, &node->hlist, classid);
> > +
> > +	mutex_lock(&pfvf->qos.qos_lock);
> > +	err = otx2_qos_add_child_node(parent, node);
> > +	if (err) {
> > +		mutex_unlock(&pfvf->qos.qos_lock);
> > +		return ERR_PTR(err);
> > +	}
> > +	mutex_unlock(&pfvf->qos.qos_lock);
> > +
> > +	INIT_LIST_HEAD(&node->child_list);
> > +	INIT_LIST_HEAD(&node->child_schq_list);
> 
> Looks suspicious that some fields of node are initialized after
> otx2_qos_add_child_node is called.

" otx2_qos_add_child_node" tries to rearrange the node in linked list based on node priority.
Here on success case we are calling INIT_LIST_HEAD. 
Will move this logic inside the function.
> 
> > +
> > +	err = otx2_qos_alloc_txschq_node(pfvf, node);
> > +	if (err) {
> > +		otx2_qos_sw_node_delete(pfvf, node);
> > +		return ERR_PTR(-ENOMEM);
> > +	}
> > +
> > +	return node;
> > +}
> > +
> > +static struct otx2_qos_node *
> > +otx2_sw_node_find(struct otx2_nic *pfvf, u32 classid)
> > +{
> > +	struct otx2_qos_node *node = NULL;
> > +
> > +	hash_for_each_possible(pfvf->qos.qos_hlist, node, hlist, classid) {
> 
> This loop may be called from ndo_select_queue, while another thread may
> modify qos_hlist. We use RCU in mlx5e to protect this structure. What
> protects it in your driver?
> 
We did not came across this case, as we normally wont delete the classes on the fly.
Will test the scenario and update the patch.
> > +		if (node->classid == classid)
> > +			break;
> > +	}
> > +
> > +	return node;
> > +}
> > +
> > +int otx2_get_txq_by_classid(struct otx2_nic *pfvf, u16 classid)
> > +{
> > +	struct otx2_qos_node *node;
> > +	u16 qid;
> > +	int res;
> > +
> > +	node = otx2_sw_node_find(pfvf, classid);
> > +	if (IS_ERR(node)) {
> > +		res = -ENOENT;
> > +		goto out;
> > +	}
> > +	qid = READ_ONCE(node->qid);
> > +	if (qid == OTX2_QOS_QID_INNER) {
> > +		res = -EINVAL;
> > +		goto out;
> > +	}
> > +	res = pfvf->hw.tx_queues + qid;
> > +out:
> > +	return res;
> > +}
> > +
> > +static int
> > +otx2_qos_txschq_config(struct otx2_nic *pfvf, struct otx2_qos_node
> *node)
> > +{
> > +	struct mbox *mbox = &pfvf->mbox;
> > +	struct nix_txschq_config *req;
> > +	int rc;
> > +
> > +	mutex_lock(&mbox->lock);
> > +
> > +	req = otx2_mbox_alloc_msg_nix_txschq_cfg(&pfvf->mbox);
> > +	if (!req) {
> > +		mutex_unlock(&mbox->lock);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	req->lvl = node->level;
> > +	__otx2_qos_txschq_cfg(pfvf, node, req);
> > +
> > +	rc = otx2_sync_mbox_msg(&pfvf->mbox);
> > +
> > +	mutex_unlock(&mbox->lock);
> > +
> > +	return rc;
> > +}
> > +
> > +static int otx2_qos_txschq_alloc(struct otx2_nic *pfvf,
> > +				 struct otx2_qos_cfg *cfg)
> > +{
> > +	struct nix_txsch_alloc_req *req;
> > +	struct nix_txsch_alloc_rsp *rsp;
> > +	struct mbox *mbox = &pfvf->mbox;
> > +	int lvl, rc, schq;
> > +
> > +	mutex_lock(&mbox->lock);
> > +	req = otx2_mbox_alloc_msg_nix_txsch_alloc(&pfvf->mbox);
> > +	if (!req) {
> > +		mutex_unlock(&mbox->lock);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> > +		req->schq[lvl] = cfg->schq[lvl];
> > +		req->schq_contig[lvl] = cfg->schq_contig[lvl];
> > +	}
> > +
> > +	rc = otx2_sync_mbox_msg(&pfvf->mbox);
> > +	if (rc) {
> > +		mutex_unlock(&mbox->lock);
> > +		return rc;
> > +	}
> > +
> > +	rsp = (struct nix_txsch_alloc_rsp *)
> > +	      otx2_mbox_get_rsp(&pfvf->mbox.mbox, 0, &req->hdr);
> > +
> > +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> > +		for (schq = 0; schq < rsp->schq_contig[lvl]; schq++) {
> > +			cfg->schq_contig_list[lvl][schq] =
> > +				rsp->schq_contig_list[lvl][schq];
> > +		}
> > +	}
> > +
> > +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> > +		for (schq = 0; schq < rsp->schq[lvl]; schq++) {
> > +			cfg->schq_list[lvl][schq] =
> > +				rsp->schq_list[lvl][schq];
> > +		}
> > +	}
> > +
> > +	pfvf->qos.link_cfg_lvl = rsp->link_cfg_lvl;
> > +
> > +	mutex_unlock(&mbox->lock);
> > +
> > +	return rc;
> > +}
> > +
> > +static void otx2_qos_txschq_fill_cfg_schq(struct otx2_nic *pfvf,
> > +					  struct otx2_qos_node *node,
> > +					  struct otx2_qos_cfg *cfg)
> > +{
> > +	struct otx2_qos_node *tmp;
> > +	int cnt;
> > +
> > +	list_for_each_entry(tmp, &node->child_schq_list, list) {
> > +		cnt = cfg->dwrr_node_pos[tmp->level];
> > +		tmp->schq = cfg->schq_list[tmp->level][cnt];
> > +		cfg->dwrr_node_pos[tmp->level]++;
> > +	}
> > +}
> > +
> > +static void otx2_qos_txschq_fill_cfg_tl(struct otx2_nic *pfvf,
> > +					struct otx2_qos_node *node,
> > +					struct otx2_qos_cfg *cfg)
> > +{
> > +	struct otx2_qos_node *tmp;
> > +	int cnt;
> > +
> > +	list_for_each_entry(tmp, &node->child_list, list) {
> > +		otx2_qos_txschq_fill_cfg_tl(pfvf, tmp, cfg);
> > +		cnt = cfg->static_node_pos[tmp->level];
> > +		tmp->schq = cfg->schq_contig_list[tmp->level][cnt];
> > +		if (cnt == 0)
> > +			node->prio_anchor = tmp->schq;
> > +		cfg->static_node_pos[tmp->level]++;
> > +		otx2_qos_txschq_fill_cfg_schq(pfvf, tmp, cfg);
> > +	}
> > +}
> > +
> > +static void otx2_qos_txschq_fill_cfg(struct otx2_nic *pfvf,
> > +				     struct otx2_qos_node *node,
> > +				     struct otx2_qos_cfg *cfg)
> > +{
> > +	mutex_lock(&pfvf->qos.qos_lock);
> > +	otx2_qos_txschq_fill_cfg_tl(pfvf, node, cfg);
> > +	otx2_qos_txschq_fill_cfg_schq(pfvf, node, cfg);
> > +	mutex_unlock(&pfvf->qos.qos_lock);
> > +}
> > +
> > +static int otx2_qos_txschq_push_cfg_schq(struct otx2_nic *pfvf,
> > +					 struct otx2_qos_node *node,
> > +					 struct otx2_qos_cfg *cfg)
> > +{
> > +	struct otx2_qos_node *tmp;
> > +	int ret = 0;
> > +
> > +	list_for_each_entry(tmp, &node->child_schq_list, list) {
> > +		ret = otx2_qos_txschq_config(pfvf, tmp);
> > +		if (ret)
> > +			return -EIO;
> > +		ret = otx2_qos_txschq_set_parent_topology(pfvf, tmp-
> >parent);
> > +		if (ret)
> > +			return -EIO;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int otx2_qos_txschq_push_cfg_tl(struct otx2_nic *pfvf,
> > +				       struct otx2_qos_node *node,
> > +				       struct otx2_qos_cfg *cfg)
> > +{
> > +	struct otx2_qos_node *tmp;
> > +	int ret;
> > +
> > +	list_for_each_entry(tmp, &node->child_list, list) {
> > +		ret = otx2_qos_txschq_push_cfg_tl(pfvf, tmp, cfg);
> > +		if (ret)
> > +			return -EIO;
> > +		ret = otx2_qos_txschq_config(pfvf, tmp);
> > +		if (ret)
> > +			return -EIO;
> > +		ret = otx2_qos_txschq_push_cfg_schq(pfvf, tmp, cfg);
> > +		if (ret)
> > +			return -EIO;
> > +	}
> > +
> > +	ret = otx2_qos_txschq_set_parent_topology(pfvf, node);
> > +	if (ret)
> > +		return -EIO;
> > +
> > +	return 0;
> > +}
> > +
> > +static int otx2_qos_txschq_push_cfg(struct otx2_nic *pfvf,
> > +				    struct otx2_qos_node *node,
> > +				    struct otx2_qos_cfg *cfg)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&pfvf->qos.qos_lock);
> > +	ret = otx2_qos_txschq_push_cfg_tl(pfvf, node, cfg);
> > +	if (ret)
> > +		goto out;
> > +	ret = otx2_qos_txschq_push_cfg_schq(pfvf, node, cfg);
> > +out:
> > +	mutex_unlock(&pfvf->qos.qos_lock);
> > +	return ret;
> > +}
> > +
> > +static int otx2_qos_txschq_update_config(struct otx2_nic *pfvf,
> > +					 struct otx2_qos_node *node,
> > +					 struct otx2_qos_cfg *cfg)
> > +{
> > +	otx2_qos_txschq_fill_cfg(pfvf, node, cfg);
> > +
> > +	return otx2_qos_txschq_push_cfg(pfvf, node, cfg);
> > +}
> > +
> > +static int otx2_qos_txschq_update_root_cfg(struct otx2_nic *pfvf,
> > +					   struct otx2_qos_node *root,
> > +					   struct otx2_qos_cfg *cfg)
> > +{
> > +	root->schq = cfg->schq_list[root->level][0];
> > +	return otx2_qos_txschq_config(pfvf, root);
> > +}
> > +
> > +static void otx2_qos_free_cfg(struct otx2_nic *pfvf, struct otx2_qos_cfg
> *cfg)
> > +{
> > +	int lvl, idx, schq;
> > +
> > +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> > +		for (idx = 0; idx < cfg->schq[lvl]; idx++) {
> > +			schq = cfg->schq_list[lvl][idx];
> > +			otx2_txschq_free_one(pfvf, lvl, schq);
> > +		}
> > +	}
> > +
> > +	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
> > +		for (idx = 0; idx < cfg->schq_contig[lvl]; idx++) {
> > +			schq = cfg->schq_contig_list[lvl][idx];
> > +			otx2_txschq_free_one(pfvf, lvl, schq);
> > +		}
> > +	}
> > +}
> > +
> > +static void otx2_qos_enadis_sq(struct otx2_nic *pfvf,
> > +			       struct otx2_qos_node *node,
> > +			       u16 qid)
> > +{
> > +	if (pfvf->qos.qid_to_sqmap[qid] != OTX2_QOS_INVALID_SQ)
> > +		otx2_qos_disable_sq(pfvf, qid);
> > +
> > +	pfvf->qos.qid_to_sqmap[qid] = node->schq;
> > +	otx2_qos_enable_sq(pfvf, qid);
> > +}
> > +
> > +static void otx2_qos_update_smq_schq(struct otx2_nic *pfvf,
> > +				     struct otx2_qos_node *node,
> > +				     bool action)
> > +{
> > +	struct otx2_qos_node *tmp;
> > +
> > +	if (node->qid == OTX2_QOS_QID_INNER)
> > +		return;
> > +
> > +	list_for_each_entry(tmp, &node->child_schq_list, list) {
> > +		if (tmp->level == NIX_TXSCH_LVL_MDQ) {
> > +			if (action == QOS_SMQ_FLUSH)
> > +				otx2_smq_flush(pfvf, tmp->schq);
> > +			else
> > +				otx2_qos_enadis_sq(pfvf, tmp, node->qid);
> > +		}
> > +	}
> > +}
> > +
> > +static void __otx2_qos_update_smq(struct otx2_nic *pfvf,
> > +				  struct otx2_qos_node *node,
> > +				  bool action)
> > +{
> > +	struct otx2_qos_node *tmp;
> > +
> > +	list_for_each_entry(tmp, &node->child_list, list) {
> > +		__otx2_qos_update_smq(pfvf, tmp, action);
> > +		if (tmp->qid == OTX2_QOS_QID_INNER)
> > +			continue;
> > +		if (tmp->level == NIX_TXSCH_LVL_MDQ) {
> > +			if (action == QOS_SMQ_FLUSH)
> > +				otx2_smq_flush(pfvf, tmp->schq);
> > +			else
> > +				otx2_qos_enadis_sq(pfvf, tmp, tmp->qid);
> > +		} else {
> > +			otx2_qos_update_smq_schq(pfvf, tmp, action);
> > +		}
> > +	}
> > +}
> > +
> > +static void otx2_qos_update_smq(struct otx2_nic *pfvf,
> > +				struct otx2_qos_node *node,
> > +				bool action)
> > +{
> > +	mutex_lock(&pfvf->qos.qos_lock);
> > +	__otx2_qos_update_smq(pfvf, node, action);
> > +	otx2_qos_update_smq_schq(pfvf, node, action);
> > +	mutex_unlock(&pfvf->qos.qos_lock);
> > +}
> > +
> > +static int otx2_qos_push_txschq_cfg(struct otx2_nic *pfvf,
> > +				    struct otx2_qos_node *node,
> > +				    struct otx2_qos_cfg *cfg)
> > +{
> > +	int ret = 0;
> > +
> > +	ret = otx2_qos_txschq_alloc(pfvf, cfg);
> > +	if (ret)
> > +		return -ENOSPC;
> > +
> > +	if (!(pfvf->netdev->flags & IFF_UP)) {
> > +		otx2_qos_txschq_fill_cfg(pfvf, node, cfg);
> > +		return 0;
> > +	}
> > +
> > +	ret = otx2_qos_txschq_update_config(pfvf, node, cfg);
> > +	if (ret) {
> > +		otx2_qos_free_cfg(pfvf, cfg);
> > +		return -EIO;
> > +	}
> > +
> > +	otx2_qos_update_smq(pfvf, node, QOS_CFG_SQ);
> > +
> > +	return 0;
> > +}
> > +
> > +static int otx2_qos_update_tree(struct otx2_nic *pfvf,
> > +				struct otx2_qos_node *node,
> > +				struct otx2_qos_cfg *cfg)
> > +{
> > +	otx2_qos_prepare_txschq_cfg(pfvf, node->parent, cfg);
> > +	return otx2_qos_push_txschq_cfg(pfvf, node->parent, cfg);
> > +}
> > +
> > +static int otx2_qos_root_add(struct otx2_nic *pfvf, u16 htb_maj_id, u16
> htb_defcls,
> > +			     struct netlink_ext_ack *extack)
> > +{
> > +	struct otx2_qos_cfg *new_cfg;
> > +	struct otx2_qos_node *root;
> > +	int err;
> > +
> > +	netdev_dbg(pfvf->netdev,
> > +		   "TC_HTB_CREATE: handle=0x%x defcls=0x%x\n",
> > +		   htb_maj_id, htb_defcls);
> > +
> > +	INIT_LIST_HEAD(&pfvf->qos.qos_tree);
> > +	mutex_init(&pfvf->qos.qos_lock);
> > +
> > +	root = otx2_qos_alloc_root(pfvf);
> > +	if (IS_ERR(root)) {
> > +		mutex_destroy(&pfvf->qos.qos_lock);
> > +		err = PTR_ERR(root);
> > +		return err;
> > +	}
> > +
> > +	/* allocate txschq queue */
> > +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> > +	if (!new_cfg) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation
> error");
> > +		mutex_destroy(&pfvf->qos.qos_lock);
> > +		return -ENOMEM;
> > +	}
> > +	/* allocate htb root node */
> > +	new_cfg->schq[root->level] = 1;
> > +	err = otx2_qos_txschq_alloc(pfvf, new_cfg);
> > +	if (err) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Error allocating txschq");
> > +		goto free_root_node;
> > +	}
> > +
> > +	if (!(pfvf->netdev->flags & IFF_UP) ||
> > +	    root->level == NIX_TXSCH_LVL_TL1) {
> > +		root->schq = new_cfg->schq_list[root->level][0];
> > +		goto out;
> > +	}
> > +
> > +	/* update the txschq configuration in hw */
> > +	err = otx2_qos_txschq_update_root_cfg(pfvf, root, new_cfg);
> > +	if (err) {
> > +		NL_SET_ERR_MSG_MOD(extack,
> > +				   "Error updating txschq configuration");
> > +		goto txschq_free;
> > +	}
> > +
> > +out:
> > +	WRITE_ONCE(pfvf->qos.defcls, htb_defcls);
> > +	smp_store_release(&pfvf->qos.maj_id, htb_maj_id); /* barrier */
> > +	kfree(new_cfg);
> > +	return 0;
> > +
> > +txschq_free:
> > +	otx2_qos_free_cfg(pfvf, new_cfg);
> > +free_root_node:
> > +	kfree(new_cfg);
> > +	otx2_qos_sw_node_delete(pfvf, root);
> > +	mutex_destroy(&pfvf->qos.qos_lock);
> > +	return err;
> > +}
> > +
> > +static int otx2_qos_root_destroy(struct otx2_nic *pfvf)
> > +{
> > +	struct otx2_qos_node *root;
> > +
> > +	netdev_dbg(pfvf->netdev, "TC_HTB_DESTROY\n");
> > +
> > +	/* find root node */
> > +	root = otx2_sw_node_find(pfvf, OTX2_QOS_ROOT_CLASSID);
> > +	if (IS_ERR(root))
> > +		return -ENOENT;
> > +
> > +	/* free the hw mappings */
> > +	otx2_qos_destroy_node(pfvf, root);
> > +	mutex_destroy(&pfvf->qos.qos_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +static int otx2_qos_validate_configuration(struct otx2_qos_node
> *parent,
> > +					   struct netlink_ext_ack *extack,
> > +					   struct otx2_nic *pfvf,
> > +					   u64 prio)
> > +{
> > +	if (test_bit(prio, parent->prio_bmap)) {
> > +		NL_SET_ERR_MSG_MOD(extack,
> > +				   "Static priority child with same priority
> exists");
> > +		return -EEXIST;
> > +	}
> > +
> > +	if (prio == TXSCH_TL1_DFLT_RR_PRIO) {
> > +		NL_SET_ERR_MSG_MOD(extack,
> > +				   "Priority is reserved for Round Robin");
> > +		return -EINVAL;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int otx2_qos_leaf_alloc_queue(struct otx2_nic *pfvf, u16 classid,
> > +				     u32 parent_classid, u64 rate, u64 ceil,
> > +				     u64 prio, struct netlink_ext_ack *extack)
> > +{
> > +	struct otx2_qos_cfg *old_cfg, *new_cfg;
> > +	struct otx2_qos_node *node, *parent;
> > +	int qid, ret, err;
> > +
> > +	netdev_dbg(pfvf->netdev,
> > +		   "TC_HTB_LEAF_ALLOC_QUEUE: classid=0x%x
> parent_classid=0x%x rate=%lld ceil=%lld prio=%lld\n",
> > +		   classid, parent_classid, rate, ceil, prio);
> > +
> > +	if (prio > OTX2_QOS_MAX_PRIO) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Valid priority range 0 to
> 7");
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +
> > +	/* get parent node */
> > +	parent = otx2_sw_node_find(pfvf, parent_classid);
> > +	if (IS_ERR(parent)) {
> > +		NL_SET_ERR_MSG_MOD(extack, "parent node not found");
> > +		ret = -ENOENT;
> > +		goto out;
> > +	}
> > +	if (parent->level == NIX_TXSCH_LVL_MDQ) {
> > +		NL_SET_ERR_MSG_MOD(extack, "HTB qos max levels
> reached");
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +
> > +	ret = otx2_qos_validate_configuration(parent, extack, pfvf, prio);
> > +	if (ret)
> > +		goto out;
> > +
> > +	set_bit(prio, parent->prio_bmap);
> > +
> > +	/* read current txschq configuration */
> > +	old_cfg = kzalloc(sizeof(*old_cfg), GFP_KERNEL);
> > +	if (!old_cfg) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation
> error");
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +	otx2_qos_read_txschq_cfg(pfvf, parent, old_cfg);
> > +
> > +	/* allocate a new sq */
> > +	qid = otx2_qos_get_qid(pfvf);
> > +	if (qid < 0) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Reached max supported
> QOS SQ's");
> > +		ret = -ENOMEM;
> > +		goto free_old_cfg;
> > +	}
> > +
> > +	/* Actual SQ mapping will be updated after SMQ alloc */
> > +	pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ;
> > +
> > +	/* allocate and initialize a new child node */
> > +	node = otx2_qos_sw_create_leaf_node(pfvf, parent, classid, prio,
> rate,
> > +					    ceil, qid);
> > +	if (IS_ERR(node)) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Unable to allocate leaf
> node");
> > +		ret = PTR_ERR(node);
> > +		goto free_old_cfg;
> > +	}
> > +
> > +	/* push new txschq config to hw */
> > +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> > +	if (!new_cfg) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation
> error");
> > +		ret = -ENOMEM;
> > +		goto free_node;
> > +	}
> > +	ret = otx2_qos_update_tree(pfvf, node, new_cfg);
> > +	if (ret) {
> > +		NL_SET_ERR_MSG_MOD(extack, "HTB HW configuration
> error");
> > +		kfree(new_cfg);
> > +		otx2_qos_sw_node_delete(pfvf, node);
> > +		/* restore the old qos tree */
> > +		err = otx2_qos_txschq_update_config(pfvf, parent, old_cfg);
> > +		if (err) {
> > +			netdev_err(pfvf->netdev,
> > +				   "Failed to restore txcshq configuration");
> > +			goto free_old_cfg;
> > +		}
> > +
> > +		otx2_qos_update_smq(pfvf, parent, QOS_CFG_SQ);
> > +		goto free_old_cfg;
> > +	}
> > +
> > +	/* update tx_real_queues */
> > +	otx2_qos_update_tx_netdev_queues(pfvf);
> > +
> > +	/* free new txschq config */
> > +	kfree(new_cfg);
> > +
> > +	/* free old txschq config */
> > +	otx2_qos_free_cfg(pfvf, old_cfg);
> > +	kfree(old_cfg);
> > +
> > +	return pfvf->hw.tx_queues + qid;
> > +
> > +free_node:
> > +	otx2_qos_sw_node_delete(pfvf, node);
> > +free_old_cfg:
> > +	kfree(old_cfg);
> > +out:
> > +	return ret;
> > +}
> > +
> > +static int otx2_qos_leaf_to_inner(struct otx2_nic *pfvf, u16 classid,
> > +				  u16 child_classid, u64 rate, u64 ceil, u64 prio,
> > +				  struct netlink_ext_ack *extack)
> > +{
> > +	struct otx2_qos_cfg *old_cfg, *new_cfg;
> > +	struct otx2_qos_node *node, *child;
> > +	int ret, err;
> > +	u16 qid;
> > +
> > +	netdev_dbg(pfvf->netdev,
> > +		   "TC_HTB_LEAF_TO_INNER classid %04x, child %04x, rate
> %llu, ceil %llu\n",
> > +		   classid, child_classid, rate, ceil);
> > +
> > +	if (prio > OTX2_QOS_MAX_PRIO) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Valid priority range 0 to
> 7");
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +
> > +	/* find node related to classid */
> > +	node = otx2_sw_node_find(pfvf, classid);
> > +	if (IS_ERR(node)) {
> > +		NL_SET_ERR_MSG_MOD(extack, "HTB node not found");
> > +		ret = -ENOENT;
> > +		goto out;
> > +	}
> > +	/* check max qos txschq level */
> > +	if (node->level == NIX_TXSCH_LVL_MDQ) {
> > +		NL_SET_ERR_MSG_MOD(extack, "HTB qos level not
> supported");
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +
> > +	set_bit(prio, node->prio_bmap);
> > +
> > +	/* store the qid to assign to leaf node */
> > +	qid = node->qid;
> > +
> > +	/* read current txschq configuration */
> > +	old_cfg = kzalloc(sizeof(*old_cfg), GFP_KERNEL);
> > +	if (!old_cfg) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation
> error");
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +	otx2_qos_read_txschq_cfg(pfvf, node, old_cfg);
> > +
> > +	/* delete the txschq nodes allocated for this node */
> > +	otx2_qos_free_sw_node_schq(pfvf, node);
> > +
> > +	/* mark this node as htb inner node */
> > +	node->qid = OTX2_QOS_QID_INNER;
> 
> As you can concurrently read node->qid from the datapath
> (ndo_select_queue), you should use READ_ONCE/WRITE_ONCE to
> guarantee
> that the value will not be torn; you already use READ_ONCE, but it
> doesn't pair with a WRITE_ONCE here.
> 
ACK, will update the suggested change.

Thanks,
Hariprasad k
> > +
> > +	/* allocate and initialize a new child node */
> > +	child = otx2_qos_sw_create_leaf_node(pfvf, node, child_classid,
> > +					     prio, rate, ceil, qid);
> > +	if (IS_ERR(child)) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Unable to allocate leaf
> node");
> > +		ret = PTR_ERR(child);
> > +		goto free_old_cfg;
> > +	}
> > +
> > +	/* push new txschq config to hw */
> > +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> > +	if (!new_cfg) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation
> error");
> > +		ret = -ENOMEM;
> > +		goto free_node;
> > +	}
> > +	ret = otx2_qos_update_tree(pfvf, child, new_cfg);
> > +	if (ret) {
> > +		NL_SET_ERR_MSG_MOD(extack, "HTB HW configuration
> error");
> > +		kfree(new_cfg);
> > +		otx2_qos_sw_node_delete(pfvf, child);
> > +		/* restore the old qos tree */
> > +		node->qid = qid;
> 
> Same here; might be somewhere else as well.
> 
> > +		err = otx2_qos_alloc_txschq_node(pfvf, node);
> > +		if (err) {
> > +			netdev_err(pfvf->netdev,
> > +				   "Failed to restore old leaf node");
> > +			goto free_old_cfg;
> > +		}
> > +		err = otx2_qos_txschq_update_config(pfvf, node, old_cfg);
> > +		if (err) {
> > +			netdev_err(pfvf->netdev,
> > +				   "Failed to restore txcshq configuration");
> > +			goto free_old_cfg;
> > +		}
> > +		otx2_qos_update_smq(pfvf, node, QOS_CFG_SQ);
> > +		goto free_old_cfg;
> > +	}
> > +
> > +	/* free new txschq config */
> > +	kfree(new_cfg);
> > +
> > +	/* free old txschq config */
> > +	otx2_qos_free_cfg(pfvf, old_cfg);
> > +	kfree(old_cfg);
> > +
> > +	return 0;
> > +
> > +free_node:
> > +	otx2_qos_sw_node_delete(pfvf, child);
> > +free_old_cfg:
> > +	kfree(old_cfg);
> > +out:
> > +	return ret;
> > +}
> > +
> > +static int otx2_qos_leaf_del(struct otx2_nic *pfvf, u16 *classid,
> > +			     struct netlink_ext_ack *extack)
> > +{
> > +	struct otx2_qos_node *node, *parent;
> > +	u64 prio;
> > +	u16 qid;
> > +
> > +	netdev_dbg(pfvf->netdev, "TC_HTB_LEAF_DEL classid %04x\n",
> *classid);
> > +
> > +	/* find node related to classid */
> > +	node = otx2_sw_node_find(pfvf, *classid);
> > +	if (IS_ERR(node)) {
> > +		NL_SET_ERR_MSG_MOD(extack, "HTB node not found");
> > +		return -ENOENT;
> > +	}
> > +	parent = node->parent;
> > +	prio   = node->prio;
> > +	qid    = node->qid;
> > +
> > +	otx2_qos_disable_sq(pfvf, node->qid);
> > +
> > +	otx2_qos_destroy_node(pfvf, node);
> > +	pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ;
> > +
> > +	clear_bit(prio, parent->prio_bmap);
> > +
> > +	return 0;
> > +}
> > +
> > +static int otx2_qos_leaf_del_last(struct otx2_nic *pfvf, u16 classid, bool
> force,
> > +				  struct netlink_ext_ack *extack)
> > +{
> > +	struct otx2_qos_node *node, *parent;
> > +	struct otx2_qos_cfg *new_cfg;
> > +	u64 prio;
> > +	int err;
> > +	u16 qid;
> > +
> > +	netdev_dbg(pfvf->netdev,
> > +		   "TC_HTB_LEAF_DEL_LAST classid %04x\n", classid);
> > +
> > +	/* find node related to classid */
> > +	node = otx2_sw_node_find(pfvf, classid);
> > +	if (IS_ERR(node)) {
> > +		NL_SET_ERR_MSG_MOD(extack, "HTB node not found");
> > +		return -ENOENT;
> > +	}
> > +
> > +	/* save qid for use by parent */
> > +	qid = node->qid;
> > +	prio = node->prio;
> > +
> > +	parent = otx2_sw_node_find(pfvf, node->parent->classid);
> > +	if (IS_ERR(parent)) {
> > +		NL_SET_ERR_MSG_MOD(extack, "parent node not found");
> > +		return -ENOENT;
> > +	}
> > +
> > +	/* destroy the leaf node */
> > +	otx2_qos_destroy_node(pfvf, node);
> > +	pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ;
> > +
> > +	clear_bit(prio, parent->prio_bmap);
> > +
> > +	/* create downstream txschq entries to parent */
> > +	err = otx2_qos_alloc_txschq_node(pfvf, parent);
> > +	if (err) {
> > +		NL_SET_ERR_MSG_MOD(extack, "HTB failed to create txsch
> configuration");
> > +		return err;
> > +	}
> > +	parent->qid = qid;
> > +	__set_bit(qid, pfvf->qos.qos_sq_bmap);
> > +
> > +	/* push new txschq config to hw */
> > +	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
> > +	if (!new_cfg) {
> > +		NL_SET_ERR_MSG_MOD(extack, "Memory allocation
> error");
> > +		return -ENOMEM;
> > +	}
> > +	/* fill txschq cfg and push txschq cfg to hw */
> > +	otx2_qos_fill_cfg_schq(parent, new_cfg);
> > +	err = otx2_qos_push_txschq_cfg(pfvf, parent, new_cfg);
> > +	if (err) {
> > +		NL_SET_ERR_MSG_MOD(extack, "HTB HW configuration
> error");
> > +		kfree(new_cfg);
> > +		return err;
> > +	}
> > +	kfree(new_cfg);
> > +
> > +	/* update tx_real_queues */
> > +	otx2_qos_update_tx_netdev_queues(pfvf);
> > +
> > +	return 0;
> > +}
> > +
> > +void otx2_clean_qos_queues(struct otx2_nic *pfvf)
> > +{
> > +	struct otx2_qos_node *root;
> > +
> > +	root = otx2_sw_node_find(pfvf, OTX2_QOS_ROOT_CLASSID);
> > +	if (IS_ERR(root))
> > +		return;
> > +
> > +	otx2_qos_update_smq(pfvf, root, QOS_SMQ_FLUSH);
> > +}
> > +
> > +void otx2_qos_config_txschq(struct otx2_nic *pfvf)
> > +{
> > +	struct otx2_qos_node *root;
> > +	int err;
> > +
> > +	root = otx2_sw_node_find(pfvf, OTX2_QOS_ROOT_CLASSID);
> > +	if (IS_ERR(root))
> > +		return;
> > +
> > +	err = otx2_qos_txschq_config(pfvf, root);
> > +	if (err) {
> > +		netdev_err(pfvf->netdev, "Error update txschq
> configuration\n");
> > +		goto root_destroy;
> > +	}
> > +
> > +	err = otx2_qos_txschq_push_cfg_tl(pfvf, root, NULL);
> > +	if (err) {
> > +		netdev_err(pfvf->netdev, "Error update txschq
> configuration\n");
> > +		goto root_destroy;
> > +	}
> > +
> > +	otx2_qos_update_smq(pfvf, root, QOS_CFG_SQ);
> > +	return;
> > +
> > +root_destroy:
> > +	netdev_err(pfvf->netdev, "Failed to update Scheduler/Shaping
> config in Hardware\n");
> > +	/* Free resources allocated */
> > +	otx2_qos_root_destroy(pfvf);
> > +}
> > +
> > +int otx2_setup_tc_htb(struct net_device *ndev, struct
> tc_htb_qopt_offload *htb)
> > +{
> > +	struct otx2_nic *pfvf = netdev_priv(ndev);
> > +	int res;
> > +
> > +	switch (htb->command) {
> > +	case TC_HTB_CREATE:
> > +		return otx2_qos_root_add(pfvf, htb->parent_classid,
> > +					 htb->classid, htb->extack);
> > +	case TC_HTB_DESTROY:
> > +		return otx2_qos_root_destroy(pfvf);
> > +	case TC_HTB_LEAF_ALLOC_QUEUE:
> > +		res = otx2_qos_leaf_alloc_queue(pfvf, htb->classid,
> > +						htb->parent_classid,
> > +						htb->rate, htb->ceil,
> > +						htb->prio, htb->extack);
> > +		if (res < 0)
> > +			return res;
> > +		htb->qid = res;
> > +		return 0;
> > +	case TC_HTB_LEAF_TO_INNER:
> > +		return otx2_qos_leaf_to_inner(pfvf, htb->parent_classid,
> > +					      htb->classid, htb->rate,
> > +					      htb->ceil, htb->prio,
> > +					      htb->extack);
> > +	case TC_HTB_LEAF_DEL:
> > +		return otx2_qos_leaf_del(pfvf, &htb->classid, htb->extack);
> > +	case TC_HTB_LEAF_DEL_LAST:
> > +	case TC_HTB_LEAF_DEL_LAST_FORCE:
> > +		return otx2_qos_leaf_del_last(pfvf, htb->classid,
> > +				htb->command ==
> TC_HTB_LEAF_DEL_LAST_FORCE,
> > +					      htb->extack);
> > +	case TC_HTB_LEAF_QUERY_QUEUE:
> > +		res = otx2_get_txq_by_classid(pfvf, htb->classid);
> > +		htb->qid = res;
> > +		return 0;
> > +	case TC_HTB_NODE_MODIFY:
> > +		fallthrough;
> > +	default:
> > +		return -EOPNOTSUPP;
> > +	}
> > +}
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
> b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
> > index 73a62d092e99..26de1af2aa57 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
> > @@ -7,13 +7,63 @@
> >  #ifndef OTX2_QOS_H
> >  #define OTX2_QOS_H
> >
> > +#include <linux/types.h>
> > +#include <linux/netdevice.h>
> > +#include <linux/rhashtable.h>
> > +
> > +#define OTX2_QOS_MAX_LVL		4
> > +#define OTX2_QOS_MAX_PRIO		7
> >  #define OTX2_QOS_MAX_LEAF_NODES                16
> >
> > -int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq);
> > -void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq);
> > +enum qos_smq_operations {
> > +	QOS_CFG_SQ,
> > +	QOS_SMQ_FLUSH,
> > +};
> > +
> > +u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic, u64 maxrate, u32
> burst);
> > +
> > +int otx2_setup_tc_htb(struct net_device *ndev, struct
> tc_htb_qopt_offload *htb);
> > +int otx2_qos_get_qid(struct otx2_nic *pfvf);
> > +void otx2_qos_free_qid(struct otx2_nic *pfvf, int qidx);
> > +int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx);
> > +void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx);
> > +
> > +struct otx2_qos_cfg {
> > +	u16 schq[NIX_TXSCH_LVL_CNT];
> > +	u16 schq_contig[NIX_TXSCH_LVL_CNT];
> > +	int static_node_pos[NIX_TXSCH_LVL_CNT];
> > +	int dwrr_node_pos[NIX_TXSCH_LVL_CNT];
> > +	u16
> schq_contig_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC];
> > +	u16 schq_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC];
> > +};
> >
> >  struct otx2_qos {
> > -	       u16 qid_to_sqmap[OTX2_QOS_MAX_LEAF_NODES];
> > -	};
> > +	DECLARE_HASHTABLE(qos_hlist,
> order_base_2(OTX2_QOS_MAX_LEAF_NODES));
> > +	struct mutex qos_lock; /* child list lock */
> > +	u16 qid_to_sqmap[OTX2_QOS_MAX_LEAF_NODES];
> > +	struct list_head qos_tree;
> > +	DECLARE_BITMAP(qos_sq_bmap, OTX2_QOS_MAX_LEAF_NODES);
> > +	u16 maj_id;
> > +	u16 defcls;
> > +	u8  link_cfg_lvl; /* LINKX_CFG CSRs mapped to TL3 or TL2's index ? */
> > +};
> > +
> > +struct otx2_qos_node {
> > +	struct list_head list; /* list managment */
> > +	struct list_head child_list;
> > +	struct list_head child_schq_list;
> > +	struct hlist_node hlist;
> > +	DECLARE_BITMAP(prio_bmap, OTX2_QOS_MAX_PRIO + 1);
> > +	struct otx2_qos_node *parent;	/* parent qos node */
> > +	u64 rate; /* htb params */
> > +	u64 ceil;
> > +	u32 classid;
> > +	u32 prio;
> > +	u16 schq; /* hw txschq */
> > +	u16 qid;
> > +	u16 prio_anchor;
> > +	u8 level;
> > +};
> > +
> >
> >  #endif
> > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
> b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
> > index 1c77f024c360..8a1e89668a1b 100644
> > --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
> > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
> > @@ -225,7 +225,22 @@ static int otx2_qos_ctx_disable(struct otx2_nic
> *pfvf, u16 qidx, int aura_id)
> >  	return otx2_sync_mbox_msg(&pfvf->mbox);
> >  }
> >
> > -int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq)
> > +int otx2_qos_get_qid(struct otx2_nic *pfvf)
> > +{
> > +	int qidx;
> > +
> > +	qidx = find_first_zero_bit(pfvf->qos.qos_sq_bmap,
> > +				   pfvf->hw.tc_tx_queues);
> > +
> > +	return qidx == pfvf->hw.tc_tx_queues ? -ENOSPC : qidx;
> > +}
> > +
> > +void otx2_qos_free_qid(struct otx2_nic *pfvf, int qidx)
> > +{
> > +	clear_bit(qidx, pfvf->qos.qos_sq_bmap);
> > +}
> > +
> > +int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx)
> >  {
> >  	struct otx2_hw *hw = &pfvf->hw;
> >  	int pool_id, sq_idx, err;
> > @@ -241,7 +256,6 @@ int otx2_qos_enable_sq(struct otx2_nic *pfvf, int
> qidx, u16 smq)
> >  		goto out;
> >
> >  	pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, sq_idx);
> > -	pfvf->qos.qid_to_sqmap[qidx] = smq;
> >  	err = otx2_sq_init(pfvf, sq_idx, pool_id);
> >  	if (err)
> >  		goto out;
> > @@ -250,7 +264,7 @@ int otx2_qos_enable_sq(struct otx2_nic *pfvf, int
> qidx, u16 smq)
> >  	return err;
> >  }
> >
> > -void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq)
> > +void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx)
> >  {
> >  	struct otx2_qset *qset = &pfvf->qset;
> >  	struct otx2_hw *hw = &pfvf->hw;
> > --
> > 2.17.1
> >
diff mbox series

Patch

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h b/drivers/net/ethernet/marvell/octeontx2/af/common.h
index 8931864ee110..f5bf719a6ccf 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
@@ -142,7 +142,7 @@  enum nix_scheduler {
 
 #define TXSCH_RR_QTM_MAX		((1 << 24) - 1)
 #define TXSCH_TL1_DFLT_RR_QTM		TXSCH_RR_QTM_MAX
-#define TXSCH_TL1_DFLT_RR_PRIO		(0x1ull)
+#define TXSCH_TL1_DFLT_RR_PRIO		(0x7ull)
 #define CN10K_MAX_DWRR_WEIGHT          16384 /* Weight is 14bit on CN10K */
 
 /* Min/Max packet sizes, excluding FCS */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
index 3d31ddf7c652..5664f768cb0c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
@@ -8,7 +8,7 @@  obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o otx2_ptp.o
 
 rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \
                otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o \
-               otx2_devlink.o qos_sq.o
+               otx2_devlink.o qos_sq.o qos.o
 rvu_nicvf-y := otx2_vf.o otx2_devlink.o
 
 rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
index 32c02a2d3554..b4542a801291 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
@@ -89,6 +89,11 @@  int otx2_update_sq_stats(struct otx2_nic *pfvf, int qidx)
 	if (!pfvf->qset.sq)
 		return 0;
 
+	if (qidx >= pfvf->hw.non_qos_queues) {
+		if (!test_bit(qidx - pfvf->hw.non_qos_queues, pfvf->qos.qos_sq_bmap))
+			return 0;
+	}
+
 	otx2_nix_sq_op_stats(&sq->stats, pfvf, qidx);
 	return 1;
 }
@@ -747,29 +752,47 @@  int otx2_txsch_alloc(struct otx2_nic *pfvf)
 	return 0;
 }
 
-int otx2_txschq_stop(struct otx2_nic *pfvf)
+void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq)
 {
 	struct nix_txsch_free_req *free_req;
-	int lvl, schq, err;
+	int err;
 
 	mutex_lock(&pfvf->mbox.lock);
-	/* Free the transmit schedulers */
+
 	free_req = otx2_mbox_alloc_msg_nix_txsch_free(&pfvf->mbox);
 	if (!free_req) {
 		mutex_unlock(&pfvf->mbox.lock);
-		return -ENOMEM;
+		netdev_err(pfvf->netdev,
+			   "Failed alloc txschq free req\n");
+		return;
 	}
 
-	free_req->flags = TXSCHQ_FREE_ALL;
+	free_req->schq_lvl = lvl;
+	free_req->schq = schq;
+
 	err = otx2_sync_mbox_msg(&pfvf->mbox);
+	if (err) {
+		netdev_err(pfvf->netdev,
+			   "Failed stop txschq %d at level %d\n", lvl, schq);
+	}
+
 	mutex_unlock(&pfvf->mbox.lock);
+}
+
+void otx2_txschq_stop(struct otx2_nic *pfvf)
+{
+	int lvl, schq;
+
+	/* free non QOS TLx nodes */
+	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++)
+		otx2_txschq_free_one(pfvf, lvl,
+				     pfvf->hw.txschq_list[lvl][0]);
 
 	/* Clear the txschq list */
 	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
 		for (schq = 0; schq < MAX_TXSCHQ_PER_FUNC; schq++)
 			pfvf->hw.txschq_list[lvl][schq] = 0;
 	}
-	return err;
 }
 
 void otx2_sqb_flush(struct otx2_nic *pfvf)
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index 3834cc447426..4b219e8e5b32 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -252,6 +252,7 @@  struct otx2_hw {
 #define CN10K_RPM		3
 #define CN10K_PTP_ONESTEP	4
 #define CN10K_HW_MACSEC		5
+#define QOS_CIR_PIR_SUPPORT	6
 	unsigned long		cap_flag;
 
 #define LMT_LINE_SIZE		128
@@ -586,6 +587,7 @@  static inline void otx2_setup_dev_hw_settings(struct otx2_nic *pfvf)
 		__set_bit(CN10K_LMTST, &hw->cap_flag);
 		__set_bit(CN10K_RPM, &hw->cap_flag);
 		__set_bit(CN10K_PTP_ONESTEP, &hw->cap_flag);
+		__set_bit(QOS_CIR_PIR_SUPPORT, &hw->cap_flag);
 	}
 
 	if (is_dev_cn10kb(pfvf->pdev))
@@ -935,7 +937,7 @@  int otx2_config_nix(struct otx2_nic *pfvf);
 int otx2_config_nix_queues(struct otx2_nic *pfvf);
 int otx2_txschq_config(struct otx2_nic *pfvf, int lvl, int prio, bool pfc_en);
 int otx2_txsch_alloc(struct otx2_nic *pfvf);
-int otx2_txschq_stop(struct otx2_nic *pfvf);
+void otx2_txschq_stop(struct otx2_nic *pfvf);
 void otx2_sqb_flush(struct otx2_nic *pfvf);
 int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
 		    dma_addr_t *dma);
@@ -953,6 +955,7 @@  int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
 		   int stack_pages, int numptrs, int buf_size);
 int otx2_aura_init(struct otx2_nic *pfvf, int aura_id,
 		   int pool_id, int numptrs);
+void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq);
 
 /* RSS configuration APIs*/
 int otx2_rss_init(struct otx2_nic *pfvf);
@@ -1064,4 +1067,7 @@  static inline void cn10k_handle_mcs_event(struct otx2_nic *pfvf,
 void otx2_qos_sq_setup(struct otx2_nic *pfvf, int qos_txqs);
 u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
 		      struct net_device *sb_dev);
+int otx2_get_txq_by_classid(struct otx2_nic *pfvf, u16 classid);
+void otx2_qos_config_txschq(struct otx2_nic *pfvf);
+void otx2_clean_qos_queues(struct otx2_nic *pfvf);
 #endif /* OTX2_COMMON_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
index 0f8d1a69139f..e8722a4f4cc6 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
@@ -92,10 +92,16 @@  static void otx2_get_qset_strings(struct otx2_nic *pfvf, u8 **data, int qset)
 			*data += ETH_GSTRING_LEN;
 		}
 	}
-	for (qidx = 0; qidx < pfvf->hw.tx_queues; qidx++) {
+
+	for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) {
 		for (stats = 0; stats < otx2_n_queue_stats; stats++) {
-			sprintf(*data, "txq%d: %s", qidx + start_qidx,
-				otx2_queue_stats[stats].name);
+			if (qidx >= pfvf->hw.non_qos_queues)
+				sprintf(*data, "txq_qos%d: %s",
+					qidx + start_qidx - pfvf->hw.non_qos_queues,
+					otx2_queue_stats[stats].name);
+			else
+				sprintf(*data, "txq%d: %s", qidx + start_qidx,
+					otx2_queue_stats[stats].name);
 			*data += ETH_GSTRING_LEN;
 		}
 	}
@@ -159,7 +165,7 @@  static void otx2_get_qset_stats(struct otx2_nic *pfvf,
 				[otx2_queue_stats[stat].index];
 	}
 
-	for (qidx = 0; qidx < pfvf->hw.tx_queues; qidx++) {
+	for (qidx = 0; qidx <  otx2_get_total_tx_queues(pfvf); qidx++) {
 		if (!otx2_update_sq_stats(pfvf, qidx)) {
 			for (stat = 0; stat < otx2_n_queue_stats; stat++)
 				*((*data)++) = 0;
@@ -254,7 +260,8 @@  static int otx2_get_sset_count(struct net_device *netdev, int sset)
 		return -EINVAL;
 
 	qstats_count = otx2_n_queue_stats *
-		       (pfvf->hw.rx_queues + pfvf->hw.tx_queues);
+		       (pfvf->hw.rx_queues + pfvf->hw.non_qos_queues +
+			pfvf->hw.tc_tx_queues);
 	if (!test_bit(CN10K_RPM, &pfvf->hw.cap_flag))
 		mac_stats = CGX_RX_STATS_COUNT + CGX_TX_STATS_COUNT;
 	otx2_update_lmac_fec_stats(pfvf);
@@ -282,7 +289,7 @@  static int otx2_set_channels(struct net_device *dev,
 {
 	struct otx2_nic *pfvf = netdev_priv(dev);
 	bool if_up = netif_running(dev);
-	int err = 0;
+	int err, qos_txqs;
 
 	if (!channel->rx_count || !channel->tx_count)
 		return -EINVAL;
@@ -296,14 +303,19 @@  static int otx2_set_channels(struct net_device *dev,
 	if (if_up)
 		dev->netdev_ops->ndo_stop(dev);
 
-	err = otx2_set_real_num_queues(dev, channel->tx_count,
+	qos_txqs = bitmap_weight(pfvf->qos.qos_sq_bmap,
+				 OTX2_QOS_MAX_LEAF_NODES);
+
+	err = otx2_set_real_num_queues(dev, channel->tx_count + qos_txqs,
 				       channel->rx_count);
 	if (err)
 		return err;
 
 	pfvf->hw.rx_queues = channel->rx_count;
 	pfvf->hw.tx_queues = channel->tx_count;
-	pfvf->qset.cq_cnt = pfvf->hw.tx_queues +  pfvf->hw.rx_queues;
+	if (pfvf->xdp_prog)
+		pfvf->hw.xdp_queues = channel->rx_count;
+	pfvf->hw.non_qos_queues =  pfvf->hw.tx_queues + pfvf->hw.xdp_queues;
 
 	if (if_up)
 		err = dev->netdev_ops->ndo_open(dev);
@@ -1405,7 +1417,8 @@  static int otx2vf_get_sset_count(struct net_device *netdev, int sset)
 		return -EINVAL;
 
 	qstats_count = otx2_n_queue_stats *
-		       (vf->hw.rx_queues + vf->hw.tx_queues);
+		       (vf->hw.rx_queues + vf->hw.tx_queues +
+			vf->hw.tc_tx_queues);
 
 	return otx2_n_dev_stats + otx2_n_drv_stats + qstats_count + 1;
 }
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index a32f0cb89fc4..d0192f9089ee 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -1387,6 +1387,9 @@  static void otx2_free_sq_res(struct otx2_nic *pf)
 	otx2_sq_free_sqbs(pf);
 	for (qidx = 0; qidx < otx2_get_total_tx_queues(pf); qidx++) {
 		sq = &qset->sq[qidx];
+		/* Skip freeing Qos queues if they are not initialized */
+		if (!sq->sqb_count)
+			continue;
 		qmem_free(pf->dev, sq->sqe);
 		qmem_free(pf->dev, sq->tso_hdrs);
 		kfree(sq->sg);
@@ -1518,8 +1521,7 @@  static int otx2_init_hw_resources(struct otx2_nic *pf)
 	otx2_free_cq_res(pf);
 	otx2_ctx_disable(mbox, NIX_AQ_CTYPE_RQ, false);
 err_free_txsch:
-	if (otx2_txschq_stop(pf))
-		dev_err(pf->dev, "%s failed to stop TX schedulers\n", __func__);
+	otx2_txschq_stop(pf);
 err_free_sq_ptrs:
 	otx2_sq_free_sqbs(pf);
 err_free_rq_ptrs:
@@ -1554,21 +1556,21 @@  static void otx2_free_hw_resources(struct otx2_nic *pf)
 	struct mbox *mbox = &pf->mbox;
 	struct otx2_cq_queue *cq;
 	struct msg_req *req;
-	int qidx, err;
+	int qidx;
 
 	/* Ensure all SQE are processed */
 	otx2_sqb_flush(pf);
 
 	/* Stop transmission */
-	err = otx2_txschq_stop(pf);
-	if (err)
-		dev_err(pf->dev, "RVUPF: Failed to stop/free TX schedulers\n");
+	otx2_txschq_stop(pf);
 
 #ifdef CONFIG_DCB
 	if (pf->pfc_en)
 		otx2_pfc_txschq_stop(pf);
 #endif
 
+	otx2_clean_qos_queues(pf);
+
 	mutex_lock(&mbox->lock);
 	/* Disable backpressure */
 	if (!(pf->pcifunc & RVU_PFVF_FUNC_MASK))
@@ -1836,6 +1838,9 @@  int otx2_open(struct net_device *netdev)
 	/* 'intf_down' may be checked on any cpu */
 	smp_wmb();
 
+	/* Enable QoS configuration before starting tx queues */
+	otx2_qos_config_txschq(pf);
+
 	/* we have already received link status notification */
 	if (pf->linfo.link_up && !(pf->pcifunc & RVU_PFVF_FUNC_MASK))
 		otx2_handle_link_event(pf);
@@ -1980,14 +1985,45 @@  static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev)
 	return NETDEV_TX_OK;
 }
 
+static int otx2_qos_select_htb_queue(struct otx2_nic *pf, struct sk_buff *skb,
+				     u16 htb_maj_id)
+{
+	u16 classid;
+
+	if ((TC_H_MAJ(skb->priority) >> 16) == htb_maj_id)
+		classid = TC_H_MIN(skb->priority);
+	else
+		classid = READ_ONCE(pf->qos.defcls);
+
+	if (!classid)
+		return 0;
+
+	return otx2_get_txq_by_classid(pf, classid);
+}
+
 u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
 		      struct net_device *sb_dev)
 {
-#ifdef CONFIG_DCB
 	struct otx2_nic *pf = netdev_priv(netdev);
+	bool qos_enabled;
+#ifdef CONFIG_DCB
 	u8 vlan_prio;
 #endif
+	int txq;
 
+	qos_enabled = (netdev->real_num_tx_queues > pf->hw.tx_queues) ? true : false;
+	if (unlikely(qos_enabled)) {
+		u16 htb_maj_id = smp_load_acquire(&pf->qos.maj_id); /* barrier */
+
+		if (unlikely(htb_maj_id)) {
+			txq = otx2_qos_select_htb_queue(pf, skb, htb_maj_id);
+			if (txq > 0)
+				return txq;
+			goto process_pfc;
+		}
+	}
+
+process_pfc:
 #ifdef CONFIG_DCB
 	if (!skb_vlan_tag_present(skb))
 		goto pick_tx;
@@ -2001,7 +2037,11 @@  u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
 
 pick_tx:
 #endif
-	return netdev_pick_tx(netdev, skb, NULL);
+	txq = netdev_pick_tx(netdev, skb, NULL);
+	if (unlikely(qos_enabled))
+		return txq % pf->hw.tx_queues;
+
+	return txq;
 }
 EXPORT_SYMBOL(otx2_select_queue);
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
index 1b967eaf948b..45a32e4b49d1 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
@@ -145,12 +145,25 @@ 
 #define NIX_AF_TL1X_TOPOLOGY(a)		(0xC80 | (a) << 16)
 #define NIX_AF_TL2X_PARENT(a)		(0xE88 | (a) << 16)
 #define NIX_AF_TL2X_SCHEDULE(a)		(0xE00 | (a) << 16)
+#define NIX_AF_TL2X_TOPOLOGY(a)		(0xE80 | (a) << 16)
+#define NIX_AF_TL2X_CIR(a)              (0xE20 | (a) << 16)
+#define NIX_AF_TL2X_PIR(a)              (0xE30 | (a) << 16)
 #define NIX_AF_TL3X_PARENT(a)		(0x1088 | (a) << 16)
 #define NIX_AF_TL3X_SCHEDULE(a)		(0x1000 | (a) << 16)
+#define NIX_AF_TL3X_SHAPE(a)		(0x1010 | (a) << 16)
+#define NIX_AF_TL3X_CIR(a)		(0x1020 | (a) << 16)
+#define NIX_AF_TL3X_PIR(a)		(0x1030 | (a) << 16)
+#define NIX_AF_TL3X_TOPOLOGY(a)		(0x1080 | (a) << 16)
 #define NIX_AF_TL4X_PARENT(a)		(0x1288 | (a) << 16)
 #define NIX_AF_TL4X_SCHEDULE(a)		(0x1200 | (a) << 16)
+#define NIX_AF_TL4X_SHAPE(a)		(0x1210 | (a) << 16)
+#define NIX_AF_TL4X_CIR(a)		(0x1220 | (a) << 16)
 #define NIX_AF_TL4X_PIR(a)		(0x1230 | (a) << 16)
+#define NIX_AF_TL4X_TOPOLOGY(a)		(0x1280 | (a) << 16)
 #define NIX_AF_MDQX_SCHEDULE(a)		(0x1400 | (a) << 16)
+#define NIX_AF_MDQX_SHAPE(a)		(0x1410 | (a) << 16)
+#define NIX_AF_MDQX_CIR(a)		(0x1420 | (a) << 16)
+#define NIX_AF_MDQX_PIR(a)		(0x1430 | (a) << 16)
 #define NIX_AF_MDQX_PARENT(a)		(0x1480 | (a) << 16)
 #define NIX_AF_TL3_TL2X_LINKX_CFG(a, b)	(0x1700 | (a) << 16 | (b) << 3)
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
index 044cc211424e..42c49249f4e7 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
@@ -19,6 +19,7 @@ 
 
 #include "cn10k.h"
 #include "otx2_common.h"
+#include "qos.h"
 
 /* Egress rate limiting definitions */
 #define MAX_BURST_EXPONENT		0x0FULL
@@ -147,8 +148,8 @@  static void otx2_get_egress_rate_cfg(u64 maxrate, u32 *exp,
 	}
 }
 
-static u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic,
-				       u64 maxrate, u32 burst)
+u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic,
+				u64 maxrate, u32 burst)
 {
 	u32 burst_exp, burst_mantissa;
 	u32 exp, mantissa, div_exp;
@@ -1127,6 +1128,8 @@  int otx2_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 	switch (type) {
 	case TC_SETUP_BLOCK:
 		return otx2_setup_tc_block(netdev, type_data);
+	case TC_SETUP_QDISC_HTB:
+		return otx2_setup_tc_htb(netdev, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
new file mode 100644
index 000000000000..22c5b6a2871a
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
@@ -0,0 +1,1460 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell RVU Ethernet driver
+ *
+ * Copyright (C) 2023 Marvell.
+ *
+ */
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/bitfield.h>
+
+#include "otx2_common.h"
+#include "cn10k.h"
+#include "qos.h"
+
+#define OTX2_QOS_QID_INNER		0xFFFFU
+#define OTX2_QOS_QID_NONE		0xFFFEU
+#define OTX2_QOS_ROOT_CLASSID		0xFFFFFFFF
+#define OTX2_QOS_CLASS_NONE		0
+#define OTX2_QOS_DEFAULT_PRIO		0xF
+#define OTX2_QOS_INVALID_SQ		0xFFFF
+
+/* Egress rate limiting definitions */
+#define MAX_BURST_EXPONENT		0x0FULL
+#define MAX_BURST_MANTISSA		0xFFULL
+#define MAX_BURST_SIZE			130816ULL
+#define MAX_RATE_DIVIDER_EXPONENT	12ULL
+#define MAX_RATE_EXPONENT		0x0FULL
+#define MAX_RATE_MANTISSA		0xFFULL
+
+/* Bitfields in NIX_TLX_PIR register */
+#define TLX_RATE_MANTISSA		GENMASK_ULL(8, 1)
+#define TLX_RATE_EXPONENT		GENMASK_ULL(12, 9)
+#define TLX_RATE_DIVIDER_EXPONENT	GENMASK_ULL(16, 13)
+#define TLX_BURST_MANTISSA		GENMASK_ULL(36, 29)
+#define TLX_BURST_EXPONENT		GENMASK_ULL(40, 37)
+
+static int otx2_qos_update_tx_netdev_queues(struct otx2_nic *pfvf)
+{
+	int tx_queues, qos_txqs, err;
+	struct otx2_hw *hw = &pfvf->hw;
+
+	qos_txqs = bitmap_weight(pfvf->qos.qos_sq_bmap,
+				 OTX2_QOS_MAX_LEAF_NODES);
+
+	tx_queues = hw->tx_queues + qos_txqs;
+
+	err = netif_set_real_num_tx_queues(pfvf->netdev, tx_queues);
+	if (err) {
+		netdev_err(pfvf->netdev,
+			   "Failed to set no of Tx queues: %d\n", tx_queues);
+		return err;
+	}
+
+	return 0;
+}
+
+static u64 otx2_qos_convert_rate(u64 rate)
+{
+	u64 converted_rate;
+
+	/* convert bytes per second to Mbps */
+	converted_rate = rate * 8;
+	converted_rate = max_t(u64, converted_rate / 1000000, 1);
+
+	return converted_rate;
+}
+
+static void __otx2_qos_txschq_cfg(struct otx2_nic *pfvf,
+				  struct otx2_qos_node *node,
+				  struct nix_txschq_config *cfg)
+{
+	struct otx2_hw *hw = &pfvf->hw;
+	int num_regs = 0;
+	u64 maxrate;
+	u8 level;
+
+	level = node->level;
+
+	/* program txschq registers */
+	if (level == NIX_TXSCH_LVL_SMQ) {
+		cfg->reg[num_regs] = NIX_AF_SMQX_CFG(node->schq);
+		cfg->regval[num_regs] = ((u64)pfvf->tx_max_pktlen << 8) |
+					OTX2_MIN_MTU;
+		cfg->regval[num_regs] |= (0x20ULL << 51) | (0x80ULL << 39) |
+					 (0x2ULL << 36);
+		num_regs++;
+
+		/* configure parent txschq */
+		cfg->reg[num_regs] = NIX_AF_MDQX_PARENT(node->schq);
+		cfg->regval[num_regs] = node->parent->schq << 16;
+		num_regs++;
+
+		/* configure prio/quantum */
+		if (node->qid == OTX2_QOS_QID_NONE) {
+			cfg->reg[num_regs] = NIX_AF_MDQX_SCHEDULE(node->schq);
+			cfg->regval[num_regs] =  node->prio << 24 |
+						 mtu_to_dwrr_weight(pfvf,
+								    pfvf->tx_max_pktlen);
+			num_regs++;
+			goto txschq_cfg_out;
+		}
+
+		/* configure prio */
+		cfg->reg[num_regs] = NIX_AF_MDQX_SCHEDULE(node->schq);
+		cfg->regval[num_regs] = (node->schq -
+					 node->parent->prio_anchor) << 24;
+		num_regs++;
+
+		/* configure PIR */
+		maxrate = (node->rate > node->ceil) ? node->rate : node->ceil;
+
+		cfg->reg[num_regs] = NIX_AF_MDQX_PIR(node->schq);
+		cfg->regval[num_regs] =
+			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
+		num_regs++;
+
+		/* configure CIR */
+		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
+			/* Don't configure CIR when both CIR+PIR not supported
+			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes deadlock
+			 */
+			goto txschq_cfg_out;
+		}
+
+		cfg->reg[num_regs] = NIX_AF_MDQX_CIR(node->schq);
+		cfg->regval[num_regs] =
+			otx2_get_txschq_rate_regval(pfvf, node->rate, 65536);
+		num_regs++;
+	} else if (level == NIX_TXSCH_LVL_TL4) {
+		/* configure parent txschq */
+		cfg->reg[num_regs] = NIX_AF_TL4X_PARENT(node->schq);
+		cfg->regval[num_regs] = node->parent->schq << 16;
+		num_regs++;
+
+		/* return if not htb node */
+		if (node->qid == OTX2_QOS_QID_NONE) {
+			cfg->reg[num_regs] = NIX_AF_TL4X_SCHEDULE(node->schq);
+			cfg->regval[num_regs] =  node->prio << 24 |
+						 mtu_to_dwrr_weight(pfvf,
+								    pfvf->tx_max_pktlen);
+			num_regs++;
+			goto txschq_cfg_out;
+		}
+
+		/* configure priority */
+		cfg->reg[num_regs] = NIX_AF_TL4X_SCHEDULE(node->schq);
+		cfg->regval[num_regs] = (node->schq -
+					 node->parent->prio_anchor) << 24;
+		num_regs++;
+
+		/* configure PIR */
+		maxrate = (node->rate > node->ceil) ? node->rate : node->ceil;
+		cfg->reg[num_regs] = NIX_AF_TL4X_PIR(node->schq);
+		cfg->regval[num_regs] =
+			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
+		num_regs++;
+
+		/* configure CIR */
+		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
+			/* Don't configure CIR when both CIR+PIR not supported
+			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes deadlock
+			 */
+			goto txschq_cfg_out;
+		}
+
+		cfg->reg[num_regs] = NIX_AF_TL4X_CIR(node->schq);
+		cfg->regval[num_regs] =
+			otx2_get_txschq_rate_regval(pfvf, node->rate, 65536);
+		num_regs++;
+	} else if (level == NIX_TXSCH_LVL_TL3) {
+		/* configure parent txschq */
+		cfg->reg[num_regs] = NIX_AF_TL3X_PARENT(node->schq);
+		cfg->regval[num_regs] = node->parent->schq << 16;
+		num_regs++;
+
+		/* configure link cfg */
+		if (level == pfvf->qos.link_cfg_lvl) {
+			cfg->reg[num_regs] = NIX_AF_TL3_TL2X_LINKX_CFG(node->schq, hw->tx_link);
+			cfg->regval[num_regs] = BIT_ULL(13) | BIT_ULL(12);
+			num_regs++;
+		}
+
+		/* return if not htb node */
+		if (node->qid == OTX2_QOS_QID_NONE) {
+			cfg->reg[num_regs] = NIX_AF_TL3X_SCHEDULE(node->schq);
+			cfg->regval[num_regs] =  node->prio << 24 |
+						 mtu_to_dwrr_weight(pfvf,
+								    pfvf->tx_max_pktlen);
+			num_regs++;
+			goto txschq_cfg_out;
+		}
+
+		/* configure priority */
+		cfg->reg[num_regs] = NIX_AF_TL3X_SCHEDULE(node->schq);
+		cfg->regval[num_regs] = (node->schq -
+					 node->parent->prio_anchor) << 24;
+		num_regs++;
+
+		/* configure PIR */
+		maxrate = (node->rate > node->ceil) ? node->rate : node->ceil;
+		cfg->reg[num_regs] = NIX_AF_TL3X_PIR(node->schq);
+		cfg->regval[num_regs] =
+			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
+		num_regs++;
+
+		/* configure CIR */
+		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
+			/* Don't configure CIR when both CIR+PIR not supported
+			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes deadlock
+			 */
+			goto txschq_cfg_out;
+		}
+
+		cfg->reg[num_regs] = NIX_AF_TL3X_CIR(node->schq);
+		cfg->regval[num_regs] =
+			otx2_get_txschq_rate_regval(pfvf, node->rate, 65536);
+		num_regs++;
+	} else if (level == NIX_TXSCH_LVL_TL2) {
+		/* configure parent txschq */
+		cfg->reg[num_regs] = NIX_AF_TL2X_PARENT(node->schq);
+		cfg->regval[num_regs] = hw->tx_link << 16;
+		num_regs++;
+
+		/* configure link cfg */
+		if (level == pfvf->qos.link_cfg_lvl) {
+			cfg->reg[num_regs] = NIX_AF_TL3_TL2X_LINKX_CFG(node->schq, hw->tx_link);
+			cfg->regval[num_regs] = BIT_ULL(13) | BIT_ULL(12);
+			num_regs++;
+		}
+
+		/* return if not htb node */
+		if (node->qid == OTX2_QOS_QID_NONE) {
+			cfg->reg[num_regs] = NIX_AF_TL2X_SCHEDULE(node->schq);
+			cfg->regval[num_regs] =  node->prio << 24 |
+						 mtu_to_dwrr_weight(pfvf,
+								    pfvf->tx_max_pktlen);
+			num_regs++;
+			goto txschq_cfg_out;
+		}
+
+		/* check if node is root */
+		if (node->qid == OTX2_QOS_QID_INNER && !node->parent) {
+			cfg->reg[num_regs] = NIX_AF_TL2X_SCHEDULE(node->schq);
+			cfg->regval[num_regs] =  TXSCH_TL1_DFLT_RR_PRIO << 24 |
+						 mtu_to_dwrr_weight(pfvf,
+								    pfvf->tx_max_pktlen);
+			num_regs++;
+			goto txschq_cfg_out;
+		}
+
+		/* configure priority/quantum */
+		cfg->reg[num_regs] = NIX_AF_TL2X_SCHEDULE(node->schq);
+		cfg->regval[num_regs] = (node->schq -
+					 node->parent->prio_anchor) << 24;
+		num_regs++;
+
+		/* configure PIR */
+		maxrate = (node->rate > node->ceil) ? node->rate : node->ceil;
+		cfg->reg[num_regs] = NIX_AF_TL2X_PIR(node->schq);
+		cfg->regval[num_regs] =
+			otx2_get_txschq_rate_regval(pfvf, maxrate, 65536);
+		num_regs++;
+
+		/* configure CIR */
+		if (!test_bit(QOS_CIR_PIR_SUPPORT, &pfvf->hw.cap_flag)) {
+			/* Don't configure CIR when both CIR+PIR not supported
+			 * On 96xx, CIR + PIR + RED_ALGO=STALL causes deadlock
+			 */
+			goto txschq_cfg_out;
+		}
+
+		cfg->reg[num_regs] = NIX_AF_TL2X_CIR(node->schq);
+		cfg->regval[num_regs] =
+			otx2_get_txschq_rate_regval(pfvf, node->rate, 65536);
+		num_regs++;
+	}
+
+txschq_cfg_out:
+	cfg->num_regs = num_regs;
+}
+
+static int otx2_qos_txschq_set_parent_topology(struct otx2_nic *pfvf,
+					       struct otx2_qos_node *parent)
+{
+	struct mbox *mbox = &pfvf->mbox;
+	struct nix_txschq_config *cfg;
+	int rc;
+
+	if (parent->level == NIX_TXSCH_LVL_MDQ)
+		return 0;
+
+	mutex_lock(&mbox->lock);
+
+	cfg = otx2_mbox_alloc_msg_nix_txschq_cfg(&pfvf->mbox);
+	if (!cfg) {
+		mutex_unlock(&mbox->lock);
+		return -ENOMEM;
+	}
+
+	cfg->lvl = parent->level;
+
+	if (parent->level == NIX_TXSCH_LVL_TL4)
+		cfg->reg[0] = NIX_AF_TL4X_TOPOLOGY(parent->schq);
+	else if (parent->level == NIX_TXSCH_LVL_TL3)
+		cfg->reg[0] = NIX_AF_TL3X_TOPOLOGY(parent->schq);
+	else if (parent->level == NIX_TXSCH_LVL_TL2)
+		cfg->reg[0] = NIX_AF_TL2X_TOPOLOGY(parent->schq);
+	else if (parent->level == NIX_TXSCH_LVL_TL1)
+		cfg->reg[0] = NIX_AF_TL1X_TOPOLOGY(parent->schq);
+
+	cfg->regval[0] = (u64)parent->prio_anchor << 32;
+	if (parent->level == NIX_TXSCH_LVL_TL1)
+		cfg->regval[0] |= (u64)TXSCH_TL1_DFLT_RR_PRIO << 1;
+
+	cfg->num_regs++;
+
+	rc = otx2_sync_mbox_msg(&pfvf->mbox);
+
+	mutex_unlock(&mbox->lock);
+
+	return rc;
+}
+
+static void otx2_qos_free_hw_node_schq(struct otx2_nic *pfvf,
+				       struct otx2_qos_node *parent)
+{
+	struct otx2_qos_node *node;
+
+	list_for_each_entry_reverse(node, &parent->child_schq_list, list)
+		otx2_txschq_free_one(pfvf, node->level, node->schq);
+}
+
+static void otx2_qos_free_hw_node(struct otx2_nic *pfvf,
+				  struct otx2_qos_node *parent)
+{
+	struct otx2_qos_node *node, *tmp;
+
+	list_for_each_entry_safe(node, tmp, &parent->child_list, list) {
+		otx2_qos_free_hw_node(pfvf, node);
+		otx2_qos_free_hw_node_schq(pfvf, node);
+		otx2_txschq_free_one(pfvf, node->level, node->schq);
+	}
+}
+
+static void otx2_qos_free_hw_cfg(struct otx2_nic *pfvf,
+				 struct otx2_qos_node *node)
+{
+	mutex_lock(&pfvf->qos.qos_lock);
+
+	/* free child node hw mappings */
+	otx2_qos_free_hw_node(pfvf, node);
+	otx2_qos_free_hw_node_schq(pfvf, node);
+
+	/* free node hw mappings */
+	otx2_txschq_free_one(pfvf, node->level, node->schq);
+
+	mutex_unlock(&pfvf->qos.qos_lock);
+}
+
+static void otx2_qos_sw_node_delete(struct otx2_nic *pfvf,
+				    struct otx2_qos_node *node)
+{
+	hash_del(&node->hlist);
+
+	if (node->qid != OTX2_QOS_QID_INNER && node->qid != OTX2_QOS_QID_NONE) {
+		__clear_bit(node->qid, pfvf->qos.qos_sq_bmap);
+		otx2_qos_update_tx_netdev_queues(pfvf);
+	}
+
+	list_del(&node->list);
+	kfree(node);
+}
+
+static void otx2_qos_free_sw_node_schq(struct otx2_nic *pfvf,
+				       struct otx2_qos_node *parent)
+{
+	struct otx2_qos_node *node, *tmp;
+
+	list_for_each_entry_safe(node, tmp, &parent->child_schq_list, list) {
+		list_del(&node->list);
+		kfree(node);
+	}
+}
+
+static void __otx2_qos_free_sw_node(struct otx2_nic *pfvf,
+				    struct otx2_qos_node *parent)
+{
+	struct otx2_qos_node *node, *tmp;
+
+	list_for_each_entry_safe(node, tmp, &parent->child_list, list) {
+		__otx2_qos_free_sw_node(pfvf, node);
+		otx2_qos_free_sw_node_schq(pfvf, node);
+		otx2_qos_sw_node_delete(pfvf, node);
+	}
+}
+
+static void otx2_qos_free_sw_node(struct otx2_nic *pfvf,
+				  struct otx2_qos_node *node)
+{
+	mutex_lock(&pfvf->qos.qos_lock);
+
+	__otx2_qos_free_sw_node(pfvf, node);
+	otx2_qos_free_sw_node_schq(pfvf, node);
+	otx2_qos_sw_node_delete(pfvf, node);
+
+	mutex_unlock(&pfvf->qos.qos_lock);
+}
+
+static void otx2_qos_destroy_node(struct otx2_nic *pfvf,
+				  struct otx2_qos_node *node)
+{
+	otx2_qos_free_hw_cfg(pfvf, node);
+	otx2_qos_free_sw_node(pfvf, node);
+}
+
+static void otx2_qos_fill_cfg_schq(struct otx2_qos_node *parent,
+				   struct otx2_qos_cfg *cfg)
+{
+	struct otx2_qos_node *node;
+
+	list_for_each_entry(node, &parent->child_schq_list, list)
+		cfg->schq[node->level]++;
+}
+
+static void otx2_qos_fill_cfg_tl(struct otx2_qos_node *parent,
+				 struct otx2_qos_cfg *cfg)
+{
+	struct otx2_qos_node *node;
+
+	list_for_each_entry(node, &parent->child_list, list) {
+		otx2_qos_fill_cfg_tl(node, cfg);
+		cfg->schq_contig[node->level]++;
+		otx2_qos_fill_cfg_schq(node, cfg);
+	}
+}
+
+static void otx2_qos_prepare_txschq_cfg(struct otx2_nic *pfvf,
+					struct otx2_qos_node *parent,
+					struct otx2_qos_cfg *cfg)
+{
+	mutex_lock(&pfvf->qos.qos_lock);
+	otx2_qos_fill_cfg_tl(parent, cfg);
+	mutex_unlock(&pfvf->qos.qos_lock);
+}
+
+static void otx2_qos_read_txschq_cfg_schq(struct otx2_qos_node *parent,
+					  struct otx2_qos_cfg *cfg)
+{
+	struct otx2_qos_node *node;
+	int cnt;
+
+	list_for_each_entry(node, &parent->child_schq_list, list) {
+		cnt = cfg->dwrr_node_pos[node->level];
+		cfg->schq_list[node->level][cnt] = node->schq;
+		cfg->schq[node->level]++;
+		cfg->dwrr_node_pos[node->level]++;
+	}
+}
+
+static void otx2_qos_read_txschq_cfg_tl(struct otx2_qos_node *parent,
+					struct otx2_qos_cfg *cfg)
+{
+	struct otx2_qos_node *node;
+	int cnt;
+
+	list_for_each_entry(node, &parent->child_list, list) {
+		otx2_qos_read_txschq_cfg_tl(node, cfg);
+		cnt = cfg->static_node_pos[node->level];
+		cfg->schq_contig_list[node->level][cnt] = node->schq;
+		cfg->schq_contig[node->level]++;
+		cfg->static_node_pos[node->level]++;
+		otx2_qos_read_txschq_cfg_schq(node, cfg);
+	}
+}
+
+static void otx2_qos_read_txschq_cfg(struct otx2_nic *pfvf,
+				     struct otx2_qos_node *node,
+				     struct otx2_qos_cfg *cfg)
+{
+	mutex_lock(&pfvf->qos.qos_lock);
+	otx2_qos_read_txschq_cfg_tl(node, cfg);
+	mutex_unlock(&pfvf->qos.qos_lock);
+}
+
+static struct otx2_qos_node *
+otx2_qos_alloc_root(struct otx2_nic *pfvf)
+{
+	struct otx2_qos_node *node;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	node->parent = NULL;
+	if (!is_otx2_vf(pfvf->pcifunc))
+		node->level = NIX_TXSCH_LVL_TL1;
+	else
+		node->level = NIX_TXSCH_LVL_TL2;
+
+	node->qid = OTX2_QOS_QID_INNER;
+	node->classid = OTX2_QOS_ROOT_CLASSID;
+
+	hash_add(pfvf->qos.qos_hlist, &node->hlist, node->classid);
+	list_add_tail(&node->list, &pfvf->qos.qos_tree);
+	INIT_LIST_HEAD(&node->child_list);
+	INIT_LIST_HEAD(&node->child_schq_list);
+
+	return node;
+}
+
+static int otx2_qos_add_child_node(struct otx2_qos_node *parent,
+				   struct otx2_qos_node *node)
+{
+	struct list_head *head = &parent->child_list;
+	struct otx2_qos_node *tmp_node;
+	struct list_head *tmp;
+
+	for (tmp = head->next; tmp != head; tmp = tmp->next) {
+		tmp_node = list_entry(tmp, struct otx2_qos_node, list);
+		if (tmp_node->prio == node->prio)
+			return -EEXIST;
+		if (tmp_node->prio > node->prio) {
+			list_add_tail(&node->list, tmp);
+			return 0;
+		}
+	}
+
+	list_add_tail(&node->list, head);
+	return 0;
+}
+
+static int otx2_qos_alloc_txschq_node(struct otx2_nic *pfvf,
+				      struct otx2_qos_node *node)
+{
+	struct otx2_qos_node *txschq_node, *parent, *tmp;
+	int lvl;
+
+	parent = node;
+	for (lvl = node->level - 1; lvl >= NIX_TXSCH_LVL_MDQ; lvl--) {
+		txschq_node = kzalloc(sizeof(*txschq_node), GFP_KERNEL);
+		if (!txschq_node)
+			goto err_out;
+
+		txschq_node->parent = parent;
+		txschq_node->level = lvl;
+		txschq_node->classid = OTX2_QOS_CLASS_NONE;
+		txschq_node->qid = OTX2_QOS_QID_NONE;
+		txschq_node->rate = 0;
+		txschq_node->ceil = 0;
+		txschq_node->prio = 0;
+
+		mutex_lock(&pfvf->qos.qos_lock);
+		list_add_tail(&txschq_node->list, &node->child_schq_list);
+		mutex_unlock(&pfvf->qos.qos_lock);
+
+		INIT_LIST_HEAD(&txschq_node->child_list);
+		INIT_LIST_HEAD(&txschq_node->child_schq_list);
+		parent = txschq_node;
+	}
+
+	return 0;
+
+err_out:
+	list_for_each_entry_safe(txschq_node, tmp, &node->child_schq_list,
+				 list) {
+		list_del(&txschq_node->list);
+		kfree(txschq_node);
+	}
+	return -ENOMEM;
+}
+
+static struct otx2_qos_node *
+otx2_qos_sw_create_leaf_node(struct otx2_nic *pfvf,
+			     struct otx2_qos_node *parent,
+			     u16 classid, u32 prio, u64 rate, u64 ceil,
+			     u16 qid)
+{
+	struct otx2_qos_node *node;
+	int err;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	node->parent = parent;
+	node->level = parent->level - 1;
+	node->classid = classid;
+	node->qid = qid;
+	node->rate = otx2_qos_convert_rate(rate);
+	node->ceil = otx2_qos_convert_rate(ceil);
+	node->prio = prio;
+
+	__set_bit(qid, pfvf->qos.qos_sq_bmap);
+
+	hash_add(pfvf->qos.qos_hlist, &node->hlist, classid);
+
+	mutex_lock(&pfvf->qos.qos_lock);
+	err = otx2_qos_add_child_node(parent, node);
+	if (err) {
+		mutex_unlock(&pfvf->qos.qos_lock);
+		return ERR_PTR(err);
+	}
+	mutex_unlock(&pfvf->qos.qos_lock);
+
+	INIT_LIST_HEAD(&node->child_list);
+	INIT_LIST_HEAD(&node->child_schq_list);
+
+	err = otx2_qos_alloc_txschq_node(pfvf, node);
+	if (err) {
+		otx2_qos_sw_node_delete(pfvf, node);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return node;
+}
+
+static struct otx2_qos_node *
+otx2_sw_node_find(struct otx2_nic *pfvf, u32 classid)
+{
+	struct otx2_qos_node *node = NULL;
+
+	hash_for_each_possible(pfvf->qos.qos_hlist, node, hlist, classid) {
+		if (node->classid == classid)
+			break;
+	}
+
+	return node;
+}
+
+int otx2_get_txq_by_classid(struct otx2_nic *pfvf, u16 classid)
+{
+	struct otx2_qos_node *node;
+	u16 qid;
+	int res;
+
+	node = otx2_sw_node_find(pfvf, classid);
+	if (IS_ERR(node)) {
+		res = -ENOENT;
+		goto out;
+	}
+	qid = READ_ONCE(node->qid);
+	if (qid == OTX2_QOS_QID_INNER) {
+		res = -EINVAL;
+		goto out;
+	}
+	res = pfvf->hw.tx_queues + qid;
+out:
+	return res;
+}
+
+static int
+otx2_qos_txschq_config(struct otx2_nic *pfvf, struct otx2_qos_node *node)
+{
+	struct mbox *mbox = &pfvf->mbox;
+	struct nix_txschq_config *req;
+	int rc;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_nix_txschq_cfg(&pfvf->mbox);
+	if (!req) {
+		mutex_unlock(&mbox->lock);
+		return -ENOMEM;
+	}
+
+	req->lvl = node->level;
+	__otx2_qos_txschq_cfg(pfvf, node, req);
+
+	rc = otx2_sync_mbox_msg(&pfvf->mbox);
+
+	mutex_unlock(&mbox->lock);
+
+	return rc;
+}
+
+static int otx2_qos_txschq_alloc(struct otx2_nic *pfvf,
+				 struct otx2_qos_cfg *cfg)
+{
+	struct nix_txsch_alloc_req *req;
+	struct nix_txsch_alloc_rsp *rsp;
+	struct mbox *mbox = &pfvf->mbox;
+	int lvl, rc, schq;
+
+	mutex_lock(&mbox->lock);
+	req = otx2_mbox_alloc_msg_nix_txsch_alloc(&pfvf->mbox);
+	if (!req) {
+		mutex_unlock(&mbox->lock);
+		return -ENOMEM;
+	}
+
+	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
+		req->schq[lvl] = cfg->schq[lvl];
+		req->schq_contig[lvl] = cfg->schq_contig[lvl];
+	}
+
+	rc = otx2_sync_mbox_msg(&pfvf->mbox);
+	if (rc) {
+		mutex_unlock(&mbox->lock);
+		return rc;
+	}
+
+	rsp = (struct nix_txsch_alloc_rsp *)
+	      otx2_mbox_get_rsp(&pfvf->mbox.mbox, 0, &req->hdr);
+
+	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
+		for (schq = 0; schq < rsp->schq_contig[lvl]; schq++) {
+			cfg->schq_contig_list[lvl][schq] =
+				rsp->schq_contig_list[lvl][schq];
+		}
+	}
+
+	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
+		for (schq = 0; schq < rsp->schq[lvl]; schq++) {
+			cfg->schq_list[lvl][schq] =
+				rsp->schq_list[lvl][schq];
+		}
+	}
+
+	pfvf->qos.link_cfg_lvl = rsp->link_cfg_lvl;
+
+	mutex_unlock(&mbox->lock);
+
+	return rc;
+}
+
+static void otx2_qos_txschq_fill_cfg_schq(struct otx2_nic *pfvf,
+					  struct otx2_qos_node *node,
+					  struct otx2_qos_cfg *cfg)
+{
+	struct otx2_qos_node *tmp;
+	int cnt;
+
+	list_for_each_entry(tmp, &node->child_schq_list, list) {
+		cnt = cfg->dwrr_node_pos[tmp->level];
+		tmp->schq = cfg->schq_list[tmp->level][cnt];
+		cfg->dwrr_node_pos[tmp->level]++;
+	}
+}
+
+static void otx2_qos_txschq_fill_cfg_tl(struct otx2_nic *pfvf,
+					struct otx2_qos_node *node,
+					struct otx2_qos_cfg *cfg)
+{
+	struct otx2_qos_node *tmp;
+	int cnt;
+
+	list_for_each_entry(tmp, &node->child_list, list) {
+		otx2_qos_txschq_fill_cfg_tl(pfvf, tmp, cfg);
+		cnt = cfg->static_node_pos[tmp->level];
+		tmp->schq = cfg->schq_contig_list[tmp->level][cnt];
+		if (cnt == 0)
+			node->prio_anchor = tmp->schq;
+		cfg->static_node_pos[tmp->level]++;
+		otx2_qos_txschq_fill_cfg_schq(pfvf, tmp, cfg);
+	}
+}
+
+static void otx2_qos_txschq_fill_cfg(struct otx2_nic *pfvf,
+				     struct otx2_qos_node *node,
+				     struct otx2_qos_cfg *cfg)
+{
+	mutex_lock(&pfvf->qos.qos_lock);
+	otx2_qos_txschq_fill_cfg_tl(pfvf, node, cfg);
+	otx2_qos_txschq_fill_cfg_schq(pfvf, node, cfg);
+	mutex_unlock(&pfvf->qos.qos_lock);
+}
+
+static int otx2_qos_txschq_push_cfg_schq(struct otx2_nic *pfvf,
+					 struct otx2_qos_node *node,
+					 struct otx2_qos_cfg *cfg)
+{
+	struct otx2_qos_node *tmp;
+	int ret = 0;
+
+	list_for_each_entry(tmp, &node->child_schq_list, list) {
+		ret = otx2_qos_txschq_config(pfvf, tmp);
+		if (ret)
+			return -EIO;
+		ret = otx2_qos_txschq_set_parent_topology(pfvf, tmp->parent);
+		if (ret)
+			return -EIO;
+	}
+
+	return 0;
+}
+
+static int otx2_qos_txschq_push_cfg_tl(struct otx2_nic *pfvf,
+				       struct otx2_qos_node *node,
+				       struct otx2_qos_cfg *cfg)
+{
+	struct otx2_qos_node *tmp;
+	int ret;
+
+	list_for_each_entry(tmp, &node->child_list, list) {
+		ret = otx2_qos_txschq_push_cfg_tl(pfvf, tmp, cfg);
+		if (ret)
+			return -EIO;
+		ret = otx2_qos_txschq_config(pfvf, tmp);
+		if (ret)
+			return -EIO;
+		ret = otx2_qos_txschq_push_cfg_schq(pfvf, tmp, cfg);
+		if (ret)
+			return -EIO;
+	}
+
+	ret = otx2_qos_txschq_set_parent_topology(pfvf, node);
+	if (ret)
+		return -EIO;
+
+	return 0;
+}
+
+static int otx2_qos_txschq_push_cfg(struct otx2_nic *pfvf,
+				    struct otx2_qos_node *node,
+				    struct otx2_qos_cfg *cfg)
+{
+	int ret;
+
+	mutex_lock(&pfvf->qos.qos_lock);
+	ret = otx2_qos_txschq_push_cfg_tl(pfvf, node, cfg);
+	if (ret)
+		goto out;
+	ret = otx2_qos_txschq_push_cfg_schq(pfvf, node, cfg);
+out:
+	mutex_unlock(&pfvf->qos.qos_lock);
+	return ret;
+}
+
+static int otx2_qos_txschq_update_config(struct otx2_nic *pfvf,
+					 struct otx2_qos_node *node,
+					 struct otx2_qos_cfg *cfg)
+{
+	otx2_qos_txschq_fill_cfg(pfvf, node, cfg);
+
+	return otx2_qos_txschq_push_cfg(pfvf, node, cfg);
+}
+
+static int otx2_qos_txschq_update_root_cfg(struct otx2_nic *pfvf,
+					   struct otx2_qos_node *root,
+					   struct otx2_qos_cfg *cfg)
+{
+	root->schq = cfg->schq_list[root->level][0];
+	return otx2_qos_txschq_config(pfvf, root);
+}
+
+static void otx2_qos_free_cfg(struct otx2_nic *pfvf, struct otx2_qos_cfg *cfg)
+{
+	int lvl, idx, schq;
+
+	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
+		for (idx = 0; idx < cfg->schq[lvl]; idx++) {
+			schq = cfg->schq_list[lvl][idx];
+			otx2_txschq_free_one(pfvf, lvl, schq);
+		}
+	}
+
+	for (lvl = 0; lvl < NIX_TXSCH_LVL_CNT; lvl++) {
+		for (idx = 0; idx < cfg->schq_contig[lvl]; idx++) {
+			schq = cfg->schq_contig_list[lvl][idx];
+			otx2_txschq_free_one(pfvf, lvl, schq);
+		}
+	}
+}
+
+static void otx2_qos_enadis_sq(struct otx2_nic *pfvf,
+			       struct otx2_qos_node *node,
+			       u16 qid)
+{
+	if (pfvf->qos.qid_to_sqmap[qid] != OTX2_QOS_INVALID_SQ)
+		otx2_qos_disable_sq(pfvf, qid);
+
+	pfvf->qos.qid_to_sqmap[qid] = node->schq;
+	otx2_qos_enable_sq(pfvf, qid);
+}
+
+static void otx2_qos_update_smq_schq(struct otx2_nic *pfvf,
+				     struct otx2_qos_node *node,
+				     bool action)
+{
+	struct otx2_qos_node *tmp;
+
+	if (node->qid == OTX2_QOS_QID_INNER)
+		return;
+
+	list_for_each_entry(tmp, &node->child_schq_list, list) {
+		if (tmp->level == NIX_TXSCH_LVL_MDQ) {
+			if (action == QOS_SMQ_FLUSH)
+				otx2_smq_flush(pfvf, tmp->schq);
+			else
+				otx2_qos_enadis_sq(pfvf, tmp, node->qid);
+		}
+	}
+}
+
+static void __otx2_qos_update_smq(struct otx2_nic *pfvf,
+				  struct otx2_qos_node *node,
+				  bool action)
+{
+	struct otx2_qos_node *tmp;
+
+	list_for_each_entry(tmp, &node->child_list, list) {
+		__otx2_qos_update_smq(pfvf, tmp, action);
+		if (tmp->qid == OTX2_QOS_QID_INNER)
+			continue;
+		if (tmp->level == NIX_TXSCH_LVL_MDQ) {
+			if (action == QOS_SMQ_FLUSH)
+				otx2_smq_flush(pfvf, tmp->schq);
+			else
+				otx2_qos_enadis_sq(pfvf, tmp, tmp->qid);
+		} else {
+			otx2_qos_update_smq_schq(pfvf, tmp, action);
+		}
+	}
+}
+
+static void otx2_qos_update_smq(struct otx2_nic *pfvf,
+				struct otx2_qos_node *node,
+				bool action)
+{
+	mutex_lock(&pfvf->qos.qos_lock);
+	__otx2_qos_update_smq(pfvf, node, action);
+	otx2_qos_update_smq_schq(pfvf, node, action);
+	mutex_unlock(&pfvf->qos.qos_lock);
+}
+
+static int otx2_qos_push_txschq_cfg(struct otx2_nic *pfvf,
+				    struct otx2_qos_node *node,
+				    struct otx2_qos_cfg *cfg)
+{
+	int ret = 0;
+
+	ret = otx2_qos_txschq_alloc(pfvf, cfg);
+	if (ret)
+		return -ENOSPC;
+
+	if (!(pfvf->netdev->flags & IFF_UP)) {
+		otx2_qos_txschq_fill_cfg(pfvf, node, cfg);
+		return 0;
+	}
+
+	ret = otx2_qos_txschq_update_config(pfvf, node, cfg);
+	if (ret) {
+		otx2_qos_free_cfg(pfvf, cfg);
+		return -EIO;
+	}
+
+	otx2_qos_update_smq(pfvf, node, QOS_CFG_SQ);
+
+	return 0;
+}
+
+static int otx2_qos_update_tree(struct otx2_nic *pfvf,
+				struct otx2_qos_node *node,
+				struct otx2_qos_cfg *cfg)
+{
+	otx2_qos_prepare_txschq_cfg(pfvf, node->parent, cfg);
+	return otx2_qos_push_txschq_cfg(pfvf, node->parent, cfg);
+}
+
+static int otx2_qos_root_add(struct otx2_nic *pfvf, u16 htb_maj_id, u16 htb_defcls,
+			     struct netlink_ext_ack *extack)
+{
+	struct otx2_qos_cfg *new_cfg;
+	struct otx2_qos_node *root;
+	int err;
+
+	netdev_dbg(pfvf->netdev,
+		   "TC_HTB_CREATE: handle=0x%x defcls=0x%x\n",
+		   htb_maj_id, htb_defcls);
+
+	INIT_LIST_HEAD(&pfvf->qos.qos_tree);
+	mutex_init(&pfvf->qos.qos_lock);
+
+	root = otx2_qos_alloc_root(pfvf);
+	if (IS_ERR(root)) {
+		mutex_destroy(&pfvf->qos.qos_lock);
+		err = PTR_ERR(root);
+		return err;
+	}
+
+	/* allocate txschq queue */
+	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
+	if (!new_cfg) {
+		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
+		mutex_destroy(&pfvf->qos.qos_lock);
+		return -ENOMEM;
+	}
+	/* allocate htb root node */
+	new_cfg->schq[root->level] = 1;
+	err = otx2_qos_txschq_alloc(pfvf, new_cfg);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Error allocating txschq");
+		goto free_root_node;
+	}
+
+	if (!(pfvf->netdev->flags & IFF_UP) ||
+	    root->level == NIX_TXSCH_LVL_TL1) {
+		root->schq = new_cfg->schq_list[root->level][0];
+		goto out;
+	}
+
+	/* update the txschq configuration in hw */
+	err = otx2_qos_txschq_update_root_cfg(pfvf, root, new_cfg);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Error updating txschq configuration");
+		goto txschq_free;
+	}
+
+out:
+	WRITE_ONCE(pfvf->qos.defcls, htb_defcls);
+	smp_store_release(&pfvf->qos.maj_id, htb_maj_id); /* barrier */
+	kfree(new_cfg);
+	return 0;
+
+txschq_free:
+	otx2_qos_free_cfg(pfvf, new_cfg);
+free_root_node:
+	kfree(new_cfg);
+	otx2_qos_sw_node_delete(pfvf, root);
+	mutex_destroy(&pfvf->qos.qos_lock);
+	return err;
+}
+
+static int otx2_qos_root_destroy(struct otx2_nic *pfvf)
+{
+	struct otx2_qos_node *root;
+
+	netdev_dbg(pfvf->netdev, "TC_HTB_DESTROY\n");
+
+	/* find root node */
+	root = otx2_sw_node_find(pfvf, OTX2_QOS_ROOT_CLASSID);
+	if (IS_ERR(root))
+		return -ENOENT;
+
+	/* free the hw mappings */
+	otx2_qos_destroy_node(pfvf, root);
+	mutex_destroy(&pfvf->qos.qos_lock);
+
+	return 0;
+}
+
+static int otx2_qos_validate_configuration(struct otx2_qos_node *parent,
+					   struct netlink_ext_ack *extack,
+					   struct otx2_nic *pfvf,
+					   u64 prio)
+{
+	if (test_bit(prio, parent->prio_bmap)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Static priority child with same priority exists");
+		return -EEXIST;
+	}
+
+	if (prio == TXSCH_TL1_DFLT_RR_PRIO) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Priority is reserved for Round Robin");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int otx2_qos_leaf_alloc_queue(struct otx2_nic *pfvf, u16 classid,
+				     u32 parent_classid, u64 rate, u64 ceil,
+				     u64 prio, struct netlink_ext_ack *extack)
+{
+	struct otx2_qos_cfg *old_cfg, *new_cfg;
+	struct otx2_qos_node *node, *parent;
+	int qid, ret, err;
+
+	netdev_dbg(pfvf->netdev,
+		   "TC_HTB_LEAF_ALLOC_QUEUE: classid=0x%x parent_classid=0x%x rate=%lld ceil=%lld prio=%lld\n",
+		   classid, parent_classid, rate, ceil, prio);
+
+	if (prio > OTX2_QOS_MAX_PRIO) {
+		NL_SET_ERR_MSG_MOD(extack, "Valid priority range 0 to 7");
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	/* get parent node */
+	parent = otx2_sw_node_find(pfvf, parent_classid);
+	if (IS_ERR(parent)) {
+		NL_SET_ERR_MSG_MOD(extack, "parent node not found");
+		ret = -ENOENT;
+		goto out;
+	}
+	if (parent->level == NIX_TXSCH_LVL_MDQ) {
+		NL_SET_ERR_MSG_MOD(extack, "HTB qos max levels reached");
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	ret = otx2_qos_validate_configuration(parent, extack, pfvf, prio);
+	if (ret)
+		goto out;
+
+	set_bit(prio, parent->prio_bmap);
+
+	/* read current txschq configuration */
+	old_cfg = kzalloc(sizeof(*old_cfg), GFP_KERNEL);
+	if (!old_cfg) {
+		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
+		ret = -ENOMEM;
+		goto out;
+	}
+	otx2_qos_read_txschq_cfg(pfvf, parent, old_cfg);
+
+	/* allocate a new sq */
+	qid = otx2_qos_get_qid(pfvf);
+	if (qid < 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Reached max supported QOS SQ's");
+		ret = -ENOMEM;
+		goto free_old_cfg;
+	}
+
+	/* Actual SQ mapping will be updated after SMQ alloc */
+	pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ;
+
+	/* allocate and initialize a new child node */
+	node = otx2_qos_sw_create_leaf_node(pfvf, parent, classid, prio, rate,
+					    ceil, qid);
+	if (IS_ERR(node)) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to allocate leaf node");
+		ret = PTR_ERR(node);
+		goto free_old_cfg;
+	}
+
+	/* push new txschq config to hw */
+	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
+	if (!new_cfg) {
+		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
+		ret = -ENOMEM;
+		goto free_node;
+	}
+	ret = otx2_qos_update_tree(pfvf, node, new_cfg);
+	if (ret) {
+		NL_SET_ERR_MSG_MOD(extack, "HTB HW configuration error");
+		kfree(new_cfg);
+		otx2_qos_sw_node_delete(pfvf, node);
+		/* restore the old qos tree */
+		err = otx2_qos_txschq_update_config(pfvf, parent, old_cfg);
+		if (err) {
+			netdev_err(pfvf->netdev,
+				   "Failed to restore txcshq configuration");
+			goto free_old_cfg;
+		}
+
+		otx2_qos_update_smq(pfvf, parent, QOS_CFG_SQ);
+		goto free_old_cfg;
+	}
+
+	/* update tx_real_queues */
+	otx2_qos_update_tx_netdev_queues(pfvf);
+
+	/* free new txschq config */
+	kfree(new_cfg);
+
+	/* free old txschq config */
+	otx2_qos_free_cfg(pfvf, old_cfg);
+	kfree(old_cfg);
+
+	return pfvf->hw.tx_queues + qid;
+
+free_node:
+	otx2_qos_sw_node_delete(pfvf, node);
+free_old_cfg:
+	kfree(old_cfg);
+out:
+	return ret;
+}
+
+static int otx2_qos_leaf_to_inner(struct otx2_nic *pfvf, u16 classid,
+				  u16 child_classid, u64 rate, u64 ceil, u64 prio,
+				  struct netlink_ext_ack *extack)
+{
+	struct otx2_qos_cfg *old_cfg, *new_cfg;
+	struct otx2_qos_node *node, *child;
+	int ret, err;
+	u16 qid;
+
+	netdev_dbg(pfvf->netdev,
+		   "TC_HTB_LEAF_TO_INNER classid %04x, child %04x, rate %llu, ceil %llu\n",
+		   classid, child_classid, rate, ceil);
+
+	if (prio > OTX2_QOS_MAX_PRIO) {
+		NL_SET_ERR_MSG_MOD(extack, "Valid priority range 0 to 7");
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	/* find node related to classid */
+	node = otx2_sw_node_find(pfvf, classid);
+	if (IS_ERR(node)) {
+		NL_SET_ERR_MSG_MOD(extack, "HTB node not found");
+		ret = -ENOENT;
+		goto out;
+	}
+	/* check max qos txschq level */
+	if (node->level == NIX_TXSCH_LVL_MDQ) {
+		NL_SET_ERR_MSG_MOD(extack, "HTB qos level not supported");
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	set_bit(prio, node->prio_bmap);
+
+	/* store the qid to assign to leaf node */
+	qid = node->qid;
+
+	/* read current txschq configuration */
+	old_cfg = kzalloc(sizeof(*old_cfg), GFP_KERNEL);
+	if (!old_cfg) {
+		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
+		ret = -ENOMEM;
+		goto out;
+	}
+	otx2_qos_read_txschq_cfg(pfvf, node, old_cfg);
+
+	/* delete the txschq nodes allocated for this node */
+	otx2_qos_free_sw_node_schq(pfvf, node);
+
+	/* mark this node as htb inner node */
+	node->qid = OTX2_QOS_QID_INNER;
+
+	/* allocate and initialize a new child node */
+	child = otx2_qos_sw_create_leaf_node(pfvf, node, child_classid,
+					     prio, rate, ceil, qid);
+	if (IS_ERR(child)) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to allocate leaf node");
+		ret = PTR_ERR(child);
+		goto free_old_cfg;
+	}
+
+	/* push new txschq config to hw */
+	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
+	if (!new_cfg) {
+		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
+		ret = -ENOMEM;
+		goto free_node;
+	}
+	ret = otx2_qos_update_tree(pfvf, child, new_cfg);
+	if (ret) {
+		NL_SET_ERR_MSG_MOD(extack, "HTB HW configuration error");
+		kfree(new_cfg);
+		otx2_qos_sw_node_delete(pfvf, child);
+		/* restore the old qos tree */
+		node->qid = qid;
+		err = otx2_qos_alloc_txschq_node(pfvf, node);
+		if (err) {
+			netdev_err(pfvf->netdev,
+				   "Failed to restore old leaf node");
+			goto free_old_cfg;
+		}
+		err = otx2_qos_txschq_update_config(pfvf, node, old_cfg);
+		if (err) {
+			netdev_err(pfvf->netdev,
+				   "Failed to restore txcshq configuration");
+			goto free_old_cfg;
+		}
+		otx2_qos_update_smq(pfvf, node, QOS_CFG_SQ);
+		goto free_old_cfg;
+	}
+
+	/* free new txschq config */
+	kfree(new_cfg);
+
+	/* free old txschq config */
+	otx2_qos_free_cfg(pfvf, old_cfg);
+	kfree(old_cfg);
+
+	return 0;
+
+free_node:
+	otx2_qos_sw_node_delete(pfvf, child);
+free_old_cfg:
+	kfree(old_cfg);
+out:
+	return ret;
+}
+
+static int otx2_qos_leaf_del(struct otx2_nic *pfvf, u16 *classid,
+			     struct netlink_ext_ack *extack)
+{
+	struct otx2_qos_node *node, *parent;
+	u64 prio;
+	u16 qid;
+
+	netdev_dbg(pfvf->netdev, "TC_HTB_LEAF_DEL classid %04x\n", *classid);
+
+	/* find node related to classid */
+	node = otx2_sw_node_find(pfvf, *classid);
+	if (IS_ERR(node)) {
+		NL_SET_ERR_MSG_MOD(extack, "HTB node not found");
+		return -ENOENT;
+	}
+	parent = node->parent;
+	prio   = node->prio;
+	qid    = node->qid;
+
+	otx2_qos_disable_sq(pfvf, node->qid);
+
+	otx2_qos_destroy_node(pfvf, node);
+	pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ;
+
+	clear_bit(prio, parent->prio_bmap);
+
+	return 0;
+}
+
+static int otx2_qos_leaf_del_last(struct otx2_nic *pfvf, u16 classid, bool force,
+				  struct netlink_ext_ack *extack)
+{
+	struct otx2_qos_node *node, *parent;
+	struct otx2_qos_cfg *new_cfg;
+	u64 prio;
+	int err;
+	u16 qid;
+
+	netdev_dbg(pfvf->netdev,
+		   "TC_HTB_LEAF_DEL_LAST classid %04x\n", classid);
+
+	/* find node related to classid */
+	node = otx2_sw_node_find(pfvf, classid);
+	if (IS_ERR(node)) {
+		NL_SET_ERR_MSG_MOD(extack, "HTB node not found");
+		return -ENOENT;
+	}
+
+	/* save qid for use by parent */
+	qid = node->qid;
+	prio = node->prio;
+
+	parent = otx2_sw_node_find(pfvf, node->parent->classid);
+	if (IS_ERR(parent)) {
+		NL_SET_ERR_MSG_MOD(extack, "parent node not found");
+		return -ENOENT;
+	}
+
+	/* destroy the leaf node */
+	otx2_qos_destroy_node(pfvf, node);
+	pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ;
+
+	clear_bit(prio, parent->prio_bmap);
+
+	/* create downstream txschq entries to parent */
+	err = otx2_qos_alloc_txschq_node(pfvf, parent);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "HTB failed to create txsch configuration");
+		return err;
+	}
+	parent->qid = qid;
+	__set_bit(qid, pfvf->qos.qos_sq_bmap);
+
+	/* push new txschq config to hw */
+	new_cfg = kzalloc(sizeof(*new_cfg), GFP_KERNEL);
+	if (!new_cfg) {
+		NL_SET_ERR_MSG_MOD(extack, "Memory allocation error");
+		return -ENOMEM;
+	}
+	/* fill txschq cfg and push txschq cfg to hw */
+	otx2_qos_fill_cfg_schq(parent, new_cfg);
+	err = otx2_qos_push_txschq_cfg(pfvf, parent, new_cfg);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "HTB HW configuration error");
+		kfree(new_cfg);
+		return err;
+	}
+	kfree(new_cfg);
+
+	/* update tx_real_queues */
+	otx2_qos_update_tx_netdev_queues(pfvf);
+
+	return 0;
+}
+
+void otx2_clean_qos_queues(struct otx2_nic *pfvf)
+{
+	struct otx2_qos_node *root;
+
+	root = otx2_sw_node_find(pfvf, OTX2_QOS_ROOT_CLASSID);
+	if (IS_ERR(root))
+		return;
+
+	otx2_qos_update_smq(pfvf, root, QOS_SMQ_FLUSH);
+}
+
+void otx2_qos_config_txschq(struct otx2_nic *pfvf)
+{
+	struct otx2_qos_node *root;
+	int err;
+
+	root = otx2_sw_node_find(pfvf, OTX2_QOS_ROOT_CLASSID);
+	if (IS_ERR(root))
+		return;
+
+	err = otx2_qos_txschq_config(pfvf, root);
+	if (err) {
+		netdev_err(pfvf->netdev, "Error update txschq configuration\n");
+		goto root_destroy;
+	}
+
+	err = otx2_qos_txschq_push_cfg_tl(pfvf, root, NULL);
+	if (err) {
+		netdev_err(pfvf->netdev, "Error update txschq configuration\n");
+		goto root_destroy;
+	}
+
+	otx2_qos_update_smq(pfvf, root, QOS_CFG_SQ);
+	return;
+
+root_destroy:
+	netdev_err(pfvf->netdev, "Failed to update Scheduler/Shaping config in Hardware\n");
+	/* Free resources allocated */
+	otx2_qos_root_destroy(pfvf);
+}
+
+int otx2_setup_tc_htb(struct net_device *ndev, struct tc_htb_qopt_offload *htb)
+{
+	struct otx2_nic *pfvf = netdev_priv(ndev);
+	int res;
+
+	switch (htb->command) {
+	case TC_HTB_CREATE:
+		return otx2_qos_root_add(pfvf, htb->parent_classid,
+					 htb->classid, htb->extack);
+	case TC_HTB_DESTROY:
+		return otx2_qos_root_destroy(pfvf);
+	case TC_HTB_LEAF_ALLOC_QUEUE:
+		res = otx2_qos_leaf_alloc_queue(pfvf, htb->classid,
+						htb->parent_classid,
+						htb->rate, htb->ceil,
+						htb->prio, htb->extack);
+		if (res < 0)
+			return res;
+		htb->qid = res;
+		return 0;
+	case TC_HTB_LEAF_TO_INNER:
+		return otx2_qos_leaf_to_inner(pfvf, htb->parent_classid,
+					      htb->classid, htb->rate,
+					      htb->ceil, htb->prio,
+					      htb->extack);
+	case TC_HTB_LEAF_DEL:
+		return otx2_qos_leaf_del(pfvf, &htb->classid, htb->extack);
+	case TC_HTB_LEAF_DEL_LAST:
+	case TC_HTB_LEAF_DEL_LAST_FORCE:
+		return otx2_qos_leaf_del_last(pfvf, htb->classid,
+				htb->command == TC_HTB_LEAF_DEL_LAST_FORCE,
+					      htb->extack);
+	case TC_HTB_LEAF_QUERY_QUEUE:
+		res = otx2_get_txq_by_classid(pfvf, htb->classid);
+		htb->qid = res;
+		return 0;
+	case TC_HTB_NODE_MODIFY:
+		fallthrough;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
index 73a62d092e99..26de1af2aa57 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
@@ -7,13 +7,63 @@ 
 #ifndef OTX2_QOS_H
 #define OTX2_QOS_H
 
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/rhashtable.h>
+
+#define OTX2_QOS_MAX_LVL		4
+#define OTX2_QOS_MAX_PRIO		7
 #define OTX2_QOS_MAX_LEAF_NODES                16
 
-int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq);
-void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq);
+enum qos_smq_operations {
+	QOS_CFG_SQ,
+	QOS_SMQ_FLUSH,
+};
+
+u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic, u64 maxrate, u32 burst);
+
+int otx2_setup_tc_htb(struct net_device *ndev, struct tc_htb_qopt_offload *htb);
+int otx2_qos_get_qid(struct otx2_nic *pfvf);
+void otx2_qos_free_qid(struct otx2_nic *pfvf, int qidx);
+int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx);
+void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx);
+
+struct otx2_qos_cfg {
+	u16 schq[NIX_TXSCH_LVL_CNT];
+	u16 schq_contig[NIX_TXSCH_LVL_CNT];
+	int static_node_pos[NIX_TXSCH_LVL_CNT];
+	int dwrr_node_pos[NIX_TXSCH_LVL_CNT];
+	u16 schq_contig_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC];
+	u16 schq_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC];
+};
 
 struct otx2_qos {
-	       u16 qid_to_sqmap[OTX2_QOS_MAX_LEAF_NODES];
-	};
+	DECLARE_HASHTABLE(qos_hlist, order_base_2(OTX2_QOS_MAX_LEAF_NODES));
+	struct mutex qos_lock; /* child list lock */
+	u16 qid_to_sqmap[OTX2_QOS_MAX_LEAF_NODES];
+	struct list_head qos_tree;
+	DECLARE_BITMAP(qos_sq_bmap, OTX2_QOS_MAX_LEAF_NODES);
+	u16 maj_id;
+	u16 defcls;
+	u8  link_cfg_lvl; /* LINKX_CFG CSRs mapped to TL3 or TL2's index ? */
+};
+
+struct otx2_qos_node {
+	struct list_head list; /* list managment */
+	struct list_head child_list;
+	struct list_head child_schq_list;
+	struct hlist_node hlist;
+	DECLARE_BITMAP(prio_bmap, OTX2_QOS_MAX_PRIO + 1);
+	struct otx2_qos_node *parent;	/* parent qos node */
+	u64 rate; /* htb params */
+	u64 ceil;
+	u32 classid;
+	u32 prio;
+	u16 schq; /* hw txschq */
+	u16 qid;
+	u16 prio_anchor;
+	u8 level;
+};
+
 
 #endif
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
index 1c77f024c360..8a1e89668a1b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
@@ -225,7 +225,22 @@  static int otx2_qos_ctx_disable(struct otx2_nic *pfvf, u16 qidx, int aura_id)
 	return otx2_sync_mbox_msg(&pfvf->mbox);
 }
 
-int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq)
+int otx2_qos_get_qid(struct otx2_nic *pfvf)
+{
+	int qidx;
+
+	qidx = find_first_zero_bit(pfvf->qos.qos_sq_bmap,
+				   pfvf->hw.tc_tx_queues);
+
+	return qidx == pfvf->hw.tc_tx_queues ? -ENOSPC : qidx;
+}
+
+void otx2_qos_free_qid(struct otx2_nic *pfvf, int qidx)
+{
+	clear_bit(qidx, pfvf->qos.qos_sq_bmap);
+}
+
+int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx)
 {
 	struct otx2_hw *hw = &pfvf->hw;
 	int pool_id, sq_idx, err;
@@ -241,7 +256,6 @@  int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq)
 		goto out;
 
 	pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, sq_idx);
-	pfvf->qos.qid_to_sqmap[qidx] = smq;
 	err = otx2_sq_init(pfvf, sq_idx, pool_id);
 	if (err)
 		goto out;
@@ -250,7 +264,7 @@  int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq)
 	return err;
 }
 
-void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq)
+void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx)
 {
 	struct otx2_qset *qset = &pfvf->qset;
 	struct otx2_hw *hw = &pfvf->hw;