diff mbox series

[RFC,net-next,v3,2/2] net/mlx5e: Add per queue netdev-genl stats

Message ID 20240529031628.324117-3-jdamato@fastly.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series mlx5: Add netdev-genl queue stats | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 902 this patch: 902
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang success Errors and warnings before: 908 this patch: 906
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 906 this patch: 906
netdev/checkpatch warning WARNING: braces {} are not necessary for any arm of this statement WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Joe Damato May 29, 2024, 3:16 a.m. UTC
Add functions to support the netdev-genl per queue stats API.

./cli.py --spec netlink/specs/netdev.yaml \
         --dump qstats-get --json '{"scope": "queue"}'

...snip

 {'ifindex': 7,
  'queue-id': 62,
  'queue-type': 'rx',
  'rx-alloc-fail': 0,
  'rx-bytes': 105965251,
  'rx-packets': 179790},
 {'ifindex': 7,
  'queue-id': 0,
  'queue-type': 'tx',
  'tx-bytes': 9402665,
  'tx-packets': 17551},

...snip

Also tested with the script tools/testing/selftests/drivers/net/stats.py
in several scenarios to ensure stats tallying was correct:

- on boot (default queue counts)
- adjusting queue count up or down (ethtool -L eth0 combined ...)
- adding mqprio TCs

Signed-off-by: Joe Damato <jdamato@fastly.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 132 ++++++++++++++++++
 1 file changed, 132 insertions(+)

Comments

Jacob Keller May 30, 2024, 9:16 p.m. UTC | #1
On 5/28/2024 8:16 PM, Joe Damato wrote:
> Add functions to support the netdev-genl per queue stats API.
> 
> ./cli.py --spec netlink/specs/netdev.yaml \
>          --dump qstats-get --json '{"scope": "queue"}'
> 
> ...snip
> 
>  {'ifindex': 7,
>   'queue-id': 62,
>   'queue-type': 'rx',
>   'rx-alloc-fail': 0,
>   'rx-bytes': 105965251,
>   'rx-packets': 179790},
>  {'ifindex': 7,
>   'queue-id': 0,
>   'queue-type': 'tx',
>   'tx-bytes': 9402665,
>   'tx-packets': 17551},
> 
> ...snip
> 
> Also tested with the script tools/testing/selftests/drivers/net/stats.py
> in several scenarios to ensure stats tallying was correct:
> 
> - on boot (default queue counts)
> - adjusting queue count up or down (ethtool -L eth0 combined ...)
> - adding mqprio TCs
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Tariq Toukan June 2, 2024, 9:14 a.m. UTC | #2
On 29/05/2024 6:16, Joe Damato wrote:
> Add functions to support the netdev-genl per queue stats API.
> 
> ./cli.py --spec netlink/specs/netdev.yaml \
>           --dump qstats-get --json '{"scope": "queue"}'
> 
> ...snip
> 
>   {'ifindex': 7,
>    'queue-id': 62,
>    'queue-type': 'rx',
>    'rx-alloc-fail': 0,
>    'rx-bytes': 105965251,
>    'rx-packets': 179790},
>   {'ifindex': 7,
>    'queue-id': 0,
>    'queue-type': 'tx',
>    'tx-bytes': 9402665,
>    'tx-packets': 17551},
> 
> ...snip
> 
> Also tested with the script tools/testing/selftests/drivers/net/stats.py
> in several scenarios to ensure stats tallying was correct:
> 
> - on boot (default queue counts)
> - adjusting queue count up or down (ethtool -L eth0 combined ...)
> - adding mqprio TCs

Please test also with interface down.

> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
>   .../net/ethernet/mellanox/mlx5/core/en_main.c | 132 ++++++++++++++++++
>   1 file changed, 132 insertions(+)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index ce15805ad55a..515c16a88a6c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -39,6 +39,7 @@
>   #include <linux/debugfs.h>
>   #include <linux/if_bridge.h>
>   #include <linux/filter.h>
> +#include <net/netdev_queues.h>
>   #include <net/page_pool/types.h>
>   #include <net/pkt_sched.h>
>   #include <net/xdp_sock_drv.h>
> @@ -5293,6 +5294,136 @@ static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev)
>   	return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev));
>   }
>   
> +static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
> +				     struct netdev_queue_stats_rx *stats)
> +{
> +	struct mlx5e_priv *priv = netdev_priv(dev);
> +	struct mlx5e_channel_stats *channel_stats;
> +	struct mlx5e_rq_stats *xskrq_stats;
> +	struct mlx5e_rq_stats *rq_stats;
> +
> +	if (mlx5e_is_uplink_rep(priv))
> +		return;
> +
> +	channel_stats = priv->channel_stats[i];
> +	xskrq_stats = &channel_stats->xskrq;
> +	rq_stats = &channel_stats->rq;
> +
> +	stats->packets = rq_stats->packets + xskrq_stats->packets;
> +	stats->bytes = rq_stats->bytes + xskrq_stats->bytes;
> +	stats->alloc_fail = rq_stats->buff_alloc_err +
> +			    xskrq_stats->buff_alloc_err;
> +}
> +
> +static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
> +				     struct netdev_queue_stats_tx *stats)
> +{
> +	struct mlx5e_priv *priv = netdev_priv(dev);
> +	struct mlx5e_channel_stats *channel_stats;
> +	struct mlx5e_sq_stats *sq_stats;
> +	int ch_ix, tc_ix;
> +
> +	mutex_lock(&priv->state_lock);
> +	txq_ix_to_chtc_ix(&priv->channels.params, i, &ch_ix, &tc_ix);
> +	mutex_unlock(&priv->state_lock);
> +
> +	channel_stats = priv->channel_stats[ch_ix];
> +	sq_stats = &channel_stats->sq[tc_ix];
> +
> +	stats->packets = sq_stats->packets;
> +	stats->bytes = sq_stats->bytes;
> +}
> +
> +static void mlx5e_get_base_stats(struct net_device *dev,
> +				 struct netdev_queue_stats_rx *rx,
> +				 struct netdev_queue_stats_tx *tx)
> +{
> +	struct mlx5e_priv *priv = netdev_priv(dev);
> +	int i, j;
> +
> +	if (!mlx5e_is_uplink_rep(priv)) {
> +		rx->packets = 0;
> +		rx->bytes = 0;
> +		rx->alloc_fail = 0;
> +
> +		/* compute stats for deactivated RX queues
> +		 *
> +		 * if priv->channels.num == 0 the device is down, so compute
> +		 * stats for every queue.
> +		 *
> +		 * otherwise, compute only the queues which have been deactivated.
> +		 */
> +		mutex_lock(&priv->state_lock);
> +		if (priv->channels.num == 0)
> +			i = 0;

This is not consistent with the above implementation of 
mlx5e_get_queue_stats_rx(), which always returns the stats even if the 
channel is down.
This way, you'll double count the down channels.

I think you should always start from priv->channels.params.num_channels.

> +		else
> +			i = priv->channels.params.num_channels;
> +		mutex_unlock(&priv->state_lock);

I understand that you're following the guidelines by taking the lock 
here, I just don't think this improves anything... If channels can be 
modified in between calls to mlx5e_get_base_stats / 
mlx5e_get_queue_stats_rx, then wrapping the priv->channels access with a 
lock can help protect each single deref, but not necessarily in giving a 
consistent "screenshot" of the stats.

The rtnl_lock should take care of that, as the driver holds it when 
changing the number of channels and updating the real_numrx/tx_queues.

This said, I would carefully say you can drop the mutex once following 
the requested changes above.

> +
> +		for (; i < priv->stats_nch; i++) {
> +			struct netdev_queue_stats_rx rx_i = {0};
> +
> +			mlx5e_get_queue_stats_rx(dev, i, &rx_i);
> +
> +			rx->packets += rx_i.packets;
> +			rx->bytes += rx_i.bytes;
> +			rx->alloc_fail += rx_i.alloc_fail;
> +		}
> +
> +		if (priv->rx_ptp_opened) {
> +			struct mlx5e_rq_stats *rq_stats = &priv->ptp_stats.rq;
> +
> +			rx->packets += rq_stats->packets;
> +			rx->bytes += rq_stats->bytes;
> +		}
> +	}
> +
> +	tx->packets = 0;
> +	tx->bytes = 0;
> +
> +	mutex_lock(&priv->state_lock);
> +	for (i = 0; i < priv->stats_nch; i++) {
> +		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
> +
> +		/* while iterating through all channels [0, stats_nch], there
> +		 * are two cases to handle:
> +		 *
> +		 *  1. the channel is available, so sum only the unavailable TCs
> +		 *     [mlx5e_get_dcb_num_tc, max_opened_tc).
> +		 *
> +		 *  2. the channel is unavailable, so sum all TCs [0, max_opened_tc).
> +		 */

I wonder why not call the local var 'tc'?

> +		if (i < priv->channels.params.num_channels) {
> +			j = mlx5e_get_dcb_num_tc(&priv->channels.params);
> +		} else {
> +			j = 0;
> +		}

Remove parenthesis, or use ternary op.

> +
> +		for (; j < priv->max_opened_tc; j++) {
> +			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
> +
> +			tx->packets += sq_stats->packets;
> +			tx->bytes += sq_stats->bytes;
> +		}
> +	}
> +	mutex_unlock(&priv->state_lock);
> +

Same comment regarding dropping the mutex.

> +	if (priv->tx_ptp_opened) {
> +		for (j = 0; j < priv->max_opened_tc; j++) {
> +			struct mlx5e_sq_stats *sq_stats = &priv->ptp_stats.sq[j];
> +
> +			tx->packets    += sq_stats->packets;
> +			tx->bytes      += sq_stats->bytes;
> +		}
> +	}
> +}
> +
> +static const struct netdev_stat_ops mlx5e_stat_ops = {
> +	.get_queue_stats_rx     = mlx5e_get_queue_stats_rx,
> +	.get_queue_stats_tx     = mlx5e_get_queue_stats_tx,
> +	.get_base_stats         = mlx5e_get_base_stats,
> +};
> +
>   static void mlx5e_build_nic_netdev(struct net_device *netdev)
>   {
>   	struct mlx5e_priv *priv = netdev_priv(netdev);
> @@ -5310,6 +5441,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
>   
>   	netdev->watchdog_timeo    = 15 * HZ;
>   
> +	netdev->stat_ops          = &mlx5e_stat_ops;
>   	netdev->ethtool_ops	  = &mlx5e_ethtool_ops;
>   
>   	netdev->vlan_features    |= NETIF_F_SG;
Joe Damato June 2, 2024, 7:22 p.m. UTC | #3
On Sun, Jun 02, 2024 at 12:14:21PM +0300, Tariq Toukan wrote:
> 
> 
> On 29/05/2024 6:16, Joe Damato wrote:
> > Add functions to support the netdev-genl per queue stats API.
> > 
> > ./cli.py --spec netlink/specs/netdev.yaml \
> >           --dump qstats-get --json '{"scope": "queue"}'
> > 
> > ...snip
> > 
> >   {'ifindex': 7,
> >    'queue-id': 62,
> >    'queue-type': 'rx',
> >    'rx-alloc-fail': 0,
> >    'rx-bytes': 105965251,
> >    'rx-packets': 179790},
> >   {'ifindex': 7,
> >    'queue-id': 0,
> >    'queue-type': 'tx',
> >    'tx-bytes': 9402665,
> >    'tx-packets': 17551},
> > 
> > ...snip
> > 
> > Also tested with the script tools/testing/selftests/drivers/net/stats.py
> > in several scenarios to ensure stats tallying was correct:
> > 
> > - on boot (default queue counts)
> > - adjusting queue count up or down (ethtool -L eth0 combined ...)
> > - adding mqprio TCs
> 
> Please test also with interface down.

OK. I'll test with the interface down.

Is there some publicly available Mellanox script I can run to test
all the different cases? That would make this much easier. Maybe
this is something to include in mlnx-tools on github?

The mlnx-tools scripts that includes some python scripts for setting
up QoS doesn't seem to work on my system, and outputs vague error
messages. I have no idea if I'm missing some kernel option, if the
device doesn't support it, or if I need some other dependency
installed.

I have been testing these patches on a:

Mellanox Technologies MT28800 Family [ConnectX-5 Ex]
firmware-version: 16.29.2002 (MT_0000000013)

> > 
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > ---
> >   .../net/ethernet/mellanox/mlx5/core/en_main.c | 132 ++++++++++++++++++
> >   1 file changed, 132 insertions(+)
> > 
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > index ce15805ad55a..515c16a88a6c 100644
> > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > @@ -39,6 +39,7 @@
> >   #include <linux/debugfs.h>
> >   #include <linux/if_bridge.h>
> >   #include <linux/filter.h>
> > +#include <net/netdev_queues.h>
> >   #include <net/page_pool/types.h>
> >   #include <net/pkt_sched.h>
> >   #include <net/xdp_sock_drv.h>
> > @@ -5293,6 +5294,136 @@ static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev)
> >   	return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev));
> >   }
> > +static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
> > +				     struct netdev_queue_stats_rx *stats)
> > +{
> > +	struct mlx5e_priv *priv = netdev_priv(dev);
> > +	struct mlx5e_channel_stats *channel_stats;
> > +	struct mlx5e_rq_stats *xskrq_stats;
> > +	struct mlx5e_rq_stats *rq_stats;
> > +
> > +	if (mlx5e_is_uplink_rep(priv))
> > +		return;
> > +
> > +	channel_stats = priv->channel_stats[i];
> > +	xskrq_stats = &channel_stats->xskrq;
> > +	rq_stats = &channel_stats->rq;
> > +
> > +	stats->packets = rq_stats->packets + xskrq_stats->packets;
> > +	stats->bytes = rq_stats->bytes + xskrq_stats->bytes;
> > +	stats->alloc_fail = rq_stats->buff_alloc_err +
> > +			    xskrq_stats->buff_alloc_err;
> > +}
> > +
> > +static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
> > +				     struct netdev_queue_stats_tx *stats)
> > +{
> > +	struct mlx5e_priv *priv = netdev_priv(dev);
> > +	struct mlx5e_channel_stats *channel_stats;
> > +	struct mlx5e_sq_stats *sq_stats;
> > +	int ch_ix, tc_ix;
> > +
> > +	mutex_lock(&priv->state_lock);
> > +	txq_ix_to_chtc_ix(&priv->channels.params, i, &ch_ix, &tc_ix);
> > +	mutex_unlock(&priv->state_lock);
> > +
> > +	channel_stats = priv->channel_stats[ch_ix];
> > +	sq_stats = &channel_stats->sq[tc_ix];
> > +
> > +	stats->packets = sq_stats->packets;
> > +	stats->bytes = sq_stats->bytes;
> > +}
> > +
> > +static void mlx5e_get_base_stats(struct net_device *dev,
> > +				 struct netdev_queue_stats_rx *rx,
> > +				 struct netdev_queue_stats_tx *tx)
> > +{
> > +	struct mlx5e_priv *priv = netdev_priv(dev);
> > +	int i, j;
> > +
> > +	if (!mlx5e_is_uplink_rep(priv)) {
> > +		rx->packets = 0;
> > +		rx->bytes = 0;
> > +		rx->alloc_fail = 0;
> > +
> > +		/* compute stats for deactivated RX queues
> > +		 *
> > +		 * if priv->channels.num == 0 the device is down, so compute
> > +		 * stats for every queue.
> > +		 *
> > +		 * otherwise, compute only the queues which have been deactivated.
> > +		 */
> > +		mutex_lock(&priv->state_lock);
> > +		if (priv->channels.num == 0)
> > +			i = 0;
> 
> This is not consistent with the above implementation of
> mlx5e_get_queue_stats_rx(), which always returns the stats even if the
> channel is down.
> This way, you'll double count the down channels.
> 
> I think you should always start from priv->channels.params.num_channels.

OK, I'll do that.

> > +		else
> > +			i = priv->channels.params.num_channels;
> > +		mutex_unlock(&priv->state_lock);
> 
> I understand that you're following the guidelines by taking the lock here, I
> just don't think this improves anything... If channels can be modified in
> between calls to mlx5e_get_base_stats / mlx5e_get_queue_stats_rx, then
> wrapping the priv->channels access with a lock can help protect each single
> deref, but not necessarily in giving a consistent "screenshot" of the stats.
> 
> The rtnl_lock should take care of that, as the driver holds it when changing
> the number of channels and updating the real_numrx/tx_queues.
> 
> This said, I would carefully say you can drop the mutex once following the
> requested changes above.

OK, that makes sense to me.

So then I assume I can drop the mutex in mlx5e_get_queue_stats_tx
above, as well, for the same reasons?

Does this mean then that you are in favor of the implementation for
tx stats provided in this RFC and that I've implemented option 1 as
you described in the previous thread correctly?

> > +
> > +		for (; i < priv->stats_nch; i++) {
> > +			struct netdev_queue_stats_rx rx_i = {0};
> > +
> > +			mlx5e_get_queue_stats_rx(dev, i, &rx_i);
> > +
> > +			rx->packets += rx_i.packets;
> > +			rx->bytes += rx_i.bytes;
> > +			rx->alloc_fail += rx_i.alloc_fail;
> > +		}
> > +
> > +		if (priv->rx_ptp_opened) {
> > +			struct mlx5e_rq_stats *rq_stats = &priv->ptp_stats.rq;
> > +
> > +			rx->packets += rq_stats->packets;
> > +			rx->bytes += rq_stats->bytes;
> > +		}
> > +	}
> > +
> > +	tx->packets = 0;
> > +	tx->bytes = 0;
> > +
> > +	mutex_lock(&priv->state_lock);
> > +	for (i = 0; i < priv->stats_nch; i++) {
> > +		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
> > +
> > +		/* while iterating through all channels [0, stats_nch], there
> > +		 * are two cases to handle:
> > +		 *
> > +		 *  1. the channel is available, so sum only the unavailable TCs
> > +		 *     [mlx5e_get_dcb_num_tc, max_opened_tc).
> > +		 *
> > +		 *  2. the channel is unavailable, so sum all TCs [0, max_opened_tc).
> > +		 */
> 
> I wonder why not call the local var 'tc'?

OK.

> > +		if (i < priv->channels.params.num_channels) {
> > +			j = mlx5e_get_dcb_num_tc(&priv->channels.params);
> > +		} else {
> > +			j = 0;
> > +		}
> 
> Remove parenthesis, or use ternary op.

I'll remove the parenthesis; I didn't run checkpatch.pl on this RFC
(which catches this), but I should have.

> > +
> > +		for (; j < priv->max_opened_tc; j++) {
> > +			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
> > +
> > +			tx->packets += sq_stats->packets;
> > +			tx->bytes += sq_stats->bytes;
> > +		}
> > +	}
> > +	mutex_unlock(&priv->state_lock);
> > +
> 
> Same comment regarding dropping the mutex.

OK.
Tariq Toukan June 3, 2024, 11:11 a.m. UTC | #4
On 02/06/2024 22:22, Joe Damato wrote:
> On Sun, Jun 02, 2024 at 12:14:21PM +0300, Tariq Toukan wrote:
>>
>>
>> On 29/05/2024 6:16, Joe Damato wrote:
>>> Add functions to support the netdev-genl per queue stats API.
>>>
>>> ./cli.py --spec netlink/specs/netdev.yaml \
>>>            --dump qstats-get --json '{"scope": "queue"}'
>>>
>>> ...snip
>>>
>>>    {'ifindex': 7,
>>>     'queue-id': 62,
>>>     'queue-type': 'rx',
>>>     'rx-alloc-fail': 0,
>>>     'rx-bytes': 105965251,
>>>     'rx-packets': 179790},
>>>    {'ifindex': 7,
>>>     'queue-id': 0,
>>>     'queue-type': 'tx',
>>>     'tx-bytes': 9402665,
>>>     'tx-packets': 17551},
>>>
>>> ...snip
>>>
>>> Also tested with the script tools/testing/selftests/drivers/net/stats.py
>>> in several scenarios to ensure stats tallying was correct:
>>>
>>> - on boot (default queue counts)
>>> - adjusting queue count up or down (ethtool -L eth0 combined ...)
>>> - adding mqprio TCs
>>
>> Please test also with interface down.
> 
> OK. I'll test with the interface down.
> 
> Is there some publicly available Mellanox script I can run to test
> all the different cases? That would make this much easier. Maybe
> this is something to include in mlnx-tools on github?
> 

You're testing some new functionality. We don't have something for it.


> The mlnx-tools scripts that includes some python scripts for setting
> up QoS doesn't seem to work on my system, and outputs vague error
> messages. I have no idea if I'm missing some kernel option, if the
> device doesn't support it, or if I need some other dependency
> installed.
> 

Can you share the command you use, and the output?

> I have been testing these patches on a:
> 
> Mellanox Technologies MT28800 Family [ConnectX-5 Ex]
> firmware-version: 16.29.2002 (MT_0000000013)
> 
>>>
>>> Signed-off-by: Joe Damato <jdamato@fastly.com>
>>> ---
>>>    .../net/ethernet/mellanox/mlx5/core/en_main.c | 132 ++++++++++++++++++
>>>    1 file changed, 132 insertions(+)
>>>
>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>>> index ce15805ad55a..515c16a88a6c 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>>> @@ -39,6 +39,7 @@
>>>    #include <linux/debugfs.h>
>>>    #include <linux/if_bridge.h>
>>>    #include <linux/filter.h>
>>> +#include <net/netdev_queues.h>
>>>    #include <net/page_pool/types.h>
>>>    #include <net/pkt_sched.h>
>>>    #include <net/xdp_sock_drv.h>
>>> @@ -5293,6 +5294,136 @@ static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev)
>>>    	return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev));
>>>    }
>>> +static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
>>> +				     struct netdev_queue_stats_rx *stats)
>>> +{
>>> +	struct mlx5e_priv *priv = netdev_priv(dev);
>>> +	struct mlx5e_channel_stats *channel_stats;
>>> +	struct mlx5e_rq_stats *xskrq_stats;
>>> +	struct mlx5e_rq_stats *rq_stats;
>>> +
>>> +	if (mlx5e_is_uplink_rep(priv))
>>> +		return;
>>> +
>>> +	channel_stats = priv->channel_stats[i];
>>> +	xskrq_stats = &channel_stats->xskrq;
>>> +	rq_stats = &channel_stats->rq;
>>> +
>>> +	stats->packets = rq_stats->packets + xskrq_stats->packets;
>>> +	stats->bytes = rq_stats->bytes + xskrq_stats->bytes;
>>> +	stats->alloc_fail = rq_stats->buff_alloc_err +
>>> +			    xskrq_stats->buff_alloc_err;
>>> +}
>>> +
>>> +static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
>>> +				     struct netdev_queue_stats_tx *stats)
>>> +{
>>> +	struct mlx5e_priv *priv = netdev_priv(dev);
>>> +	struct mlx5e_channel_stats *channel_stats;
>>> +	struct mlx5e_sq_stats *sq_stats;
>>> +	int ch_ix, tc_ix;
>>> +
>>> +	mutex_lock(&priv->state_lock);
>>> +	txq_ix_to_chtc_ix(&priv->channels.params, i, &ch_ix, &tc_ix);
>>> +	mutex_unlock(&priv->state_lock);
>>> +
>>> +	channel_stats = priv->channel_stats[ch_ix];
>>> +	sq_stats = &channel_stats->sq[tc_ix];
>>> +
>>> +	stats->packets = sq_stats->packets;
>>> +	stats->bytes = sq_stats->bytes;
>>> +}
>>> +
>>> +static void mlx5e_get_base_stats(struct net_device *dev,
>>> +				 struct netdev_queue_stats_rx *rx,
>>> +				 struct netdev_queue_stats_tx *tx)
>>> +{
>>> +	struct mlx5e_priv *priv = netdev_priv(dev);
>>> +	int i, j;
>>> +
>>> +	if (!mlx5e_is_uplink_rep(priv)) {
>>> +		rx->packets = 0;
>>> +		rx->bytes = 0;
>>> +		rx->alloc_fail = 0;
>>> +
>>> +		/* compute stats for deactivated RX queues
>>> +		 *
>>> +		 * if priv->channels.num == 0 the device is down, so compute
>>> +		 * stats for every queue.
>>> +		 *
>>> +		 * otherwise, compute only the queues which have been deactivated.
>>> +		 */
>>> +		mutex_lock(&priv->state_lock);
>>> +		if (priv->channels.num == 0)
>>> +			i = 0;
>>
>> This is not consistent with the above implementation of
>> mlx5e_get_queue_stats_rx(), which always returns the stats even if the
>> channel is down.
>> This way, you'll double count the down channels.
>>
>> I think you should always start from priv->channels.params.num_channels.
> 
> OK, I'll do that.
> 
>>> +		else
>>> +			i = priv->channels.params.num_channels;
>>> +		mutex_unlock(&priv->state_lock);
>>
>> I understand that you're following the guidelines by taking the lock here, I
>> just don't think this improves anything... If channels can be modified in
>> between calls to mlx5e_get_base_stats / mlx5e_get_queue_stats_rx, then
>> wrapping the priv->channels access with a lock can help protect each single
>> deref, but not necessarily in giving a consistent "screenshot" of the stats.
>>
>> The rtnl_lock should take care of that, as the driver holds it when changing
>> the number of channels and updating the real_numrx/tx_queues.
>>
>> This said, I would carefully say you can drop the mutex once following the
>> requested changes above.

I still don't really like this design, so I gave some more thought on 
this...

I think we should come up with a new mapping array under priv, that maps 
i (from real_num_tx_queues) to the matching sq_stats struct.
This array would be maintained in the channels open/close functions, 
similarly to priv->txq2sq.

Then, we would not calculate the mapping per call, but just get the 
proper pointer from the array. This eases the handling of htb and ptp 
queues, which were missed in your txq_ix_to_chtc_ix().

This handles mapped SQs.

Now, regarding unmapped ones, they must be handled in the "base" 
function call.
We'd still need to access channels->params, to:
1. read params.num_channels to iterate until priv->stats_nch, and
2. read mlx5e_get_dcb_num_tc(params) to iterate until priv->max_opened_tc.

I think we can live with this without holding the mutex, given that this 
runs under the rtnl lock.
We can add ASSERT_RTNL() to verify the assumption.


> 
> OK, that makes sense to me.
> 
> So then I assume I can drop the mutex in mlx5e_get_queue_stats_tx
> above, as well, for the same reasons?
> 
> Does this mean then that you are in favor of the implementation for
> tx stats provided in this RFC and that I've implemented option 1 as
> you described in the previous thread correctly?
> 

Yes, but I wasn't happy enough with the design.
Thanks for your contribution.

>>> +
>>> +		for (; i < priv->stats_nch; i++) {
>>> +			struct netdev_queue_stats_rx rx_i = {0};
>>> +
>>> +			mlx5e_get_queue_stats_rx(dev, i, &rx_i);
>>> +
>>> +			rx->packets += rx_i.packets;
>>> +			rx->bytes += rx_i.bytes;
>>> +			rx->alloc_fail += rx_i.alloc_fail;
>>> +		}
>>> +
>>> +		if (priv->rx_ptp_opened) {
>>> +			struct mlx5e_rq_stats *rq_stats = &priv->ptp_stats.rq;
>>> +
>>> +			rx->packets += rq_stats->packets;
>>> +			rx->bytes += rq_stats->bytes;
>>> +		}
>>> +	}
>>> +
>>> +	tx->packets = 0;
>>> +	tx->bytes = 0;
>>> +
>>> +	mutex_lock(&priv->state_lock);
>>> +	for (i = 0; i < priv->stats_nch; i++) {
>>> +		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
>>> +
>>> +		/* while iterating through all channels [0, stats_nch], there
>>> +		 * are two cases to handle:
>>> +		 *
>>> +		 *  1. the channel is available, so sum only the unavailable TCs
>>> +		 *     [mlx5e_get_dcb_num_tc, max_opened_tc).
>>> +		 *
>>> +		 *  2. the channel is unavailable, so sum all TCs [0, max_opened_tc).
>>> +		 */
>>
>> I wonder why not call the local var 'tc'?
> 
> OK.
> 
>>> +		if (i < priv->channels.params.num_channels) {
>>> +			j = mlx5e_get_dcb_num_tc(&priv->channels.params);
>>> +		} else {
>>> +			j = 0;
>>> +		}
>>
>> Remove parenthesis, or use ternary op.
> 
> I'll remove the parenthesis; I didn't run checkpatch.pl on this RFC
> (which catches this), but I should have.
> 
>>> +
>>> +		for (; j < priv->max_opened_tc; j++) {
>>> +			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
>>> +
>>> +			tx->packets += sq_stats->packets;
>>> +			tx->bytes += sq_stats->bytes;
>>> +		}
>>> +	}
>>> +	mutex_unlock(&priv->state_lock);
>>> +
>>
>> Same comment regarding dropping the mutex.
> 
> OK.
Joe Damato June 3, 2024, 6:11 p.m. UTC | #5
On Mon, Jun 03, 2024 at 02:11:14PM +0300, Tariq Toukan wrote:
> 
> 
> On 02/06/2024 22:22, Joe Damato wrote:
> > On Sun, Jun 02, 2024 at 12:14:21PM +0300, Tariq Toukan wrote:
> > > 
> > > 
> > > On 29/05/2024 6:16, Joe Damato wrote:
> > > > Add functions to support the netdev-genl per queue stats API.
> > > > 
> > > > ./cli.py --spec netlink/specs/netdev.yaml \
> > > >            --dump qstats-get --json '{"scope": "queue"}'
> > > > 
> > > > ...snip
> > > > 
> > > >    {'ifindex': 7,
> > > >     'queue-id': 62,
> > > >     'queue-type': 'rx',
> > > >     'rx-alloc-fail': 0,
> > > >     'rx-bytes': 105965251,
> > > >     'rx-packets': 179790},
> > > >    {'ifindex': 7,
> > > >     'queue-id': 0,
> > > >     'queue-type': 'tx',
> > > >     'tx-bytes': 9402665,
> > > >     'tx-packets': 17551},
> > > > 
> > > > ...snip
> > > > 
> > > > Also tested with the script tools/testing/selftests/drivers/net/stats.py
> > > > in several scenarios to ensure stats tallying was correct:
> > > > 
> > > > - on boot (default queue counts)
> > > > - adjusting queue count up or down (ethtool -L eth0 combined ...)
> > > > - adding mqprio TCs
> > > 
> > > Please test also with interface down.
> > 
> > OK. I'll test with the interface down.
> > 
> > Is there some publicly available Mellanox script I can run to test
> > all the different cases? That would make this much easier. Maybe
> > this is something to include in mlnx-tools on github?
> > 
> 
> You're testing some new functionality. We don't have something for it.
> 
> 
> > The mlnx-tools scripts that includes some python scripts for setting
> > up QoS doesn't seem to work on my system, and outputs vague error
> > messages. I have no idea if I'm missing some kernel option, if the
> > device doesn't support it, or if I need some other dependency
> > installed.
> > 
> 
> Can you share the command you use, and the output?

Sure:

jdamato@test:~/mlnx-tools/python$ python --version
Python 3.12.3

jdamato@test:~/mlnx-tools/python$ ./mlnx_qos -i vlan401
Priority trust state is not supported on your system
Buffers commands are not supported on your system
Rate limit is not supported on your system!
ETS features are not supported on your system

jdamato@test:~/mlnx-tools/python$ echo $?
1

This is mlnx-tools at SHA 641718b13f71 ("Merge pull request #87 from
ahlabenadam/no_remove_folder")

You can feel free to follow up with me about this off-list if you
like.

> > I have been testing these patches on a:
> > 
> > Mellanox Technologies MT28800 Family [ConnectX-5 Ex]
> > firmware-version: 16.29.2002 (MT_0000000013)
> > 
> > > > 
> > > > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > > > ---
> > > >    .../net/ethernet/mellanox/mlx5/core/en_main.c | 132 ++++++++++++++++++
> > > >    1 file changed, 132 insertions(+)
> > > > 
> > > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > > > index ce15805ad55a..515c16a88a6c 100644
> > > > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > > > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > > > @@ -39,6 +39,7 @@
> > > >    #include <linux/debugfs.h>
> > > >    #include <linux/if_bridge.h>
> > > >    #include <linux/filter.h>
> > > > +#include <net/netdev_queues.h>
> > > >    #include <net/page_pool/types.h>
> > > >    #include <net/pkt_sched.h>
> > > >    #include <net/xdp_sock_drv.h>
> > > > @@ -5293,6 +5294,136 @@ static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev)
> > > >    	return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev));
> > > >    }
> > > > +static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
> > > > +				     struct netdev_queue_stats_rx *stats)
> > > > +{
> > > > +	struct mlx5e_priv *priv = netdev_priv(dev);
> > > > +	struct mlx5e_channel_stats *channel_stats;
> > > > +	struct mlx5e_rq_stats *xskrq_stats;
> > > > +	struct mlx5e_rq_stats *rq_stats;
> > > > +
> > > > +	if (mlx5e_is_uplink_rep(priv))
> > > > +		return;
> > > > +
> > > > +	channel_stats = priv->channel_stats[i];
> > > > +	xskrq_stats = &channel_stats->xskrq;
> > > > +	rq_stats = &channel_stats->rq;
> > > > +
> > > > +	stats->packets = rq_stats->packets + xskrq_stats->packets;
> > > > +	stats->bytes = rq_stats->bytes + xskrq_stats->bytes;
> > > > +	stats->alloc_fail = rq_stats->buff_alloc_err +
> > > > +			    xskrq_stats->buff_alloc_err;
> > > > +}
> > > > +
> > > > +static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
> > > > +				     struct netdev_queue_stats_tx *stats)
> > > > +{
> > > > +	struct mlx5e_priv *priv = netdev_priv(dev);
> > > > +	struct mlx5e_channel_stats *channel_stats;
> > > > +	struct mlx5e_sq_stats *sq_stats;
> > > > +	int ch_ix, tc_ix;
> > > > +
> > > > +	mutex_lock(&priv->state_lock);
> > > > +	txq_ix_to_chtc_ix(&priv->channels.params, i, &ch_ix, &tc_ix);
> > > > +	mutex_unlock(&priv->state_lock);
> > > > +
> > > > +	channel_stats = priv->channel_stats[ch_ix];
> > > > +	sq_stats = &channel_stats->sq[tc_ix];
> > > > +
> > > > +	stats->packets = sq_stats->packets;
> > > > +	stats->bytes = sq_stats->bytes;
> > > > +}
> > > > +
> > > > +static void mlx5e_get_base_stats(struct net_device *dev,
> > > > +				 struct netdev_queue_stats_rx *rx,
> > > > +				 struct netdev_queue_stats_tx *tx)
> > > > +{
> > > > +	struct mlx5e_priv *priv = netdev_priv(dev);
> > > > +	int i, j;
> > > > +
> > > > +	if (!mlx5e_is_uplink_rep(priv)) {
> > > > +		rx->packets = 0;
> > > > +		rx->bytes = 0;
> > > > +		rx->alloc_fail = 0;
> > > > +
> > > > +		/* compute stats for deactivated RX queues
> > > > +		 *
> > > > +		 * if priv->channels.num == 0 the device is down, so compute
> > > > +		 * stats for every queue.
> > > > +		 *
> > > > +		 * otherwise, compute only the queues which have been deactivated.
> > > > +		 */
> > > > +		mutex_lock(&priv->state_lock);
> > > > +		if (priv->channels.num == 0)
> > > > +			i = 0;
> > > 
> > > This is not consistent with the above implementation of
> > > mlx5e_get_queue_stats_rx(), which always returns the stats even if the
> > > channel is down.
> > > This way, you'll double count the down channels.
> > > 
> > > I think you should always start from priv->channels.params.num_channels.
> > 
> > OK, I'll do that.
> > 
> > > > +		else
> > > > +			i = priv->channels.params.num_channels;
> > > > +		mutex_unlock(&priv->state_lock);
> > > 
> > > I understand that you're following the guidelines by taking the lock here, I
> > > just don't think this improves anything... If channels can be modified in
> > > between calls to mlx5e_get_base_stats / mlx5e_get_queue_stats_rx, then
> > > wrapping the priv->channels access with a lock can help protect each single
> > > deref, but not necessarily in giving a consistent "screenshot" of the stats.
> > > 
> > > The rtnl_lock should take care of that, as the driver holds it when changing
> > > the number of channels and updating the real_numrx/tx_queues.
> > > 
> > > This said, I would carefully say you can drop the mutex once following the
> > > requested changes above.
> 
> I still don't really like this design, so I gave some more thought on
> this...

Thanks, again, for your careful thoughts and review on this. I do
appreciate it and this functionality will be extremely useful for me
(and I suspect many others).

> I think we should come up with a new mapping array under priv, that maps i
> (from real_num_tx_queues) to the matching sq_stats struct.
> This array would be maintained in the channels open/close functions,
> similarly to priv->txq2sq.
> 
> Then, we would not calculate the mapping per call, but just get the proper
> pointer from the array. This eases the handling of htb and ptp queues, which
> were missed in your txq_ix_to_chtc_ix().
> 
> This handles mapped SQs.

OK, the above makes sense. I'll give that a try.

> Now, regarding unmapped ones, they must be handled in the "base" function
> call.
> We'd still need to access channels->params, to:
> 1. read params.num_channels to iterate until priv->stats_nch, and
> 2. read mlx5e_get_dcb_num_tc(params) to iterate until priv->max_opened_tc.
> 
> I think we can live with this without holding the mutex, given that this
> runs under the rtnl lock.
> We can add ASSERT_RTNL() to verify the assumption.

OK. I'll try the above and propose an rfc v4.

> 
> > 
> > OK, that makes sense to me.
> > 
> > So then I assume I can drop the mutex in mlx5e_get_queue_stats_tx
> > above, as well, for the same reasons?
> > 
> > Does this mean then that you are in favor of the implementation for
> > tx stats provided in this RFC and that I've implemented option 1 as
> > you described in the previous thread correctly?
> > 
> 
> Yes, but I wasn't happy enough with the design.
> Thanks for your contribution.

Thanks for your review and patience.
Joe Damato June 3, 2024, 7:22 p.m. UTC | #6
On Mon, Jun 03, 2024 at 02:11:14PM +0300, Tariq Toukan wrote:
> 
> 
> On 02/06/2024 22:22, Joe Damato wrote:
> > On Sun, Jun 02, 2024 at 12:14:21PM +0300, Tariq Toukan wrote:
> > > 
> > > 
> > > On 29/05/2024 6:16, Joe Damato wrote:
> > > > Add functions to support the netdev-genl per queue stats API.
> > > > 
> > > > ./cli.py --spec netlink/specs/netdev.yaml \
> > > >            --dump qstats-get --json '{"scope": "queue"}'
> > > > 
> > > > ...snip
> > > > 
> > > >    {'ifindex': 7,
> > > >     'queue-id': 62,
> > > >     'queue-type': 'rx',
> > > >     'rx-alloc-fail': 0,
> > > >     'rx-bytes': 105965251,
> > > >     'rx-packets': 179790},
> > > >    {'ifindex': 7,
> > > >     'queue-id': 0,
> > > >     'queue-type': 'tx',
> > > >     'tx-bytes': 9402665,
> > > >     'tx-packets': 17551},
> > > > 
> > > > ...snip
> > > > 
> > > > Also tested with the script tools/testing/selftests/drivers/net/stats.py
> > > > in several scenarios to ensure stats tallying was correct:
> > > > 
> > > > - on boot (default queue counts)
> > > > - adjusting queue count up or down (ethtool -L eth0 combined ...)
> > > > - adding mqprio TCs
> > > 
> > > Please test also with interface down.
> > 
> > OK. I'll test with the interface down.
> > 
> > Is there some publicly available Mellanox script I can run to test
> > all the different cases? That would make this much easier. Maybe
> > this is something to include in mlnx-tools on github?
> > 
> 
> You're testing some new functionality. We don't have something for it.
> 
> 
> > The mlnx-tools scripts that includes some python scripts for setting
> > up QoS doesn't seem to work on my system, and outputs vague error
> > messages. I have no idea if I'm missing some kernel option, if the
> > device doesn't support it, or if I need some other dependency
> > installed.
> > 
> 
> Can you share the command you use, and the output?
> 
> > I have been testing these patches on a:
> > 
> > Mellanox Technologies MT28800 Family [ConnectX-5 Ex]
> > firmware-version: 16.29.2002 (MT_0000000013)
> > 
> > > > 
> > > > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > > > ---
> > > >    .../net/ethernet/mellanox/mlx5/core/en_main.c | 132 ++++++++++++++++++
> > > >    1 file changed, 132 insertions(+)
> > > > 
> > > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > > > index ce15805ad55a..515c16a88a6c 100644
> > > > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > > > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > > > @@ -39,6 +39,7 @@
> > > >    #include <linux/debugfs.h>
> > > >    #include <linux/if_bridge.h>
> > > >    #include <linux/filter.h>
> > > > +#include <net/netdev_queues.h>
> > > >    #include <net/page_pool/types.h>
> > > >    #include <net/pkt_sched.h>
> > > >    #include <net/xdp_sock_drv.h>
> > > > @@ -5293,6 +5294,136 @@ static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev)
> > > >    	return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev));
> > > >    }
> > > > +static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
> > > > +				     struct netdev_queue_stats_rx *stats)
> > > > +{
> > > > +	struct mlx5e_priv *priv = netdev_priv(dev);
> > > > +	struct mlx5e_channel_stats *channel_stats;
> > > > +	struct mlx5e_rq_stats *xskrq_stats;
> > > > +	struct mlx5e_rq_stats *rq_stats;
> > > > +
> > > > +	if (mlx5e_is_uplink_rep(priv))
> > > > +		return;
> > > > +
> > > > +	channel_stats = priv->channel_stats[i];
> > > > +	xskrq_stats = &channel_stats->xskrq;
> > > > +	rq_stats = &channel_stats->rq;
> > > > +
> > > > +	stats->packets = rq_stats->packets + xskrq_stats->packets;
> > > > +	stats->bytes = rq_stats->bytes + xskrq_stats->bytes;
> > > > +	stats->alloc_fail = rq_stats->buff_alloc_err +
> > > > +			    xskrq_stats->buff_alloc_err;
> > > > +}
> > > > +
> > > > +static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
> > > > +				     struct netdev_queue_stats_tx *stats)
> > > > +{
> > > > +	struct mlx5e_priv *priv = netdev_priv(dev);
> > > > +	struct mlx5e_channel_stats *channel_stats;
> > > > +	struct mlx5e_sq_stats *sq_stats;
> > > > +	int ch_ix, tc_ix;
> > > > +
> > > > +	mutex_lock(&priv->state_lock);
> > > > +	txq_ix_to_chtc_ix(&priv->channels.params, i, &ch_ix, &tc_ix);
> > > > +	mutex_unlock(&priv->state_lock);
> > > > +
> > > > +	channel_stats = priv->channel_stats[ch_ix];
> > > > +	sq_stats = &channel_stats->sq[tc_ix];
> > > > +
> > > > +	stats->packets = sq_stats->packets;
> > > > +	stats->bytes = sq_stats->bytes;
> > > > +}
> > > > +
> > > > +static void mlx5e_get_base_stats(struct net_device *dev,
> > > > +				 struct netdev_queue_stats_rx *rx,
> > > > +				 struct netdev_queue_stats_tx *tx)
> > > > +{
> > > > +	struct mlx5e_priv *priv = netdev_priv(dev);
> > > > +	int i, j;
> > > > +
> > > > +	if (!mlx5e_is_uplink_rep(priv)) {
> > > > +		rx->packets = 0;
> > > > +		rx->bytes = 0;
> > > > +		rx->alloc_fail = 0;
> > > > +
> > > > +		/* compute stats for deactivated RX queues
> > > > +		 *
> > > > +		 * if priv->channels.num == 0 the device is down, so compute
> > > > +		 * stats for every queue.
> > > > +		 *
> > > > +		 * otherwise, compute only the queues which have been deactivated.
> > > > +		 */
> > > > +		mutex_lock(&priv->state_lock);
> > > > +		if (priv->channels.num == 0)
> > > > +			i = 0;
> > > 
> > > This is not consistent with the above implementation of
> > > mlx5e_get_queue_stats_rx(), which always returns the stats even if the
> > > channel is down.
> > > This way, you'll double count the down channels.
> > > 
> > > I think you should always start from priv->channels.params.num_channels.
> > 
> > OK, I'll do that.
> > 
> > > > +		else
> > > > +			i = priv->channels.params.num_channels;
> > > > +		mutex_unlock(&priv->state_lock);
> > > 
> > > I understand that you're following the guidelines by taking the lock here, I
> > > just don't think this improves anything... If channels can be modified in
> > > between calls to mlx5e_get_base_stats / mlx5e_get_queue_stats_rx, then
> > > wrapping the priv->channels access with a lock can help protect each single
> > > deref, but not necessarily in giving a consistent "screenshot" of the stats.
> > > 
> > > The rtnl_lock should take care of that, as the driver holds it when changing
> > > the number of channels and updating the real_numrx/tx_queues.
> > > 
> > > This said, I would carefully say you can drop the mutex once following the
> > > requested changes above.
> 
> I still don't really like this design, so I gave some more thought on
> this...
> 
> I think we should come up with a new mapping array under priv, that maps i
> (from real_num_tx_queues) to the matching sq_stats struct.
> This array would be maintained in the channels open/close functions,
> similarly to priv->txq2sq.
> 
> Then, we would not calculate the mapping per call, but just get the proper
> pointer from the array. This eases the handling of htb and ptp queues, which
> were missed in your txq_ix_to_chtc_ix().

Maybe I am just getting way off track here, but I noticed this in
drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c in
set_pflag_tx_port_ts:

  if (!err)
    priv->tx_ptp_opened = true;

but what if enable is false, meaning the user is disabling ptp via
ethtool?

In that case, shouldn't this be:

  if (!err)
    priv->tx_ptp_opened = enabled;

Because it seems like the user could pass 0 to disable ptp, and with
the current code then trigger mlx5e_close_channels which would call
mlx5e_ptp_close, but priv->tx_ptp_opened might still be true even
though ptp has been closed. Just a guess as I really don't know much
about this code at all, but it seems that the MLX5E_PFLAG_TX_PORT_TS
will be set in set_pflag_tx_port_ts based on enable, which then
affects how mlx5e_open_channels behaves.

Likewise in mlx5e_ptp_close_queues, maybe
rx_ptp_opened and tx_ptp_opened should be set to false there?

It seems like the user could get the driver into an inconsistent
state by enabling then disabling ptp, but maybe I'm just reading
this all wrong.

Is that a bug? If so, I can submit:

1. ethtool tx_ptp_opened = false
2. rx_ptp_opened = tx_ptp_opened = false in mlx5e_ptp_close_queues

to net as a Fixes ?

I am asking about this from the perspective of stats, because in the
qos stuff, the txq2sq_stats mapping is created or removed (set to
NULL) in mlx5e_activate_qos_sq and mlx5e_deactivate_qos_sq /
mlx5e_qos_deactivate_queues, respectively.

This means that priv->txq2sq_stats could be a valid sq_stats or
invalid for deactivated TCs and qos. It seems like ptp should work
the same way, but I have no idea.

If ptp did work the same way, then in base the code would check if
ptp was disabled and obtain the stats from it, if there are any from
it being previously enabled.

That seems like more consistent behavior, but sorry if
I'm totally off on all of this.

> This handles mapped SQs.
> 
> Now, regarding unmapped ones, they must be handled in the "base" function
> call.
> We'd still need to access channels->params, to:
> 1. read params.num_channels to iterate until priv->stats_nch, and
> 2. read mlx5e_get_dcb_num_tc(params) to iterate until priv->max_opened_tc.
> 
> I think we can live with this without holding the mutex, given that this
> runs under the rtnl lock.
> We can add ASSERT_RTNL() to verify the assumption.
> 
> 
> > 
> > OK, that makes sense to me.
> > 
> > So then I assume I can drop the mutex in mlx5e_get_queue_stats_tx
> > above, as well, for the same reasons?
> > 
> > Does this mean then that you are in favor of the implementation for
> > tx stats provided in this RFC and that I've implemented option 1 as
> > you described in the previous thread correctly?
> > 
> 
> Yes, but I wasn't happy enough with the design.
> Thanks for your contribution.
> 
> > > > +
> > > > +		for (; i < priv->stats_nch; i++) {
> > > > +			struct netdev_queue_stats_rx rx_i = {0};
> > > > +
> > > > +			mlx5e_get_queue_stats_rx(dev, i, &rx_i);
> > > > +
> > > > +			rx->packets += rx_i.packets;
> > > > +			rx->bytes += rx_i.bytes;
> > > > +			rx->alloc_fail += rx_i.alloc_fail;
> > > > +		}
> > > > +
> > > > +		if (priv->rx_ptp_opened) {
> > > > +			struct mlx5e_rq_stats *rq_stats = &priv->ptp_stats.rq;
> > > > +
> > > > +			rx->packets += rq_stats->packets;
> > > > +			rx->bytes += rq_stats->bytes;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	tx->packets = 0;
> > > > +	tx->bytes = 0;
> > > > +
> > > > +	mutex_lock(&priv->state_lock);
> > > > +	for (i = 0; i < priv->stats_nch; i++) {
> > > > +		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
> > > > +
> > > > +		/* while iterating through all channels [0, stats_nch], there
> > > > +		 * are two cases to handle:
> > > > +		 *
> > > > +		 *  1. the channel is available, so sum only the unavailable TCs
> > > > +		 *     [mlx5e_get_dcb_num_tc, max_opened_tc).
> > > > +		 *
> > > > +		 *  2. the channel is unavailable, so sum all TCs [0, max_opened_tc).
> > > > +		 */
> > > 
> > > I wonder why not call the local var 'tc'?
> > 
> > OK.
> > 
> > > > +		if (i < priv->channels.params.num_channels) {
> > > > +			j = mlx5e_get_dcb_num_tc(&priv->channels.params);
> > > > +		} else {
> > > > +			j = 0;
> > > > +		}
> > > 
> > > Remove parenthesis, or use ternary op.
> > 
> > I'll remove the parenthesis; I didn't run checkpatch.pl on this RFC
> > (which catches this), but I should have.
> > 
> > > > +
> > > > +		for (; j < priv->max_opened_tc; j++) {
> > > > +			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
> > > > +
> > > > +			tx->packets += sq_stats->packets;
> > > > +			tx->bytes += sq_stats->bytes;
> > > > +		}
> > > > +	}
> > > > +	mutex_unlock(&priv->state_lock);
> > > > +
> > > 
> > > Same comment regarding dropping the mutex.
> > 
> > OK.
>
Tariq Toukan June 3, 2024, 9:36 p.m. UTC | #7
On 03/06/2024 22:22, Joe Damato wrote:
> On Mon, Jun 03, 2024 at 02:11:14PM +0300, Tariq Toukan wrote:
>>

...

>>
>> I still don't really like this design, so I gave some more thought on
>> this...
>>
>> I think we should come up with a new mapping array under priv, that maps i
>> (from real_num_tx_queues) to the matching sq_stats struct.
>> This array would be maintained in the channels open/close functions,
>> similarly to priv->txq2sq.
>>
>> Then, we would not calculate the mapping per call, but just get the proper
>> pointer from the array. This eases the handling of htb and ptp queues, which
>> were missed in your txq_ix_to_chtc_ix().
> 
> Maybe I am just getting way off track here, but I noticed this in
> drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c in
> set_pflag_tx_port_ts:
> 
>    if (!err)
>      priv->tx_ptp_opened = true;
> 
> but what if enable is false, meaning the user is disabling ptp via
> ethtool?
> 

This bool field under priv acts as a flag, means: ptp was ever opened.
It's the same idea as max_opened_tc, but with boolean instead of numeric.

> In that case, shouldn't this be:
> 
>    if (!err)
>      priv->tx_ptp_opened = enabled;
> 
> Because it seems like the user could pass 0 to disable ptp, and with
> the current code then trigger mlx5e_close_channels which would call
> mlx5e_ptp_close, but priv->tx_ptp_opened might still be true even
> though ptp has been closed. Just a guess as I really don't know much
> about this code at all, but it seems that the MLX5E_PFLAG_TX_PORT_TS
> will be set in set_pflag_tx_port_ts based on enable, which then
> affects how mlx5e_open_channels behaves.
> 
> Likewise in mlx5e_ptp_close_queues, maybe
> rx_ptp_opened and tx_ptp_opened should be set to false there?
> 
> It seems like the user could get the driver into an inconsistent
> state by enabling then disabling ptp, but maybe I'm just reading
> this all wrong.
> 
> Is that a bug? If so, I can submit:
> 
> 1. ethtool tx_ptp_opened = false
> 2. rx_ptp_opened = tx_ptp_opened = false in mlx5e_ptp_close_queues
> 
> to net as a Fixes ?
> 
> I am asking about this from the perspective of stats, because in the
> qos stuff, the txq2sq_stats mapping is created or removed (set to
> NULL) in mlx5e_activate_qos_sq and mlx5e_deactivate_qos_sq /
> mlx5e_qos_deactivate_queues, respectively.
> 
> This means that priv->txq2sq_stats could be a valid sq_stats or
> invalid for deactivated TCs and qos. It seems like ptp should work
> the same way, but I have no idea.
> 
> If ptp did work the same way, then in base the code would check if
> ptp was disabled and obtain the stats from it, if there are any from
> it being previously enabled.
> 
> That seems like more consistent behavior, but sorry if
> I'm totally off on all of this.
>
Joe Damato June 3, 2024, 9:50 p.m. UTC | #8
On Tue, Jun 04, 2024 at 12:36:16AM +0300, Tariq Toukan wrote:
> 
> 
> On 03/06/2024 22:22, Joe Damato wrote:
> > On Mon, Jun 03, 2024 at 02:11:14PM +0300, Tariq Toukan wrote:
> > > 
> 
> ...
> 
> > > 
> > > I still don't really like this design, so I gave some more thought on
> > > this...
> > > 
> > > I think we should come up with a new mapping array under priv, that maps i
> > > (from real_num_tx_queues) to the matching sq_stats struct.
> > > This array would be maintained in the channels open/close functions,
> > > similarly to priv->txq2sq.
> > > 
> > > Then, we would not calculate the mapping per call, but just get the proper
> > > pointer from the array. This eases the handling of htb and ptp queues, which
> > > were missed in your txq_ix_to_chtc_ix().
> > 
> > Maybe I am just getting way off track here, but I noticed this in
> > drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c in
> > set_pflag_tx_port_ts:
> > 
> >    if (!err)
> >      priv->tx_ptp_opened = true;
> > 
> > but what if enable is false, meaning the user is disabling ptp via
> > ethtool?
> > 
> 
> This bool field under priv acts as a flag, means: ptp was ever opened.
> It's the same idea as max_opened_tc, but with boolean instead of numeric.

OK, but then to avoid double counting ptp, I suppose we don't
output ptp stats in base and only when the correct txq_ix is passed
in to the tx stats function which refers to the PTP queue?

It's either that or we dont include ptp's sqstats in the
txq2sq_stats mapping you've proposed, if I understand correctly.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ce15805ad55a..515c16a88a6c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -39,6 +39,7 @@ 
 #include <linux/debugfs.h>
 #include <linux/if_bridge.h>
 #include <linux/filter.h>
+#include <net/netdev_queues.h>
 #include <net/page_pool/types.h>
 #include <net/pkt_sched.h>
 #include <net/xdp_sock_drv.h>
@@ -5293,6 +5294,136 @@  static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev)
 	return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev));
 }
 
+static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
+				     struct netdev_queue_stats_rx *stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_channel_stats *channel_stats;
+	struct mlx5e_rq_stats *xskrq_stats;
+	struct mlx5e_rq_stats *rq_stats;
+
+	if (mlx5e_is_uplink_rep(priv))
+		return;
+
+	channel_stats = priv->channel_stats[i];
+	xskrq_stats = &channel_stats->xskrq;
+	rq_stats = &channel_stats->rq;
+
+	stats->packets = rq_stats->packets + xskrq_stats->packets;
+	stats->bytes = rq_stats->bytes + xskrq_stats->bytes;
+	stats->alloc_fail = rq_stats->buff_alloc_err +
+			    xskrq_stats->buff_alloc_err;
+}
+
+static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
+				     struct netdev_queue_stats_tx *stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_channel_stats *channel_stats;
+	struct mlx5e_sq_stats *sq_stats;
+	int ch_ix, tc_ix;
+
+	mutex_lock(&priv->state_lock);
+	txq_ix_to_chtc_ix(&priv->channels.params, i, &ch_ix, &tc_ix);
+	mutex_unlock(&priv->state_lock);
+
+	channel_stats = priv->channel_stats[ch_ix];
+	sq_stats = &channel_stats->sq[tc_ix];
+
+	stats->packets = sq_stats->packets;
+	stats->bytes = sq_stats->bytes;
+}
+
+static void mlx5e_get_base_stats(struct net_device *dev,
+				 struct netdev_queue_stats_rx *rx,
+				 struct netdev_queue_stats_tx *tx)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int i, j;
+
+	if (!mlx5e_is_uplink_rep(priv)) {
+		rx->packets = 0;
+		rx->bytes = 0;
+		rx->alloc_fail = 0;
+
+		/* compute stats for deactivated RX queues
+		 *
+		 * if priv->channels.num == 0 the device is down, so compute
+		 * stats for every queue.
+		 *
+		 * otherwise, compute only the queues which have been deactivated.
+		 */
+		mutex_lock(&priv->state_lock);
+		if (priv->channels.num == 0)
+			i = 0;
+		else
+			i = priv->channels.params.num_channels;
+		mutex_unlock(&priv->state_lock);
+
+		for (; i < priv->stats_nch; i++) {
+			struct netdev_queue_stats_rx rx_i = {0};
+
+			mlx5e_get_queue_stats_rx(dev, i, &rx_i);
+
+			rx->packets += rx_i.packets;
+			rx->bytes += rx_i.bytes;
+			rx->alloc_fail += rx_i.alloc_fail;
+		}
+
+		if (priv->rx_ptp_opened) {
+			struct mlx5e_rq_stats *rq_stats = &priv->ptp_stats.rq;
+
+			rx->packets += rq_stats->packets;
+			rx->bytes += rq_stats->bytes;
+		}
+	}
+
+	tx->packets = 0;
+	tx->bytes = 0;
+
+	mutex_lock(&priv->state_lock);
+	for (i = 0; i < priv->stats_nch; i++) {
+		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
+
+		/* while iterating through all channels [0, stats_nch], there
+		 * are two cases to handle:
+		 *
+		 *  1. the channel is available, so sum only the unavailable TCs
+		 *     [mlx5e_get_dcb_num_tc, max_opened_tc).
+		 *
+		 *  2. the channel is unavailable, so sum all TCs [0, max_opened_tc).
+		 */
+		if (i < priv->channels.params.num_channels) {
+			j = mlx5e_get_dcb_num_tc(&priv->channels.params);
+		} else {
+			j = 0;
+		}
+
+		for (; j < priv->max_opened_tc; j++) {
+			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
+
+			tx->packets += sq_stats->packets;
+			tx->bytes += sq_stats->bytes;
+		}
+	}
+	mutex_unlock(&priv->state_lock);
+
+	if (priv->tx_ptp_opened) {
+		for (j = 0; j < priv->max_opened_tc; j++) {
+			struct mlx5e_sq_stats *sq_stats = &priv->ptp_stats.sq[j];
+
+			tx->packets    += sq_stats->packets;
+			tx->bytes      += sq_stats->bytes;
+		}
+	}
+}
+
+static const struct netdev_stat_ops mlx5e_stat_ops = {
+	.get_queue_stats_rx     = mlx5e_get_queue_stats_rx,
+	.get_queue_stats_tx     = mlx5e_get_queue_stats_tx,
+	.get_base_stats         = mlx5e_get_base_stats,
+};
+
 static void mlx5e_build_nic_netdev(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -5310,6 +5441,7 @@  static void mlx5e_build_nic_netdev(struct net_device *netdev)
 
 	netdev->watchdog_timeo    = 15 * HZ;
 
+	netdev->stat_ops          = &mlx5e_stat_ops;
 	netdev->ethtool_ops	  = &mlx5e_ethtool_ops;
 
 	netdev->vlan_features    |= NETIF_F_SG;