diff mbox series

[rdma-next,v1,05/11] RDMA/counter: Add optional counter support

Message ID 04bd7354c6a375e684712d79915f7eb816efee92.1631660727.git.leonro@nvidia.com (mailing list archive)
State Not Applicable
Delegated to: Netdev Maintainers
Headers show
Series Optional counter statistics support | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 11 maintainers not CCed: liweihang@huawei.com phaddad@nvidia.com parav@nvidia.com lee.jones@linaro.org jgg@ziepe.ca anand.a.khoje@oracle.com haakon.bugge@oracle.com jackm@dev.mellanox.co.il mbloch@nvidia.com liangwenpeng@huawei.com joe@perches.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 467 this patch: 467
netdev/kdoc fail Errors and warnings before: 195 this patch: 198
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 149 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 582 this patch: 582
netdev/header_inline success Link

Commit Message

Leon Romanovsky Sept. 14, 2021, 11:07 p.m. UTC
From: Aharon Landau <aharonl@nvidia.com>

An optional counter is a vendor-specific counter that may be dynamically
enabled/disabled.
This enhancement allows us to expose counters which are for example
mutual exclusive and cannot be enabled at the same time, counters that
might degrades performance, optional debug counters, etc.

Optional counters are marked with IB_STAT_FLAG_OPTIONAL flag and not
exported in sysfs.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Neta Ostrovsky <netao@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/counters.c | 22 ++++++++++++++++++++++
 drivers/infiniband/core/device.c   |  1 +
 drivers/infiniband/core/sysfs.c    | 26 ++++++++++++++++----------
 include/rdma/ib_verbs.h            | 14 +++++++++++++-
 include/rdma/rdma_counter.h        |  2 ++
 5 files changed, 54 insertions(+), 11 deletions(-)

Comments

Jason Gunthorpe Sept. 27, 2021, 5:03 p.m. UTC | #1
On Wed, Sep 15, 2021 at 02:07:24AM +0300, Leon Romanovsky wrote:
>  
> +int rdma_counter_modify(struct ib_device *dev, u32 port, int index, bool enable)
> +{
> +	struct rdma_hw_stats *stats;
> +	int ret;
> +
> +	if (!dev->ops.modify_hw_stat)
> +		return -EOPNOTSUPP;
> +
> +	stats = ib_get_hw_stats_port(dev, port);
> +	if (!stats)
> +		return -EINVAL;
> +
> +	mutex_lock(&stats->lock);
> +	ret = dev->ops.modify_hw_stat(dev, port, index, enable);
> +	if (!ret)
> +		enable ? clear_bit(index, stats->is_disabled) :
> +			set_bit(index, stats->is_disabled);

This is not a kernel coding style write out the if, use success
oriented flow

Also, shouldn't this logic protect the driver from being called on
non-optional counters?

>  	for (i = 0; i < data->stats->num_counters; i++) {
> -		attr = &data->attrs[i];
> +		if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)
> +			continue;
> +		attr = &data->attrs[pos];
>  		sysfs_attr_init(&attr->attr.attr);
>  		attr->attr.attr.name = data->stats->descs[i].name;
>  		attr->attr.attr.mode = 0444;
>  		attr->attr.show = hw_stat_device_show;
>  		attr->show = show_hw_stats;
> -		data->group.attrs[i] = &attr->attr.attr;
> +		data->group.attrs[pos] = &attr->attr.attr;
> +		pos++;
>  	}

This isn't OK, the hw_stat_device_show() computes the stat index like
this:

	return stat_attr->show(ibdev, ibdev->hw_stats_data->stats,
			       stat_attr - ibdev->hw_stats_data->attrs, 0, buf);

Which assumes the stats are packed contiguously. This only works
because mlx5 is always putting the optional stats at the end.

>  /**
>   * struct rdma_stat_desc
>   * @name - The name of the counter
> - *
> + * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL
>   */

The previous patch shouldn't have had the extra blank line then?

  
> +int rdma_counter_modify(struct ib_device *dev, u32 port, int index,
> +			bool is_add);

index should be unsigned int

The bool is called 'is_add' here but the implementation is 'enable' ?

Jason
Mark Zhang Sept. 28, 2021, 9:03 a.m. UTC | #2
On 9/28/2021 1:03 AM, Jason Gunthorpe wrote:
> On Wed, Sep 15, 2021 at 02:07:24AM +0300, Leon Romanovsky wrote:
>>   
>> +int rdma_counter_modify(struct ib_device *dev, u32 port, int index, bool enable)
>> +{
>> +	struct rdma_hw_stats *stats;
>> +	int ret;
>> +
>> +	if (!dev->ops.modify_hw_stat)
>> +		return -EOPNOTSUPP;
>> +
>> +	stats = ib_get_hw_stats_port(dev, port);
>> +	if (!stats)
>> +		return -EINVAL;
>> +
>> +	mutex_lock(&stats->lock);
>> +	ret = dev->ops.modify_hw_stat(dev, port, index, enable);
>> +	if (!ret)
>> +		enable ? clear_bit(index, stats->is_disabled) :
>> +			set_bit(index, stats->is_disabled);
> 
> This is not a kernel coding style write out the if, use success
> oriented flow
> 
> Also, shouldn't this logic protect the driver from being called on
> non-optional counters?

We leave it to driver, driver would return failure if modify is not 
supported. Is it good?

>>   	for (i = 0; i < data->stats->num_counters; i++) {
>> -		attr = &data->attrs[i];
>> +		if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)
>> +			continue;
>> +		attr = &data->attrs[pos];
>>   		sysfs_attr_init(&attr->attr.attr);
>>   		attr->attr.attr.name = data->stats->descs[i].name;
>>   		attr->attr.attr.mode = 0444;
>>   		attr->attr.show = hw_stat_device_show;
>>   		attr->show = show_hw_stats;
>> -		data->group.attrs[i] = &attr->attr.attr;
>> +		data->group.attrs[pos] = &attr->attr.attr;
>> +		pos++;
>>   	}
> 
> This isn't OK, the hw_stat_device_show() computes the stat index like
> this:
> 
> 	return stat_attr->show(ibdev, ibdev->hw_stats_data->stats,
> 			       stat_attr - ibdev->hw_stats_data->attrs, 0, buf);
> 
> Which assumes the stats are packed contiguously. This only works
> because mlx5 is always putting the optional stats at the end.

Yes you are right, thanks. Maybe we can add an "index" field in struct 
hw_stats_device/port_attribute, then set it in setup and use it in show.
Jason Gunthorpe Sept. 28, 2021, 11:51 a.m. UTC | #3
On Tue, Sep 28, 2021 at 05:03:24PM +0800, Mark Zhang wrote:
> On 9/28/2021 1:03 AM, Jason Gunthorpe wrote:
> > On Wed, Sep 15, 2021 at 02:07:24AM +0300, Leon Romanovsky wrote:
> > > +int rdma_counter_modify(struct ib_device *dev, u32 port, int index, bool enable)
> > > +{
> > > +	struct rdma_hw_stats *stats;
> > > +	int ret;
> > > +
> > > +	if (!dev->ops.modify_hw_stat)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	stats = ib_get_hw_stats_port(dev, port);
> > > +	if (!stats)
> > > +		return -EINVAL;
> > > +
> > > +	mutex_lock(&stats->lock);
> > > +	ret = dev->ops.modify_hw_stat(dev, port, index, enable);
> > > +	if (!ret)
> > > +		enable ? clear_bit(index, stats->is_disabled) :
> > > +			set_bit(index, stats->is_disabled);
> > 
> > This is not a kernel coding style write out the if, use success
> > oriented flow
> > 
> > Also, shouldn't this logic protect the driver from being called on
> > non-optional counters?
> 
> We leave it to driver, driver would return failure if modify is not
> supported. Is it good?

I think the core code should do it

> > >   	for (i = 0; i < data->stats->num_counters; i++) {
> > > -		attr = &data->attrs[i];
> > > +		if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)
> > > +			continue;
> > > +		attr = &data->attrs[pos];
> > >   		sysfs_attr_init(&attr->attr.attr);
> > >   		attr->attr.attr.name = data->stats->descs[i].name;
> > >   		attr->attr.attr.mode = 0444;
> > >   		attr->attr.show = hw_stat_device_show;
> > >   		attr->show = show_hw_stats;
> > > -		data->group.attrs[i] = &attr->attr.attr;
> > > +		data->group.attrs[pos] = &attr->attr.attr;
> > > +		pos++;
> > >   	}
> > 
> > This isn't OK, the hw_stat_device_show() computes the stat index like
> > this:
> > 
> > 	return stat_attr->show(ibdev, ibdev->hw_stats_data->stats,
> > 			       stat_attr - ibdev->hw_stats_data->attrs, 0, buf);
> > 
> > Which assumes the stats are packed contiguously. This only works
> > because mlx5 is always putting the optional stats at the end.
> 
> Yes you are right, thanks. Maybe we can add an "index" field in struct
> hw_stats_device/port_attribute, then set it in setup and use it in show.

You could just add a WARN_ON check that optional stats are at the end
I suppose

Jason
Leon Romanovsky Sept. 29, 2021, 12:14 p.m. UTC | #4
On Tue, Sep 28, 2021 at 08:51:35AM -0300, Jason Gunthorpe wrote:
> On Tue, Sep 28, 2021 at 05:03:24PM +0800, Mark Zhang wrote:
> > On 9/28/2021 1:03 AM, Jason Gunthorpe wrote:
> > > On Wed, Sep 15, 2021 at 02:07:24AM +0300, Leon Romanovsky wrote:
> > > > +int rdma_counter_modify(struct ib_device *dev, u32 port, int index, bool enable)
> > > > +{
> > > > +	struct rdma_hw_stats *stats;
> > > > +	int ret;
> > > > +
> > > > +	if (!dev->ops.modify_hw_stat)
> > > > +		return -EOPNOTSUPP;
> > > > +
> > > > +	stats = ib_get_hw_stats_port(dev, port);
> > > > +	if (!stats)
> > > > +		return -EINVAL;
> > > > +
> > > > +	mutex_lock(&stats->lock);
> > > > +	ret = dev->ops.modify_hw_stat(dev, port, index, enable);
> > > > +	if (!ret)
> > > > +		enable ? clear_bit(index, stats->is_disabled) :
> > > > +			set_bit(index, stats->is_disabled);
> > > 
> > > This is not a kernel coding style write out the if, use success
> > > oriented flow
> > > 
> > > Also, shouldn't this logic protect the driver from being called on
> > > non-optional counters?
> > 
> > We leave it to driver, driver would return failure if modify is not
> > supported. Is it good?
> 
> I think the core code should do it
> 
> > > >   	for (i = 0; i < data->stats->num_counters; i++) {
> > > > -		attr = &data->attrs[i];
> > > > +		if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)
> > > > +			continue;
> > > > +		attr = &data->attrs[pos];
> > > >   		sysfs_attr_init(&attr->attr.attr);
> > > >   		attr->attr.attr.name = data->stats->descs[i].name;
> > > >   		attr->attr.attr.mode = 0444;
> > > >   		attr->attr.show = hw_stat_device_show;
> > > >   		attr->show = show_hw_stats;
> > > > -		data->group.attrs[i] = &attr->attr.attr;
> > > > +		data->group.attrs[pos] = &attr->attr.attr;
> > > > +		pos++;
> > > >   	}
> > > 
> > > This isn't OK, the hw_stat_device_show() computes the stat index like
> > > this:
> > > 
> > > 	return stat_attr->show(ibdev, ibdev->hw_stats_data->stats,
> > > 			       stat_attr - ibdev->hw_stats_data->attrs, 0, buf);
> > > 
> > > Which assumes the stats are packed contiguously. This only works
> > > because mlx5 is always putting the optional stats at the end.
> > 
> > Yes you are right, thanks. Maybe we can add an "index" field in struct
> > hw_stats_device/port_attribute, then set it in setup and use it in show.
> 
> You could just add a WARN_ON check that optional stats are at the end
> I suppose

Everyone adds their counters to the end, last example is bnxt_re
9a381f7e5aa2 ("RDMA/bnxt_re: Add extended statistics counters")

Thanks

> 
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index a9559e33a113..974abc73a033 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -106,6 +106,28 @@  static int __rdma_counter_bind_qp(struct rdma_counter *counter,
 	return ret;
 }
 
+int rdma_counter_modify(struct ib_device *dev, u32 port, int index, bool enable)
+{
+	struct rdma_hw_stats *stats;
+	int ret;
+
+	if (!dev->ops.modify_hw_stat)
+		return -EOPNOTSUPP;
+
+	stats = ib_get_hw_stats_port(dev, port);
+	if (!stats)
+		return -EINVAL;
+
+	mutex_lock(&stats->lock);
+	ret = dev->ops.modify_hw_stat(dev, port, index, enable);
+	if (!ret)
+		enable ? clear_bit(index, stats->is_disabled) :
+			set_bit(index, stats->is_disabled);
+	mutex_unlock(&stats->lock);
+
+	return ret;
+}
+
 static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
 					   struct ib_qp *qp,
 					   enum rdma_nl_counter_mode mode)
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index f4814bb7f082..22a4adda7981 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2676,6 +2676,7 @@  void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, modify_cq);
 	SET_DEVICE_OP(dev_ops, modify_device);
 	SET_DEVICE_OP(dev_ops, modify_flow_action_esp);
+	SET_DEVICE_OP(dev_ops, modify_hw_stat);
 	SET_DEVICE_OP(dev_ops, modify_port);
 	SET_DEVICE_OP(dev_ops, modify_qp);
 	SET_DEVICE_OP(dev_ops, modify_srq);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index a26bf960f7ef..c5fc86d8ed3e 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -938,7 +938,7 @@  int ib_setup_device_attrs(struct ib_device *ibdev)
 {
 	struct hw_stats_device_attribute *attr;
 	struct hw_stats_device_data *data;
-	int i, ret;
+	int i, ret, pos = 0;
 
 	data = alloc_hw_stats_device(ibdev);
 	if (IS_ERR(data)) {
@@ -959,16 +959,19 @@  int ib_setup_device_attrs(struct ib_device *ibdev)
 	data->stats->timestamp = jiffies;
 
 	for (i = 0; i < data->stats->num_counters; i++) {
-		attr = &data->attrs[i];
+		if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)
+			continue;
+		attr = &data->attrs[pos];
 		sysfs_attr_init(&attr->attr.attr);
 		attr->attr.attr.name = data->stats->descs[i].name;
 		attr->attr.attr.mode = 0444;
 		attr->attr.show = hw_stat_device_show;
 		attr->show = show_hw_stats;
-		data->group.attrs[i] = &attr->attr.attr;
+		data->group.attrs[pos] = &attr->attr.attr;
+		pos++;
 	}
 
-	attr = &data->attrs[i];
+	attr = &data->attrs[pos];
 	sysfs_attr_init(&attr->attr.attr);
 	attr->attr.attr.name = "lifespan";
 	attr->attr.attr.mode = 0644;
@@ -976,7 +979,7 @@  int ib_setup_device_attrs(struct ib_device *ibdev)
 	attr->show = show_stats_lifespan;
 	attr->attr.store = hw_stat_device_store;
 	attr->store = set_stats_lifespan;
-	data->group.attrs[i] = &attr->attr.attr;
+	data->group.attrs[pos] = &attr->attr.attr;
 	for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++)
 		if (!ibdev->groups[i]) {
 			ibdev->groups[i] = &data->group;
@@ -1032,7 +1035,7 @@  static int setup_hw_port_stats(struct ib_port *port,
 {
 	struct hw_stats_port_attribute *attr;
 	struct hw_stats_port_data *data;
-	int i, ret;
+	int i, ret, pos = 0;
 
 	data = alloc_hw_stats_port(port, group);
 	if (IS_ERR(data))
@@ -1050,16 +1053,19 @@  static int setup_hw_port_stats(struct ib_port *port,
 	data->stats->timestamp = jiffies;
 
 	for (i = 0; i < data->stats->num_counters; i++) {
-		attr = &data->attrs[i];
+		if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)
+			continue;
+		attr = &data->attrs[pos];
 		sysfs_attr_init(&attr->attr.attr);
 		attr->attr.attr.name = data->stats->descs[i].name;
 		attr->attr.attr.mode = 0444;
 		attr->attr.show = hw_stat_port_show;
 		attr->show = show_hw_stats;
-		group->attrs[i] = &attr->attr.attr;
+		group->attrs[pos] = &attr->attr.attr;
+		pos++;
 	}
 
-	attr = &data->attrs[i];
+	attr = &data->attrs[pos];
 	sysfs_attr_init(&attr->attr.attr);
 	attr->attr.attr.name = "lifespan";
 	attr->attr.attr.mode = 0644;
@@ -1067,7 +1073,7 @@  static int setup_hw_port_stats(struct ib_port *port,
 	attr->show = show_stats_lifespan;
 	attr->attr.store = hw_stat_port_store;
 	attr->store = set_stats_lifespan;
-	group->attrs[i] = &attr->attr.attr;
+	group->attrs[pos] = &attr->attr.attr;
 
 	port->hw_stats_data = data;
 	return 0;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index f016bc0cd9de..e825e8e7accf 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -545,13 +545,18 @@  enum ib_port_speed {
 	IB_SPEED_NDR	= 128,
 };
 
+enum ib_stat_flag {
+	IB_STAT_FLAG_OPTIONAL = 1 << 0,
+};
+
 /**
  * struct rdma_stat_desc
  * @name - The name of the counter
- *
+ * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL
  */
 struct rdma_stat_desc {
 	const char *name;
+	unsigned int flags;
 };
 
 /**
@@ -2591,6 +2596,13 @@  struct ib_device_ops {
 	int (*get_hw_stats)(struct ib_device *device,
 			    struct rdma_hw_stats *stats, u32 port, int index);
 
+	/**
+	 * modify_hw_stat - Modify the counter configuration
+	 * @enable: true/false when enable/disable a counter
+	 * Return codes - 0 on success or error code otherwise.
+	 */
+	int (*modify_hw_stat)(struct ib_device *device, u32 port,
+			      int counter_index, bool enable);
 	/**
 	 * Allows rdma drivers to add their own restrack attributes.
 	 */
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 0295b22cd1cd..b21ea39efc6c 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -63,4 +63,6 @@  int rdma_counter_get_mode(struct ib_device *dev, u32 port,
 			  enum rdma_nl_counter_mode *mode,
 			  enum rdma_nl_counter_mask *mask);
 
+int rdma_counter_modify(struct ib_device *dev, u32 port, int index,
+			bool is_add);
 #endif /* _RDMA_COUNTER_H_ */