diff mbox series

[1/1] net/mlx5: update debug log level for remote access error syndromes

Message ID 1666689721-30424-1-git-send-email-aru.kolappan@oracle.com (mailing list archive)
State Changes Requested
Headers show
Series [1/1] net/mlx5: update debug log level for remote access error syndromes | expand

Commit Message

Arumugam Kolappan Oct. 25, 2022, 9:22 a.m. UTC
The mlx5 driver dumps the entire CQE buffer by default for few syndromes.
Some syndromes are expected due to the application behavior [ex:
MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR, MLX5_CQE_SYNDROME_REMOTE_OP_ERR and
MLX5_CQE_SYNDROME_LOCAL_PROT_ERR]. Hence, for these syndromes, the patch
converts the log level from KERN_WARNING to KERN_DEBUG. This enables the
application to get the CQE buffer dump by changing to KERN_DEBUG level
as and when needed.

Suggested-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Arumugam Kolappan <aru.kolappan@oracle.com>
---
 drivers/infiniband/hw/mlx5/cq.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

Comments

Leon Romanovsky Oct. 26, 2022, 5:48 a.m. UTC | #1
On Tue, Oct 25, 2022 at 02:22:01AM -0700, Arumugam Kolappan wrote:
> The mlx5 driver dumps the entire CQE buffer by default for few syndromes.
> Some syndromes are expected due to the application behavior [ex:
> MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR, MLX5_CQE_SYNDROME_REMOTE_OP_ERR and
> MLX5_CQE_SYNDROME_LOCAL_PROT_ERR]. Hence, for these syndromes, the patch
> converts the log level from KERN_WARNING to KERN_DEBUG. This enables the
> application to get the CQE buffer dump by changing to KERN_DEBUG level
> as and when needed.
> 
> Suggested-by: Leon Romanovsky <leon@kernel.org>
> Signed-off-by: Arumugam Kolappan <aru.kolappan@oracle.com>
> ---
>  drivers/infiniband/hw/mlx5/cq.c | 30 ++++++++++++++++++++++--------
>  1 file changed, 22 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
> index be189e0..d665129 100644
> --- a/drivers/infiniband/hw/mlx5/cq.c
> +++ b/drivers/infiniband/hw/mlx5/cq.c
> @@ -267,10 +267,25 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
>  	wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
>  }
>  
> -static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
> +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe,
> +		     struct ib_wc *wc, int dump)
>  {
> -	mlx5_ib_warn(dev, "dump error cqe\n");
> -	mlx5_dump_err_cqe(dev->mdev, cqe);
> +	const char *level;
> +
> +	if (!dump)
> +		return;
> +
> +	mlx5_ib_warn(dev, "WC error: %d, Message: %s\n", wc->status,
> +		     ib_wc_status_msg(wc->status));

Aren't you interested "to hide" this print too? Right now, it will
be printed without relation to your "dump" variable value.

> +
> +	if (dump == 1)
> +		level = KERN_WARNING;
> +
> +	if (dump == 2)
> +		level = KERN_DEBUG;

Please change dump_cqe() arguments to receive level directly, so you
will set "dump = KERN_DEBUG" and not not "dump = 2" in
mlx5_handle_error_cqe().

Thanks
Leon Romanovsky Oct. 26, 2022, 5:58 a.m. UTC | #2
On Wed, Oct 26, 2022 at 08:48:13AM +0300, Leon Romanovsky wrote:
> On Tue, Oct 25, 2022 at 02:22:01AM -0700, Arumugam Kolappan wrote:
> > The mlx5 driver dumps the entire CQE buffer by default for few syndromes.
> > Some syndromes are expected due to the application behavior [ex:
> > MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR, MLX5_CQE_SYNDROME_REMOTE_OP_ERR and
> > MLX5_CQE_SYNDROME_LOCAL_PROT_ERR]. Hence, for these syndromes, the patch
> > converts the log level from KERN_WARNING to KERN_DEBUG. This enables the
> > application to get the CQE buffer dump by changing to KERN_DEBUG level
> > as and when needed.
> > 
> > Suggested-by: Leon Romanovsky <leon@kernel.org>
> > Signed-off-by: Arumugam Kolappan <aru.kolappan@oracle.com>
> > ---
> >  drivers/infiniband/hw/mlx5/cq.c | 30 ++++++++++++++++++++++--------
> >  1 file changed, 22 insertions(+), 8 deletions(-)

And more general comments:
1. Patch title is "RDMA/mlx5: Update ...." and not "net/mlx5: updated ..."
2. No need 1/1 notation for single patch
3. [PATCH ...] is better to include target [PATCH rdma-next ...]

Thanks
Arumugam Kolappan Nov. 1, 2022, 7:23 a.m. UTC | #3
Hi Leon,

On 10/25/22 10:48 PM, Leon Romanovsky wrote:
> On Tue, Oct 25, 2022 at 02:22:01AM -0700, Arumugam Kolappan wrote:
>> The mlx5 driver dumps the entire CQE buffer by default for few syndromes.
>> Some syndromes are expected due to the application behavior [ex:
>> MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR, MLX5_CQE_SYNDROME_REMOTE_OP_ERR and
>> MLX5_CQE_SYNDROME_LOCAL_PROT_ERR]. Hence, for these syndromes, the patch
>> converts the log level from KERN_WARNING to KERN_DEBUG. This enables the
>> application to get the CQE buffer dump by changing to KERN_DEBUG level
>> as and when needed.
>>
>> Suggested-by: Leon Romanovsky <leon@kernel.org>
>> Signed-off-by: Arumugam Kolappan <aru.kolappan@oracle.com>
>> ---
>>   drivers/infiniband/hw/mlx5/cq.c | 30 ++++++++++++++++++++++--------
>>   1 file changed, 22 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
>> index be189e0..d665129 100644
>> --- a/drivers/infiniband/hw/mlx5/cq.c
>> +++ b/drivers/infiniband/hw/mlx5/cq.c
>> @@ -267,10 +267,25 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
>>   	wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
>>   }
>>   
>> -static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
>> +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe,
>> +		     struct ib_wc *wc, int dump)
>>   {
>> -	mlx5_ib_warn(dev, "dump error cqe\n");
>> -	mlx5_dump_err_cqe(dev->mdev, cqe);
>> +	const char *level;
>> +
>> +	if (!dump)
>> +		return;
>> +
>> +	mlx5_ib_warn(dev, "WC error: %d, Message: %s\n", wc->status,
>> +		     ib_wc_status_msg(wc->status));
> Aren't you interested "to hide" this print too? Right now, it will
> be printed without relation to your "dump" variable value.

Thanks for pointing out this. Yes. This line also needs to be covered by 
debug log level.

Current existing functions ("mlx5_ib_warn(), mlx5_ib_err() ...) do not 
accept log-level as argument.

So I've added a new fn: mlx5_ib_log(..) which takes log-level as the 
first argument and print it accordingly.


The updated patch will be posted in the next email for your review.

>> +
>> +	if (dump == 1)
>> +		level = KERN_WARNING;
>> +
>> +	if (dump == 2)
>> +		level = KERN_DEBUG;
> Please change dump_cqe() arguments to receive level directly, so you
> will set "dump = KERN_DEBUG" and not not "dump = 2" in
> mlx5_handle_error_cqe().

Yes. This is also taken care in the updated patch.
Also I've updated the subject line as you mentioned in the other email.


Thanks

Aru

>
> Thanks
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index be189e0..d665129 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -267,10 +267,25 @@  static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
 	wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
 }
 
-static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
+static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe,
+		     struct ib_wc *wc, int dump)
 {
-	mlx5_ib_warn(dev, "dump error cqe\n");
-	mlx5_dump_err_cqe(dev->mdev, cqe);
+	const char *level;
+
+	if (!dump)
+		return;
+
+	mlx5_ib_warn(dev, "WC error: %d, Message: %s\n", wc->status,
+		     ib_wc_status_msg(wc->status));
+
+	if (dump == 1)
+		level = KERN_WARNING;
+
+	if (dump == 2)
+		level = KERN_DEBUG;
+
+	print_hex_dump(level, "cqe_dump: ", DUMP_PREFIX_OFFSET, 16, 1,
+		       cqe, sizeof(*cqe), false);
 }
 
 static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
@@ -287,6 +302,7 @@  static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
 		wc->status = IB_WC_LOC_QP_OP_ERR;
 		break;
 	case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR:
+		dump = 2;
 		wc->status = IB_WC_LOC_PROT_ERR;
 		break;
 	case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
@@ -306,9 +322,11 @@  static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
 		wc->status = IB_WC_REM_INV_REQ_ERR;
 		break;
 	case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		dump = 2;
 		wc->status = IB_WC_REM_ACCESS_ERR;
 		break;
 	case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
+		dump = 2;
 		wc->status = IB_WC_REM_OP_ERR;
 		break;
 	case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
@@ -328,11 +346,7 @@  static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
 	}
 
 	wc->vendor_err = cqe->vendor_err_synd;
-	if (dump) {
-		mlx5_ib_warn(dev, "WC error: %d, Message: %s\n", wc->status,
-			     ib_wc_status_msg(wc->status));
-		dump_cqe(dev, cqe);
-	}
+	dump_cqe(dev, cqe, wc, dump);
 }
 
 static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,