diff mbox series

[PATCHv4] media: cec: core: count low-drive, error and arb-lost, conditions

Message ID 1d437648-3774-467f-8c72-e6f0bbf69211@xs4all.nl (mailing list archive)
State New, archived
Headers show
Series [PATCHv4] media: cec: core: count low-drive, error and arb-lost, conditions | expand

Commit Message

Hans Verkuil Oct. 16, 2023, 7:09 a.m. UTC
Count how many Low Drive, Error and Arbitration Lost transmit
status errors occurred, and expose that in debugfs.

Also log the first 8 transmits that result in Low Drive or Error
conditions. That really should not happen with well-behaved CEC devices
and good HDMI cables.

This is useful to detect and debug HDMI cable issues.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
Changes since v3:
- Document new fields in kerneldoc
Changes since v2:
- Fix spaces instead of TAB issue in two lines.
Changes since v1:
- Log the first 8 transmits that resulted in a Low Drive or Error status.
---
 drivers/media/cec/core/cec-adap.c | 54 ++++++++++++++++++++++++++++---
 include/media/cec.h               | 14 ++++++--
 2 files changed, 61 insertions(+), 7 deletions(-)

Comments

Mauro Carvalho Chehab Nov. 6, 2023, 10:01 a.m. UTC | #1
Em Mon, 16 Oct 2023 09:09:03 +0200
Hans Verkuil <hverkuil-cisco@xs4all.nl> escreveu:

> Count how many Low Drive, Error and Arbitration Lost transmit
> status errors occurred, and expose that in debugfs.
> 
> Also log the first 8 transmits that result in Low Drive or Error
> conditions. That really should not happen with well-behaved CEC devices
> and good HDMI cables.
> 
> This is useful to detect and debug HDMI cable issues.
> 
> Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
> ---
> Changes since v3:
> - Document new fields in kerneldoc
> Changes since v2:
> - Fix spaces instead of TAB issue in two lines.
> Changes since v1:
> - Log the first 8 transmits that resulted in a Low Drive or Error status.
> ---
>  drivers/media/cec/core/cec-adap.c | 54 ++++++++++++++++++++++++++++---
>  include/media/cec.h               | 14 ++++++--
>  2 files changed, 61 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c
> index 09ca83c23329..b9a2753e0846 100644
> --- a/drivers/media/cec/core/cec-adap.c
> +++ b/drivers/media/cec/core/cec-adap.c
> @@ -511,7 +511,7 @@ int cec_thread_func(void *_adap)
>  				pr_warn("cec-%s: transmit timed out\n", adap->name);
>  			}
>  			adap->transmit_in_progress = false;
> -			adap->tx_timeouts++;
> +			adap->tx_timeout_cnt++;
>  			goto unlock;
>  		}
> 
> @@ -625,6 +625,33 @@ void cec_transmit_done_ts(struct cec_adapter *adap, u8 status,
>  	msg->tx_low_drive_cnt += low_drive_cnt;
>  	msg->tx_error_cnt += error_cnt;
> 
> +	adap->tx_arb_lost_cnt += arb_lost_cnt;
> +	adap->tx_low_drive_cnt += low_drive_cnt;
> +	adap->tx_error_cnt += error_cnt;

What's the difference between those three? It seems that they're
unconditionally incremented only here.

> +
> +	/*
> +	 * Low Drive transmission errors should really not happen for
> +	 * well-behaved CEC devices and proper HDMI cables.
> +	 *
> +	 * Ditto for the 'Error' status.
> +	 *
> +	 * For the first few times that this happens, log this.
> +	 * Stop logging after that, since that will not add any more
> +	 * useful information and instead it will just flood the kernel log.
> +	 */
> +	if (done && adap->tx_low_drive_log_cnt < 8 && msg->tx_low_drive_cnt) {
> +		adap->tx_low_drive_log_cnt++;
> +		dprintk(0, "low drive counter: %u (seq %u: %*ph)\n",
> +			msg->tx_low_drive_cnt, msg->sequence,
> +			msg->len, msg->msg);
> +	}
> +	if (done && adap->tx_error_log_cnt < 8 && msg->tx_error_cnt) {
> +		adap->tx_error_log_cnt++;
> +		dprintk(0, "error counter: %u (seq %u: %*ph)\n",
> +			msg->tx_error_cnt, msg->sequence,
> +			msg->len, msg->msg);
> +	}
> +
>  	/* Mark that we're done with this transmit */
>  	adap->transmitting = NULL;
> 
> @@ -1607,6 +1634,8 @@ int cec_adap_enable(struct cec_adapter *adap)
>  	if (enable) {
>  		adap->last_initiator = 0xff;
>  		adap->transmit_in_progress = false;
> +		adap->tx_low_drive_log_cnt = 0;
> +		adap->tx_error_log_cnt = 0;
>  		ret = adap->ops->adap_enable(adap, true);
>  		if (!ret) {
>  			/*
> @@ -2260,10 +2289,25 @@ int cec_adap_status(struct seq_file *file, void *priv)
>  	if (adap->monitor_pin_cnt)
>  		seq_printf(file, "file handles in Monitor Pin mode: %u\n",
>  			   adap->monitor_pin_cnt);
> -	if (adap->tx_timeouts) {
> -		seq_printf(file, "transmit timeouts: %u\n",
> -			   adap->tx_timeouts);
> -		adap->tx_timeouts = 0;
> +	if (adap->tx_timeout_cnt) {
> +		seq_printf(file, "transmit timeout count: %u\n",
> +			   adap->tx_timeout_cnt);
> +		adap->tx_timeout_cnt = 0;
> +	}
> +	if (adap->tx_low_drive_cnt) {
> +		seq_printf(file, "transmit low drive count: %u\n",
> +			   adap->tx_low_drive_cnt);
> +		adap->tx_low_drive_cnt = 0;
> +	}
> +	if (adap->tx_arb_lost_cnt) {
> +		seq_printf(file, "transmit arbitration lost count: %u\n",
> +			   adap->tx_arb_lost_cnt);
> +		adap->tx_arb_lost_cnt = 0;
> +	}
> +	if (adap->tx_error_cnt) {
> +		seq_printf(file, "transmit error count: %u\n",
> +			   adap->tx_error_cnt);
> +		adap->tx_error_cnt = 0;
>  	}
>  	data = adap->transmitting;
>  	if (data)
> diff --git a/include/media/cec.h b/include/media/cec.h
> index 53e4b2eb2b26..57b630f1931e 100644
> --- a/include/media/cec.h
> +++ b/include/media/cec.h
> @@ -207,7 +207,12 @@ struct cec_adap_ops {
>   *	passthrough mode.
>   * @log_addrs:		current logical addresses
>   * @conn_info:		current connector info
> - * @tx_timeouts:	number of transmit timeouts
> + * @tx_timeout_cnt:	number of Time Out transmits
> + * @tx_low_drive_cnt:	number of Low Drive transmits
> + * @tx_error_cnt:	number of Error transmits
> + * @tx_arb_lost_cnt:	number of Arbitration Lost transmits
> + * @tx_low_drive_log_cnt: number of logged Low Drive transmits
> + * @tx_error_log_cnt:	number of logged Error transmits

As those counts are reset after sysfs reads, I would mention that.
Something like:

	* @tx_error_log_cnt:	number of logged Error transmits since last read

>   * @notifier:		CEC notifier
>   * @pin:		CEC pin status struct
>   * @cec_dir:		debugfs cec directory
> @@ -262,7 +267,12 @@ struct cec_adapter {
>  	struct cec_log_addrs log_addrs;
>  	struct cec_connector_info conn_info;
> 
> -	u32 tx_timeouts;
> +	u32 tx_timeout_cnt;
> +	u32 tx_low_drive_cnt;
> +	u32 tx_error_cnt;
> +	u32 tx_arb_lost_cnt;
> +	u32 tx_low_drive_log_cnt;
> +	u32 tx_error_log_cnt;

Namespace seems confusing to me. Some values are unconditionally 
incremented at TX completion. I assume that those are the total number
of TX packets, right?

If so, I would rename it to "tx_total", "tx_low_drive_total", ...
to make it clearer.

>  #ifdef CONFIG_CEC_NOTIFIER
>  	struct cec_notifier *notifier;

Thanks,
Mauro
Hans Verkuil Nov. 6, 2023, 10:23 a.m. UTC | #2
On 06/11/2023 11:01, Mauro Carvalho Chehab wrote:
> Em Mon, 16 Oct 2023 09:09:03 +0200
> Hans Verkuil <hverkuil-cisco@xs4all.nl> escreveu:
> 
>> Count how many Low Drive, Error and Arbitration Lost transmit
>> status errors occurred, and expose that in debugfs.
>>
>> Also log the first 8 transmits that result in Low Drive or Error
>> conditions. That really should not happen with well-behaved CEC devices
>> and good HDMI cables.
>>
>> This is useful to detect and debug HDMI cable issues.
>>
>> Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
>> ---
>> Changes since v3:
>> - Document new fields in kerneldoc
>> Changes since v2:
>> - Fix spaces instead of TAB issue in two lines.
>> Changes since v1:
>> - Log the first 8 transmits that resulted in a Low Drive or Error status.
>> ---
>>  drivers/media/cec/core/cec-adap.c | 54 ++++++++++++++++++++++++++++---
>>  include/media/cec.h               | 14 ++++++--
>>  2 files changed, 61 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c
>> index 09ca83c23329..b9a2753e0846 100644
>> --- a/drivers/media/cec/core/cec-adap.c
>> +++ b/drivers/media/cec/core/cec-adap.c
>> @@ -511,7 +511,7 @@ int cec_thread_func(void *_adap)
>>  				pr_warn("cec-%s: transmit timed out\n", adap->name);
>>  			}
>>  			adap->transmit_in_progress = false;
>> -			adap->tx_timeouts++;
>> +			adap->tx_timeout_cnt++;
>>  			goto unlock;
>>  		}
>>
>> @@ -625,6 +625,33 @@ void cec_transmit_done_ts(struct cec_adapter *adap, u8 status,
>>  	msg->tx_low_drive_cnt += low_drive_cnt;
>>  	msg->tx_error_cnt += error_cnt;
>>
>> +	adap->tx_arb_lost_cnt += arb_lost_cnt;
>> +	adap->tx_low_drive_cnt += low_drive_cnt;
>> +	adap->tx_error_cnt += error_cnt;
> 
> What's the difference between those three? It seems that they're
> unconditionally incremented only here.

Different error conditions. A CEC transmit will be repeated multiple times
until it succeeds or until the max number of attempts is reached.

For debugging it helps to keep track of unusual error reasons since that
can indicate a hardware problem. So this keeps track of how often each
error reason happens.

The driver will pass in these counters. If there were no errors, then these
values are 0.

> 
>> +
>> +	/*
>> +	 * Low Drive transmission errors should really not happen for
>> +	 * well-behaved CEC devices and proper HDMI cables.
>> +	 *
>> +	 * Ditto for the 'Error' status.
>> +	 *
>> +	 * For the first few times that this happens, log this.
>> +	 * Stop logging after that, since that will not add any more
>> +	 * useful information and instead it will just flood the kernel log.
>> +	 */
>> +	if (done && adap->tx_low_drive_log_cnt < 8 && msg->tx_low_drive_cnt) {
>> +		adap->tx_low_drive_log_cnt++;
>> +		dprintk(0, "low drive counter: %u (seq %u: %*ph)\n",
>> +			msg->tx_low_drive_cnt, msg->sequence,
>> +			msg->len, msg->msg);
>> +	}
>> +	if (done && adap->tx_error_log_cnt < 8 && msg->tx_error_cnt) {
>> +		adap->tx_error_log_cnt++;
>> +		dprintk(0, "error counter: %u (seq %u: %*ph)\n",
>> +			msg->tx_error_cnt, msg->sequence,
>> +			msg->len, msg->msg);
>> +	}
>> +
>>  	/* Mark that we're done with this transmit */
>>  	adap->transmitting = NULL;
>>
>> @@ -1607,6 +1634,8 @@ int cec_adap_enable(struct cec_adapter *adap)
>>  	if (enable) {
>>  		adap->last_initiator = 0xff;
>>  		adap->transmit_in_progress = false;
>> +		adap->tx_low_drive_log_cnt = 0;
>> +		adap->tx_error_log_cnt = 0;
>>  		ret = adap->ops->adap_enable(adap, true);
>>  		if (!ret) {
>>  			/*
>> @@ -2260,10 +2289,25 @@ int cec_adap_status(struct seq_file *file, void *priv)
>>  	if (adap->monitor_pin_cnt)
>>  		seq_printf(file, "file handles in Monitor Pin mode: %u\n",
>>  			   adap->monitor_pin_cnt);
>> -	if (adap->tx_timeouts) {
>> -		seq_printf(file, "transmit timeouts: %u\n",
>> -			   adap->tx_timeouts);
>> -		adap->tx_timeouts = 0;
>> +	if (adap->tx_timeout_cnt) {
>> +		seq_printf(file, "transmit timeout count: %u\n",
>> +			   adap->tx_timeout_cnt);
>> +		adap->tx_timeout_cnt = 0;
>> +	}
>> +	if (adap->tx_low_drive_cnt) {
>> +		seq_printf(file, "transmit low drive count: %u\n",
>> +			   adap->tx_low_drive_cnt);
>> +		adap->tx_low_drive_cnt = 0;
>> +	}
>> +	if (adap->tx_arb_lost_cnt) {
>> +		seq_printf(file, "transmit arbitration lost count: %u\n",
>> +			   adap->tx_arb_lost_cnt);
>> +		adap->tx_arb_lost_cnt = 0;
>> +	}
>> +	if (adap->tx_error_cnt) {
>> +		seq_printf(file, "transmit error count: %u\n",
>> +			   adap->tx_error_cnt);
>> +		adap->tx_error_cnt = 0;
>>  	}
>>  	data = adap->transmitting;
>>  	if (data)
>> diff --git a/include/media/cec.h b/include/media/cec.h
>> index 53e4b2eb2b26..57b630f1931e 100644
>> --- a/include/media/cec.h
>> +++ b/include/media/cec.h
>> @@ -207,7 +207,12 @@ struct cec_adap_ops {
>>   *	passthrough mode.
>>   * @log_addrs:		current logical addresses
>>   * @conn_info:		current connector info
>> - * @tx_timeouts:	number of transmit timeouts
>> + * @tx_timeout_cnt:	number of Time Out transmits
>> + * @tx_low_drive_cnt:	number of Low Drive transmits
>> + * @tx_error_cnt:	number of Error transmits
>> + * @tx_arb_lost_cnt:	number of Arbitration Lost transmits
>> + * @tx_low_drive_log_cnt: number of logged Low Drive transmits
>> + * @tx_error_log_cnt:	number of logged Error transmits
> 
> As those counts are reset after sysfs reads, I would mention that.
> Something like:
> 
> 	* @tx_error_log_cnt:	number of logged Error transmits since last read

Yeah, that can be documented a bit better.

> 
>>   * @notifier:		CEC notifier
>>   * @pin:		CEC pin status struct
>>   * @cec_dir:		debugfs cec directory
>> @@ -262,7 +267,12 @@ struct cec_adapter {
>>  	struct cec_log_addrs log_addrs;
>>  	struct cec_connector_info conn_info;
>>
>> -	u32 tx_timeouts;
>> +	u32 tx_timeout_cnt;
>> +	u32 tx_low_drive_cnt;
>> +	u32 tx_error_cnt;
>> +	u32 tx_arb_lost_cnt;
>> +	u32 tx_low_drive_log_cnt;
>> +	u32 tx_error_log_cnt;
> 
> Namespace seems confusing to me. Some values are unconditionally 
> incremented at TX completion. I assume that those are the total number
> of TX packets, right?
> 
> If so, I would rename it to "tx_total", "tx_low_drive_total", ...
> to make it clearer.

Hmm. They count the number of times something happened. Hence _cnt.
I'm not sure _total is better. That would be "the total number of
times something happened.", which is the same thing.

I think I'll just leave this as-is.

Regards,

	Hans

> 
>>  #ifdef CONFIG_CEC_NOTIFIER
>>  	struct cec_notifier *notifier;
> 
> Thanks,
> Mauro
diff mbox series

Patch

diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c
index 09ca83c23329..b9a2753e0846 100644
--- a/drivers/media/cec/core/cec-adap.c
+++ b/drivers/media/cec/core/cec-adap.c
@@ -511,7 +511,7 @@  int cec_thread_func(void *_adap)
 				pr_warn("cec-%s: transmit timed out\n", adap->name);
 			}
 			adap->transmit_in_progress = false;
-			adap->tx_timeouts++;
+			adap->tx_timeout_cnt++;
 			goto unlock;
 		}

@@ -625,6 +625,33 @@  void cec_transmit_done_ts(struct cec_adapter *adap, u8 status,
 	msg->tx_low_drive_cnt += low_drive_cnt;
 	msg->tx_error_cnt += error_cnt;

+	adap->tx_arb_lost_cnt += arb_lost_cnt;
+	adap->tx_low_drive_cnt += low_drive_cnt;
+	adap->tx_error_cnt += error_cnt;
+
+	/*
+	 * Low Drive transmission errors should really not happen for
+	 * well-behaved CEC devices and proper HDMI cables.
+	 *
+	 * Ditto for the 'Error' status.
+	 *
+	 * For the first few times that this happens, log this.
+	 * Stop logging after that, since that will not add any more
+	 * useful information and instead it will just flood the kernel log.
+	 */
+	if (done && adap->tx_low_drive_log_cnt < 8 && msg->tx_low_drive_cnt) {
+		adap->tx_low_drive_log_cnt++;
+		dprintk(0, "low drive counter: %u (seq %u: %*ph)\n",
+			msg->tx_low_drive_cnt, msg->sequence,
+			msg->len, msg->msg);
+	}
+	if (done && adap->tx_error_log_cnt < 8 && msg->tx_error_cnt) {
+		adap->tx_error_log_cnt++;
+		dprintk(0, "error counter: %u (seq %u: %*ph)\n",
+			msg->tx_error_cnt, msg->sequence,
+			msg->len, msg->msg);
+	}
+
 	/* Mark that we're done with this transmit */
 	adap->transmitting = NULL;

@@ -1607,6 +1634,8 @@  int cec_adap_enable(struct cec_adapter *adap)
 	if (enable) {
 		adap->last_initiator = 0xff;
 		adap->transmit_in_progress = false;
+		adap->tx_low_drive_log_cnt = 0;
+		adap->tx_error_log_cnt = 0;
 		ret = adap->ops->adap_enable(adap, true);
 		if (!ret) {
 			/*
@@ -2260,10 +2289,25 @@  int cec_adap_status(struct seq_file *file, void *priv)
 	if (adap->monitor_pin_cnt)
 		seq_printf(file, "file handles in Monitor Pin mode: %u\n",
 			   adap->monitor_pin_cnt);
-	if (adap->tx_timeouts) {
-		seq_printf(file, "transmit timeouts: %u\n",
-			   adap->tx_timeouts);
-		adap->tx_timeouts = 0;
+	if (adap->tx_timeout_cnt) {
+		seq_printf(file, "transmit timeout count: %u\n",
+			   adap->tx_timeout_cnt);
+		adap->tx_timeout_cnt = 0;
+	}
+	if (adap->tx_low_drive_cnt) {
+		seq_printf(file, "transmit low drive count: %u\n",
+			   adap->tx_low_drive_cnt);
+		adap->tx_low_drive_cnt = 0;
+	}
+	if (adap->tx_arb_lost_cnt) {
+		seq_printf(file, "transmit arbitration lost count: %u\n",
+			   adap->tx_arb_lost_cnt);
+		adap->tx_arb_lost_cnt = 0;
+	}
+	if (adap->tx_error_cnt) {
+		seq_printf(file, "transmit error count: %u\n",
+			   adap->tx_error_cnt);
+		adap->tx_error_cnt = 0;
 	}
 	data = adap->transmitting;
 	if (data)
diff --git a/include/media/cec.h b/include/media/cec.h
index 53e4b2eb2b26..57b630f1931e 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -207,7 +207,12 @@  struct cec_adap_ops {
  *	passthrough mode.
  * @log_addrs:		current logical addresses
  * @conn_info:		current connector info
- * @tx_timeouts:	number of transmit timeouts
+ * @tx_timeout_cnt:	number of Time Out transmits
+ * @tx_low_drive_cnt:	number of Low Drive transmits
+ * @tx_error_cnt:	number of Error transmits
+ * @tx_arb_lost_cnt:	number of Arbitration Lost transmits
+ * @tx_low_drive_log_cnt: number of logged Low Drive transmits
+ * @tx_error_log_cnt:	number of logged Error transmits
  * @notifier:		CEC notifier
  * @pin:		CEC pin status struct
  * @cec_dir:		debugfs cec directory
@@ -262,7 +267,12 @@  struct cec_adapter {
 	struct cec_log_addrs log_addrs;
 	struct cec_connector_info conn_info;

-	u32 tx_timeouts;
+	u32 tx_timeout_cnt;
+	u32 tx_low_drive_cnt;
+	u32 tx_error_cnt;
+	u32 tx_arb_lost_cnt;
+	u32 tx_low_drive_log_cnt;
+	u32 tx_error_log_cnt;

 #ifdef CONFIG_CEC_NOTIFIER
 	struct cec_notifier *notifier;