[v2] drm/print: Introduce drm_line_printer

Message ID	20240528130622.1152-1-michal.wajdeczko@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <dri-devel-bounces@lists.freedesktop.org> From: Michal Wajdeczko <michal.wajdeczko@intel.com> To: dri-devel@lists.freedesktop.org Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>, Jani Nikula <jani.nikula@intel.com>, John Harrison <John.C.Harrison@Intel.com> Subject: [PATCH v2] drm/print: Introduce drm_line_printer Date: Tue, 28 May 2024 15:06:22 +0200 Message-Id: <20240528130622.1152-1-michal.wajdeczko@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: dri-devel-bounces@lists.freedesktop.org Sender: "dri-devel" <dri-devel-bounces@lists.freedesktop.org>
Series	[v2] drm/print: Introduce drm_line_printer \| expand [v2] drm/print: Introduce drm_line_printer

Michal Wajdeczko May 28, 2024, 1:06 p.m. UTC

This drm printer wrapper can be used to increase the robustness of
the captured output generated by any other drm_printer to make sure
we didn't lost any intermediate lines of the output by adding line
numbers to each output line. Helpful for capturing some crash data.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
---
v2: don't abuse prefix, use union instead (Jani)
    don't use 'dp' as name, prefer 'p' (Jani)
    add support for unique series identifier (John)
---
 drivers/gpu/drm/drm_print.c | 14 ++++++++
 include/drm/drm_print.h     | 68 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 1 deletion(-)

Jani Nikula May 29, 2024, 9:01 a.m. UTC | #1

On Tue, 28 May 2024, Michal Wajdeczko <michal.wajdeczko@intel.com> wrote:
> This drm printer wrapper can be used to increase the robustness of
> the captured output generated by any other drm_printer to make sure
> we didn't lost any intermediate lines of the output by adding line
> numbers to each output line. Helpful for capturing some crash data.
>
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Jani Nikula <jani.nikula@intel.com>
> Cc: John Harrison <John.C.Harrison@Intel.com>

Reviewed-by: Jani Nikula <jani.nikula@intel.com>


> ---
> v2: don't abuse prefix, use union instead (Jani)
>     don't use 'dp' as name, prefer 'p' (Jani)
>     add support for unique series identifier (John)
> ---
>  drivers/gpu/drm/drm_print.c | 14 ++++++++
>  include/drm/drm_print.h     | 68 ++++++++++++++++++++++++++++++++++++-
>  2 files changed, 81 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
> index cf2efb44722c..be9cbebff5b3 100644
> --- a/drivers/gpu/drm/drm_print.c
> +++ b/drivers/gpu/drm/drm_print.c
> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf)
>  }
>  EXPORT_SYMBOL(__drm_printfn_err);
>  
> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf)
> +{
> +	unsigned int counter = ++p->line.counter;
> +	const char *prefix = p->prefix ?: "";
> +	const char *pad = p->prefix ? " " : "";
> +
> +	if (p->line.series)
> +		drm_printf(p->arg, "%s%s%u.%u: %pV",
> +			   prefix, pad, p->line.series, counter, vaf);
> +	else
> +		drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
> +}
> +EXPORT_SYMBOL(__drm_printfn_line);
> +
>  /**
>   * drm_puts - print a const string to a &drm_printer stream
>   * @p: the &drm printer
> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
> index 089950ad8681..f4d9b98d7909 100644
> --- a/include/drm/drm_print.h
> +++ b/include/drm/drm_print.h
> @@ -176,7 +176,13 @@ struct drm_printer {
>  	void (*puts)(struct drm_printer *p, const char *str);
>  	void *arg;
>  	const char *prefix;
> -	enum drm_debug_category category;
> +	union {
> +		enum drm_debug_category category;
> +		struct {
> +			unsigned short series;
> +			unsigned short counter;
> +		} line;
> +	};
>  };
>  
>  void __drm_printfn_coredump(struct drm_printer *p, struct va_format *vaf);
> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p, const char *str);
>  void __drm_printfn_info(struct drm_printer *p, struct va_format *vaf);
>  void __drm_printfn_dbg(struct drm_printer *p, struct va_format *vaf);
>  void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf);
> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf);
>  
>  __printf(2, 3)
>  void drm_printf(struct drm_printer *p, const char *f, ...);
> @@ -357,6 +364,65 @@ static inline struct drm_printer drm_err_printer(struct drm_device *drm,
>  	return p;
>  }
>  
> +/**
> + * drm_line_printer - construct a &drm_printer that prefixes outputs with line numbers
> + * @p: the &struct drm_printer which actually generates the output
> + * @prefix: optional output prefix, or NULL for no prefix
> + * @series: optional unique series identifier, or 0 to omit identifier in the output
> + *
> + * This printer can be used to increase the robustness of the captured output
> + * to make sure we didn't lost any intermediate lines of the output. Helpful
> + * while capturing some crash data.
> + *
> + * Example 1::
> + *
> + *	void crash_dump(struct drm_device *drm)
> + *	{
> + *		static unsigned short id;
> + *		struct drm_printer p = drm_err_printer(drm, "crash");
> + *		struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
> + *
> + *		drm_printf(&lp, "foo");
> + *		drm_printf(&lp, "bar");
> + *	}
> + *
> + * Above code will print into the dmesg something like::
> + *
> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
> + *
> + * Example 2::
> + *
> + *	void line_dump(struct device *dev)
> + *	{
> + *		struct drm_printer p = drm_info_printer(dev);
> + *		struct drm_printer lp = drm_line_printer(&p, NULL, 0);
> + *
> + *		drm_printf(&lp, "foo");
> + *		drm_printf(&lp, "bar");
> + *	}
> + *
> + * Above code will print::
> + *
> + *	[ ] 0000:00:00.0: [drm] 1: foo
> + *	[ ] 0000:00:00.0: [drm] 2: bar
> + *
> + * RETURNS:
> + * The &drm_printer object
> + */
> +static inline struct drm_printer drm_line_printer(struct drm_printer *p,
> +						  const char *prefix,
> +						  unsigned short series)
> +{
> +	struct drm_printer lp = {
> +		.printfn = __drm_printfn_line,
> +		.arg = p,
> +		.prefix = prefix,
> +		.line = { .series = series, },
> +	};
> +	return lp;
> +}
> +
>  /*
>   * struct device based logging
>   *

John Harrison May 29, 2024, 11:48 p.m. UTC | #2

On 5/28/2024 06:06, Michal Wajdeczko wrote:
> This drm printer wrapper can be used to increase the robustness of
> the captured output generated by any other drm_printer to make sure
> we didn't lost any intermediate lines of the output by adding line
> numbers to each output line. Helpful for capturing some crash data.
>
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Jani Nikula <jani.nikula@intel.com>
> Cc: John Harrison <John.C.Harrison@Intel.com>
> ---
> v2: don't abuse prefix, use union instead (Jani)
>      don't use 'dp' as name, prefer 'p' (Jani)
>      add support for unique series identifier (John)
> ---
>   drivers/gpu/drm/drm_print.c | 14 ++++++++
>   include/drm/drm_print.h     | 68 ++++++++++++++++++++++++++++++++++++-
>   2 files changed, 81 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
> index cf2efb44722c..be9cbebff5b3 100644
> --- a/drivers/gpu/drm/drm_print.c
> +++ b/drivers/gpu/drm/drm_print.c
> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf)
>   }
>   EXPORT_SYMBOL(__drm_printfn_err);
>   
> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf)
> +{
> +	unsigned int counter = ++p->line.counter;
Wrong units, but see below anyway...

> +	const char *prefix = p->prefix ?: "";
> +	const char *pad = p->prefix ? " " : "";
> +
> +	if (p->line.series)
> +		drm_printf(p->arg, "%s%s%u.%u: %pV",
> +			   prefix, pad, p->line.series, counter, vaf);
> +	else
> +		drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
> +}
> +EXPORT_SYMBOL(__drm_printfn_line);
> +
>   /**
>    * drm_puts - print a const string to a &drm_printer stream
>    * @p: the &drm printer
> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
> index 089950ad8681..f4d9b98d7909 100644
> --- a/include/drm/drm_print.h
> +++ b/include/drm/drm_print.h
> @@ -176,7 +176,13 @@ struct drm_printer {
>   	void (*puts)(struct drm_printer *p, const char *str);
>   	void *arg;
>   	const char *prefix;
> -	enum drm_debug_category category;
> +	union {
> +		enum drm_debug_category category;
> +		struct {
> +			unsigned short series;
> +			unsigned short counter;
Any particular reason for using 'short' rather than 'int'? Short is only 
16bits right? That might seem huge but a GuC log buffer with 16MB debug 
log (and minimum sizes for the other sections) when dumped via the 
original debugfs hexdump mechanism is 1,245,444 lines. That line count 
goes down a lot when you start using longer lines for the dump, but it 
is still in the tens of thousands of lines.  So limiting to 16 bits is 
an overflow just waiting to happen.

> +		} line;
> +	};
>   };
>   
>   void __drm_printfn_coredump(struct drm_printer *p, struct va_format *vaf);
> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p, const char *str);
>   void __drm_printfn_info(struct drm_printer *p, struct va_format *vaf);
>   void __drm_printfn_dbg(struct drm_printer *p, struct va_format *vaf);
>   void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf);
> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf);
>   
>   __printf(2, 3)
>   void drm_printf(struct drm_printer *p, const char *f, ...);
> @@ -357,6 +364,65 @@ static inline struct drm_printer drm_err_printer(struct drm_device *drm,
>   	return p;
>   }
>   
> +/**
> + * drm_line_printer - construct a &drm_printer that prefixes outputs with line numbers
> + * @p: the &struct drm_printer which actually generates the output
> + * @prefix: optional output prefix, or NULL for no prefix
> + * @series: optional unique series identifier, or 0 to omit identifier in the output
> + *
> + * This printer can be used to increase the robustness of the captured output
> + * to make sure we didn't lost any intermediate lines of the output. Helpful
> + * while capturing some crash data.
> + *
> + * Example 1::
> + *
> + *	void crash_dump(struct drm_device *drm)
> + *	{
> + *		static unsigned short id;
> + *		struct drm_printer p = drm_err_printer(drm, "crash");
> + *		struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
Is there any benefit or other reason to split the prefix across two 
separate printers? It seems like a source of confusion. As in, the code 
will allow a double prefix, there is not much you can do about that 
because losing the prefix from drm_line_printer would mean no prefix at 
all when not using drm_err_printer underneath. But why explicitly split 
the message across both printers in the usage example? This is saying 
that this is the recommended way to use the interface, but with no 
explanation of why the split is required or how the split should be done.

Also, there is really no specific connection to crashes. The purpose of 
this is for allowing the dumping of multi-line blocks of data. One use 
is for debugging crashes. But there are many debug operations that 
require the same dump and do not involve a crash. And this support is 
certainly not intended to be used on general end-user GPU hangs. For 
those, everything should be part of the devcoredump core file that is 
produced and accessible via sysfs. We absolutely should not be dumping 
huge data blocks in customer release drivers except in very extreme 
circumstances.

> + *
> + *		drm_printf(&lp, "foo");
> + *		drm_printf(&lp, "bar");
> + *	}
> + *
> + * Above code will print into the dmesg something like::
> + *
> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
> + *
> + * Example 2::
> + *
> + *	void line_dump(struct device *dev)
> + *	{
> + *		struct drm_printer p = drm_info_printer(dev);
> + *		struct drm_printer lp = drm_line_printer(&p, NULL, 0);
> + *
> + *		drm_printf(&lp, "foo");
> + *		drm_printf(&lp, "bar");
> + *	}
> + *
> + * Above code will print::
> + *
> + *	[ ] 0000:00:00.0: [drm] 1: foo
> + *	[ ] 0000:00:00.0: [drm] 2: bar
Not really seeing the point of having two examples listed. The first 
includes all the optional extras, the second is just repeating with no 
new information.

John.

> + *
> + * RETURNS:
> + * The &drm_printer object
> + */
> +static inline struct drm_printer drm_line_printer(struct drm_printer *p,
> +						  const char *prefix,
> +						  unsigned short series)
> +{
> +	struct drm_printer lp = {
> +		.printfn = __drm_printfn_line,
> +		.arg = p,
> +		.prefix = prefix,
> +		.line = { .series = series, },
> +	};
> +	return lp;
> +}
> +
>   /*
>    * struct device based logging
>    *

Jani Nikula May 30, 2024, 7:49 a.m. UTC | #3

On Wed, 29 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
> On 5/28/2024 06:06, Michal Wajdeczko wrote:
>> This drm printer wrapper can be used to increase the robustness of
>> the captured output generated by any other drm_printer to make sure
>> we didn't lost any intermediate lines of the output by adding line
>> numbers to each output line. Helpful for capturing some crash data.
>>
>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> Cc: Jani Nikula <jani.nikula@intel.com>
>> Cc: John Harrison <John.C.Harrison@Intel.com>
>> ---
>> v2: don't abuse prefix, use union instead (Jani)
>>      don't use 'dp' as name, prefer 'p' (Jani)
>>      add support for unique series identifier (John)
>> ---
>>   drivers/gpu/drm/drm_print.c | 14 ++++++++
>>   include/drm/drm_print.h     | 68 ++++++++++++++++++++++++++++++++++++-
>>   2 files changed, 81 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
>> index cf2efb44722c..be9cbebff5b3 100644
>> --- a/drivers/gpu/drm/drm_print.c
>> +++ b/drivers/gpu/drm/drm_print.c
>> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf)
>>   }
>>   EXPORT_SYMBOL(__drm_printfn_err);
>>   
>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf)
>> +{
>> +	unsigned int counter = ++p->line.counter;
> Wrong units, but see below anyway...
>
>> +	const char *prefix = p->prefix ?: "";
>> +	const char *pad = p->prefix ? " " : "";
>> +
>> +	if (p->line.series)
>> +		drm_printf(p->arg, "%s%s%u.%u: %pV",
>> +			   prefix, pad, p->line.series, counter, vaf);
>> +	else
>> +		drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
>> +}
>> +EXPORT_SYMBOL(__drm_printfn_line);
>> +
>>   /**
>>    * drm_puts - print a const string to a &drm_printer stream
>>    * @p: the &drm printer
>> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
>> index 089950ad8681..f4d9b98d7909 100644
>> --- a/include/drm/drm_print.h
>> +++ b/include/drm/drm_print.h
>> @@ -176,7 +176,13 @@ struct drm_printer {
>>   	void (*puts)(struct drm_printer *p, const char *str);
>>   	void *arg;
>>   	const char *prefix;
>> -	enum drm_debug_category category;
>> +	union {
>> +		enum drm_debug_category category;
>> +		struct {
>> +			unsigned short series;
>> +			unsigned short counter;
> Any particular reason for using 'short' rather than 'int'? Short is only 
> 16bits right? That might seem huge but a GuC log buffer with 16MB debug 
> log (and minimum sizes for the other sections) when dumped via the 
> original debugfs hexdump mechanism is 1,245,444 lines. That line count 
> goes down a lot when you start using longer lines for the dump, but it 
> is still in the tens of thousands of lines.  So limiting to 16 bits is 
> an overflow just waiting to happen.
>
>> +		} line;
>> +	};
>>   };
>>   
>>   void __drm_printfn_coredump(struct drm_printer *p, struct va_format *vaf);
>> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p, const char *str);
>>   void __drm_printfn_info(struct drm_printer *p, struct va_format *vaf);
>>   void __drm_printfn_dbg(struct drm_printer *p, struct va_format *vaf);
>>   void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf);
>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf);
>>   
>>   __printf(2, 3)
>>   void drm_printf(struct drm_printer *p, const char *f, ...);
>> @@ -357,6 +364,65 @@ static inline struct drm_printer drm_err_printer(struct drm_device *drm,
>>   	return p;
>>   }
>>   
>> +/**
>> + * drm_line_printer - construct a &drm_printer that prefixes outputs with line numbers
>> + * @p: the &struct drm_printer which actually generates the output
>> + * @prefix: optional output prefix, or NULL for no prefix
>> + * @series: optional unique series identifier, or 0 to omit identifier in the output
>> + *
>> + * This printer can be used to increase the robustness of the captured output
>> + * to make sure we didn't lost any intermediate lines of the output. Helpful
>> + * while capturing some crash data.
>> + *
>> + * Example 1::
>> + *
>> + *	void crash_dump(struct drm_device *drm)
>> + *	{
>> + *		static unsigned short id;
>> + *		struct drm_printer p = drm_err_printer(drm, "crash");
>> + *		struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
> Is there any benefit or other reason to split the prefix across two 
> separate printers? It seems like a source of confusion. As in, the code 
> will allow a double prefix, there is not much you can do about that 
> because losing the prefix from drm_line_printer would mean no prefix at 
> all when not using drm_err_printer underneath. But why explicitly split 
> the message across both printers in the usage example? This is saying 
> that this is the recommended way to use the interface, but with no 
> explanation of why the split is required or how the split should be done.

You could have a printer, and then add two separate line counted blocks.

	struct drm_printer p = drm_err_printer(drm, "parent");
	struct drm_printer lp1 = drm_line_printer(&p, "child 1", 0);

	...

	struct drm_printer lp2 = drm_line_printer(&p, "child 2", 0);

	...

p could be defined elsewhere and passed into separate functions which
each have the line printing. The two prefixes can be useful.

> Also, there is really no specific connection to crashes. The purpose of 
> this is for allowing the dumping of multi-line blocks of data. One use 
> is for debugging crashes. But there are many debug operations that 
> require the same dump and do not involve a crash. And this support is 
> certainly not intended to be used on general end-user GPU hangs. For 
> those, everything should be part of the devcoredump core file that is 
> produced and accessible via sysfs. We absolutely should not be dumping 
> huge data blocks in customer release drivers except in very extreme 
> circumstances.

A devcoredump implementation could use a drm_printer too?

Is this only about bikeshedding the example? I'm not sure I get the
point here.

>
>> + *
>> + *		drm_printf(&lp, "foo");
>> + *		drm_printf(&lp, "bar");
>> + *	}
>> + *
>> + * Above code will print into the dmesg something like::
>> + *
>> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
>> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
>> + *
>> + * Example 2::
>> + *
>> + *	void line_dump(struct device *dev)
>> + *	{
>> + *		struct drm_printer p = drm_info_printer(dev);
>> + *		struct drm_printer lp = drm_line_printer(&p, NULL, 0);
>> + *
>> + *		drm_printf(&lp, "foo");
>> + *		drm_printf(&lp, "bar");
>> + *	}
>> + *
>> + * Above code will print::
>> + *
>> + *	[ ] 0000:00:00.0: [drm] 1: foo
>> + *	[ ] 0000:00:00.0: [drm] 2: bar
> Not really seeing the point of having two examples listed. The first 
> includes all the optional extras, the second is just repeating with no 
> new information.

You see how the "series" param behaves?

BR,
Jani.

>
> John.
>
>> + *
>> + * RETURNS:
>> + * The &drm_printer object
>> + */
>> +static inline struct drm_printer drm_line_printer(struct drm_printer *p,
>> +						  const char *prefix,
>> +						  unsigned short series)
>> +{
>> +	struct drm_printer lp = {
>> +		.printfn = __drm_printfn_line,
>> +		.arg = p,
>> +		.prefix = prefix,
>> +		.line = { .series = series, },
>> +	};
>> +	return lp;
>> +}
>> +
>>   /*
>>    * struct device based logging
>>    *
>

Michal Wajdeczko May 30, 2024, 9:33 a.m. UTC | #4

On 30.05.2024 09:49, Jani Nikula wrote:
> On Wed, 29 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
>> On 5/28/2024 06:06, Michal Wajdeczko wrote:
>>> This drm printer wrapper can be used to increase the robustness of
>>> the captured output generated by any other drm_printer to make sure
>>> we didn't lost any intermediate lines of the output by adding line
>>> numbers to each output line. Helpful for capturing some crash data.
>>>
>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>> Cc: Jani Nikula <jani.nikula@intel.com>
>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>> ---
>>> v2: don't abuse prefix, use union instead (Jani)
>>>      don't use 'dp' as name, prefer 'p' (Jani)
>>>      add support for unique series identifier (John)
>>> ---
>>>   drivers/gpu/drm/drm_print.c | 14 ++++++++
>>>   include/drm/drm_print.h     | 68 ++++++++++++++++++++++++++++++++++++-
>>>   2 files changed, 81 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
>>> index cf2efb44722c..be9cbebff5b3 100644
>>> --- a/drivers/gpu/drm/drm_print.c
>>> +++ b/drivers/gpu/drm/drm_print.c
>>> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf)
>>>   }
>>>   EXPORT_SYMBOL(__drm_printfn_err);
>>>   
>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf)
>>> +{
>>> +	unsigned int counter = ++p->line.counter;
>> Wrong units, but see below anyway...

it really doesn't matter as it is temporary var used in printf()
actual 'short' counter will wrap on its own unit boundary

>>
>>> +	const char *prefix = p->prefix ?: "";
>>> +	const char *pad = p->prefix ? " " : "";
>>> +
>>> +	if (p->line.series)
>>> +		drm_printf(p->arg, "%s%s%u.%u: %pV",
>>> +			   prefix, pad, p->line.series, counter, vaf);
>>> +	else
>>> +		drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
>>> +}
>>> +EXPORT_SYMBOL(__drm_printfn_line);
>>> +
>>>   /**
>>>    * drm_puts - print a const string to a &drm_printer stream
>>>    * @p: the &drm printer
>>> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
>>> index 089950ad8681..f4d9b98d7909 100644
>>> --- a/include/drm/drm_print.h
>>> +++ b/include/drm/drm_print.h
>>> @@ -176,7 +176,13 @@ struct drm_printer {
>>>   	void (*puts)(struct drm_printer *p, const char *str);
>>>   	void *arg;
>>>   	const char *prefix;
>>> -	enum drm_debug_category category;
>>> +	union {
>>> +		enum drm_debug_category category;
>>> +		struct {
>>> +			unsigned short series;
>>> +			unsigned short counter;
>> Any particular reason for using 'short' rather than 'int'? Short is only 

didn't want to increase the size of the struct drm_printer and with
little luck sizeof two shorts will be less/equal sizeof enum

>> 16bits right? That might seem huge but a GuC log buffer with 16MB debug 
>> log (and minimum sizes for the other sections) when dumped via the 
>> original debugfs hexdump mechanism is 1,245,444 lines. That line count 

if your capture relies on collecting all 1,245,444 lines then likely you
have other problem that needs solving than line counter overflow

>> goes down a lot when you start using longer lines for the dump, but it 
>> is still in the tens of thousands of lines.  So limiting to 16 bits is 
>> an overflow just waiting to happen.

but even in case of an overflow it should be relatively easy to teach
any parser to deal with line .0 as indicator to restart any tracker

and it is highly unlikely that any captured logs will miss exactly
65,536 contiguous lines, but even then it should be noticeable gap

>>
>>> +		} line;
>>> +	};
>>>   };
>>>   
>>>   void __drm_printfn_coredump(struct drm_printer *p, struct va_format *vaf);
>>> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p, const char *str);
>>>   void __drm_printfn_info(struct drm_printer *p, struct va_format *vaf);
>>>   void __drm_printfn_dbg(struct drm_printer *p, struct va_format *vaf);
>>>   void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf);
>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf);
>>>   
>>>   __printf(2, 3)
>>>   void drm_printf(struct drm_printer *p, const char *f, ...);
>>> @@ -357,6 +364,65 @@ static inline struct drm_printer drm_err_printer(struct drm_device *drm,
>>>   	return p;
>>>   }
>>>   
>>> +/**
>>> + * drm_line_printer - construct a &drm_printer that prefixes outputs with line numbers
>>> + * @p: the &struct drm_printer which actually generates the output
>>> + * @prefix: optional output prefix, or NULL for no prefix
>>> + * @series: optional unique series identifier, or 0 to omit identifier in the output
>>> + *
>>> + * This printer can be used to increase the robustness of the captured output
>>> + * to make sure we didn't lost any intermediate lines of the output. Helpful
>>> + * while capturing some crash data.
>>> + *
>>> + * Example 1::
>>> + *
>>> + *	void crash_dump(struct drm_device *drm)
>>> + *	{
>>> + *		static unsigned short id;
>>> + *		struct drm_printer p = drm_err_printer(drm, "crash");
>>> + *		struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
>> Is there any benefit or other reason to split the prefix across two 
>> separate printers? It seems like a source of confusion. As in, the code

it's not any kind of the required 'split', as both printers used here
can treat prefix as optional if NULL, but rather a way to show how to
pass two potentially separated prefixes, as one of them could be already
prepared (like engine name or any other alias) or if the primary printer
does not accept any prefix at all (and this a limitation of our existing
custom GT oriented printers [1] [2])

[1]
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/xe/xe_gt_printk.h#L66
[2]
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/xe/xe_gt_printk.h#L81

>> will allow a double prefix, there is not much you can do about that 
>> because losing the prefix from drm_line_printer would mean no prefix at 

but why would we loose the prefix from the primary printer ?

>> all when not using drm_err_printer underneath. But why explicitly split 
>> the message across both printers in the usage example? This is saying 
>> that this is the recommended way to use the interface, but with no 
>> explanation of why the split is required or how the split should be done.

the drm_line_printer is flexible and can be used in many configurations,
above is just one of the potential uses that shows full output

> 
> You could have a printer, and then add two separate line counted blocks.
> 
> 	struct drm_printer p = drm_err_printer(drm, "parent");
> 	struct drm_printer lp1 = drm_line_printer(&p, "child 1", 0);
> 
> 	...
> 
> 	struct drm_printer lp2 = drm_line_printer(&p, "child 2", 0);
> 
> 	...
> 
> p could be defined elsewhere and passed into separate functions which
> each have the line printing. The two prefixes can be useful.

didn't considered that, but as stated above, drm_line_printer is
flexible and can be used in many different ways, like this new one

> 
>> Also, there is really no specific connection to crashes. The purpose of 
>> this is for allowing the dumping of multi-line blocks of data. One use 
>> is for debugging crashes. But there are many debug operations that 
>> require the same dump and do not involve a crash. And this support is 
>> certainly not intended to be used on general end-user GPU hangs. For 
>> those, everything should be part of the devcoredump core file that is 
>> produced and accessible via sysfs. We absolutely should not be dumping 
>> huge data blocks in customer release drivers except in very extreme 
>> circumstances.

if you are trying to convince me that we don't need any custom
drm_printer that would take care of tracking and printing line numbers
in kind of more robust way and instead we should be doing such line
prints in a error prone way on it's own as you did in [3], then sorry,
I'm not convinced, unless you just feel that it shouldn't be part of the
drm yet, but then decision is drm maintainer hands (and also in the Xe
maintainers who don't want to fall into i915-ish private solutions trap)

[3] https://patchwork.freedesktop.org/patch/594021/?series=133349&rev=2

> 
> A devcoredump implementation could use a drm_printer too?
> 
> Is this only about bikeshedding the example? I'm not sure I get the
> point here.
> 
>>
>>> + *
>>> + *		drm_printf(&lp, "foo");
>>> + *		drm_printf(&lp, "bar");
>>> + *	}
>>> + *
>>> + * Above code will print into the dmesg something like::
>>> + *
>>> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
>>> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
>>> + *
>>> + * Example 2::
>>> + *
>>> + *	void line_dump(struct device *dev)
>>> + *	{
>>> + *		struct drm_printer p = drm_info_printer(dev);
>>> + *		struct drm_printer lp = drm_line_printer(&p, NULL, 0);
>>> + *
>>> + *		drm_printf(&lp, "foo");
>>> + *		drm_printf(&lp, "bar");
>>> + *	}
>>> + *
>>> + * Above code will print::
>>> + *
>>> + *	[ ] 0000:00:00.0: [drm] 1: foo
>>> + *	[ ] 0000:00:00.0: [drm] 2: bar
>> Not really seeing the point of having two examples listed. The first 
>> includes all the optional extras, the second is just repeating with no 
>> new information.
> 
> You see how the "series" param behaves?

exactly

> 
> BR,
> Jani.
> 
>>
>> John.
>>
>>> + *
>>> + * RETURNS:
>>> + * The &drm_printer object
>>> + */
>>> +static inline struct drm_printer drm_line_printer(struct drm_printer *p,
>>> +						  const char *prefix,
>>> +						  unsigned short series)
>>> +{
>>> +	struct drm_printer lp = {
>>> +		.printfn = __drm_printfn_line,
>>> +		.arg = p,
>>> +		.prefix = prefix,
>>> +		.line = { .series = series, },
>>> +	};
>>> +	return lp;
>>> +}
>>> +
>>>   /*
>>>    * struct device based logging
>>>    *
>>
>

John Harrison May 30, 2024, 6:47 p.m. UTC | #5

On 5/30/2024 00:49, Jani Nikula wrote:
> On Wed, 29 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
>> On 5/28/2024 06:06, Michal Wajdeczko wrote:
>>> This drm printer wrapper can be used to increase the robustness of
>>> the captured output generated by any other drm_printer to make sure
>>> we didn't lost any intermediate lines of the output by adding line
>>> numbers to each output line. Helpful for capturing some crash data.
>>>
>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>> Cc: Jani Nikula <jani.nikula@intel.com>
>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>> ---
>>> v2: don't abuse prefix, use union instead (Jani)
>>>       don't use 'dp' as name, prefer 'p' (Jani)
>>>       add support for unique series identifier (John)
>>> ---
>>>    drivers/gpu/drm/drm_print.c | 14 ++++++++
>>>    include/drm/drm_print.h     | 68 ++++++++++++++++++++++++++++++++++++-
>>>    2 files changed, 81 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
>>> index cf2efb44722c..be9cbebff5b3 100644
>>> --- a/drivers/gpu/drm/drm_print.c
>>> +++ b/drivers/gpu/drm/drm_print.c
>>> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf)
>>>    }
>>>    EXPORT_SYMBOL(__drm_printfn_err);
>>>    
>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf)
>>> +{
>>> +	unsigned int counter = ++p->line.counter;
>> Wrong units, but see below anyway...
>>
>>> +	const char *prefix = p->prefix ?: "";
>>> +	const char *pad = p->prefix ? " " : "";
>>> +
>>> +	if (p->line.series)
>>> +		drm_printf(p->arg, "%s%s%u.%u: %pV",
>>> +			   prefix, pad, p->line.series, counter, vaf);
>>> +	else
>>> +		drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
>>> +}
>>> +EXPORT_SYMBOL(__drm_printfn_line);
>>> +
>>>    /**
>>>     * drm_puts - print a const string to a &drm_printer stream
>>>     * @p: the &drm printer
>>> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
>>> index 089950ad8681..f4d9b98d7909 100644
>>> --- a/include/drm/drm_print.h
>>> +++ b/include/drm/drm_print.h
>>> @@ -176,7 +176,13 @@ struct drm_printer {
>>>    	void (*puts)(struct drm_printer *p, const char *str);
>>>    	void *arg;
>>>    	const char *prefix;
>>> -	enum drm_debug_category category;
>>> +	union {
>>> +		enum drm_debug_category category;
>>> +		struct {
>>> +			unsigned short series;
>>> +			unsigned short counter;
>> Any particular reason for using 'short' rather than 'int'? Short is only
>> 16bits right? That might seem huge but a GuC log buffer with 16MB debug
>> log (and minimum sizes for the other sections) when dumped via the
>> original debugfs hexdump mechanism is 1,245,444 lines. That line count
>> goes down a lot when you start using longer lines for the dump, but it
>> is still in the tens of thousands of lines.  So limiting to 16 bits is
>> an overflow just waiting to happen.
>>
>>> +		} line;
>>> +	};
>>>    };
>>>    
>>>    void __drm_printfn_coredump(struct drm_printer *p, struct va_format *vaf);
>>> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p, const char *str);
>>>    void __drm_printfn_info(struct drm_printer *p, struct va_format *vaf);
>>>    void __drm_printfn_dbg(struct drm_printer *p, struct va_format *vaf);
>>>    void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf);
>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf);
>>>    
>>>    __printf(2, 3)
>>>    void drm_printf(struct drm_printer *p, const char *f, ...);
>>> @@ -357,6 +364,65 @@ static inline struct drm_printer drm_err_printer(struct drm_device *drm,
>>>    	return p;
>>>    }
>>>    
>>> +/**
>>> + * drm_line_printer - construct a &drm_printer that prefixes outputs with line numbers
>>> + * @p: the &struct drm_printer which actually generates the output
>>> + * @prefix: optional output prefix, or NULL for no prefix
>>> + * @series: optional unique series identifier, or 0 to omit identifier in the output
>>> + *
>>> + * This printer can be used to increase the robustness of the captured output
>>> + * to make sure we didn't lost any intermediate lines of the output. Helpful
>>> + * while capturing some crash data.
>>> + *
>>> + * Example 1::
>>> + *
>>> + *	void crash_dump(struct drm_device *drm)
>>> + *	{
>>> + *		static unsigned short id;
>>> + *		struct drm_printer p = drm_err_printer(drm, "crash");
>>> + *		struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
>> Is there any benefit or other reason to split the prefix across two
>> separate printers? It seems like a source of confusion. As in, the code
>> will allow a double prefix, there is not much you can do about that
>> because losing the prefix from drm_line_printer would mean no prefix at
>> all when not using drm_err_printer underneath. But why explicitly split
>> the message across both printers in the usage example? This is saying
>> that this is the recommended way to use the interface, but with no
>> explanation of why the split is required or how the split should be done.
> You could have a printer, and then add two separate line counted blocks.
>
> 	struct drm_printer p = drm_err_printer(drm, "parent");
> 	struct drm_printer lp1 = drm_line_printer(&p, "child 1", 0);
>
> 	...
>
> 	struct drm_printer lp2 = drm_line_printer(&p, "child 2", 0);
>
> 	...
>
> p could be defined elsewhere and passed into separate functions which
> each have the line printing. The two prefixes can be useful.
Except you can't have a multi-level prefix if using the info level 
printer as that does not take a prefix. And I'm really not seeing a 
reason why you would want the line count to restart in the middle of a 
single atomic dump operation.

>
>> Also, there is really no specific connection to crashes. The purpose of
>> this is for allowing the dumping of multi-line blocks of data. One use
>> is for debugging crashes. But there are many debug operations that
>> require the same dump and do not involve a crash. And this support is
>> certainly not intended to be used on general end-user GPU hangs. For
>> those, everything should be part of the devcoredump core file that is
>> produced and accessible via sysfs. We absolutely should not be dumping
>> huge data blocks in customer release drivers except in very extreme
>> circumstances.
> A devcoredump implementation could use a drm_printer too?
You mean for reading the devcoredump file from sysfs? Except that the 
whole reason this was started was because Michal absolutely refuses to 
allow line counted output in a sysfs/debugfs file because "it is 
unnecessary and breaks the decoding tools".

>
> Is this only about bikeshedding the example? I'm not sure I get the
> point here.
I would call it getting accurate and understandable documentation.

The existing example is splitting what should be an atomic name "crash 
dump" across two separate line printer objects. That is something that 
is so unrealistic that it implies there is a required reason to break 
the prefix up. Documentation that is ambiguous and confusing is 
potentially worse than no documentation at all.


>
>>> + *
>>> + *		drm_printf(&lp, "foo");
>>> + *		drm_printf(&lp, "bar");
>>> + *	}
>>> + *
>>> + * Above code will print into the dmesg something like::
>>> + *
>>> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
>>> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
>>> + *
>>> + * Example 2::
>>> + *
>>> + *	void line_dump(struct device *dev)
>>> + *	{
>>> + *		struct drm_printer p = drm_info_printer(dev);
>>> + *		struct drm_printer lp = drm_line_printer(&p, NULL, 0);
>>> + *
>>> + *		drm_printf(&lp, "foo");
>>> + *		drm_printf(&lp, "bar");
>>> + *	}
>>> + *
>>> + * Above code will print::
>>> + *
>>> + *	[ ] 0000:00:00.0: [drm] 1: foo
>>> + *	[ ] 0000:00:00.0: [drm] 2: bar
>> Not really seeing the point of having two examples listed. The first
>> includes all the optional extras, the second is just repeating with no
>> new information.
> You see how the "series" param behaves?
The second example doesn't have a series parameter. If the only purpose 
is to say "the print of the series value is suppressed if zero" then why 
not just have that one line? Documentation should also be concise.

John.

>
> BR,
> Jani.
>
>> John.
>>
>>> + *
>>> + * RETURNS:
>>> + * The &drm_printer object
>>> + */
>>> +static inline struct drm_printer drm_line_printer(struct drm_printer *p,
>>> +						  const char *prefix,
>>> +						  unsigned short series)
>>> +{
>>> +	struct drm_printer lp = {
>>> +		.printfn = __drm_printfn_line,
>>> +		.arg = p,
>>> +		.prefix = prefix,
>>> +		.line = { .series = series, },
>>> +	};
>>> +	return lp;
>>> +}
>>> +
>>>    /*
>>>     * struct device based logging
>>>     *

John Harrison May 30, 2024, 6:47 p.m. UTC | #6

On 5/30/2024 02:33, Michal Wajdeczko wrote:
> On 30.05.2024 09:49, Jani Nikula wrote:
>> On Wed, 29 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
>>> On 5/28/2024 06:06, Michal Wajdeczko wrote:
>>>> This drm printer wrapper can be used to increase the robustness of
>>>> the captured output generated by any other drm_printer to make sure
>>>> we didn't lost any intermediate lines of the output by adding line
>>>> numbers to each output line. Helpful for capturing some crash data.
>>>>
>>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>>> Cc: Jani Nikula <jani.nikula@intel.com>
>>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>>> ---
>>>> v2: don't abuse prefix, use union instead (Jani)
>>>>       don't use 'dp' as name, prefer 'p' (Jani)
>>>>       add support for unique series identifier (John)
>>>> ---
>>>>    drivers/gpu/drm/drm_print.c | 14 ++++++++
>>>>    include/drm/drm_print.h     | 68 ++++++++++++++++++++++++++++++++++++-
>>>>    2 files changed, 81 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
>>>> index cf2efb44722c..be9cbebff5b3 100644
>>>> --- a/drivers/gpu/drm/drm_print.c
>>>> +++ b/drivers/gpu/drm/drm_print.c
>>>> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf)
>>>>    }
>>>>    EXPORT_SYMBOL(__drm_printfn_err);
>>>>    
>>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf)
>>>> +{
>>>> +	unsigned int counter = ++p->line.counter;
>>> Wrong units, but see below anyway...
> it really doesn't matter as it is temporary var used in printf()
> actual 'short' counter will wrap on its own unit boundary
It should still match. Otherwise the code is ambiguous. Was it meant to 
be an int? Was it meant to be a short? Just because code compiles 
doesn't mean it is good.

>
>>>> +	const char *prefix = p->prefix ?: "";
>>>> +	const char *pad = p->prefix ? " " : "";
>>>> +
>>>> +	if (p->line.series)
>>>> +		drm_printf(p->arg, "%s%s%u.%u: %pV",
>>>> +			   prefix, pad, p->line.series, counter, vaf);
>>>> +	else
>>>> +		drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
>>>> +}
>>>> +EXPORT_SYMBOL(__drm_printfn_line);
>>>> +
>>>>    /**
>>>>     * drm_puts - print a const string to a &drm_printer stream
>>>>     * @p: the &drm printer
>>>> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
>>>> index 089950ad8681..f4d9b98d7909 100644
>>>> --- a/include/drm/drm_print.h
>>>> +++ b/include/drm/drm_print.h
>>>> @@ -176,7 +176,13 @@ struct drm_printer {
>>>>    	void (*puts)(struct drm_printer *p, const char *str);
>>>>    	void *arg;
>>>>    	const char *prefix;
>>>> -	enum drm_debug_category category;
>>>> +	union {
>>>> +		enum drm_debug_category category;
>>>> +		struct {
>>>> +			unsigned short series;
>>>> +			unsigned short counter;
>>> Any particular reason for using 'short' rather than 'int'? Short is only
> didn't want to increase the size of the struct drm_printer and with
> little luck sizeof two shorts will be less/equal sizeof enum
Depending on the compiler, the architecture and what values have been 
defined within it, an enum is possibly (likely?) to be a char.


>
>>> 16bits right? That might seem huge but a GuC log buffer with 16MB debug
>>> log (and minimum sizes for the other sections) when dumped via the
>>> original debugfs hexdump mechanism is 1,245,444 lines. That line count
> if your capture relies on collecting all 1,245,444 lines then likely you
> have other problem that needs solving than line counter overflow
Have you ever used a full 16MB GuC log? And read it out via debugfs? 
Then that was 1.2 million lines of text that you read out. Did you have 
other problems that meant reading that file was a waste of your time? Or 
did it allow you to debug the issue you were working on?

The purpose of this patch is to 'improve' the fully working version that 
was already posted. Causing unwanted wraps in the line count is not an 
improvement. It is very definitely a bug. And now your argument is that 
we shouldn't be doing this in the first place? That's a given! Dumping 
huge streams of data to dmesg is a total hack. We absolutely should not 
be doing it. But we have no choice because there is no other way 
(without adding even bigger and more complicated mechanisms involving 
external debug modules or something).

>
>>> goes down a lot when you start using longer lines for the dump, but it
>>> is still in the tens of thousands of lines.  So limiting to 16 bits is
>>> an overflow just waiting to happen.
> but even in case of an overflow it should be relatively easy to teach
> any parser to deal with line .0 as indicator to restart any tracker
Wasn't your earlier argument that trivially parsing out the line count 
prefix from a debugfs file was far too much effort and cannot possibly 
be done by a developer. Now you are saying that coping with a broken 
count is "easy to teach a parser". The one single purpose of this entire 
change is to guarantee a valid dump can be extracted from a log. 
Anything that potentially prevents that from working is a fundamental 
failure.

>
> and it is highly unlikely that any captured logs will miss exactly
> 65,536 contiguous lines, but even then it should be noticeable gap
Or we could just use an integer count that is not going to wrap and be 
ambiguous.


>
>>>> +		} line;
>>>> +	};
>>>>    };
>>>>    
>>>>    void __drm_printfn_coredump(struct drm_printer *p, struct va_format *vaf);
>>>> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p, const char *str);
>>>>    void __drm_printfn_info(struct drm_printer *p, struct va_format *vaf);
>>>>    void __drm_printfn_dbg(struct drm_printer *p, struct va_format *vaf);
>>>>    void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf);
>>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf);
>>>>    
>>>>    __printf(2, 3)
>>>>    void drm_printf(struct drm_printer *p, const char *f, ...);
>>>> @@ -357,6 +364,65 @@ static inline struct drm_printer drm_err_printer(struct drm_device *drm,
>>>>    	return p;
>>>>    }
>>>>    
>>>> +/**
>>>> + * drm_line_printer - construct a &drm_printer that prefixes outputs with line numbers
>>>> + * @p: the &struct drm_printer which actually generates the output
>>>> + * @prefix: optional output prefix, or NULL for no prefix
>>>> + * @series: optional unique series identifier, or 0 to omit identifier in the output
>>>> + *
>>>> + * This printer can be used to increase the robustness of the captured output
>>>> + * to make sure we didn't lost any intermediate lines of the output. Helpful
>>>> + * while capturing some crash data.
>>>> + *
>>>> + * Example 1::
>>>> + *
>>>> + *	void crash_dump(struct drm_device *drm)
>>>> + *	{
>>>> + *		static unsigned short id;
>>>> + *		struct drm_printer p = drm_err_printer(drm, "crash");
>>>> + *		struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
>>> Is there any benefit or other reason to split the prefix across two
>>> separate printers? It seems like a source of confusion. As in, the code
> it's not any kind of the required 'split', as both printers used here
> can treat prefix as optional if NULL, but rather a way to show how to
> pass two potentially separated prefixes, as one of them could be already
> prepared (like engine name or any other alias) or if the primary printer
> does not accept any prefix at all (and this a limitation of our existing
> custom GT oriented printers [1] [2])
>
> [1]
> https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/xe/xe_gt_printk.h#L66
> [2]
> https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/xe/xe_gt_printk.h#L81
As per earlier comments, my point is not that you should change the 
patch to remove one of the prefixes from the code. My point is that the 
documentation is confusing. You are explicitly splitting a single phrase 
"crash dump" across two separate printer objects with no explanation as 
to why. And as you just pointed out, there are many use cases where 
there would not be the option to split it. So it would be much, much 
clearer to pass NULL to your drm_err_printer example and have a single 
line comment saying that multiple prefixes could be used if allowed by 
the printer objects and if useful in the situation. Rather than having a 
bizarrely split string with no explanation as to why it has been split.

>
>>> will allow a double prefix, there is not much you can do about that
>>> because losing the prefix from drm_line_printer would mean no prefix at
> but why would we loose the prefix from the primary printer ?
I don't know what you mean by the primary printer? As above, I was 
simply trying to say that I am not requesting a code change but just a 
clarification of the documentation.

>
>>> all when not using drm_err_printer underneath. But why explicitly split
>>> the message across both printers in the usage example? This is saying
>>> that this is the recommended way to use the interface, but with no
>>> explanation of why the split is required or how the split should be done.
> the drm_line_printer is flexible and can be used in many configurations,
> above is just one of the potential uses that shows full output
>
>> You could have a printer, and then add two separate line counted blocks.
>>
>> 	struct drm_printer p = drm_err_printer(drm, "parent");
>> 	struct drm_printer lp1 = drm_line_printer(&p, "child 1", 0);
>>
>> 	...
>>
>> 	struct drm_printer lp2 = drm_line_printer(&p, "child 2", 0);
>>
>> 	...
>>
>> p could be defined elsewhere and passed into separate functions which
>> each have the line printing. The two prefixes can be useful.
> didn't considered that, but as stated above, drm_line_printer is
> flexible and can be used in many different ways, like this new one
And you really do not need to list them all out as massively verbose 
examples with confusing differences between them that are not explained. 
A single example plus a couple of lines of description would be much 
clearer.

>
>>> Also, there is really no specific connection to crashes. The purpose of
>>> this is for allowing the dumping of multi-line blocks of data. One use
>>> is for debugging crashes. But there are many debug operations that
>>> require the same dump and do not involve a crash. And this support is
>>> certainly not intended to be used on general end-user GPU hangs. For
>>> those, everything should be part of the devcoredump core file that is
>>> produced and accessible via sysfs. We absolutely should not be dumping
>>> huge data blocks in customer release drivers except in very extreme
>>> circumstances.
> if you are trying to convince me that we don't need any custom
> drm_printer that would take care of tracking and printing line numbers
> in kind of more robust way and instead we should be doing such line
> prints in a error prone way on it's own as you did in [3], then sorry,
> I'm not convinced, unless you just feel that it shouldn't be part of the
> drm yet, but then decision is drm maintainer hands (and also in the Xe
> maintainers who don't want to fall into i915-ish private solutions trap)
>
> [3] https://patchwork.freedesktop.org/patch/594021/?series=133349&rev=2
No. I am saying that your example use case seems to be implying a much 
greater usage for this mechanism than should be expected. I'm saying 
that it should never occur in an end user system because dumping 
megabytes of data to dmesg is a bad user experience. It absolutely 
should never be a standard part of handling a GPU hang type 'crash'. The 
primary purpose is for internal debug by developers only. If a use case 
gets shipped upstream then it should be an extremely hard to hit corner 
case for which we are desperate to get any useful debug logs by any 
means necessary.

As for error prone, I am not seeing how the original trivial (and 
working) code is error prone but this complex abstraction of it is less 
so. Especially given the integer truncation problem. I mean seriously, 
how 'error prone' can it be to add a "%d, line++" to a print?! And how 
much of a 'private solutions trap' is it to add such a trivial prefix to 
a couple of prints in a single function that is really a big ugly hack 
for getting logs out in a really dodgy manner anyway?

As you say, it is up to the DRM maintainers as to whether they want this 
support in the generic DRM layers. If it lands and it is functional 
(i.e. does not break its sole reason for being by truncating counts 
partway through a dump) then sure, I'll use it. I just don't see that it 
is even remotely worth the effort given that it is single use only and 
given how trivial the local version is.

John.

>
>> A devcoredump implementation could use a drm_printer too?
>>
>> Is this only about bikeshedding the example? I'm not sure I get the
>> point here.
>>
>>>> + *
>>>> + *		drm_printf(&lp, "foo");
>>>> + *		drm_printf(&lp, "bar");
>>>> + *	}
>>>> + *
>>>> + * Above code will print into the dmesg something like::
>>>> + *
>>>> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
>>>> + *	[ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
>>>> + *
>>>> + * Example 2::
>>>> + *
>>>> + *	void line_dump(struct device *dev)
>>>> + *	{
>>>> + *		struct drm_printer p = drm_info_printer(dev);
>>>> + *		struct drm_printer lp = drm_line_printer(&p, NULL, 0);
>>>> + *
>>>> + *		drm_printf(&lp, "foo");
>>>> + *		drm_printf(&lp, "bar");
>>>> + *	}
>>>> + *
>>>> + * Above code will print::
>>>> + *
>>>> + *	[ ] 0000:00:00.0: [drm] 1: foo
>>>> + *	[ ] 0000:00:00.0: [drm] 2: bar
>>> Not really seeing the point of having two examples listed. The first
>>> includes all the optional extras, the second is just repeating with no
>>> new information.
>> You see how the "series" param behaves?
> exactly
>
>> BR,
>> Jani.
>>
>>> John.
>>>
>>>> + *
>>>> + * RETURNS:
>>>> + * The &drm_printer object
>>>> + */
>>>> +static inline struct drm_printer drm_line_printer(struct drm_printer *p,
>>>> +						  const char *prefix,
>>>> +						  unsigned short series)
>>>> +{
>>>> +	struct drm_printer lp = {
>>>> +		.printfn = __drm_printfn_line,
>>>> +		.arg = p,
>>>> +		.prefix = prefix,
>>>> +		.line = { .series = series, },
>>>> +	};
>>>> +	return lp;
>>>> +}
>>>> +
>>>>    /*
>>>>     * struct device based logging
>>>>     *

Jani Nikula May 30, 2024, 7:37 p.m. UTC | #7

On Thu, 30 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
> Except that the whole reason this was started was because Michal
> absolutely refuses to allow line counted output in a sysfs/debugfs
> file because "it is unnecessary and breaks the decoding tools".

I'm only looking at this patch at face value, it seems useful to me, but
I'm not aware of any deeper context. And judging by the tone of the
discussion, I also don't want to get into it.

That said, might be prudent to post a patch using the line printer along
with its addition.

Over and out.

BR,
Jani.

Michal Wajdeczko May 30, 2024, 9:09 p.m. UTC | #8

On 30.05.2024 20:47, John Harrison wrote:
> On 5/30/2024 02:33, Michal Wajdeczko wrote:
>> On 30.05.2024 09:49, Jani Nikula wrote:
>>> On Wed, 29 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
>>>> On 5/28/2024 06:06, Michal Wajdeczko wrote:
>>>>> This drm printer wrapper can be used to increase the robustness of
>>>>> the captured output generated by any other drm_printer to make sure
>>>>> we didn't lost any intermediate lines of the output by adding line
>>>>> numbers to each output line. Helpful for capturing some crash data.
>>>>>
>>>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>>>> Cc: Jani Nikula <jani.nikula@intel.com>
>>>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>>>> ---
>>>>> v2: don't abuse prefix, use union instead (Jani)
>>>>>       don't use 'dp' as name, prefer 'p' (Jani)
>>>>>       add support for unique series identifier (John)
>>>>> ---
>>>>>    drivers/gpu/drm/drm_print.c | 14 ++++++++
>>>>>    include/drm/drm_print.h     | 68
>>>>> ++++++++++++++++++++++++++++++++++++-
>>>>>    2 files changed, 81 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
>>>>> index cf2efb44722c..be9cbebff5b3 100644
>>>>> --- a/drivers/gpu/drm/drm_print.c
>>>>> +++ b/drivers/gpu/drm/drm_print.c
>>>>> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p,
>>>>> struct va_format *vaf)
>>>>>    }
>>>>>    EXPORT_SYMBOL(__drm_printfn_err);
>>>>>    +void __drm_printfn_line(struct drm_printer *p, struct va_format
>>>>> *vaf)
>>>>> +{
>>>>> +    unsigned int counter = ++p->line.counter;
>>>> Wrong units, but see below anyway...
>> it really doesn't matter as it is temporary var used in printf()
>> actual 'short' counter will wrap on its own unit boundary
> It should still match. Otherwise the code is ambiguous. Was it meant to
> be an int? Was it meant to be a short? Just because code compiles
> doesn't mean it is good.

it is meant to be "unsigned int" as it is more than "short" counter that
is initialized from and it will printed in printf() as %u

I really don't get what is wrong here

> 
>>
>>>>> +    const char *prefix = p->prefix ?: "";
>>>>> +    const char *pad = p->prefix ? " " : "";
>>>>> +
>>>>> +    if (p->line.series)
>>>>> +        drm_printf(p->arg, "%s%s%u.%u: %pV",
>>>>> +               prefix, pad, p->line.series, counter, vaf);
>>>>> +    else
>>>>> +        drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
>>>>> +}
>>>>> +EXPORT_SYMBOL(__drm_printfn_line);
>>>>> +
>>>>>    /**
>>>>>     * drm_puts - print a const string to a &drm_printer stream
>>>>>     * @p: the &drm printer
>>>>> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
>>>>> index 089950ad8681..f4d9b98d7909 100644
>>>>> --- a/include/drm/drm_print.h
>>>>> +++ b/include/drm/drm_print.h
>>>>> @@ -176,7 +176,13 @@ struct drm_printer {
>>>>>        void (*puts)(struct drm_printer *p, const char *str);
>>>>>        void *arg;
>>>>>        const char *prefix;
>>>>> -    enum drm_debug_category category;
>>>>> +    union {
>>>>> +        enum drm_debug_category category;
>>>>> +        struct {
>>>>> +            unsigned short series;
>>>>> +            unsigned short counter;
>>>> Any particular reason for using 'short' rather than 'int'? Short is
>>>> only
>> didn't want to increase the size of the struct drm_printer and with
>> little luck sizeof two shorts will be less/equal sizeof enum
> Depending on the compiler, the architecture and what values have been
> defined within it, an enum is possibly (likely?) to be a char.

except that is usually a int [1]

but series/counter could be defined as long long int if you really want
and don't care about struct size

[1] https://en.cppreference.com/w/c/language/enum

> 
> 
>>
>>>> 16bits right? That might seem huge but a GuC log buffer with 16MB debug
>>>> log (and minimum sizes for the other sections) when dumped via the
>>>> original debugfs hexdump mechanism is 1,245,444 lines. That line count
>> if your capture relies on collecting all 1,245,444 lines then likely you
>> have other problem that needs solving than line counter overflow
> Have you ever used a full 16MB GuC log? And read it out via debugfs?

quite frequently over last 6+ years

> Then that was 1.2 million lines of text that you read out. Did you have
> other problems that meant reading that file was a waste of your time? Or
> did it allow you to debug the issue you were working on?

I read your reply about 1,245,444 lines in context of limitations of
drm_line_printer planned to be used for dmesg output not about the
debugfs output

> 
> The purpose of this patch is to 'improve' the fully working version that
> was already posted. Causing unwanted wraps in the line count is not an
> improvement. It is very definitely a bug. And now your argument is that
> we shouldn't be doing this in the first place? That's a given! Dumping
> huge streams of data to dmesg is a total hack. We absolutely should not
> be doing it. But we have no choice because there is no other way
> (without adding even bigger and more complicated mechanisms involving
> external debug modules or something).

my point was that blindly printing 1,245,444 lines of hex data to dmesg
is rather sub-optimal way to get 'crash data' (as if it wouldn't be a
crash then likely collecting log over debugfs/devcoredump should work)

one of the idea that could minimize size of collected log data could be
to actually try to decode it partially and copy only last N entries
(yes, I know it requires extra development, but maybe in return we will
be less spamming the dmesg)

> 
>>
>>>> goes down a lot when you start using longer lines for the dump, but it
>>>> is still in the tens of thousands of lines.  So limiting to 16 bits is
>>>> an overflow just waiting to happen.
>> but even in case of an overflow it should be relatively easy to teach
>> any parser to deal with line .0 as indicator to restart any tracker
> Wasn't your earlier argument that trivially parsing out the line count
> prefix from a debugfs file was far too much effort and cannot possibly
> be done by a developer. Now you are saying that coping with a broken
> count is "easy to teach a parser". The one single purpose of this entire
> change is to guarantee a valid dump can be extracted from a log.
> Anything that potentially prevents that from working is a fundamental
> failure.
> 
>>
>> and it is highly unlikely that any captured logs will miss exactly
>> 65,536 contiguous lines, but even then it should be noticeable gap
> Or we could just use an integer count that is not going to wrap and be
> ambiguous.

maybe all we need is to define series/counter as:

	unsigned int series : 8;
	unsigned int counter : 24;

which will give you 16,777,215 lines and 255 series without noticeable
increasing sizeof struct drm_printer

> 
> 
>>
>>>>> +        } line;
>>>>> +    };
>>>>>    };
>>>>>       void __drm_printfn_coredump(struct drm_printer *p, struct
>>>>> va_format *vaf);
>>>>> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p,
>>>>> const char *str);
>>>>>    void __drm_printfn_info(struct drm_printer *p, struct va_format
>>>>> *vaf);
>>>>>    void __drm_printfn_dbg(struct drm_printer *p, struct va_format
>>>>> *vaf);
>>>>>    void __drm_printfn_err(struct drm_printer *p, struct va_format
>>>>> *vaf);
>>>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format
>>>>> *vaf);
>>>>>       __printf(2, 3)
>>>>>    void drm_printf(struct drm_printer *p, const char *f, ...);
>>>>> @@ -357,6 +364,65 @@ static inline struct drm_printer
>>>>> drm_err_printer(struct drm_device *drm,
>>>>>        return p;
>>>>>    }
>>>>>    +/**
>>>>> + * drm_line_printer - construct a &drm_printer that prefixes
>>>>> outputs with line numbers
>>>>> + * @p: the &struct drm_printer which actually generates the output
>>>>> + * @prefix: optional output prefix, or NULL for no prefix
>>>>> + * @series: optional unique series identifier, or 0 to omit
>>>>> identifier in the output
>>>>> + *
>>>>> + * This printer can be used to increase the robustness of the
>>>>> captured output
>>>>> + * to make sure we didn't lost any intermediate lines of the
>>>>> output. Helpful
>>>>> + * while capturing some crash data.
>>>>> + *
>>>>> + * Example 1::
>>>>> + *
>>>>> + *    void crash_dump(struct drm_device *drm)
>>>>> + *    {
>>>>> + *        static unsigned short id;
>>>>> + *        struct drm_printer p = drm_err_printer(drm, "crash");
>>>>> + *        struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
>>>> Is there any benefit or other reason to split the prefix across two
>>>> separate printers? It seems like a source of confusion. As in, the code
>> it's not any kind of the required 'split', as both printers used here
>> can treat prefix as optional if NULL, but rather a way to show how to
>> pass two potentially separated prefixes, as one of them could be already
>> prepared (like engine name or any other alias) or if the primary printer
>> does not accept any prefix at all (and this a limitation of our existing
>> custom GT oriented printers [1] [2])
>>
>> [1]
>> https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/xe/xe_gt_printk.h#L66
>> [2]
>> https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/xe/xe_gt_printk.h#L81
> As per earlier comments, my point is not that you should change the
> patch to remove one of the prefixes from the code. My point is that the
> documentation is confusing. You are explicitly splitting a single phrase
> "crash dump" across two separate printer objects with no explanation as

but who said it is single phrase "crash dump" ?

> to why. And as you just pointed out, there are many use cases where
> there would not be the option to split it. So it would be much, much
> clearer to pass NULL to your drm_err_printer example and have a single
> line comment saying that multiple prefixes could be used if allowed by
> the printer objects and if useful in the situation. Rather than having a
> bizarrely split string with no explanation as to why it has been split.

again, it wasn't a split but example how different prefixes will be
presented in final output and I assumed that average engineer could
figure out which part comes from which printer, but from your voice it
looks that using "crash" and "dump" strings as light reference to
example function name was too tricky, and you need raw example like:

	struct drm_printer p = drm_err_printer(drm, "AAA");
	struct drm_printer lp = drm_line_printer(&p, "BBB", ++id);

	[ ] 0000:00:00.0: [drm] *ERROR* AAA BBB 1.1: foo
	[ ] 0000:00:00.0: [drm] *ERROR* AAA BBB 1.2: bar

> 
>>
>>>> will allow a double prefix, there is not much you can do about that
>>>> because losing the prefix from drm_line_printer would mean no prefix at
>> but why would we loose the prefix from the primary printer ?
> I don't know what you mean by the primary printer? As above, I was

by 'primary' printer I mean the one that is passed to the
drm_line_printer and the drm_line_printer uses for actual output

> simply trying to say that I am not requesting a code change but just a
> clarification of the documentation.
> 
>>
>>>> all when not using drm_err_printer underneath. But why explicitly split
>>>> the message across both printers in the usage example? This is saying
>>>> that this is the recommended way to use the interface, but with no
>>>> explanation of why the split is required or how the split should be
>>>> done.
>> the drm_line_printer is flexible and can be used in many configurations,
>> above is just one of the potential uses that shows full output
>>
>>> You could have a printer, and then add two separate line counted blocks.
>>>
>>>     struct drm_printer p = drm_err_printer(drm, "parent");
>>>     struct drm_printer lp1 = drm_line_printer(&p, "child 1", 0);
>>>
>>>     ...
>>>
>>>     struct drm_printer lp2 = drm_line_printer(&p, "child 2", 0);
>>>
>>>     ...
>>>
>>> p could be defined elsewhere and passed into separate functions which
>>> each have the line printing. The two prefixes can be useful.
>> didn't considered that, but as stated above, drm_line_printer is
>> flexible and can be used in many different ways, like this new one
> And you really do not need to list them all out as massively verbose
> examples with confusing differences between them that are not explained.
> A single example plus a couple of lines of description would be much
> clearer.

but sometimes, especially in case of the formatting functions, it might
be more beneficial to actually show some true outputs, than just
describe what you might expect

> 
>>
>>>> Also, there is really no specific connection to crashes. The purpose of
>>>> this is for allowing the dumping of multi-line blocks of data. One use
>>>> is for debugging crashes. But there are many debug operations that
>>>> require the same dump and do not involve a crash. And this support is
>>>> certainly not intended to be used on general end-user GPU hangs. For
>>>> those, everything should be part of the devcoredump core file that is
>>>> produced and accessible via sysfs. We absolutely should not be dumping
>>>> huge data blocks in customer release drivers except in very extreme
>>>> circumstances.
>> if you are trying to convince me that we don't need any custom
>> drm_printer that would take care of tracking and printing line numbers
>> in kind of more robust way and instead we should be doing such line
>> prints in a error prone way on it's own as you did in [3], then sorry,
>> I'm not convinced, unless you just feel that it shouldn't be part of the
>> drm yet, but then decision is drm maintainer hands (and also in the Xe
>> maintainers who don't want to fall into i915-ish private solutions trap)
>>
>> [3] https://patchwork.freedesktop.org/patch/594021/?series=133349&rev=2
> No. I am saying that your example use case seems to be implying a much
> greater usage for this mechanism than should be expected. I'm saying
> that it should never occur in an end user system because dumping
> megabytes of data to dmesg is a bad user experience. It absolutely

IMO, it's not always a dump of megabytes is where drm_line_printer could
be beneficial, but again, it's idea was to show that you don't have to
manually modify each printf to have a line prefix and, what's more
important, don't pollute output if other printer (debugfs) will be used

> should never be a standard part of handling a GPU hang type 'crash'. The
> primary purpose is for internal debug by developers only. If a use case
> gets shipped upstream then it should be an extremely hard to hit corner
> case for which we are desperate to get any useful debug logs by any
> means necessary.

but there are many tools that we shouldn't over use in production
systems but still we do have them defined as common code

> 
> As for error prone, I am not seeing how the original trivial (and
> working) code is error prone but this complex abstraction of it is less
> so. Especially given the integer truncation problem. I mean seriously,
> how 'error prone' can it be to add a "%d, line++" to a print?! And how

what if in the future someone else add new printf() but misses or
misspells that %d, line++ in format line ?

> much of a 'private solutions trap' is it to add such a trivial prefix to
> a couple of prints in a single function that is really a big ugly hack
> for getting logs out in a really dodgy manner anyway?
> 
> As you say, it is up to the DRM maintainers as to whether they want this
> support in the generic DRM layers. If it lands and it is functional
> (i.e. does not break its sole reason for being by truncating counts
> partway through a dump) then sure, I'll use it. I just don't see that it
> is even remotely worth the effort given that it is single use only and
> given how trivial the local version is.

if collecting GuC log over dmesg is so helpful/important in some
situations then likely similar solution could be beneficial on i915, no?

> 
> John.
> 
>>
>>> A devcoredump implementation could use a drm_printer too?
>>>
>>> Is this only about bikeshedding the example? I'm not sure I get the
>>> point here.
>>>
>>>>> + *
>>>>> + *        drm_printf(&lp, "foo");
>>>>> + *        drm_printf(&lp, "bar");
>>>>> + *    }
>>>>> + *
>>>>> + * Above code will print into the dmesg something like::
>>>>> + *
>>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
>>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
>>>>> + *
>>>>> + * Example 2::
>>>>> + *
>>>>> + *    void line_dump(struct device *dev)
>>>>> + *    {
>>>>> + *        struct drm_printer p = drm_info_printer(dev);
>>>>> + *        struct drm_printer lp = drm_line_printer(&p, NULL, 0);
>>>>> + *
>>>>> + *        drm_printf(&lp, "foo");
>>>>> + *        drm_printf(&lp, "bar");
>>>>> + *    }
>>>>> + *
>>>>> + * Above code will print::
>>>>> + *
>>>>> + *    [ ] 0000:00:00.0: [drm] 1: foo
>>>>> + *    [ ] 0000:00:00.0: [drm] 2: bar
>>>> Not really seeing the point of having two examples listed. The first
>>>> includes all the optional extras, the second is just repeating with no
>>>> new information.
>>> You see how the "series" param behaves?
>> exactly
>>
>>> BR,
>>> Jani.
>>>
>>>> John.
>>>>
>>>>> + *
>>>>> + * RETURNS:
>>>>> + * The &drm_printer object
>>>>> + */
>>>>> +static inline struct drm_printer drm_line_printer(struct
>>>>> drm_printer *p,
>>>>> +                          const char *prefix,
>>>>> +                          unsigned short series)
>>>>> +{
>>>>> +    struct drm_printer lp = {
>>>>> +        .printfn = __drm_printfn_line,
>>>>> +        .arg = p,
>>>>> +        .prefix = prefix,
>>>>> +        .line = { .series = series, },
>>>>> +    };
>>>>> +    return lp;
>>>>> +}
>>>>> +
>>>>>    /*
>>>>>     * struct device based logging
>>>>>     *
>

Michal Wajdeczko May 30, 2024, 9:27 p.m. UTC | #9

On 30.05.2024 20:47, John Harrison wrote:
> On 5/30/2024 00:49, Jani Nikula wrote:
>> On Wed, 29 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
>>> On 5/28/2024 06:06, Michal Wajdeczko wrote:
>>>> This drm printer wrapper can be used to increase the robustness of
>>>> the captured output generated by any other drm_printer to make sure
>>>> we didn't lost any intermediate lines of the output by adding line
>>>> numbers to each output line. Helpful for capturing some crash data.
>>>>
>>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>>> Cc: Jani Nikula <jani.nikula@intel.com>
>>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>>> ---
>>>> v2: don't abuse prefix, use union instead (Jani)
>>>>       don't use 'dp' as name, prefer 'p' (Jani)
>>>>       add support for unique series identifier (John)
>>>> ---
>>>>    drivers/gpu/drm/drm_print.c | 14 ++++++++
>>>>    include/drm/drm_print.h     | 68
>>>> ++++++++++++++++++++++++++++++++++++-
>>>>    2 files changed, 81 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
>>>> index cf2efb44722c..be9cbebff5b3 100644
>>>> --- a/drivers/gpu/drm/drm_print.c
>>>> +++ b/drivers/gpu/drm/drm_print.c
>>>> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p,
>>>> struct va_format *vaf)
>>>>    }
>>>>    EXPORT_SYMBOL(__drm_printfn_err);
>>>>    +void __drm_printfn_line(struct drm_printer *p, struct va_format
>>>> *vaf)
>>>> +{
>>>> +    unsigned int counter = ++p->line.counter;
>>> Wrong units, but see below anyway...
>>>
>>>> +    const char *prefix = p->prefix ?: "";
>>>> +    const char *pad = p->prefix ? " " : "";
>>>> +
>>>> +    if (p->line.series)
>>>> +        drm_printf(p->arg, "%s%s%u.%u: %pV",
>>>> +               prefix, pad, p->line.series, counter, vaf);
>>>> +    else
>>>> +        drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
>>>> +}
>>>> +EXPORT_SYMBOL(__drm_printfn_line);
>>>> +
>>>>    /**
>>>>     * drm_puts - print a const string to a &drm_printer stream
>>>>     * @p: the &drm printer
>>>> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
>>>> index 089950ad8681..f4d9b98d7909 100644
>>>> --- a/include/drm/drm_print.h
>>>> +++ b/include/drm/drm_print.h
>>>> @@ -176,7 +176,13 @@ struct drm_printer {
>>>>        void (*puts)(struct drm_printer *p, const char *str);
>>>>        void *arg;
>>>>        const char *prefix;
>>>> -    enum drm_debug_category category;
>>>> +    union {
>>>> +        enum drm_debug_category category;
>>>> +        struct {
>>>> +            unsigned short series;
>>>> +            unsigned short counter;
>>> Any particular reason for using 'short' rather than 'int'? Short is only
>>> 16bits right? That might seem huge but a GuC log buffer with 16MB debug
>>> log (and minimum sizes for the other sections) when dumped via the
>>> original debugfs hexdump mechanism is 1,245,444 lines. That line count
>>> goes down a lot when you start using longer lines for the dump, but it
>>> is still in the tens of thousands of lines.  So limiting to 16 bits is
>>> an overflow just waiting to happen.
>>>
>>>> +        } line;
>>>> +    };
>>>>    };
>>>>       void __drm_printfn_coredump(struct drm_printer *p, struct
>>>> va_format *vaf);
>>>> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p,
>>>> const char *str);
>>>>    void __drm_printfn_info(struct drm_printer *p, struct va_format
>>>> *vaf);
>>>>    void __drm_printfn_dbg(struct drm_printer *p, struct va_format
>>>> *vaf);
>>>>    void __drm_printfn_err(struct drm_printer *p, struct va_format
>>>> *vaf);
>>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf);
>>>>       __printf(2, 3)
>>>>    void drm_printf(struct drm_printer *p, const char *f, ...);
>>>> @@ -357,6 +364,65 @@ static inline struct drm_printer
>>>> drm_err_printer(struct drm_device *drm,
>>>>        return p;
>>>>    }
>>>>    +/**
>>>> + * drm_line_printer - construct a &drm_printer that prefixes
>>>> outputs with line numbers
>>>> + * @p: the &struct drm_printer which actually generates the output
>>>> + * @prefix: optional output prefix, or NULL for no prefix
>>>> + * @series: optional unique series identifier, or 0 to omit
>>>> identifier in the output
>>>> + *
>>>> + * This printer can be used to increase the robustness of the
>>>> captured output
>>>> + * to make sure we didn't lost any intermediate lines of the
>>>> output. Helpful
>>>> + * while capturing some crash data.
>>>> + *
>>>> + * Example 1::
>>>> + *
>>>> + *    void crash_dump(struct drm_device *drm)
>>>> + *    {
>>>> + *        static unsigned short id;
>>>> + *        struct drm_printer p = drm_err_printer(drm, "crash");
>>>> + *        struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
>>> Is there any benefit or other reason to split the prefix across two
>>> separate printers? It seems like a source of confusion. As in, the code
>>> will allow a double prefix, there is not much you can do about that
>>> because losing the prefix from drm_line_printer would mean no prefix at
>>> all when not using drm_err_printer underneath. But why explicitly split
>>> the message across both printers in the usage example? This is saying
>>> that this is the recommended way to use the interface, but with no
>>> explanation of why the split is required or how the split should be
>>> done.
>> You could have a printer, and then add two separate line counted blocks.
>>
>>     struct drm_printer p = drm_err_printer(drm, "parent");
>>     struct drm_printer lp1 = drm_line_printer(&p, "child 1", 0);
>>
>>     ...
>>
>>     struct drm_printer lp2 = drm_line_printer(&p, "child 2", 0);
>>
>>     ...
>>
>> p could be defined elsewhere and passed into separate functions which
>> each have the line printing. The two prefixes can be useful.
> Except you can't have a multi-level prefix if using the info level
> printer as that does not take a prefix. And I'm really not seeing a

but it's up to you which printer you choose as 'primary' printer that
will render final output

> reason why you would want the line count to restart in the middle of a
> single atomic dump operation.
> 
>>
>>> Also, there is really no specific connection to crashes. The purpose of
>>> this is for allowing the dumping of multi-line blocks of data. One use
>>> is for debugging crashes. But there are many debug operations that
>>> require the same dump and do not involve a crash. And this support is
>>> certainly not intended to be used on general end-user GPU hangs. For
>>> those, everything should be part of the devcoredump core file that is
>>> produced and accessible via sysfs. We absolutely should not be dumping
>>> huge data blocks in customer release drivers except in very extreme
>>> circumstances.
>> A devcoredump implementation could use a drm_printer too?
> You mean for reading the devcoredump file from sysfs? Except that the
> whole reason this was started was because Michal absolutely refuses to
> allow line counted output in a sysfs/debugfs file because "it is
> unnecessary and breaks the decoding tools".
> 
>>
>> Is this only about bikeshedding the example? I'm not sure I get the
>> point here.
> I would call it getting accurate and understandable documentation.
> 
> The existing example is splitting what should be an atomic name "crash
> dump" across two separate line printer objects. That is something that
> is so unrealistic that it implies there is a required reason to break
> the prefix up. Documentation that is ambiguous and confusing is
> potentially worse than no documentation at all.
> 

all below setups will render the same output.
which one, in your opinion, shows all potential of the drm_line_printer?

	struct drm_printer p = drm_err_printer(drm, "AAA BBB");
	struct drm_printer lp = drm_line_printer(&p, NULL, id);

	struct drm_printer p = drm_err_printer(drm, NULL);
	struct drm_printer lp = drm_line_printer(&p, "AAA BBB", id);

	struct drm_printer p = drm_err_printer(drm, "AAA");
	struct drm_printer lp = drm_line_printer(&p, "BBB", ++id);

> 
>>
>>>> + *
>>>> + *        drm_printf(&lp, "foo");
>>>> + *        drm_printf(&lp, "bar");
>>>> + *    }
>>>> + *
>>>> + * Above code will print into the dmesg something like::
>>>> + *
>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
>>>> + *
>>>> + * Example 2::
>>>> + *
>>>> + *    void line_dump(struct device *dev)
>>>> + *    {
>>>> + *        struct drm_printer p = drm_info_printer(dev);
>>>> + *        struct drm_printer lp = drm_line_printer(&p, NULL, 0);
>>>> + *
>>>> + *        drm_printf(&lp, "foo");
>>>> + *        drm_printf(&lp, "bar");
>>>> + *    }
>>>> + *
>>>> + * Above code will print::
>>>> + *
>>>> + *    [ ] 0000:00:00.0: [drm] 1: foo
>>>> + *    [ ] 0000:00:00.0: [drm] 2: bar
>>> Not really seeing the point of having two examples listed. The first
>>> includes all the optional extras, the second is just repeating with no
>>> new information.
>> You see how the "series" param behaves?
> The second example doesn't have a series parameter. If the only purpose
> is to say "the print of the series value is suppressed if zero" then why
> not just have that one line? Documentation should also be concise.

it was already documented in that way:

@series: optional unique series identifier, or 0 to omit identifier in
the output

> 
> John.
> 
>>
>> BR,
>> Jani.
>>
>>> John.
>>>
>>>> + *
>>>> + * RETURNS:
>>>> + * The &drm_printer object
>>>> + */
>>>> +static inline struct drm_printer drm_line_printer(struct
>>>> drm_printer *p,
>>>> +                          const char *prefix,
>>>> +                          unsigned short series)
>>>> +{
>>>> +    struct drm_printer lp = {
>>>> +        .printfn = __drm_printfn_line,
>>>> +        .arg = p,
>>>> +        .prefix = prefix,
>>>> +        .line = { .series = series, },
>>>> +    };
>>>> +    return lp;
>>>> +}
>>>> +
>>>>    /*
>>>>     * struct device based logging
>>>>     *
>

John Harrison June 14, 2024, 6:37 p.m. UTC | #10

On 5/30/2024 12:37, Jani Nikula wrote:
> On Thu, 30 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
>> Except that the whole reason this was started was because Michal
>> absolutely refuses to allow line counted output in a sysfs/debugfs
>> file because "it is unnecessary and breaks the decoding tools".
> I'm only looking at this patch at face value, it seems useful to me, but
> I'm not aware of any deeper context. And judging by the tone of the
> discussion, I also don't want to get into it.
>
> That said, might be prudent to post a patch using the line printer along
> with its addition.
See:
     https://patchwork.freedesktop.org/series/134695/

Patches #5 and #6 make use of the line printer. Although as mentioned in 
that patch set, the usage there is still just a stop-gap measure and 
both those usages are intended to be replaced once the issues with the 
devcoredump version are resolved. The ultimate aim would be to use the 
line printer from just one function which dumps a devcoredump capture to 
dmesg. That function can then be used from the dead CT code and any 
other random debug effort a random developer is working on locally.

John.

>
> Over and out.
>
>
> BR,
> Jani.
>

John Harrison June 14, 2024, 6:45 p.m. UTC | #11

On 5/30/2024 14:27, Michal Wajdeczko wrote:
> On 30.05.2024 20:47, John Harrison wrote:
>> On 5/30/2024 00:49, Jani Nikula wrote:
>>> On Wed, 29 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
>>>> On 5/28/2024 06:06, Michal Wajdeczko wrote:
>>>>> This drm printer wrapper can be used to increase the robustness of
>>>>> the captured output generated by any other drm_printer to make sure
>>>>> we didn't lost any intermediate lines of the output by adding line
>>>>> numbers to each output line. Helpful for capturing some crash data.
>>>>>
>>>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>>>> Cc: Jani Nikula <jani.nikula@intel.com>
>>>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>>>> ---
>>>>> v2: don't abuse prefix, use union instead (Jani)
>>>>>        don't use 'dp' as name, prefer 'p' (Jani)
>>>>>        add support for unique series identifier (John)
>>>>> ---
>>>>>     drivers/gpu/drm/drm_print.c | 14 ++++++++
>>>>>     include/drm/drm_print.h     | 68
>>>>> ++++++++++++++++++++++++++++++++++++-
>>>>>     2 files changed, 81 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
>>>>> index cf2efb44722c..be9cbebff5b3 100644
>>>>> --- a/drivers/gpu/drm/drm_print.c
>>>>> +++ b/drivers/gpu/drm/drm_print.c
>>>>> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p,
>>>>> struct va_format *vaf)
>>>>>     }
>>>>>     EXPORT_SYMBOL(__drm_printfn_err);
>>>>>     +void __drm_printfn_line(struct drm_printer *p, struct va_format
>>>>> *vaf)
>>>>> +{
>>>>> +    unsigned int counter = ++p->line.counter;
>>>> Wrong units, but see below anyway...
>>>>
>>>>> +    const char *prefix = p->prefix ?: "";
>>>>> +    const char *pad = p->prefix ? " " : "";
>>>>> +
>>>>> +    if (p->line.series)
>>>>> +        drm_printf(p->arg, "%s%s%u.%u: %pV",
>>>>> +               prefix, pad, p->line.series, counter, vaf);
>>>>> +    else
>>>>> +        drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
>>>>> +}
>>>>> +EXPORT_SYMBOL(__drm_printfn_line);
>>>>> +
>>>>>     /**
>>>>>      * drm_puts - print a const string to a &drm_printer stream
>>>>>      * @p: the &drm printer
>>>>> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
>>>>> index 089950ad8681..f4d9b98d7909 100644
>>>>> --- a/include/drm/drm_print.h
>>>>> +++ b/include/drm/drm_print.h
>>>>> @@ -176,7 +176,13 @@ struct drm_printer {
>>>>>         void (*puts)(struct drm_printer *p, const char *str);
>>>>>         void *arg;
>>>>>         const char *prefix;
>>>>> -    enum drm_debug_category category;
>>>>> +    union {
>>>>> +        enum drm_debug_category category;
>>>>> +        struct {
>>>>> +            unsigned short series;
>>>>> +            unsigned short counter;
>>>> Any particular reason for using 'short' rather than 'int'? Short is only
>>>> 16bits right? That might seem huge but a GuC log buffer with 16MB debug
>>>> log (and minimum sizes for the other sections) when dumped via the
>>>> original debugfs hexdump mechanism is 1,245,444 lines. That line count
>>>> goes down a lot when you start using longer lines for the dump, but it
>>>> is still in the tens of thousands of lines.  So limiting to 16 bits is
>>>> an overflow just waiting to happen.
>>>>
>>>>> +        } line;
>>>>> +    };
>>>>>     };
>>>>>        void __drm_printfn_coredump(struct drm_printer *p, struct
>>>>> va_format *vaf);
>>>>> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p,
>>>>> const char *str);
>>>>>     void __drm_printfn_info(struct drm_printer *p, struct va_format
>>>>> *vaf);
>>>>>     void __drm_printfn_dbg(struct drm_printer *p, struct va_format
>>>>> *vaf);
>>>>>     void __drm_printfn_err(struct drm_printer *p, struct va_format
>>>>> *vaf);
>>>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf);
>>>>>        __printf(2, 3)
>>>>>     void drm_printf(struct drm_printer *p, const char *f, ...);
>>>>> @@ -357,6 +364,65 @@ static inline struct drm_printer
>>>>> drm_err_printer(struct drm_device *drm,
>>>>>         return p;
>>>>>     }
>>>>>     +/**
>>>>> + * drm_line_printer - construct a &drm_printer that prefixes
>>>>> outputs with line numbers
>>>>> + * @p: the &struct drm_printer which actually generates the output
>>>>> + * @prefix: optional output prefix, or NULL for no prefix
>>>>> + * @series: optional unique series identifier, or 0 to omit
>>>>> identifier in the output
>>>>> + *
>>>>> + * This printer can be used to increase the robustness of the
>>>>> captured output
>>>>> + * to make sure we didn't lost any intermediate lines of the
>>>>> output. Helpful
>>>>> + * while capturing some crash data.
>>>>> + *
>>>>> + * Example 1::
>>>>> + *
>>>>> + *    void crash_dump(struct drm_device *drm)
>>>>> + *    {
>>>>> + *        static unsigned short id;
>>>>> + *        struct drm_printer p = drm_err_printer(drm, "crash");
>>>>> + *        struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
>>>> Is there any benefit or other reason to split the prefix across two
>>>> separate printers? It seems like a source of confusion. As in, the code
>>>> will allow a double prefix, there is not much you can do about that
>>>> because losing the prefix from drm_line_printer would mean no prefix at
>>>> all when not using drm_err_printer underneath. But why explicitly split
>>>> the message across both printers in the usage example? This is saying
>>>> that this is the recommended way to use the interface, but with no
>>>> explanation of why the split is required or how the split should be
>>>> done.
>>> You could have a printer, and then add two separate line counted blocks.
>>>
>>>      struct drm_printer p = drm_err_printer(drm, "parent");
>>>      struct drm_printer lp1 = drm_line_printer(&p, "child 1", 0);
>>>
>>>      ...
>>>
>>>      struct drm_printer lp2 = drm_line_printer(&p, "child 2", 0);
>>>
>>>      ...
>>>
>>> p could be defined elsewhere and passed into separate functions which
>>> each have the line printing. The two prefixes can be useful.
>> Except you can't have a multi-level prefix if using the info level
>> printer as that does not take a prefix. And I'm really not seeing a
> but it's up to you which printer you choose as 'primary' printer that
> will render final output
Not really. The choice of printer is dictated by what the code is 
needing to do not by how the developer is feeling on that particular 
day. And this isn't about some random developer's situation, it is about 
making a clear and concise example to show said random developer how 
this feature should be used. And that means not using every possible 
feature simply because it is there even if that usage makes no actual sense.

>
>> reason why you would want the line count to restart in the middle of a
>> single atomic dump operation.
>>
>>>> Also, there is really no specific connection to crashes. The purpose of
>>>> this is for allowing the dumping of multi-line blocks of data. One use
>>>> is for debugging crashes. But there are many debug operations that
>>>> require the same dump and do not involve a crash. And this support is
>>>> certainly not intended to be used on general end-user GPU hangs. For
>>>> those, everything should be part of the devcoredump core file that is
>>>> produced and accessible via sysfs. We absolutely should not be dumping
>>>> huge data blocks in customer release drivers except in very extreme
>>>> circumstances.
>>> A devcoredump implementation could use a drm_printer too?
>> You mean for reading the devcoredump file from sysfs? Except that the
>> whole reason this was started was because Michal absolutely refuses to
>> allow line counted output in a sysfs/debugfs file because "it is
>> unnecessary and breaks the decoding tools".
>>
>>> Is this only about bikeshedding the example? I'm not sure I get the
>>> point here.
>> I would call it getting accurate and understandable documentation.
>>
>> The existing example is splitting what should be an atomic name "crash
>> dump" across two separate line printer objects. That is something that
>> is so unrealistic that it implies there is a required reason to break
>> the prefix up. Documentation that is ambiguous and confusing is
>> potentially worse than no documentation at all.
>>
> all below setups will render the same output.
> which one, in your opinion, shows all potential of the drm_line_printer?
>
> 	struct drm_printer p = drm_err_printer(drm, "AAA BBB");
> 	struct drm_printer lp = drm_line_printer(&p, NULL, id);
>
> 	struct drm_printer p = drm_err_printer(drm, NULL);
> 	struct drm_printer lp = drm_line_printer(&p, "AAA BBB", id);
>
> 	struct drm_printer p = drm_err_printer(drm, "AAA");
> 	struct drm_printer lp = drm_line_printer(&p, "BBB", ++id);
As I keep saying, this isn't about showing every possible permutation of 
the feature. It is about giving a clear and concise example that 
demonstrates typical usage. Further non-typical (but still trivial to 
explain) usage can just be described in a line or two of documentation 
without needing dozens of lines of example code. And given that the 
example is meant to be 'how to use line printer' not 'how to use random 
other printer', the middle of those three seems the clearest to me. 
Although you seem to be missing the ++?

>
>>>>> + *
>>>>> + *        drm_printf(&lp, "foo");
>>>>> + *        drm_printf(&lp, "bar");
>>>>> + *    }
>>>>> + *
>>>>> + * Above code will print into the dmesg something like::
>>>>> + *
>>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
>>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
>>>>> + *
>>>>> + * Example 2::
>>>>> + *
>>>>> + *    void line_dump(struct device *dev)
>>>>> + *    {
>>>>> + *        struct drm_printer p = drm_info_printer(dev);
>>>>> + *        struct drm_printer lp = drm_line_printer(&p, NULL, 0);
>>>>> + *
>>>>> + *        drm_printf(&lp, "foo");
>>>>> + *        drm_printf(&lp, "bar");
>>>>> + *    }
>>>>> + *
>>>>> + * Above code will print::
>>>>> + *
>>>>> + *    [ ] 0000:00:00.0: [drm] 1: foo
>>>>> + *    [ ] 0000:00:00.0: [drm] 2: bar
>>>> Not really seeing the point of having two examples listed. The first
>>>> includes all the optional extras, the second is just repeating with no
>>>> new information.
>>> You see how the "series" param behaves?
>> The second example doesn't have a series parameter. If the only purpose
>> is to say "the print of the series value is suppressed if zero" then why
>> not just have that one line? Documentation should also be concise.
> it was already documented in that way:
>
> @series: optional unique series identifier, or 0 to omit identifier in
> the output
Precisely my point. Why do you need sixteen lines of almost totally 
duplicated example when you have one line of perfectly clear and 
unambiguous documentation already?

John.

>
>> John.
>>
>>> BR,
>>> Jani.
>>>
>>>> John.
>>>>
>>>>> + *
>>>>> + * RETURNS:
>>>>> + * The &drm_printer object
>>>>> + */
>>>>> +static inline struct drm_printer drm_line_printer(struct
>>>>> drm_printer *p,
>>>>> +                          const char *prefix,
>>>>> +                          unsigned short series)
>>>>> +{
>>>>> +    struct drm_printer lp = {
>>>>> +        .printfn = __drm_printfn_line,
>>>>> +        .arg = p,
>>>>> +        .prefix = prefix,
>>>>> +        .line = { .series = series, },
>>>>> +    };
>>>>> +    return lp;
>>>>> +}
>>>>> +
>>>>>     /*
>>>>>      * struct device based logging
>>>>>      *

John Harrison June 14, 2024, 7:55 p.m. UTC | #12

On 5/30/2024 14:09, Michal Wajdeczko wrote:
> On 30.05.2024 20:47, John Harrison wrote:
>> On 5/30/2024 02:33, Michal Wajdeczko wrote:
>>> On 30.05.2024 09:49, Jani Nikula wrote:
>>>> On Wed, 29 May 2024, John Harrison <john.c.harrison@intel.com> wrote:
>>>>> On 5/28/2024 06:06, Michal Wajdeczko wrote:
>>>>>> This drm printer wrapper can be used to increase the robustness of
>>>>>> the captured output generated by any other drm_printer to make sure
>>>>>> we didn't lost any intermediate lines of the output by adding line
>>>>>> numbers to each output line. Helpful for capturing some crash data.
>>>>>>
>>>>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>>>>> Cc: Jani Nikula <jani.nikula@intel.com>
>>>>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>>>>> ---
>>>>>> v2: don't abuse prefix, use union instead (Jani)
>>>>>>        don't use 'dp' as name, prefer 'p' (Jani)
>>>>>>        add support for unique series identifier (John)
>>>>>> ---
>>>>>>     drivers/gpu/drm/drm_print.c | 14 ++++++++
>>>>>>     include/drm/drm_print.h     | 68
>>>>>> ++++++++++++++++++++++++++++++++++++-
>>>>>>     2 files changed, 81 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
>>>>>> index cf2efb44722c..be9cbebff5b3 100644
>>>>>> --- a/drivers/gpu/drm/drm_print.c
>>>>>> +++ b/drivers/gpu/drm/drm_print.c
>>>>>> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p,
>>>>>> struct va_format *vaf)
>>>>>>     }
>>>>>>     EXPORT_SYMBOL(__drm_printfn_err);
>>>>>>     +void __drm_printfn_line(struct drm_printer *p, struct va_format
>>>>>> *vaf)
>>>>>> +{
>>>>>> +    unsigned int counter = ++p->line.counter;
>>>>> Wrong units, but see below anyway...
>>> it really doesn't matter as it is temporary var used in printf()
>>> actual 'short' counter will wrap on its own unit boundary
>> It should still match. Otherwise the code is ambiguous. Was it meant to
>> be an int? Was it meant to be a short? Just because code compiles
>> doesn't mean it is good.
> it is meant to be "unsigned int" as it is more than "short" counter that
> is initialized from and it will printed in printf() as %u
>
> I really don't get what is wrong here
The fact that you have '<type A> = <type B>;'. That generally implies a 
programming error because types are supposed to match unless there is a 
good reason and an explicit cast to show that the programmer meant the 
change.

>
>>>>>> +    const char *prefix = p->prefix ?: "";
>>>>>> +    const char *pad = p->prefix ? " " : "";
>>>>>> +
>>>>>> +    if (p->line.series)
>>>>>> +        drm_printf(p->arg, "%s%s%u.%u: %pV",
>>>>>> +               prefix, pad, p->line.series, counter, vaf);
>>>>>> +    else
>>>>>> +        drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(__drm_printfn_line);
>>>>>> +
>>>>>>     /**
>>>>>>      * drm_puts - print a const string to a &drm_printer stream
>>>>>>      * @p: the &drm printer
>>>>>> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
>>>>>> index 089950ad8681..f4d9b98d7909 100644
>>>>>> --- a/include/drm/drm_print.h
>>>>>> +++ b/include/drm/drm_print.h
>>>>>> @@ -176,7 +176,13 @@ struct drm_printer {
>>>>>>         void (*puts)(struct drm_printer *p, const char *str);
>>>>>>         void *arg;
>>>>>>         const char *prefix;
>>>>>> -    enum drm_debug_category category;
>>>>>> +    union {
>>>>>> +        enum drm_debug_category category;
>>>>>> +        struct {
>>>>>> +            unsigned short series;
>>>>>> +            unsigned short counter;
>>>>> Any particular reason for using 'short' rather than 'int'? Short is
>>>>> only
>>> didn't want to increase the size of the struct drm_printer and with
>>> little luck sizeof two shorts will be less/equal sizeof enum
>> Depending on the compiler, the architecture and what values have been
>> defined within it, an enum is possibly (likely?) to be a char.
> except that is usually a int [1]
>
> but series/counter could be defined as long long int if you really want
> and don't care about struct size
Personally, I don't care about the structure size. I care about the 
output not being ambiguous. I don't know if there is any particular 
reason why this structure's size is important. You would need to ask 
someone who knows more about DRM in general to answer that. You seem to 
think it is critically important though, given that you are adding 
unions and shorts and sacrificing data safety and ultimate usability for 
the sake of a few bytes. Where is your reasoning for that coming from?

>
> [1] https://en.cppreference.com/w/c/language/enum
"If it is not explicitly specified, the underlying type is the 
enumeration’s compatible type, which is either a signed or unsigned 
integer type, or char."

>
>>
>>>>> 16bits right? That might seem huge but a GuC log buffer with 16MB debug
>>>>> log (and minimum sizes for the other sections) when dumped via the
>>>>> original debugfs hexdump mechanism is 1,245,444 lines. That line count
>>> if your capture relies on collecting all 1,245,444 lines then likely you
>>> have other problem that needs solving than line counter overflow
>> Have you ever used a full 16MB GuC log? And read it out via debugfs?
> quite frequently over last 6+ years
>
>> Then that was 1.2 million lines of text that you read out. Did you have
>> other problems that meant reading that file was a waste of your time? Or
>> did it allow you to debug the issue you were working on?
> I read your reply about 1,245,444 lines in context of limitations of
> drm_line_printer planned to be used for dmesg output not about the
> debugfs output
It is the same output. It doesn't matter whether you are reading from 
debugfs or dumping to dmesg. It is the same GuC log and is the same 1.2m 
lines of output. As per the other patch series, I am trying to reduce 
that by using wider lines and such but that only gets you so far. This 
is still fundamentally about spamming huge amounts of output to dmesg 
and needing to get that out with 100% reliability or it is 100% useless.

>
>> The purpose of this patch is to 'improve' the fully working version that
>> was already posted. Causing unwanted wraps in the line count is not an
>> improvement. It is very definitely a bug. And now your argument is that
>> we shouldn't be doing this in the first place? That's a given! Dumping
>> huge streams of data to dmesg is a total hack. We absolutely should not
>> be doing it. But we have no choice because there is no other way
>> (without adding even bigger and more complicated mechanisms involving
>> external debug modules or something).
> my point was that blindly printing 1,245,444 lines of hex data to dmesg
> is rather sub-optimal way to get 'crash data' (as if it wouldn't be a
> crash then likely collecting log over debugfs/devcoredump should work)
>
> one of the idea that could minimize size of collected log data could be
> to actually try to decode it partially and copy only last N entries
> (yes, I know it requires extra development, but maybe in return we will
> be less spamming the dmesg)
Or just use a smaller GuC log buffer in the first place as a trivial way 
to achieve the same result? Sure, that's great until you realise that 
you need N+100 entries to see what went wrong.

The one and only purpose of this is to allow the debugging of very hard 
to reproduce problems. And they are not always a 'crash'. It is by far 
the easiest way to get logs out from a failing self test. Sometimes this 
is the only way to get meaningful logs out of our CI system :(. There 
are numerous reasons why it is useful and there is no reason at all to 
limit it just because it is unwieldy. I know it is unwieldy. It is truly 
horrid that we have to do this. It is an evil hack. But sometimes it is 
the only thing that works. And anything that makes it less useful is 
defeating the whole point of it.

>
>>>>> goes down a lot when you start using longer lines for the dump, but it
>>>>> is still in the tens of thousands of lines.  So limiting to 16 bits is
>>>>> an overflow just waiting to happen.
>>> but even in case of an overflow it should be relatively easy to teach
>>> any parser to deal with line .0 as indicator to restart any tracker
>> Wasn't your earlier argument that trivially parsing out the line count
>> prefix from a debugfs file was far too much effort and cannot possibly
>> be done by a developer. Now you are saying that coping with a broken
>> count is "easy to teach a parser". The one single purpose of this entire
>> change is to guarantee a valid dump can be extracted from a log.
>> Anything that potentially prevents that from working is a fundamental
>> failure.
>>
>>> and it is highly unlikely that any captured logs will miss exactly
>>> 65,536 contiguous lines, but even then it should be noticeable gap
>> Or we could just use an integer count that is not going to wrap and be
>> ambiguous.
> maybe all we need is to define series/counter as:
>
> 	unsigned int series : 8;
> 	unsigned int counter : 24;
>
> which will give you 16,777,215 lines and 255 series without noticeable
> increasing sizeof struct drm_printer
Sure. Although if you are so desperate not to increase the size of the 
DRM structure and and make bad things happen to the rest of the DRM 
drivers, then why bother putting this into the DRM layer at all? Given 
that it is only ever going to be used by one function in the Xe driver 
and can be trivially coded locally in the Xe driver with no adverse 
affects to any other DRM based driver...


>
>>
>>>>>> +        } line;
>>>>>> +    };
>>>>>>     };
>>>>>>        void __drm_printfn_coredump(struct drm_printer *p, struct
>>>>>> va_format *vaf);
>>>>>> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p,
>>>>>> const char *str);
>>>>>>     void __drm_printfn_info(struct drm_printer *p, struct va_format
>>>>>> *vaf);
>>>>>>     void __drm_printfn_dbg(struct drm_printer *p, struct va_format
>>>>>> *vaf);
>>>>>>     void __drm_printfn_err(struct drm_printer *p, struct va_format
>>>>>> *vaf);
>>>>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format
>>>>>> *vaf);
>>>>>>        __printf(2, 3)
>>>>>>     void drm_printf(struct drm_printer *p, const char *f, ...);
>>>>>> @@ -357,6 +364,65 @@ static inline struct drm_printer
>>>>>> drm_err_printer(struct drm_device *drm,
>>>>>>         return p;
>>>>>>     }
>>>>>>     +/**
>>>>>> + * drm_line_printer - construct a &drm_printer that prefixes
>>>>>> outputs with line numbers
>>>>>> + * @p: the &struct drm_printer which actually generates the output
>>>>>> + * @prefix: optional output prefix, or NULL for no prefix
>>>>>> + * @series: optional unique series identifier, or 0 to omit
>>>>>> identifier in the output
>>>>>> + *
>>>>>> + * This printer can be used to increase the robustness of the
>>>>>> captured output
>>>>>> + * to make sure we didn't lost any intermediate lines of the
>>>>>> output. Helpful
>>>>>> + * while capturing some crash data.
>>>>>> + *
>>>>>> + * Example 1::
>>>>>> + *
>>>>>> + *    void crash_dump(struct drm_device *drm)
>>>>>> + *    {
>>>>>> + *        static unsigned short id;
>>>>>> + *        struct drm_printer p = drm_err_printer(drm, "crash");
>>>>>> + *        struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
>>>>> Is there any benefit or other reason to split the prefix across two
>>>>> separate printers? It seems like a source of confusion. As in, the code
>>> it's not any kind of the required 'split', as both printers used here
>>> can treat prefix as optional if NULL, but rather a way to show how to
>>> pass two potentially separated prefixes, as one of them could be already
>>> prepared (like engine name or any other alias) or if the primary printer
>>> does not accept any prefix at all (and this a limitation of our existing
>>> custom GT oriented printers [1] [2])
>>>
>>> [1]
>>> https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/xe/xe_gt_printk.h#L66
>>> [2]
>>> https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/xe/xe_gt_printk.h#L81
>> As per earlier comments, my point is not that you should change the
>> patch to remove one of the prefixes from the code. My point is that the
>> documentation is confusing. You are explicitly splitting a single phrase
>> "crash dump" across two separate printer objects with no explanation as
> but who said it is single phrase "crash dump" ?
Because from the single context of the two lines above, why would it not 
be the phrase "crash dump"?

>
>> to why. And as you just pointed out, there are many use cases where
>> there would not be the option to split it. So it would be much, much
>> clearer to pass NULL to your drm_err_printer example and have a single
>> line comment saying that multiple prefixes could be used if allowed by
>> the printer objects and if useful in the situation. Rather than having a
>> bizarrely split string with no explanation as to why it has been split.
> again, it wasn't a split but example how different prefixes will be
> presented in final output and I assumed that average engineer could
> figure out which part comes from which printer, but from your voice it
> looks that using "crash" and "dump" strings as light reference to
> example function name was too tricky, and you need raw example like:
>
> 	struct drm_printer p = drm_err_printer(drm, "AAA");
> 	struct drm_printer lp = drm_line_printer(&p, "BBB", ++id);
>
> 	[ ] 0000:00:00.0: [drm] *ERROR* AAA BBB 1.1: foo
> 	[ ] 0000:00:00.0: [drm] *ERROR* AAA BBB 1.2: bar
It is exceedingly obvious what string is coming from what printer. My 
point is that your example was using strings which a reasonable person 
might construe as a single phrase but gave no explanation why that 
phrase was being split across to separate print helpers. Even in this 
example, it is still confusing as to why a developer would want to split 
"AAA BBB" across the two helpers. If you add a second line printer "CCC" 
then it would be clearer that you are meaning to sub-divide the output 
into A.B and A.C rather than just split the single AB into A.B for no 
apparent reason. But again, that is just an excessively verbose/complex 
way of saying "the top level printer can also add its own prefix if 
desired". Especially given that not all top level printers can even take 
a prefix.


>
>>>>> will allow a double prefix, there is not much you can do about that
>>>>> because losing the prefix from drm_line_printer would mean no prefix at
>>> but why would we loose the prefix from the primary printer ?
>> I don't know what you mean by the primary printer? As above, I was
> by 'primary' printer I mean the one that is passed to the
> drm_line_printer and the drm_line_printer uses for actual output
>
>> simply trying to say that I am not requesting a code change but just a
>> clarification of the documentation.
>>
>>>>> all when not using drm_err_printer underneath. But why explicitly split
>>>>> the message across both printers in the usage example? This is saying
>>>>> that this is the recommended way to use the interface, but with no
>>>>> explanation of why the split is required or how the split should be
>>>>> done.
>>> the drm_line_printer is flexible and can be used in many configurations,
>>> above is just one of the potential uses that shows full output
>>>
>>>> You could have a printer, and then add two separate line counted blocks.
>>>>
>>>>      struct drm_printer p = drm_err_printer(drm, "parent");
>>>>      struct drm_printer lp1 = drm_line_printer(&p, "child 1", 0);
>>>>
>>>>      ...
>>>>
>>>>      struct drm_printer lp2 = drm_line_printer(&p, "child 2", 0);
>>>>
>>>>      ...
>>>>
>>>> p could be defined elsewhere and passed into separate functions which
>>>> each have the line printing. The two prefixes can be useful.
>>> didn't considered that, but as stated above, drm_line_printer is
>>> flexible and can be used in many different ways, like this new one
>> And you really do not need to list them all out as massively verbose
>> examples with confusing differences between them that are not explained.
>> A single example plus a couple of lines of description would be much
>> clearer.
> but sometimes, especially in case of the formatting functions, it might
> be more beneficial to actually show some true outputs, than just
> describe what you might expect
Sure, but one example usage is sufficient for that. You don't need reams 
of example code to demonstrate completely trivial features.

>
>>>>> Also, there is really no specific connection to crashes. The purpose of
>>>>> this is for allowing the dumping of multi-line blocks of data. One use
>>>>> is for debugging crashes. But there are many debug operations that
>>>>> require the same dump and do not involve a crash. And this support is
>>>>> certainly not intended to be used on general end-user GPU hangs. For
>>>>> those, everything should be part of the devcoredump core file that is
>>>>> produced and accessible via sysfs. We absolutely should not be dumping
>>>>> huge data blocks in customer release drivers except in very extreme
>>>>> circumstances.
>>> if you are trying to convince me that we don't need any custom
>>> drm_printer that would take care of tracking and printing line numbers
>>> in kind of more robust way and instead we should be doing such line
>>> prints in a error prone way on it's own as you did in [3], then sorry,
>>> I'm not convinced, unless you just feel that it shouldn't be part of the
>>> drm yet, but then decision is drm maintainer hands (and also in the Xe
>>> maintainers who don't want to fall into i915-ish private solutions trap)
>>>
>>> [3] https://patchwork.freedesktop.org/patch/594021/?series=133349&rev=2
>> No. I am saying that your example use case seems to be implying a much
>> greater usage for this mechanism than should be expected. I'm saying
>> that it should never occur in an end user system because dumping
>> megabytes of data to dmesg is a bad user experience. It absolutely
> IMO, it's not always a dump of megabytes is where drm_line_printer could
> be beneficial, but again, it's idea was to show that you don't have to
> manually modify each printf to have a line prefix and, what's more
> important, don't pollute output if other printer (debugfs) will be used
So add the line count to the top level kernel printk implementation and 
have it present on all kernel output.

And as already discussed, there is still potential advantage to having 
the line count even in the debugfs file.

>
>> should never be a standard part of handling a GPU hang type 'crash'. The
>> primary purpose is for internal debug by developers only. If a use case
>> gets shipped upstream then it should be an extremely hard to hit corner
>> case for which we are desperate to get any useful debug logs by any
>> means necessary.
> but there are many tools that we shouldn't over use in production
> systems but still we do have them defined as common code
But something like BUG() has thousands of instances across the kernel 
and has no impact at all in a non-debug build. Whereas this will have 
exactly one usage instance and does add complexity and memory usage to 
all builds.

>
>> As for error prone, I am not seeing how the original trivial (and
>> working) code is error prone but this complex abstraction of it is less
>> so. Especially given the integer truncation problem. I mean seriously,
>> how 'error prone' can it be to add a "%d, line++" to a print?! And how
> what if in the future someone else add new printf() but misses or
> misspells that %d, line++ in format line ?
Sure, it is technically possible to put a bug in any piece of code. But 
it is hardly 'error prone'.

>
>> much of a 'private solutions trap' is it to add such a trivial prefix to
>> a couple of prints in a single function that is really a big ugly hack
>> for getting logs out in a really dodgy manner anyway?
>>
>> As you say, it is up to the DRM maintainers as to whether they want this
>> support in the generic DRM layers. If it lands and it is functional
>> (i.e. does not break its sole reason for being by truncating counts
>> partway through a dump) then sure, I'll use it. I just don't see that it
>> is even remotely worth the effort given that it is single use only and
>> given how trivial the local version is.
> if collecting GuC log over dmesg is so helpful/important in some
> situations then likely similar solution could be beneficial on i915, no?
It's been in the i915 driver for quite some time already. And the code 
has been modified by precisely zero other developers with precisely zero 
bugs introduced in the formatting/line-counting since that time.

John.

>
>> John.
>>
>>>> A devcoredump implementation could use a drm_printer too?
>>>>
>>>> Is this only about bikeshedding the example? I'm not sure I get the
>>>> point here.
>>>>
>>>>>> + *
>>>>>> + *        drm_printf(&lp, "foo");
>>>>>> + *        drm_printf(&lp, "bar");
>>>>>> + *    }
>>>>>> + *
>>>>>> + * Above code will print into the dmesg something like::
>>>>>> + *
>>>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
>>>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
>>>>>> + *
>>>>>> + * Example 2::
>>>>>> + *
>>>>>> + *    void line_dump(struct device *dev)
>>>>>> + *    {
>>>>>> + *        struct drm_printer p = drm_info_printer(dev);
>>>>>> + *        struct drm_printer lp = drm_line_printer(&p, NULL, 0);
>>>>>> + *
>>>>>> + *        drm_printf(&lp, "foo");
>>>>>> + *        drm_printf(&lp, "bar");
>>>>>> + *    }
>>>>>> + *
>>>>>> + * Above code will print::
>>>>>> + *
>>>>>> + *    [ ] 0000:00:00.0: [drm] 1: foo
>>>>>> + *    [ ] 0000:00:00.0: [drm] 2: bar
>>>>> Not really seeing the point of having two examples listed. The first
>>>>> includes all the optional extras, the second is just repeating with no
>>>>> new information.
>>>> You see how the "series" param behaves?
>>> exactly
>>>
>>>> BR,
>>>> Jani.
>>>>
>>>>> John.
>>>>>
>>>>>> + *
>>>>>> + * RETURNS:
>>>>>> + * The &drm_printer object
>>>>>> + */
>>>>>> +static inline struct drm_printer drm_line_printer(struct
>>>>>> drm_printer *p,
>>>>>> +                          const char *prefix,
>>>>>> +                          unsigned short series)
>>>>>> +{
>>>>>> +    struct drm_printer lp = {
>>>>>> +        .printfn = __drm_printfn_line,
>>>>>> +        .arg = p,
>>>>>> +        .prefix = prefix,
>>>>>> +        .line = { .series = series, },
>>>>>> +    };
>>>>>> +    return lp;
>>>>>> +}
>>>>>> +
>>>>>>     /*
>>>>>>      * struct device based logging
>>>>>>      *

[v2] drm/print: Introduce drm_line_printer

Commit Message

Comments

Patch