diff mbox series

[1/5] dri: cleanup debugfs error handling

Message ID 20211008091704.27094-1-nirmoy.das@amd.com (mailing list archive)
State New, archived
Headers show
Series [1/5] dri: cleanup debugfs error handling | expand

Commit Message

Das, Nirmoy Oct. 8, 2021, 9:17 a.m. UTC
Debugfs API returns encoded error instead of NULL.
This patch cleanups drm debugfs error handling to
properly set dri and its minor's root dentry to NULL.

Also do not error out if dri/minor debugfs directory
creation fails as a debugfs error is not a fatal error.

CC: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
CC: Maxime Ripard <mripard@kernel.org>
CC: Thomas Zimmermann <tzimmermann@suse.de>
CC: David Airlie <airlied@linux.ie>
CC: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
---
 drivers/gpu/drm/drm_debugfs.c  | 25 +++++++++++++++++++++++--
 drivers/gpu/drm/drm_drv.c      | 16 ++++++++++------
 drivers/gpu/drm/drm_internal.h |  7 +++----
 3 files changed, 36 insertions(+), 12 deletions(-)

--
2.32.0

Comments

Thomas Zimmermann Oct. 8, 2021, 9:35 a.m. UTC | #1
Hi

Am 08.10.21 um 11:17 schrieb Nirmoy Das:
> Debugfs API returns encoded error instead of NULL.
> This patch cleanups drm debugfs error handling to
> properly set dri and its minor's root dentry to NULL.
> 
> Also do not error out if dri/minor debugfs directory
> creation fails as a debugfs error is not a fatal error.
> 
> CC: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> CC: Maxime Ripard <mripard@kernel.org>
> CC: Thomas Zimmermann <tzimmermann@suse.de>
> CC: David Airlie <airlied@linux.ie>
> CC: Daniel Vetter <daniel@ffwll.ch>
> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
> ---
>   drivers/gpu/drm/drm_debugfs.c  | 25 +++++++++++++++++++++++--
>   drivers/gpu/drm/drm_drv.c      | 16 ++++++++++------
>   drivers/gpu/drm/drm_internal.h |  7 +++----
>   3 files changed, 36 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_debugfs.c b/drivers/gpu/drm/drm_debugfs.c
> index b0a826489488..af275a0c09b4 100644
> --- a/drivers/gpu/drm/drm_debugfs.c
> +++ b/drivers/gpu/drm/drm_debugfs.c
> @@ -180,6 +180,9 @@ void drm_debugfs_create_files(const struct drm_info_list *files, int count,
>   	struct drm_info_node *tmp;
>   	int i;
> 
> +	if (!minor->debugfs_root)
> +		return;
> +
>   	for (i = 0; i < count; i++) {
>   		u32 features = files[i].driver_features;
> 
> @@ -203,7 +206,7 @@ void drm_debugfs_create_files(const struct drm_info_list *files, int count,
>   }
>   EXPORT_SYMBOL(drm_debugfs_create_files);
> 
> -int drm_debugfs_init(struct drm_minor *minor, int minor_id,
> +void drm_debugfs_init(struct drm_minor *minor, int minor_id,
>   		     struct dentry *root)
>   {
>   	struct drm_device *dev = minor->dev;
> @@ -212,8 +215,16 @@ int drm_debugfs_init(struct drm_minor *minor, int minor_id,
>   	INIT_LIST_HEAD(&minor->debugfs_list);
>   	mutex_init(&minor->debugfs_lock);
>   	sprintf(name, "%d", minor_id);
> +
> +	if (!root)
> +		goto error;
> +
>   	minor->debugfs_root = debugfs_create_dir(name, root);
> 
> +	if (IS_ERR(minor->debugfs_root))
> +		goto error;
> +
> +
>   	drm_debugfs_create_files(drm_debugfs_list, DRM_DEBUGFS_ENTRIES,
>   				 minor->debugfs_root, minor);
> 
> @@ -230,7 +241,11 @@ int drm_debugfs_init(struct drm_minor *minor, int minor_id,
>   	if (dev->driver->debugfs_init)
>   		dev->driver->debugfs_init(minor);
> 
> -	return 0;
> +	return;
> +
> +error:
> +	minor->debugfs_root = NULL;
> +	return;
>   }
> 
> 
> @@ -241,6 +256,9 @@ int drm_debugfs_remove_files(const struct drm_info_list *files, int count,
>   	struct drm_info_node *tmp;
>   	int i;
> 
> +	if (!minor->debugfs_root)
> +		return 0;
> +
>   	mutex_lock(&minor->debugfs_lock);
>   	for (i = 0; i < count; i++) {
>   		list_for_each_safe(pos, q, &minor->debugfs_list) {
> @@ -261,6 +279,9 @@ static void drm_debugfs_remove_all_files(struct drm_minor *minor)
>   {
>   	struct drm_info_node *node, *tmp;
> 
> +	if (!minor->debugfs_root)
> +		return;
> +
>   	mutex_lock(&minor->debugfs_lock);
>   	list_for_each_entry_safe(node, tmp, &minor->debugfs_list, list) {
>   		debugfs_remove(node->dent);
> diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
> index 7a5097467ba5..fa57ec2d49bf 100644
> --- a/drivers/gpu/drm/drm_drv.c
> +++ b/drivers/gpu/drm/drm_drv.c
> @@ -160,11 +160,7 @@ static int drm_minor_register(struct drm_device *dev, unsigned int type)
>   	if (!minor)
>   		return 0;
> 
> -	ret = drm_debugfs_init(minor, minor->index, drm_debugfs_root);
> -	if (ret) {
> -		DRM_ERROR("DRM: Failed to initialize /sys/kernel/debug/dri.\n");

Rather than deleting the error message, return an error code from 
drm_debugfs_init() and print it here. I'd change DRM_ERROR() to 
drm_dbg_core(NULL, ...).


> -		goto err_debugfs;
> -	}
> +	drm_debugfs_init(minor, minor->index, drm_debugfs_root);
> 
>   	ret = device_add(minor->kdev);
>   	if (ret)
> @@ -1050,7 +1046,15 @@ static int __init drm_core_init(void)
>   		goto error;
>   	}
> 
> -	drm_debugfs_root = debugfs_create_dir("dri", NULL);
> +	if (!debugfs_initialized()) {
> +		drm_debugfs_root = NULL;
> +	} else {
> +		drm_debugfs_root = debugfs_create_dir("dri", NULL);
> +		if (IS_ERR(drm_debugfs_root)) {
> +			DRM_WARN("DRM: Failed to initialize /sys/kernel/debug/dri.\n");

This should also print the error code. I'd also change the call to 
drm_dbg_core(). The message should say 'failed to create', so it's 
differnt from the other one.

Best regards
Thomas

> +			drm_debugfs_root = NULL;
> +		}
> +	}
> 
>   	ret = register_chrdev(DRM_MAJOR, "drm", &drm_stub_fops);
>   	if (ret < 0)
> diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
> index 17f3548c8ed2..e27a40166178 100644
> --- a/drivers/gpu/drm/drm_internal.h
> +++ b/drivers/gpu/drm/drm_internal.h
> @@ -182,8 +182,8 @@ int drm_gem_dumb_destroy(struct drm_file *file, struct drm_device *dev,
> 
>   /* drm_debugfs.c drm_debugfs_crc.c */
>   #if defined(CONFIG_DEBUG_FS)
> -int drm_debugfs_init(struct drm_minor *minor, int minor_id,
> -		     struct dentry *root);
> +void drm_debugfs_init(struct drm_minor *minor, int minor_id,
> +		      struct dentry *root);
>   void drm_debugfs_cleanup(struct drm_minor *minor);
>   void drm_debugfs_connector_add(struct drm_connector *connector);
>   void drm_debugfs_connector_remove(struct drm_connector *connector);
> @@ -191,10 +191,9 @@ void drm_debugfs_crtc_add(struct drm_crtc *crtc);
>   void drm_debugfs_crtc_remove(struct drm_crtc *crtc);
>   void drm_debugfs_crtc_crc_add(struct drm_crtc *crtc);
>   #else
> -static inline int drm_debugfs_init(struct drm_minor *minor, int minor_id,
> +static inline void drm_debugfs_init(struct drm_minor *minor, int minor_id,
>   				   struct dentry *root)
>   {
> -	return 0;
>   }
> 
>   static inline void drm_debugfs_cleanup(struct drm_minor *minor)
> --
> 2.32.0
>
Jani Nikula Oct. 8, 2021, 9:40 a.m. UTC | #2
On Fri, 08 Oct 2021, Nirmoy Das <nirmoy.das@amd.com> wrote:
> Debugfs API returns encoded error instead of NULL.
> This patch cleanups drm debugfs error handling to
> properly set dri and its minor's root dentry to NULL.
>
> Also do not error out if dri/minor debugfs directory
> creation fails as a debugfs error is not a fatal error.

Cc: Greg

I thought this is the opposite of what Greg's been telling everyone to
do with debugfs.

BR,
Jani.

>
> CC: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> CC: Maxime Ripard <mripard@kernel.org>
> CC: Thomas Zimmermann <tzimmermann@suse.de>
> CC: David Airlie <airlied@linux.ie>
> CC: Daniel Vetter <daniel@ffwll.ch>
> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
> ---
>  drivers/gpu/drm/drm_debugfs.c  | 25 +++++++++++++++++++++++--
>  drivers/gpu/drm/drm_drv.c      | 16 ++++++++++------
>  drivers/gpu/drm/drm_internal.h |  7 +++----
>  3 files changed, 36 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_debugfs.c b/drivers/gpu/drm/drm_debugfs.c
> index b0a826489488..af275a0c09b4 100644
> --- a/drivers/gpu/drm/drm_debugfs.c
> +++ b/drivers/gpu/drm/drm_debugfs.c
> @@ -180,6 +180,9 @@ void drm_debugfs_create_files(const struct drm_info_list *files, int count,
>  	struct drm_info_node *tmp;
>  	int i;
>
> +	if (!minor->debugfs_root)
> +		return;
> +
>  	for (i = 0; i < count; i++) {
>  		u32 features = files[i].driver_features;
>
> @@ -203,7 +206,7 @@ void drm_debugfs_create_files(const struct drm_info_list *files, int count,
>  }
>  EXPORT_SYMBOL(drm_debugfs_create_files);
>
> -int drm_debugfs_init(struct drm_minor *minor, int minor_id,
> +void drm_debugfs_init(struct drm_minor *minor, int minor_id,
>  		     struct dentry *root)
>  {
>  	struct drm_device *dev = minor->dev;
> @@ -212,8 +215,16 @@ int drm_debugfs_init(struct drm_minor *minor, int minor_id,
>  	INIT_LIST_HEAD(&minor->debugfs_list);
>  	mutex_init(&minor->debugfs_lock);
>  	sprintf(name, "%d", minor_id);
> +
> +	if (!root)
> +		goto error;
> +
>  	minor->debugfs_root = debugfs_create_dir(name, root);
>
> +	if (IS_ERR(minor->debugfs_root))
> +		goto error;
> +
> +
>  	drm_debugfs_create_files(drm_debugfs_list, DRM_DEBUGFS_ENTRIES,
>  				 minor->debugfs_root, minor);
>
> @@ -230,7 +241,11 @@ int drm_debugfs_init(struct drm_minor *minor, int minor_id,
>  	if (dev->driver->debugfs_init)
>  		dev->driver->debugfs_init(minor);
>
> -	return 0;
> +	return;
> +
> +error:
> +	minor->debugfs_root = NULL;
> +	return;
>  }
>
>
> @@ -241,6 +256,9 @@ int drm_debugfs_remove_files(const struct drm_info_list *files, int count,
>  	struct drm_info_node *tmp;
>  	int i;
>
> +	if (!minor->debugfs_root)
> +		return 0;
> +
>  	mutex_lock(&minor->debugfs_lock);
>  	for (i = 0; i < count; i++) {
>  		list_for_each_safe(pos, q, &minor->debugfs_list) {
> @@ -261,6 +279,9 @@ static void drm_debugfs_remove_all_files(struct drm_minor *minor)
>  {
>  	struct drm_info_node *node, *tmp;
>
> +	if (!minor->debugfs_root)
> +		return;
> +
>  	mutex_lock(&minor->debugfs_lock);
>  	list_for_each_entry_safe(node, tmp, &minor->debugfs_list, list) {
>  		debugfs_remove(node->dent);
> diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
> index 7a5097467ba5..fa57ec2d49bf 100644
> --- a/drivers/gpu/drm/drm_drv.c
> +++ b/drivers/gpu/drm/drm_drv.c
> @@ -160,11 +160,7 @@ static int drm_minor_register(struct drm_device *dev, unsigned int type)
>  	if (!minor)
>  		return 0;
>
> -	ret = drm_debugfs_init(minor, minor->index, drm_debugfs_root);
> -	if (ret) {
> -		DRM_ERROR("DRM: Failed to initialize /sys/kernel/debug/dri.\n");
> -		goto err_debugfs;
> -	}
> +	drm_debugfs_init(minor, minor->index, drm_debugfs_root);
>
>  	ret = device_add(minor->kdev);
>  	if (ret)
> @@ -1050,7 +1046,15 @@ static int __init drm_core_init(void)
>  		goto error;
>  	}
>
> -	drm_debugfs_root = debugfs_create_dir("dri", NULL);
> +	if (!debugfs_initialized()) {
> +		drm_debugfs_root = NULL;
> +	} else {
> +		drm_debugfs_root = debugfs_create_dir("dri", NULL);
> +		if (IS_ERR(drm_debugfs_root)) {
> +			DRM_WARN("DRM: Failed to initialize /sys/kernel/debug/dri.\n");
> +			drm_debugfs_root = NULL;
> +		}
> +	}
>
>  	ret = register_chrdev(DRM_MAJOR, "drm", &drm_stub_fops);
>  	if (ret < 0)
> diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
> index 17f3548c8ed2..e27a40166178 100644
> --- a/drivers/gpu/drm/drm_internal.h
> +++ b/drivers/gpu/drm/drm_internal.h
> @@ -182,8 +182,8 @@ int drm_gem_dumb_destroy(struct drm_file *file, struct drm_device *dev,
>
>  /* drm_debugfs.c drm_debugfs_crc.c */
>  #if defined(CONFIG_DEBUG_FS)
> -int drm_debugfs_init(struct drm_minor *minor, int minor_id,
> -		     struct dentry *root);
> +void drm_debugfs_init(struct drm_minor *minor, int minor_id,
> +		      struct dentry *root);
>  void drm_debugfs_cleanup(struct drm_minor *minor);
>  void drm_debugfs_connector_add(struct drm_connector *connector);
>  void drm_debugfs_connector_remove(struct drm_connector *connector);
> @@ -191,10 +191,9 @@ void drm_debugfs_crtc_add(struct drm_crtc *crtc);
>  void drm_debugfs_crtc_remove(struct drm_crtc *crtc);
>  void drm_debugfs_crtc_crc_add(struct drm_crtc *crtc);
>  #else
> -static inline int drm_debugfs_init(struct drm_minor *minor, int minor_id,
> +static inline void drm_debugfs_init(struct drm_minor *minor, int minor_id,
>  				   struct dentry *root)
>  {
> -	return 0;
>  }
>
>  static inline void drm_debugfs_cleanup(struct drm_minor *minor)
> --
> 2.32.0
>
Greg KH Oct. 8, 2021, 11:07 a.m. UTC | #3
On Fri, Oct 08, 2021 at 12:40:47PM +0300, Jani Nikula wrote:
> On Fri, 08 Oct 2021, Nirmoy Das <nirmoy.das@amd.com> wrote:
> > Debugfs API returns encoded error instead of NULL.
> > This patch cleanups drm debugfs error handling to
> > properly set dri and its minor's root dentry to NULL.
> >
> > Also do not error out if dri/minor debugfs directory
> > creation fails as a debugfs error is not a fatal error.
> 
> Cc: Greg
> 
> I thought this is the opposite of what Greg's been telling everyone to
> do with debugfs.

Yes, that is not good.

You should never care about the result of a debugfs_create* call.  Just
take the result, and if it is a directory, save it off to use it for
creating a file, no need to check anything.

And then throw it away, later, when you want to remove the directory,
look it up with a call to debugfs_lookup() and pass that to
debugfs_remove() (which does so recursively).

There should never be a need to save, or check, the result of any
debugfs call.  If so, odds are it is being used incorrectly.

thanks,

greg k-h
Das, Nirmoy Oct. 8, 2021, 12:08 p.m. UTC | #4
[AMD Official Use Only]

Thanks, Greg and Jani. So I have to do the exact opposite.

We do have some NULL dentry check in the drm code. I will remove those instead.

Regards,
Nirmoy
Das, Nirmoy Oct. 8, 2021, 12:56 p.m. UTC | #5
[AMD Official Use Only]

Trying with Christian's Gmail, as he still didn't receive previous emails.
Christian König Oct. 11, 2021, 2:19 p.m. UTC | #6
Am 08.10.21 um 17:11 schrieb Greg KH:
> On Fri, Oct 08, 2021 at 04:22:06PM +0200, Christian König wrote:
>> Hi guys,
>>
>> thanks Nirmoy for forwarding this, there is seriously something wrong with
>> the AMD mail servers.
>>
>>> On 10/8/2021 1:07 PM, Greg KH wrote:
>>>> On Fri, Oct 08, 2021 at 12:40:47PM +0300, Jani Nikula wrote:
>>>>> On Fri, 08 Oct 2021, Nirmoy Das <nirmoy.das@amd.com> wrote:
>>>>>> Debugfs API returns encoded error instead of NULL.
>>>>>> This patch cleanups drm debugfs error handling to
>>>>>> properly set dri and its minor's root dentry to NULL.
>>>>>>
>>>>>> Also do not error out if dri/minor debugfs directory
>>>>>> creation fails as a debugfs error is not a fatal error.
>>>>> Cc: Greg
>>>>>
>>>>> I thought this is the opposite of what Greg's been telling everyone to
>>>>> do with debugfs.
>>>> Yes, that is not good.
>>>>
>>>> You should never care about the result of a debugfs_create* call.  Just
>>>> take the result, and if it is a directory, save it off to use it for
>>>> creating a file, no need to check anything.
>> While I totally agree to the intention behind that I'm pretty sure there are
>> some consequences which are a rather bad idea.
>>
>> First of all the debugfs functions return a normal struct dentry pointer and
>> keeping that around unchecked means that we now have pointers in structure
>> members which can suddenly be an ERR_PTR() without any documentation that
>> they are not real pointers.
>>
>> What we could do instead is something like returning a typedef or a
>> structure with the dentry pointer embedded and then document that those are
>> special, can be ERR_PTR() and should never be touched except for the debugfs
>> functions itself.
> I agree, and am slowly working toward that, but am not there yet.  If
> you look, I have removed the return values for almost all
> debugfs_create_* calls, only a few remain.
>
> For now, just treat it like a "void" pointer that you have no context
> for and all will be fine.

Ok in that case we should just document on the saved dentry that this 
pointer is not necessary valid.

Nirmoy, can you take care of that?

>> The other issue is that adding the same file twice is unfortunately a rather
>> common coding error which we don't catch or complain about any more if we
>> don't look at the return value at all.
> How can you add the same debugfs file twice?  You have different
> directories.

We had multiple occasions triggering that:

1. Code accidentally initializing a component twice.

Except for the debugfs entry and a bit memory leak we had no negative 
effect otherwise and wouldn't had detected that without the error 
message from debugfs.

2. Driver not cleaning up properly on hotplug. E.g. old subdirectory not 
properly removed and re-plug.

3. Driver trying to use the same debugfs file for different devices.

> And really, who cares, it's debugging code!  :)

Well especially cause 3 once took me a day to figure out that I'm 
looking at the wrong hardware state because the driver was handling two 
devices, but only one showed up under debugfs.

>>>> And then throw it away, later, when you want to remove the directory,
>>>> look it up with a call to debugfs_lookup() and pass that to
>>>> debugfs_remove() (which does so recursively).
>>>>
>>>> There should never be a need to save, or check, the result of any
>>>> debugfs call.  If so, odds are it is being used incorrectly.
>> Yeah, exactly that's the problem I see here.
>>
>> We save the return value because the DRM subsystem is creating a debugfs
>> directory for the drivers to use.
> That's fine for now, not a big deal.  And even if there is an error,
> again, you can always feed that error back into the debugfs subsystem on
> another call and it will handle it correctly.

Problem is it isn't, we have a crash because the member isn't a pointer 
but an ERR_PTR instead.

And adding IS_ERR checks all around is even worse than adding NULL checks.

Regards,
Christian.

>
> thanks,
>
> greg k-h
Greg KH Oct. 11, 2021, 2:53 p.m. UTC | #7
On Mon, Oct 11, 2021 at 04:19:58PM +0200, Christian König wrote:
> > > > > And then throw it away, later, when you want to remove the directory,
> > > > > look it up with a call to debugfs_lookup() and pass that to
> > > > > debugfs_remove() (which does so recursively).
> > > > > 
> > > > > There should never be a need to save, or check, the result of any
> > > > > debugfs call.  If so, odds are it is being used incorrectly.
> > > Yeah, exactly that's the problem I see here.
> > > 
> > > We save the return value because the DRM subsystem is creating a debugfs
> > > directory for the drivers to use.
> > That's fine for now, not a big deal.  And even if there is an error,
> > again, you can always feed that error back into the debugfs subsystem on
> > another call and it will handle it correctly.
> 
> Problem is it isn't, we have a crash because the member isn't a pointer but
> an ERR_PTR instead.

Again, that is fine, you can feed that into debugfs and it will "just
work".  Treat it as an opaque pointer, not a *dentry and you will be
fine.

thanks,

greg k-h
Das, Nirmoy Oct. 11, 2021, 3:13 p.m. UTC | #8
On 10/11/2021 4:19 PM, Christian König wrote:
> Am 08.10.21 um 17:11 schrieb Greg KH:
>> On Fri, Oct 08, 2021 at 04:22:06PM +0200, Christian König wrote:
>>> Hi guys,
>>>
>>> thanks Nirmoy for forwarding this, there is seriously something 
>>> wrong with
>>> the AMD mail servers.
>>>
>>>> On 10/8/2021 1:07 PM, Greg KH wrote:
>>>>> On Fri, Oct 08, 2021 at 12:40:47PM +0300, Jani Nikula wrote:
>>>>>> On Fri, 08 Oct 2021, Nirmoy Das <nirmoy.das@amd.com> wrote:
>>>>>>> Debugfs API returns encoded error instead of NULL.
>>>>>>> This patch cleanups drm debugfs error handling to
>>>>>>> properly set dri and its minor's root dentry to NULL.
>>>>>>>
>>>>>>> Also do not error out if dri/minor debugfs directory
>>>>>>> creation fails as a debugfs error is not a fatal error.
>>>>>> Cc: Greg
>>>>>>
>>>>>> I thought this is the opposite of what Greg's been telling 
>>>>>> everyone to
>>>>>> do with debugfs.
>>>>> Yes, that is not good.
>>>>>
>>>>> You should never care about the result of a debugfs_create* call.  
>>>>> Just
>>>>> take the result, and if it is a directory, save it off to use it for
>>>>> creating a file, no need to check anything.
>>> While I totally agree to the intention behind that I'm pretty sure 
>>> there are
>>> some consequences which are a rather bad idea.
>>>
>>> First of all the debugfs functions return a normal struct dentry 
>>> pointer and
>>> keeping that around unchecked means that we now have pointers in 
>>> structure
>>> members which can suddenly be an ERR_PTR() without any documentation 
>>> that
>>> they are not real pointers.
>>>
>>> What we could do instead is something like returning a typedef or a
>>> structure with the dentry pointer embedded and then document that 
>>> those are
>>> special, can be ERR_PTR() and should never be touched except for the 
>>> debugfs
>>> functions itself.
>> I agree, and am slowly working toward that, but am not there yet.  If
>> you look, I have removed the return values for almost all
>> debugfs_create_* calls, only a few remain.
>>
>> For now, just treat it like a "void" pointer that you have no context
>> for and all will be fine.
>
> Ok in that case we should just document on the saved dentry that this 
> pointer is not necessary valid.
>
> Nirmoy, can you take care of that?


Sure, I will send patches to document and clean our drm core debugfs code.


Thanks,

Nirmoy


>
>>> The other issue is that adding the same file twice is unfortunately 
>>> a rather
>>> common coding error which we don't catch or complain about any more 
>>> if we
>>> don't look at the return value at all.
>> How can you add the same debugfs file twice?  You have different
>> directories.
>
> We had multiple occasions triggering that:
>
> 1. Code accidentally initializing a component twice.
>
> Except for the debugfs entry and a bit memory leak we had no negative 
> effect otherwise and wouldn't had detected that without the error 
> message from debugfs.
>
> 2. Driver not cleaning up properly on hotplug. E.g. old subdirectory 
> not properly removed and re-plug.
>
> 3. Driver trying to use the same debugfs file for different devices.
>
>> And really, who cares, it's debugging code!  :)
>
> Well especially cause 3 once took me a day to figure out that I'm 
> looking at the wrong hardware state because the driver was handling 
> two devices, but only one showed up under debugfs.
>
>>>>> And then throw it away, later, when you want to remove the directory,
>>>>> look it up with a call to debugfs_lookup() and pass that to
>>>>> debugfs_remove() (which does so recursively).
>>>>>
>>>>> There should never be a need to save, or check, the result of any
>>>>> debugfs call.  If so, odds are it is being used incorrectly.
>>> Yeah, exactly that's the problem I see here.
>>>
>>> We save the return value because the DRM subsystem is creating a 
>>> debugfs
>>> directory for the drivers to use.
>> That's fine for now, not a big deal.  And even if there is an error,
>> again, you can always feed that error back into the debugfs subsystem on
>> another call and it will handle it correctly.
>
> Problem is it isn't, we have a crash because the member isn't a 
> pointer but an ERR_PTR instead.
>
> And adding IS_ERR checks all around is even worse than adding NULL 
> checks.
>
> Regards,
> Christian.
>
>>
>> thanks,
>>
>> greg k-h
>
Jani Nikula Oct. 11, 2021, 4:38 p.m. UTC | #9
On Mon, 11 Oct 2021, Greg KH <gregkh@linuxfoundation.org> wrote:
> On Mon, Oct 11, 2021 at 04:19:58PM +0200, Christian König wrote:
>> > > > > And then throw it away, later, when you want to remove the directory,
>> > > > > look it up with a call to debugfs_lookup() and pass that to
>> > > > > debugfs_remove() (which does so recursively).
>> > > > > 
>> > > > > There should never be a need to save, or check, the result of any
>> > > > > debugfs call.  If so, odds are it is being used incorrectly.
>> > > Yeah, exactly that's the problem I see here.
>> > > 
>> > > We save the return value because the DRM subsystem is creating a debugfs
>> > > directory for the drivers to use.
>> > That's fine for now, not a big deal.  And even if there is an error,
>> > again, you can always feed that error back into the debugfs subsystem on
>> > another call and it will handle it correctly.
>> 
>> Problem is it isn't, we have a crash because the member isn't a pointer but
>> an ERR_PTR instead.
>
> Again, that is fine, you can feed that into debugfs and it will "just
> work".  Treat it as an opaque pointer, not a *dentry and you will be
> fine.

Hmm, some of the patches add things like:

+
+	if (!root)
+		goto error;
+
	minor->debugfs_root = debugfs_create_dir(name, root);

Superficially this seems okay, as it looks like debugfs_create_dir()
doesn't actually cope with NULL values. However, since ->debugfs_root
comes from debugfs_create_dir() I presume it'll never be NULL on errors
anyway but rather an error pointer!

So I think we probably need to go through the drm subsystem and look for
existing similar patterns in fix them.

BR,
Jani.



>
> thanks,
>
> greg k-h
Greg KH Oct. 11, 2021, 5:07 p.m. UTC | #10
On Mon, Oct 11, 2021 at 07:38:22PM +0300, Jani Nikula wrote:
> On Mon, 11 Oct 2021, Greg KH <gregkh@linuxfoundation.org> wrote:
> > On Mon, Oct 11, 2021 at 04:19:58PM +0200, Christian König wrote:
> >> > > > > And then throw it away, later, when you want to remove the directory,
> >> > > > > look it up with a call to debugfs_lookup() and pass that to
> >> > > > > debugfs_remove() (which does so recursively).
> >> > > > > 
> >> > > > > There should never be a need to save, or check, the result of any
> >> > > > > debugfs call.  If so, odds are it is being used incorrectly.
> >> > > Yeah, exactly that's the problem I see here.
> >> > > 
> >> > > We save the return value because the DRM subsystem is creating a debugfs
> >> > > directory for the drivers to use.
> >> > That's fine for now, not a big deal.  And even if there is an error,
> >> > again, you can always feed that error back into the debugfs subsystem on
> >> > another call and it will handle it correctly.
> >> 
> >> Problem is it isn't, we have a crash because the member isn't a pointer but
> >> an ERR_PTR instead.
> >
> > Again, that is fine, you can feed that into debugfs and it will "just
> > work".  Treat it as an opaque pointer, not a *dentry and you will be
> > fine.
> 
> Hmm, some of the patches add things like:
> 
> +
> +	if (!root)
> +		goto error;
> +
> 	minor->debugfs_root = debugfs_create_dir(name, root);
> 
> Superficially this seems okay, as it looks like debugfs_create_dir()
> doesn't actually cope with NULL values.

Yes it does, it puts things at the root of debugfs.

But why are you checking for NULL here, as the return value of a debugfs
call can never be NULL?

> However, since ->debugfs_root
> comes from debugfs_create_dir() I presume it'll never be NULL on errors
> anyway but rather an error pointer!

That is correct.

> So I think we probably need to go through the drm subsystem and look for
> existing similar patterns in fix them.

Please do.  I know I made one pass at it a while ago but I think someone
else went through and cleaned them up again.

thanks,

greg k-h
Jani Nikula Oct. 11, 2021, 6:43 p.m. UTC | #11
On Mon, 11 Oct 2021, Greg KH <gregkh@linuxfoundation.org> wrote:
> On Mon, Oct 11, 2021 at 07:38:22PM +0300, Jani Nikula wrote:
>> On Mon, 11 Oct 2021, Greg KH <gregkh@linuxfoundation.org> wrote:
>> > On Mon, Oct 11, 2021 at 04:19:58PM +0200, Christian König wrote:
>> >> > > > > And then throw it away, later, when you want to remove the directory,
>> >> > > > > look it up with a call to debugfs_lookup() and pass that to
>> >> > > > > debugfs_remove() (which does so recursively).
>> >> > > > > 
>> >> > > > > There should never be a need to save, or check, the result of any
>> >> > > > > debugfs call.  If so, odds are it is being used incorrectly.
>> >> > > Yeah, exactly that's the problem I see here.
>> >> > > 
>> >> > > We save the return value because the DRM subsystem is creating a debugfs
>> >> > > directory for the drivers to use.
>> >> > That's fine for now, not a big deal.  And even if there is an error,
>> >> > again, you can always feed that error back into the debugfs subsystem on
>> >> > another call and it will handle it correctly.
>> >> 
>> >> Problem is it isn't, we have a crash because the member isn't a pointer but
>> >> an ERR_PTR instead.
>> >
>> > Again, that is fine, you can feed that into debugfs and it will "just
>> > work".  Treat it as an opaque pointer, not a *dentry and you will be
>> > fine.
>> 
>> Hmm, some of the patches add things like:
>> 
>> +
>> +	if (!root)
>> +		goto error;
>> +
>> 	minor->debugfs_root = debugfs_create_dir(name, root);
>> 
>> Superficially this seems okay, as it looks like debugfs_create_dir()
>> doesn't actually cope with NULL values.
>
> Yes it does, it puts things at the root of debugfs.

Oh, thanks for the correction.

> But why are you checking for NULL here, as the return value of a debugfs
> call can never be NULL?

Just musing on what is going on in the patch, and why such changes
initially seem like good ideas and get through review.


Thanks,
Jani.


>
>> However, since ->debugfs_root
>> comes from debugfs_create_dir() I presume it'll never be NULL on errors
>> anyway but rather an error pointer!
>
> That is correct.
>
>> So I think we probably need to go through the drm subsystem and look for
>> existing similar patterns in fix them.
>
> Please do.  I know I made one pass at it a while ago but I think someone
> else went through and cleaned them up again.
>
> thanks,
>
> greg k-h
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_debugfs.c b/drivers/gpu/drm/drm_debugfs.c
index b0a826489488..af275a0c09b4 100644
--- a/drivers/gpu/drm/drm_debugfs.c
+++ b/drivers/gpu/drm/drm_debugfs.c
@@ -180,6 +180,9 @@  void drm_debugfs_create_files(const struct drm_info_list *files, int count,
 	struct drm_info_node *tmp;
 	int i;

+	if (!minor->debugfs_root)
+		return;
+
 	for (i = 0; i < count; i++) {
 		u32 features = files[i].driver_features;

@@ -203,7 +206,7 @@  void drm_debugfs_create_files(const struct drm_info_list *files, int count,
 }
 EXPORT_SYMBOL(drm_debugfs_create_files);

-int drm_debugfs_init(struct drm_minor *minor, int minor_id,
+void drm_debugfs_init(struct drm_minor *minor, int minor_id,
 		     struct dentry *root)
 {
 	struct drm_device *dev = minor->dev;
@@ -212,8 +215,16 @@  int drm_debugfs_init(struct drm_minor *minor, int minor_id,
 	INIT_LIST_HEAD(&minor->debugfs_list);
 	mutex_init(&minor->debugfs_lock);
 	sprintf(name, "%d", minor_id);
+
+	if (!root)
+		goto error;
+
 	minor->debugfs_root = debugfs_create_dir(name, root);

+	if (IS_ERR(minor->debugfs_root))
+		goto error;
+
+
 	drm_debugfs_create_files(drm_debugfs_list, DRM_DEBUGFS_ENTRIES,
 				 minor->debugfs_root, minor);

@@ -230,7 +241,11 @@  int drm_debugfs_init(struct drm_minor *minor, int minor_id,
 	if (dev->driver->debugfs_init)
 		dev->driver->debugfs_init(minor);

-	return 0;
+	return;
+
+error:
+	minor->debugfs_root = NULL;
+	return;
 }


@@ -241,6 +256,9 @@  int drm_debugfs_remove_files(const struct drm_info_list *files, int count,
 	struct drm_info_node *tmp;
 	int i;

+	if (!minor->debugfs_root)
+		return 0;
+
 	mutex_lock(&minor->debugfs_lock);
 	for (i = 0; i < count; i++) {
 		list_for_each_safe(pos, q, &minor->debugfs_list) {
@@ -261,6 +279,9 @@  static void drm_debugfs_remove_all_files(struct drm_minor *minor)
 {
 	struct drm_info_node *node, *tmp;

+	if (!minor->debugfs_root)
+		return;
+
 	mutex_lock(&minor->debugfs_lock);
 	list_for_each_entry_safe(node, tmp, &minor->debugfs_list, list) {
 		debugfs_remove(node->dent);
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 7a5097467ba5..fa57ec2d49bf 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -160,11 +160,7 @@  static int drm_minor_register(struct drm_device *dev, unsigned int type)
 	if (!minor)
 		return 0;

-	ret = drm_debugfs_init(minor, minor->index, drm_debugfs_root);
-	if (ret) {
-		DRM_ERROR("DRM: Failed to initialize /sys/kernel/debug/dri.\n");
-		goto err_debugfs;
-	}
+	drm_debugfs_init(minor, minor->index, drm_debugfs_root);

 	ret = device_add(minor->kdev);
 	if (ret)
@@ -1050,7 +1046,15 @@  static int __init drm_core_init(void)
 		goto error;
 	}

-	drm_debugfs_root = debugfs_create_dir("dri", NULL);
+	if (!debugfs_initialized()) {
+		drm_debugfs_root = NULL;
+	} else {
+		drm_debugfs_root = debugfs_create_dir("dri", NULL);
+		if (IS_ERR(drm_debugfs_root)) {
+			DRM_WARN("DRM: Failed to initialize /sys/kernel/debug/dri.\n");
+			drm_debugfs_root = NULL;
+		}
+	}

 	ret = register_chrdev(DRM_MAJOR, "drm", &drm_stub_fops);
 	if (ret < 0)
diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
index 17f3548c8ed2..e27a40166178 100644
--- a/drivers/gpu/drm/drm_internal.h
+++ b/drivers/gpu/drm/drm_internal.h
@@ -182,8 +182,8 @@  int drm_gem_dumb_destroy(struct drm_file *file, struct drm_device *dev,

 /* drm_debugfs.c drm_debugfs_crc.c */
 #if defined(CONFIG_DEBUG_FS)
-int drm_debugfs_init(struct drm_minor *minor, int minor_id,
-		     struct dentry *root);
+void drm_debugfs_init(struct drm_minor *minor, int minor_id,
+		      struct dentry *root);
 void drm_debugfs_cleanup(struct drm_minor *minor);
 void drm_debugfs_connector_add(struct drm_connector *connector);
 void drm_debugfs_connector_remove(struct drm_connector *connector);
@@ -191,10 +191,9 @@  void drm_debugfs_crtc_add(struct drm_crtc *crtc);
 void drm_debugfs_crtc_remove(struct drm_crtc *crtc);
 void drm_debugfs_crtc_crc_add(struct drm_crtc *crtc);
 #else
-static inline int drm_debugfs_init(struct drm_minor *minor, int minor_id,
+static inline void drm_debugfs_init(struct drm_minor *minor, int minor_id,
 				   struct dentry *root)
 {
-	return 0;
 }

 static inline void drm_debugfs_cleanup(struct drm_minor *minor)