diff mbox

drm/drm_bridge: adjust bridge module's refcount

Message ID 1480453976-26303-1-git-send-email-jsarha@ti.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jyri Sarha Nov. 29, 2016, 9:12 p.m. UTC
Store the module that provides the bridge and adjust its refcount
accordingly. The bridge module unload should not be allowed while the
bridge is attached.

Signed-off-by: Jyri Sarha <jsarha@ti.com>
---
 drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
 include/drm/drm_bridge.h     | 11 ++++++++++-
 2 files changed, 22 insertions(+), 4 deletions(-)

Comments

Jyri Sarha Nov. 29, 2016, 9:30 p.m. UTC | #1
On 11/29/16 23:12, Jyri Sarha wrote:
> @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev, struct drm_bridge *bridge)
>  	if (bridge->dev)
>  		return -EBUSY;
>  
> +	if (!try_module_get(bridge->module))
> +		return -ENOENT;
> +
>  	bridge->dev = dev;
>  
>  	if (bridge->funcs->attach)
This sill needs:

-       if (bridge->funcs->attach)
-               return bridge->funcs->attach(bridge);
+       if (bridge->funcs->attach) {
+               int ret = bridge->funcs->attach(bridge);
+
+               if (ret) {
+                       module_put(bridge->module);
+                       return ret;
+               }
+       }

> @@ -144,6 +151,8 @@ void drm_bridge_detach(struct drm_bridge *bridge)
>  	if (bridge->funcs->detach)
>  		bridge->funcs->detach(bridge);
>  
> +	module_put(bridge->module);
> +
>  	bridge->dev = NULL;
>  }
>  EXPORT_SYMBOL(drm_bridge_detach);
Laurent Pinchart Nov. 29, 2016, 10:06 p.m. UTC | #2
Hi Jyri,

Thank you for the patch.

On Tuesday 29 Nov 2016 23:12:56 Jyri Sarha wrote:
> Store the module that provides the bridge and adjust its refcount
> accordingly. The bridge module unload should not be allowed while the
> bridge is attached.
> 
> Signed-off-by: Jyri Sarha <jsarha@ti.com>
> ---
>  drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
>  include/drm/drm_bridge.h     | 11 ++++++++++-
>  2 files changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
> index 0ee052b..36d427b 100644
> --- a/drivers/gpu/drm/drm_bridge.c
> +++ b/drivers/gpu/drm/drm_bridge.c
> @@ -61,22 +61,26 @@
>  static LIST_HEAD(bridge_list);
> 
>  /**
> - * drm_bridge_add - add the given bridge to the global bridge list
> + * __drm_bridge_add - add the given bridge to the global bridge list
>   *
>   * @bridge: bridge control structure
> + * @module: module that provides this bridge
>   *
>   * RETURNS:
>   * Unconditionally returns Zero.
>   */
> -int drm_bridge_add(struct drm_bridge *bridge)
> +int __drm_bridge_add(struct drm_bridge *bridge, struct module *module)
>  {
> +#ifdef MODULE
> +	bridge->module = module;
> +#endif
>  	mutex_lock(&bridge_lock);
>  	list_add_tail(&bridge->list, &bridge_list);
>  	mutex_unlock(&bridge_lock);
> 
>  	return 0;
>  }
> -EXPORT_SYMBOL(drm_bridge_add);
> +EXPORT_SYMBOL(__drm_bridge_add);
> 
>  /**
>   * drm_bridge_remove - remove the given bridge from the global bridge list
> @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev, struct
> drm_bridge *bridge) if (bridge->dev)
>  		return -EBUSY;
> 
> +	if (!try_module_get(bridge->module))
> +		return -ENOENT;

Isn't this still racy ? What happens if the module is removed right before 
this call ? Won't the bridge object be freed, and this code then try to call 
try_module_get() on freed memory ?

To fix this properly I think we need to make the bridge object refcounted, 
with a release callback to signal to the bridge driver that memory can be 
freed. The refcount should be increased in of_drm_find_bridge(), and decreased 
in a new drm_bridge_put() function (the "fun" part will be to update drivers 
to call that :-S).

The module refcount still needs to be increased in drm_bridge_attach() like 
you do here, but you'll need to protect it with bridge_lock to avoid a race 
between try_module_get() and drm_bridge_remove().

>  	bridge->dev = dev;
> 
>  	if (bridge->funcs->attach)
> @@ -144,6 +151,8 @@ void drm_bridge_detach(struct drm_bridge *bridge)
>  	if (bridge->funcs->detach)
>  		bridge->funcs->detach(bridge);
> 
> +	module_put(bridge->module);
> +
>  	bridge->dev = NULL;
>  }
>  EXPORT_SYMBOL(drm_bridge_detach);
> diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h
> index 530a1d6..d60d5d2 100644
> --- a/include/drm/drm_bridge.h
> +++ b/include/drm/drm_bridge.h
> @@ -25,6 +25,7 @@
> 
>  #include <linux/list.h>
>  #include <linux/ctype.h>
> +#include <linux/module.h>
>  #include <drm/drm_mode_object.h>
>  #include <drm/drm_modes.h>
> 
> @@ -192,13 +193,21 @@ struct drm_bridge {
>  #ifdef CONFIG_OF
>  	struct device_node *of_node;
>  #endif
> +#ifdef MODULE
> +	struct module *module;
> +#endif
>  	struct list_head list;
> 
>  	const struct drm_bridge_funcs *funcs;
>  	void *driver_private;
>  };
> 
> -int drm_bridge_add(struct drm_bridge *bridge);
> +int __drm_bridge_add(struct drm_bridge *bridge, struct module *module);
> +#ifdef MODULE
> +#define drm_bridge_add(bridge) __drm_bridge_add(bridge, THIS_MODULE)
> +#else
> +#define drm_bridge_add(bridge) __drm_bridge_add(bridge, NULL)
> +#endif
>  void drm_bridge_remove(struct drm_bridge *bridge);
>  struct drm_bridge *of_drm_find_bridge(struct device_node *np);
>  int drm_bridge_attach(struct drm_device *dev, struct drm_bridge *bridge);
Andrzej Hajda Nov. 30, 2016, 8:11 a.m. UTC | #3
On 29.11.2016 22:12, Jyri Sarha wrote:
> Store the module that provides the bridge and adjust its refcount
> accordingly. The bridge module unload should not be allowed while the
> bridge is attached.

Module refcounting as a way of preventing driver from unbinding
is quite popular, but as other developers said already is incorrect
solution - it does not really prevent from unbinding and is a hack.

>
> Signed-off-by: Jyri Sarha <jsarha@ti.com>
> ---
>  drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
>  include/drm/drm_bridge.h     | 11 ++++++++++-
>  2 files changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
> index 0ee052b..36d427b 100644
> --- a/drivers/gpu/drm/drm_bridge.c
> +++ b/drivers/gpu/drm/drm_bridge.c
> @@ -61,22 +61,26 @@
>  static LIST_HEAD(bridge_list);
>  
>  /**
> - * drm_bridge_add - add the given bridge to the global bridge list
> + * __drm_bridge_add - add the given bridge to the global bridge list
>   *
>   * @bridge: bridge control structure
> + * @module: module that provides this bridge
>   *
>   * RETURNS:
>   * Unconditionally returns Zero.
>   */
> -int drm_bridge_add(struct drm_bridge *bridge)
> +int __drm_bridge_add(struct drm_bridge *bridge, struct module *module)
>  {
> +#ifdef MODULE
> +	bridge->module = module;
> +#endif
>  	mutex_lock(&bridge_lock);
>  	list_add_tail(&bridge->list, &bridge_list);
>  	mutex_unlock(&bridge_lock);
>  
>  	return 0;
>  }
> -EXPORT_SYMBOL(drm_bridge_add);
> +EXPORT_SYMBOL(__drm_bridge_add);
>  
>  /**
>   * drm_bridge_remove - remove the given bridge from the global bridge list
> @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev, struct drm_bridge *bridge)
>  	if (bridge->dev)
>  		return -EBUSY;
>  
> +	if (!try_module_get(bridge->module))
> +		return -ENOENT;
> +

module field can be missing, will it compile in such case?

>  	bridge->dev = dev;
>  
>  	if (bridge->funcs->attach)
> @@ -144,6 +151,8 @@ void drm_bridge_detach(struct drm_bridge *bridge)
>  	if (bridge->funcs->detach)
>  		bridge->funcs->detach(bridge);
>  
> +	module_put(bridge->module);
> +
>  	bridge->dev = NULL;
>  }
>  EXPORT_SYMBOL(drm_bridge_detach);
> diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h
> index 530a1d6..d60d5d2 100644
> --- a/include/drm/drm_bridge.h
> +++ b/include/drm/drm_bridge.h
> @@ -25,6 +25,7 @@
>  
>  #include <linux/list.h>
>  #include <linux/ctype.h>
> +#include <linux/module.h>
>  #include <drm/drm_mode_object.h>
>  #include <drm/drm_modes.h>
>  
> @@ -192,13 +193,21 @@ struct drm_bridge {
>  #ifdef CONFIG_OF
>  	struct device_node *of_node;
>  #endif
> +#ifdef MODULE
> +	struct module *module;
> +#endif
>  	struct list_head list;
>  
>  	const struct drm_bridge_funcs *funcs;
>  	void *driver_private;
>  };
>  
> -int drm_bridge_add(struct drm_bridge *bridge);
> +int __drm_bridge_add(struct drm_bridge *bridge, struct module *module);
> +#ifdef MODULE
> +#define drm_bridge_add(bridge) __drm_bridge_add(bridge, THIS_MODULE)
> +#else
> +#define drm_bridge_add(bridge) __drm_bridge_add(bridge, NULL)
> +#endif

If I remember correctly THIS_MODULE is NULL if MODULE is undefined.
So the whole #ifdef here is unnecessary.

Regards
Andrzej

>  void drm_bridge_remove(struct drm_bridge *bridge);
>  struct drm_bridge *of_drm_find_bridge(struct device_node *np);
>  int drm_bridge_attach(struct drm_device *dev, struct drm_bridge *bridge);
Daniel Vetter Nov. 30, 2016, 8:13 a.m. UTC | #4
On Wed, Nov 30, 2016 at 12:06:25AM +0200, Laurent Pinchart wrote:
> Hi Jyri,
> 
> Thank you for the patch.
> 
> On Tuesday 29 Nov 2016 23:12:56 Jyri Sarha wrote:
> > Store the module that provides the bridge and adjust its refcount
> > accordingly. The bridge module unload should not be allowed while the
> > bridge is attached.
> > 
> > Signed-off-by: Jyri Sarha <jsarha@ti.com>
> > ---
> >  drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
> >  include/drm/drm_bridge.h     | 11 ++++++++++-
> >  2 files changed, 22 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
> > index 0ee052b..36d427b 100644
> > --- a/drivers/gpu/drm/drm_bridge.c
> > +++ b/drivers/gpu/drm/drm_bridge.c
> > @@ -61,22 +61,26 @@
> >  static LIST_HEAD(bridge_list);
> > 
> >  /**
> > - * drm_bridge_add - add the given bridge to the global bridge list
> > + * __drm_bridge_add - add the given bridge to the global bridge list
> >   *
> >   * @bridge: bridge control structure
> > + * @module: module that provides this bridge
> >   *
> >   * RETURNS:
> >   * Unconditionally returns Zero.
> >   */
> > -int drm_bridge_add(struct drm_bridge *bridge)
> > +int __drm_bridge_add(struct drm_bridge *bridge, struct module *module)
> >  {
> > +#ifdef MODULE
> > +	bridge->module = module;
> > +#endif
> >  	mutex_lock(&bridge_lock);
> >  	list_add_tail(&bridge->list, &bridge_list);
> >  	mutex_unlock(&bridge_lock);
> > 
> >  	return 0;
> >  }
> > -EXPORT_SYMBOL(drm_bridge_add);
> > +EXPORT_SYMBOL(__drm_bridge_add);
> > 
> >  /**
> >   * drm_bridge_remove - remove the given bridge from the global bridge list
> > @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev, struct
> > drm_bridge *bridge) if (bridge->dev)
> >  		return -EBUSY;
> > 
> > +	if (!try_module_get(bridge->module))
> > +		return -ENOENT;
> 
> Isn't this still racy ? What happens if the module is removed right before 
> this call ? Won't the bridge object be freed, and this code then try to call 
> try_module_get() on freed memory ?
> 
> To fix this properly I think we need to make the bridge object refcounted, 
> with a release callback to signal to the bridge driver that memory can be 
> freed. The refcount should be increased in of_drm_find_bridge(), and decreased 
> in a new drm_bridge_put() function (the "fun" part will be to update drivers 
> to call that :-S).
> 
> The module refcount still needs to be increased in drm_bridge_attach() like 
> you do here, but you'll need to protect it with bridge_lock to avoid a race 
> between try_module_get() and drm_bridge_remove().

Hm right, I thought _attach is the function called directly, but we do
lookup+attach. Might be good to have an of helper which does the
lookup+attach and then drops the reference. We can do that as long as
attach/detach holds onto a reference like the one below.

And I don't think we need a new bridge refcount, we can just us
try_module_get/put for that (but wrapped into helpers ofc).
-Daniel

> 
> >  	bridge->dev = dev;
> > 
> >  	if (bridge->funcs->attach)
> > @@ -144,6 +151,8 @@ void drm_bridge_detach(struct drm_bridge *bridge)
> >  	if (bridge->funcs->detach)
> >  		bridge->funcs->detach(bridge);
> > 
> > +	module_put(bridge->module);
> > +
> >  	bridge->dev = NULL;
> >  }
> >  EXPORT_SYMBOL(drm_bridge_detach);
> > diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h
> > index 530a1d6..d60d5d2 100644
> > --- a/include/drm/drm_bridge.h
> > +++ b/include/drm/drm_bridge.h
> > @@ -25,6 +25,7 @@
> > 
> >  #include <linux/list.h>
> >  #include <linux/ctype.h>
> > +#include <linux/module.h>
> >  #include <drm/drm_mode_object.h>
> >  #include <drm/drm_modes.h>
> > 
> > @@ -192,13 +193,21 @@ struct drm_bridge {
> >  #ifdef CONFIG_OF
> >  	struct device_node *of_node;
> >  #endif
> > +#ifdef MODULE
> > +	struct module *module;
> > +#endif
> >  	struct list_head list;
> > 
> >  	const struct drm_bridge_funcs *funcs;
> >  	void *driver_private;
> >  };
> > 
> > -int drm_bridge_add(struct drm_bridge *bridge);
> > +int __drm_bridge_add(struct drm_bridge *bridge, struct module *module);
> > +#ifdef MODULE
> > +#define drm_bridge_add(bridge) __drm_bridge_add(bridge, THIS_MODULE)
> > +#else
> > +#define drm_bridge_add(bridge) __drm_bridge_add(bridge, NULL)
> > +#endif
> >  void drm_bridge_remove(struct drm_bridge *bridge);
> >  struct drm_bridge *of_drm_find_bridge(struct device_node *np);
> >  int drm_bridge_attach(struct drm_device *dev, struct drm_bridge *bridge);
> 
> -- 
> Regards,
> 
> Laurent Pinchart
>
Laurent Pinchart Nov. 30, 2016, 8:16 a.m. UTC | #5
Hi Andrzej,

On Wednesday 30 Nov 2016 09:11:56 Andrzej Hajda wrote:
> On 29.11.2016 22:12, Jyri Sarha wrote:
> > Store the module that provides the bridge and adjust its refcount
> > accordingly. The bridge module unload should not be allowed while the
> > bridge is attached.
> 
> Module refcounting as a way of preventing driver from unbinding
> is quite popular, but as other developers said already is incorrect
> solution - it does not really prevent from unbinding and is a hack.

Absolutely, module refcounting must not be used as a way to prevent unbinding, 
but it's still necessary to prevent functions from disappearing. The unbinding 
race has to be fixed through reference counting to prevent objects from being 
freed, but if an object contains function pointers that refer to code part of 
a module, object refcounting won't prevent the code from being removed. Only 
module refcounting helps there. The two techniques are thus complementary.

> > Signed-off-by: Jyri Sarha <jsarha@ti.com>
> > ---
> > 
> >  drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
> >  include/drm/drm_bridge.h     | 11 ++++++++++-
> >  2 files changed, 22 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
> > index 0ee052b..36d427b 100644
> > --- a/drivers/gpu/drm/drm_bridge.c
> > +++ b/drivers/gpu/drm/drm_bridge.c
> > @@ -61,22 +61,26 @@
> >  static LIST_HEAD(bridge_list);
> >  
> >  /**
> > - * drm_bridge_add - add the given bridge to the global bridge list
> > + * __drm_bridge_add - add the given bridge to the global bridge list
> >   *
> >   * @bridge: bridge control structure
> > + * @module: module that provides this bridge
> >   *
> >   * RETURNS:
> >   * Unconditionally returns Zero.
> >   */
> > -int drm_bridge_add(struct drm_bridge *bridge)
> > +int __drm_bridge_add(struct drm_bridge *bridge, struct module *module)
> >  {
> > +#ifdef MODULE
> > +	bridge->module = module;
> > +#endif
> >  	mutex_lock(&bridge_lock);
> >  	list_add_tail(&bridge->list, &bridge_list);
> >  	mutex_unlock(&bridge_lock);
> >  	
> >  	return 0;
> >  }
> > -EXPORT_SYMBOL(drm_bridge_add);
> > +EXPORT_SYMBOL(__drm_bridge_add);
> > 
> >  /**
> >   * drm_bridge_remove - remove the given bridge from the global bridge
> >   list
> > @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev, struct
> > drm_bridge *bridge)
> >  	if (bridge->dev)
> >  		return -EBUSY;
> > 
> > +	if (!try_module_get(bridge->module))
> > +		return -ENOENT;
> > +
> 
> module field can be missing, will it compile in such case?
> 
> >  	bridge->dev = dev;
> >  	
> >  	if (bridge->funcs->attach)
> > @@ -144,6 +151,8 @@ void drm_bridge_detach(struct drm_bridge *bridge)
> >  	if (bridge->funcs->detach)
> >  		bridge->funcs->detach(bridge);
> > 
> > +	module_put(bridge->module);
> > +
> >  	bridge->dev = NULL;
> >  }
> >  EXPORT_SYMBOL(drm_bridge_detach);
> > diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h
> > index 530a1d6..d60d5d2 100644
> > --- a/include/drm/drm_bridge.h
> > +++ b/include/drm/drm_bridge.h
> > @@ -25,6 +25,7 @@
> > 
> >  #include <linux/list.h>
> >  #include <linux/ctype.h>
> > +#include <linux/module.h>
> >  #include <drm/drm_mode_object.h>
> >  #include <drm/drm_modes.h>
> > 
> > @@ -192,13 +193,21 @@ struct drm_bridge {
> >  #ifdef CONFIG_OF
> >  	struct device_node *of_node;
> >  #endif
> > +#ifdef MODULE
> > +	struct module *module;
> > +#endif
> >  	struct list_head list;
> >  	
> >  	const struct drm_bridge_funcs *funcs;
> >  	void *driver_private;
> >  };
> > 
> > -int drm_bridge_add(struct drm_bridge *bridge);
> > +int __drm_bridge_add(struct drm_bridge *bridge, struct module *module);
> > +#ifdef MODULE
> > +#define drm_bridge_add(bridge) __drm_bridge_add(bridge, THIS_MODULE)
> > +#else
> > +#define drm_bridge_add(bridge) __drm_bridge_add(bridge, NULL)
> > +#endif
> 
> If I remember correctly THIS_MODULE is NULL if MODULE is undefined.
> So the whole #ifdef here is unnecessary.
Laurent Pinchart Nov. 30, 2016, 8:25 a.m. UTC | #6
Hi Daniel,

On Wednesday 30 Nov 2016 09:13:00 Daniel Vetter wrote:
> On Wed, Nov 30, 2016 at 12:06:25AM +0200, Laurent Pinchart wrote:
> > On Tuesday 29 Nov 2016 23:12:56 Jyri Sarha wrote:
> >> Store the module that provides the bridge and adjust its refcount
> >> accordingly. The bridge module unload should not be allowed while the
> >> bridge is attached.
> >> 
> >> Signed-off-by: Jyri Sarha <jsarha@ti.com>
> >> ---
> >> 
> >>  drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
> >>  include/drm/drm_bridge.h     | 11 ++++++++++-
> >>  2 files changed, 22 insertions(+), 4 deletions(-)
> >> 
> >> diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
> >> index 0ee052b..36d427b 100644
> >> --- a/drivers/gpu/drm/drm_bridge.c
> >> +++ b/drivers/gpu/drm/drm_bridge.c

[snip]

> >> @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev, struct
> >> drm_bridge *bridge)
> >>  	if (bridge->dev)
> >>  		return -EBUSY;
> >> 
> >> +	if (!try_module_get(bridge->module))
> >> +		return -ENOENT;
> > 
> > Isn't this still racy ? What happens if the module is removed right before
> > this call ? Won't the bridge object be freed, and this code then try to
> > call try_module_get() on freed memory ?
> > 
> > To fix this properly I think we need to make the bridge object refcounted,
> > with a release callback to signal to the bridge driver that memory can be
> > freed. The refcount should be increased in of_drm_find_bridge(), and
> > decreased in a new drm_bridge_put() function (the "fun" part will be to
> > update drivers to call that :-S).
> > 
> > The module refcount still needs to be increased in drm_bridge_attach()
> > like you do here, but you'll need to protect it with bridge_lock to avoid
> > a race between try_module_get() and drm_bridge_remove().
> 
> Hm right, I thought _attach is the function called directly, but we do
> lookup+attach. Might be good to have an of helper which does the
> lookup+attach and then drops the reference. We can do that as long as
> attach/detach holds onto a reference like the one below.
> 
> And I don't think we need a new bridge refcount, we can just us
> try_module_get/put for that (but wrapped into helpers ofc).

The bridge refcount and the module refcount serve two different purposes. The 
first one addresses the unbind race by preventing the object from being freed 
while still referenced, to avoid faulty data memory accesses (note that 
increasing a module refcount doesn't prevent unbinding the module from the 
device). The second one addresses the module unload race by preventing code 
from being unloaded while still being reachable through function pointers, 
avoiding faulty text memory accesses (as well as data memory accesses to any 
.data or .rodata section in the module, if those are referenced through 
pointers in the bridge object).

We thus need both types of refcounting, with the former tied to the lookup 
operation and the latter to the attach operation (even though we could handle 
module refcounting at lookup time as well if preferred, but in a well-behaved 
system the bridge callbacks should not be invoked before attach time).

> >>  	bridge->dev = dev;
> >>  	
> >>  	if (bridge->funcs->attach)
Jyri Sarha Nov. 30, 2016, 8:32 a.m. UTC | #7
On 11/30/16 00:06, Laurent Pinchart wrote:
>> >  /**
>> >   * drm_bridge_remove - remove the given bridge from the global bridge list
>> > @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev, struct
>> > drm_bridge *bridge) if (bridge->dev)
>> >  		return -EBUSY;
>> > 
>> > +	if (!try_module_get(bridge->module))
>> > +		return -ENOENT;
> Isn't this still racy ? What happens if the module is removed right before 
> this call ? Won't the bridge object be freed, and this code then try to call 
> try_module_get() on freed memory ?
> 
> To fix this properly I think we need to make the bridge object refcounted, 
> with a release callback to signal to the bridge driver that memory can be 
> freed. The refcount should be increased in of_drm_find_bridge(), and decreased 
> in a new drm_bridge_put() function (the "fun" part will be to update drivers 
> to call that :-S).
> 

I think another option would be to find and attach the bridge object
atomically by holding the bridge_lock until the try_module_get() has
succeeded.

AFAIU, if the module unload is triggered while holding the bridge_lock
but before the try_module_get() call, then try_module_get() would return
false and the attach call will return failure.

> The module refcount still needs to be increased in drm_bridge_attach() like 
> you do here, but you'll need to protect it with bridge_lock to avoid a race 
> between try_module_get() and drm_bridge_remove().
>
Laurent Pinchart Nov. 30, 2016, 8:37 a.m. UTC | #8
Hi Jyri,

On Wednesday 30 Nov 2016 10:32:40 Jyri Sarha wrote:
> On 11/30/16 00:06, Laurent Pinchart wrote:
> >> >  /**
> >> >   * drm_bridge_remove - remove the given bridge from the global bridge
> >> >   list
> >> > @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev,
> >> > struct drm_bridge *bridge)
> >> >  	if (bridge->dev)
> >> >  		return -EBUSY;
> >> > 
> >> > +	if (!try_module_get(bridge->module))
> >> > +		return -ENOENT;
> > 
> > Isn't this still racy ? What happens if the module is removed right before
> > this call ? Won't the bridge object be freed, and this code then try to
> > call try_module_get() on freed memory ?
> > 
> > To fix this properly I think we need to make the bridge object refcounted,
> > with a release callback to signal to the bridge driver that memory can be
> > freed. The refcount should be increased in of_drm_find_bridge(), and
> > decreased in a new drm_bridge_put() function (the "fun" part will be to
> > update drivers to call that :-S).
> 
> I think another option would be to find and attach the bridge object
> atomically by holding the bridge_lock until the try_module_get() has
> succeeded.
> 
> AFAIU, if the module unload is triggered while holding the bridge_lock
> but before the try_module_get() call, then try_module_get() would return
> false and the attach call will return failure.

This would require combining lookup and attach in all cases. I'm not sure that 
would be very convenient for drivers. Keeping the two operations separate 
would be more flexible.

> > The module refcount still needs to be increased in drm_bridge_attach()
> > like you do here, but you'll need to protect it with bridge_lock to avoid
> > a race between try_module_get() and drm_bridge_remove().
Jyri Sarha Nov. 30, 2016, 8:55 a.m. UTC | #9
On 11/30/16 10:37, Laurent Pinchart wrote:
>>> To fix this properly I think we need to make the bridge object refcounted,
>>> > > with a release callback to signal to the bridge driver that memory can be
>>> > > freed. The refcount should be increased in of_drm_find_bridge(), and
>>> > > decreased in a new drm_bridge_put() function (the "fun" part will be to
>>> > > update drivers to call that :-S).
>> > 
>> > I think another option would be to find and attach the bridge object
>> > atomically by holding the bridge_lock until the try_module_get() has
>> > succeeded.
>> > 
>> > AFAIU, if the module unload is triggered while holding the bridge_lock
>> > but before the try_module_get() call, then try_module_get() would return
>> > false and the attach call will return failure.
> This would require combining lookup and attach in all cases. I'm not sure that 
> would be very convenient for drivers. Keeping the two operations separate 
> would be more flexible.
> 

That could be avoided if of_drm_find_bridge() would copy the content of
bridge object, in stead of providing a pointer. The attach call could
then search for the bridge object again before continuing. But still in
the end the impact to individual drivers would be equal to adding a
simple drm_bridge_put() call. Probably better to go forward with your
suggestion.

Even adding the drm_bridge_put() would not be necessary in most cases if
we would add a devm version of getting the bridge object. In 99% of
cases the device probe will fail if the bridge attach fails, and bridge
object refcount would return to zero automatically.
Laurent Pinchart Nov. 30, 2016, 9:03 a.m. UTC | #10
Hi Jyri,

On Wednesday 30 Nov 2016 10:55:10 Jyri Sarha wrote:
> On 11/30/16 10:37, Laurent Pinchart wrote:
> >>> To fix this properly I think we need to make the bridge object
> >>> refcounted,
> >>> 
> >>>>> with a release callback to signal to the bridge driver that memory
> >>>>> can be freed. The refcount should be increased in
> >>>>> of_drm_find_bridge(), and decreased in a new drm_bridge_put() function
> >>>>> (the "fun" part will be to update drivers to call that :-S).
> >>> 
> >>> I think another option would be to find and attach the bridge object
> >>> atomically by holding the bridge_lock until the try_module_get() has
> >>> succeeded.
> >>> 
> >>> AFAIU, if the module unload is triggered while holding the bridge_lock
> >>> but before the try_module_get() call, then try_module_get() would
> >>> return false and the attach call will return failure.
> > 
> > This would require combining lookup and attach in all cases. I'm not sure
> > that would be very convenient for drivers. Keeping the two operations
> > separate would be more flexible.
> 
> That could be avoided if of_drm_find_bridge() would copy the content of
> bridge object, in stead of providing a pointer.

/me shivers

> The attach call could then search for the bridge object again before
> continuing. But still in the end the impact to individual drivers would be
> equal to adding a simple drm_bridge_put() call. Probably better to go
> forward with your suggestion.

Please :-)

> Even adding the drm_bridge_put() would not be necessary in most cases if
> we would add a devm version of getting the bridge object. In 99% of
> cases the device probe will fail if the bridge attach fails, and bridge
> object refcount would return to zero automatically.

devm_* is evil in most cases, especially the devm_kzalloc() function as it 
ties object lifetime to devices, releasing memory at unbind time when the 
object can still be referenced elsewhere. Regarding bridges, a 
devm_of_drm_find_bridge() might make sense as the bridge seems at first glance 
to (nearly) always be a resource that can be released at DRM unbind time.
Daniel Vetter Nov. 30, 2016, 9:12 a.m. UTC | #11
On Wed, Nov 30, 2016 at 10:25:57AM +0200, Laurent Pinchart wrote:
> Hi Daniel,
> 
> On Wednesday 30 Nov 2016 09:13:00 Daniel Vetter wrote:
> > On Wed, Nov 30, 2016 at 12:06:25AM +0200, Laurent Pinchart wrote:
> > > On Tuesday 29 Nov 2016 23:12:56 Jyri Sarha wrote:
> > >> Store the module that provides the bridge and adjust its refcount
> > >> accordingly. The bridge module unload should not be allowed while the
> > >> bridge is attached.
> > >> 
> > >> Signed-off-by: Jyri Sarha <jsarha@ti.com>
> > >> ---
> > >> 
> > >>  drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
> > >>  include/drm/drm_bridge.h     | 11 ++++++++++-
> > >>  2 files changed, 22 insertions(+), 4 deletions(-)
> > >> 
> > >> diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
> > >> index 0ee052b..36d427b 100644
> > >> --- a/drivers/gpu/drm/drm_bridge.c
> > >> +++ b/drivers/gpu/drm/drm_bridge.c
> 
> [snip]
> 
> > >> @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev, struct
> > >> drm_bridge *bridge)
> > >>  	if (bridge->dev)
> > >>  		return -EBUSY;
> > >> 
> > >> +	if (!try_module_get(bridge->module))
> > >> +		return -ENOENT;
> > > 
> > > Isn't this still racy ? What happens if the module is removed right before
> > > this call ? Won't the bridge object be freed, and this code then try to
> > > call try_module_get() on freed memory ?
> > > 
> > > To fix this properly I think we need to make the bridge object refcounted,
> > > with a release callback to signal to the bridge driver that memory can be
> > > freed. The refcount should be increased in of_drm_find_bridge(), and
> > > decreased in a new drm_bridge_put() function (the "fun" part will be to
> > > update drivers to call that :-S).
> > > 
> > > The module refcount still needs to be increased in drm_bridge_attach()
> > > like you do here, but you'll need to protect it with bridge_lock to avoid
> > > a race between try_module_get() and drm_bridge_remove().
> > 
> > Hm right, I thought _attach is the function called directly, but we do
> > lookup+attach. Might be good to have an of helper which does the
> > lookup+attach and then drops the reference. We can do that as long as
> > attach/detach holds onto a reference like the one below.
> > 
> > And I don't think we need a new bridge refcount, we can just us
> > try_module_get/put for that (but wrapped into helpers ofc).
> 
> The bridge refcount and the module refcount serve two different purposes. The 
> first one addresses the unbind race by preventing the object from being freed 
> while still referenced, to avoid faulty data memory accesses (note that 
> increasing a module refcount doesn't prevent unbinding the module from the 
> device). The second one addresses the module unload race by preventing code 
> from being unloaded while still being reachable through function pointers, 
> avoiding faulty text memory accesses (as well as data memory accesses to any 
> .data or .rodata section in the module, if those are referenced through 
> pointers in the bridge object).
> 
> We thus need both types of refcounting, with the former tied to the lookup 
> operation and the latter to the attach operation (even though we could handle 
> module refcounting at lookup time as well if preferred, but in a well-behaved 
> system the bridge callbacks should not be invoked before attach time).

So you're saying struct drm_bridge should embed a struct device? Or at
least kobj? Handling that race is one of the reasons we have them ...
-Daniel
Jyri Sarha Nov. 30, 2016, 9:13 a.m. UTC | #12
On 11/30/16 11:03, Laurent Pinchart wrote:
>>> This would require combining lookup and attach in all cases. I'm not sure
>>> > > that would be very convenient for drivers. Keeping the two operations
>>> > > separate would be more flexible.
>> > 
>> > That could be avoided if of_drm_find_bridge() would copy the content of
>> > bridge object, in stead of providing a pointer.
> /me shivers
> 
>> > The attach call could then search for the bridge object again before
>> > continuing. But still in the end the impact to individual drivers would be
>> > equal to adding a simple drm_bridge_put() call. Probably better to go
>> > forward with your suggestion.
> Please :-)
> 

Ok :). But first I'll make a pull request of all accumulated tilcdc
patches, as all problems I have found in them are to be fixed outside
tilcdc.

>> > Even adding the drm_bridge_put() would not be necessary in most cases if
>> > we would add a devm version of getting the bridge object. In 99% of
>> > cases the device probe will fail if the bridge attach fails, and bridge
>> > object refcount would return to zero automatically.
> devm_* is evil in most cases, especially the devm_kzalloc() function as it 
> ties object lifetime to devices, releasing memory at unbind time when the 
> object can still be referenced elsewhere. Regarding bridges, a 
> devm_of_drm_find_bridge() might make sense as the bridge seems at first glance 
> to (nearly) always be a resource that can be released at DRM unbind time.

But the devm is not at all as evil when the object is refcounted,
because every piece of code that wants to keep a reference to the object
can keep it.

Cheers,
Jyri
Andrzej Hajda Nov. 30, 2016, 9:26 a.m. UTC | #13
On 30.11.2016 09:16, Laurent Pinchart wrote:
> Hi Andrzej,
>
> On Wednesday 30 Nov 2016 09:11:56 Andrzej Hajda wrote:
>> On 29.11.2016 22:12, Jyri Sarha wrote:
>>> Store the module that provides the bridge and adjust its refcount
>>> accordingly. The bridge module unload should not be allowed while the
>>> bridge is attached.
>> Module refcounting as a way of preventing driver from unbinding
>> is quite popular, but as other developers said already is incorrect
>> solution - it does not really prevent from unbinding and is a hack.
> Absolutely, module refcounting must not be used as a way to prevent unbinding, 
> but it's still necessary to prevent functions from disappearing. The unbinding 
> race has to be fixed through reference counting to prevent objects from being 
> freed, but if an object contains function pointers that refer to code part of 
> a module, object refcounting won't prevent the code from being removed. Only 
> module refcounting helps there. The two techniques are thus complementary.

It is not true. There at least two existing and proper solutions, which
do not use refcounting:
1. components - if you put the bridge into component framework, it will
bring down drm device before detaching the bridge.
2. proper callbacks from the provider (bridge in this case) to the
consumer (encoder or drm device). Such callbacks exists for example in
case of MIPI-DSI panels attached to encoders. On removal
panel calls mipi_dsi_detach, which calls .detach ops in associated
encoder, encoder safely disables the video pipeline and the panel
continue its removal.

Of course both solutions have some pitfalls, the first one removes whole
drmdev instead of disabling one pipeline, the second one is specific for
DSI bus, but they clearly shows refcounting is not the only option.

Regards
Andrzej
Laurent Pinchart Nov. 30, 2016, 9:30 a.m. UTC | #14
Hi Jyri,

On Wednesday 30 Nov 2016 11:13:41 Jyri Sarha wrote:
> On 11/30/16 11:03, Laurent Pinchart wrote:
> >>> This would require combining lookup and attach in all cases. I'm not
> >>> sure
> >>> 
> >>>>> that would be very convenient for drivers. Keeping the two
> >>>>> operations separate would be more flexible.
> >>> 
> >>> That could be avoided if of_drm_find_bridge() would copy the content of
> >>> bridge object, in stead of providing a pointer.
> > 
> > /me shivers
> > 
> >>> The attach call could then search for the bridge object again before
> >>> continuing. But still in the end the impact to individual drivers would
> >>> be equal to adding a simple drm_bridge_put() call. Probably better to go
> >>> forward with your suggestion.
> > 
> > Please :-)
> 
> Ok :). But first I'll make a pull request of all accumulated tilcdc
> patches, as all problems I have found in them are to be fixed outside
> tilcdc.

Sure.

> >>> Even adding the drm_bridge_put() would not be necessary in most cases
> >>> if we would add a devm version of getting the bridge object. In 99% of
> >>> cases the device probe will fail if the bridge attach fails, and bridge
> >>> object refcount would return to zero automatically.
> > 
> > devm_* is evil in most cases, especially the devm_kzalloc() function as it
> > ties object lifetime to devices, releasing memory at unbind time when the
> > object can still be referenced elsewhere. Regarding bridges, a
> > devm_of_drm_find_bridge() might make sense as the bridge seems at first
> > glance to (nearly) always be a resource that can be released at DRM
> > unbind time.
>
> But the devm is not at all as evil when the object is refcounted,
> because every piece of code that wants to keep a reference to the object
> can keep it.

devm_* is fine when allocating a resources that is release at device unbind 
time. I/O regions, IRQ, and most probably bridges (I'd have to sleep over that 
last one) should fall in that category, at least most of the time. 
devm_kzalloc() is evil in the sense that many driver authors use it as a 
solution to world hunger without thinking twice, while memory allocated by 
drivers for objects that are registered and must outlive the lifetime of the 
device binding is not a resource consumed by the driver that can be released 
at unbind time.
Laurent Pinchart Nov. 30, 2016, 9:39 a.m. UTC | #15
Hi Andrzej,

On Wednesday 30 Nov 2016 10:26:24 Andrzej Hajda wrote:
> On 30.11.2016 09:16, Laurent Pinchart wrote:
> > On Wednesday 30 Nov 2016 09:11:56 Andrzej Hajda wrote:
> >> On 29.11.2016 22:12, Jyri Sarha wrote:
> >>> Store the module that provides the bridge and adjust its refcount
> >>> accordingly. The bridge module unload should not be allowed while the
> >>> bridge is attached.
> >> 
> >> Module refcounting as a way of preventing driver from unbinding
> >> is quite popular, but as other developers said already is incorrect
> >> solution - it does not really prevent from unbinding and is a hack.
> > 
> > Absolutely, module refcounting must not be used as a way to prevent
> > unbinding, but it's still necessary to prevent functions from
> > disappearing. The unbinding race has to be fixed through reference
> > counting to prevent objects from being freed, but if an object contains
> > function pointers that refer to code part of a module, object refcounting
> > won't prevent the code from being removed. Only module refcounting helps
> > there. The two techniques are thus complementary.
>
> It is not true. There at least two existing and proper solutions, which
> do not use refcounting:
>
> 1. components - if you put the bridge into component framework, it will
> bring down drm device before detaching the bridge.

Don't get me started on that one. The component concept is fine, its 
implementation less so. It's on my (long) to-do list of things to fix.

> 2. proper callbacks from the provider (bridge in this case) to the
> consumer (encoder or drm device). Such callbacks exists for example in
> case of MIPI-DSI panels attached to encoders. On removal
> panel calls mipi_dsi_detach, which calls .detach ops in associated
> encoder, encoder safely disables the video pipeline and the panel
> continue its removal.

*safely* is the keyword here. I have yet to see a solution based on a removal 
notification callback that is both race-free and easy to use in drivers.

> Of course both solutions have some pitfalls, the first one removes whole
> drmdev instead of disabling one pipeline,

The DRM subsystem doesn't have hotplug support (except for displayed connected 
to connectors of course), let alone hot-unplug support. That should be fixed, 
but will be a very long term goal.

Regardless of that, the component framework also relies on a removal 
notification callback, which has the same drawback as the DSI one. Handling 
this in a race-free way is complex. Seeing how drivers fail at simple things 
(such as calling to drm_bridge_detach(), which all but one driver fail to do), 
I'd be surprised if even a single driver got the component unbind handling 
right.

> the second one is specific for DSI bus, but they clearly shows refcounting
> is not the only option.

Sorry, I meant the only working and (more or less) simple option ;-)
Laurent Pinchart Nov. 30, 2016, 9:56 a.m. UTC | #16
Hi Daniel,

On Wednesday 30 Nov 2016 10:12:01 Daniel Vetter wrote:
> On Wed, Nov 30, 2016 at 10:25:57AM +0200, Laurent Pinchart wrote:
> > On Wednesday 30 Nov 2016 09:13:00 Daniel Vetter wrote:
> >> On Wed, Nov 30, 2016 at 12:06:25AM +0200, Laurent Pinchart wrote:
> >>> On Tuesday 29 Nov 2016 23:12:56 Jyri Sarha wrote:
> >>>> Store the module that provides the bridge and adjust its refcount
> >>>> accordingly. The bridge module unload should not be allowed while the
> >>>> bridge is attached.
> >>>> 
> >>>> Signed-off-by: Jyri Sarha <jsarha@ti.com>
> >>>> ---
> >>>> 
> >>>>  drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
> >>>>  include/drm/drm_bridge.h     | 11 ++++++++++-
> >>>>  2 files changed, 22 insertions(+), 4 deletions(-)
> >>>> 
> >>>> diff --git a/drivers/gpu/drm/drm_bridge.c
> >>>> b/drivers/gpu/drm/drm_bridge.c
> >>>> index 0ee052b..36d427b 100644
> >>>> --- a/drivers/gpu/drm/drm_bridge.c
> >>>> +++ b/drivers/gpu/drm/drm_bridge.c
> > 
> > [snip]
> > 
> >>>> @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev,
> >>>> struct drm_bridge *bridge)
> >>>>  	if (bridge->dev)
> >>>>  		return -EBUSY;
> >>>> 
> >>>> +	if (!try_module_get(bridge->module))
> >>>> +		return -ENOENT;
> >>> 
> >>> Isn't this still racy ? What happens if the module is removed right
> >>> before this call ? Won't the bridge object be freed, and this code then
> >>> try to call try_module_get() on freed memory ?
> >>> 
> >>> To fix this properly I think we need to make the bridge object
> >>> refcounted, with a release callback to signal to the bridge driver that
> >>> memory can be freed. The refcount should be increased in
> >>> of_drm_find_bridge(), and decreased in a new drm_bridge_put() function
> >>> (the "fun" part will be to update drivers to call that :-S).
> >>> 
> >>> The module refcount still needs to be increased in drm_bridge_attach()
> >>> like you do here, but you'll need to protect it with bridge_lock to
> >>> avoid a race between try_module_get() and drm_bridge_remove().
> >> 
> >> Hm right, I thought _attach is the function called directly, but we do
> >> lookup+attach. Might be good to have an of helper which does the
> >> lookup+attach and then drops the reference. We can do that as long as
> >> attach/detach holds onto a reference like the one below.
> >> 
> >> And I don't think we need a new bridge refcount, we can just us
> >> try_module_get/put for that (but wrapped into helpers ofc).
> > 
> > The bridge refcount and the module refcount serve two different purposes.
> > The first one addresses the unbind race by preventing the object from
> > being freed while still referenced, to avoid faulty data memory accesses
> > (note that increasing a module refcount doesn't prevent unbinding the
> > module from the device). The second one addresses the module unload race
> > by preventing code from being unloaded while still being reachable
> > through function pointers, avoiding faulty text memory accesses (as well
> > as data memory accesses to any .data or .rodata section in the module, if
> > those are referenced through pointers in the bridge object).
> > 
> > We thus need both types of refcounting, with the former tied to the lookup
> > operation and the latter to the attach operation (even though we could
> > handle module refcounting at lookup time as well if preferred, but in a
> > well-behaved system the bridge callbacks should not be invoked before
> > attach time).
>
> So you're saying struct drm_bridge should embed a struct device? Or at
> least kobj? Handling that race is one of the reasons we have them ...

I'm thinking about kref + a release callback. The really fun part to solve 
this in the long term will be to teach the DRM core about encoder hot-unplug 
:-) In the meantime the minimum we need is to allow the system to fail safely 
when a bridge is unplugged, which will involve safe recovery from 
.atomic_commit() failures.
Daniel Vetter Nov. 30, 2016, 10:55 a.m. UTC | #17
On Wed, Nov 30, 2016 at 11:56:25AM +0200, Laurent Pinchart wrote:
> Hi Daniel,
> 
> On Wednesday 30 Nov 2016 10:12:01 Daniel Vetter wrote:
> > On Wed, Nov 30, 2016 at 10:25:57AM +0200, Laurent Pinchart wrote:
> > > On Wednesday 30 Nov 2016 09:13:00 Daniel Vetter wrote:
> > >> On Wed, Nov 30, 2016 at 12:06:25AM +0200, Laurent Pinchart wrote:
> > >>> On Tuesday 29 Nov 2016 23:12:56 Jyri Sarha wrote:
> > >>>> Store the module that provides the bridge and adjust its refcount
> > >>>> accordingly. The bridge module unload should not be allowed while the
> > >>>> bridge is attached.
> > >>>> 
> > >>>> Signed-off-by: Jyri Sarha <jsarha@ti.com>
> > >>>> ---
> > >>>> 
> > >>>>  drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
> > >>>>  include/drm/drm_bridge.h     | 11 ++++++++++-
> > >>>>  2 files changed, 22 insertions(+), 4 deletions(-)
> > >>>> 
> > >>>> diff --git a/drivers/gpu/drm/drm_bridge.c
> > >>>> b/drivers/gpu/drm/drm_bridge.c
> > >>>> index 0ee052b..36d427b 100644
> > >>>> --- a/drivers/gpu/drm/drm_bridge.c
> > >>>> +++ b/drivers/gpu/drm/drm_bridge.c
> > > 
> > > [snip]
> > > 
> > >>>> @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev,
> > >>>> struct drm_bridge *bridge)
> > >>>>  	if (bridge->dev)
> > >>>>  		return -EBUSY;
> > >>>> 
> > >>>> +	if (!try_module_get(bridge->module))
> > >>>> +		return -ENOENT;
> > >>> 
> > >>> Isn't this still racy ? What happens if the module is removed right
> > >>> before this call ? Won't the bridge object be freed, and this code then
> > >>> try to call try_module_get() on freed memory ?
> > >>> 
> > >>> To fix this properly I think we need to make the bridge object
> > >>> refcounted, with a release callback to signal to the bridge driver that
> > >>> memory can be freed. The refcount should be increased in
> > >>> of_drm_find_bridge(), and decreased in a new drm_bridge_put() function
> > >>> (the "fun" part will be to update drivers to call that :-S).
> > >>> 
> > >>> The module refcount still needs to be increased in drm_bridge_attach()
> > >>> like you do here, but you'll need to protect it with bridge_lock to
> > >>> avoid a race between try_module_get() and drm_bridge_remove().
> > >> 
> > >> Hm right, I thought _attach is the function called directly, but we do
> > >> lookup+attach. Might be good to have an of helper which does the
> > >> lookup+attach and then drops the reference. We can do that as long as
> > >> attach/detach holds onto a reference like the one below.
> > >> 
> > >> And I don't think we need a new bridge refcount, we can just us
> > >> try_module_get/put for that (but wrapped into helpers ofc).
> > > 
> > > The bridge refcount and the module refcount serve two different purposes.
> > > The first one addresses the unbind race by preventing the object from
> > > being freed while still referenced, to avoid faulty data memory accesses
> > > (note that increasing a module refcount doesn't prevent unbinding the
> > > module from the device). The second one addresses the module unload race
> > > by preventing code from being unloaded while still being reachable
> > > through function pointers, avoiding faulty text memory accesses (as well
> > > as data memory accesses to any .data or .rodata section in the module, if
> > > those are referenced through pointers in the bridge object).
> > > 
> > > We thus need both types of refcounting, with the former tied to the lookup
> > > operation and the latter to the attach operation (even though we could
> > > handle module refcounting at lookup time as well if preferred, but in a
> > > well-behaved system the bridge callbacks should not be invoked before
> > > attach time).
> >
> > So you're saying struct drm_bridge should embed a struct device? Or at
> > least kobj? Handling that race is one of the reasons we have them ...
> 
> I'm thinking about kref + a release callback. The really fun part to solve 
> this in the long term will be to teach the DRM core about encoder hot-unplug 
> :-) In the meantime the minimum we need is to allow the system to fail safely 
> when a bridge is unplugged, which will involve safe recovery from 
> .atomic_commit() failures.

Why exactly do you want to hotplug encoders? Or bridges fwiw ... since at
least only making those hotpluggable will make the uabi story easier since
they're not exposed.
-Daniel
Laurent Pinchart Nov. 30, 2016, 11:03 a.m. UTC | #18
Hi Daniel,

On Wednesday 30 Nov 2016 11:55:20 Daniel Vetter wrote:
> On Wed, Nov 30, 2016 at 11:56:25AM +0200, Laurent Pinchart wrote:
> > On Wednesday 30 Nov 2016 10:12:01 Daniel Vetter wrote:
> >> On Wed, Nov 30, 2016 at 10:25:57AM +0200, Laurent Pinchart wrote:
> >>> On Wednesday 30 Nov 2016 09:13:00 Daniel Vetter wrote:
> >>>> On Wed, Nov 30, 2016 at 12:06:25AM +0200, Laurent Pinchart wrote:
> >>>>> On Tuesday 29 Nov 2016 23:12:56 Jyri Sarha wrote:
> >>>>>> Store the module that provides the bridge and adjust its refcount
> >>>>>> accordingly. The bridge module unload should not be allowed while
> >>>>>> the bridge is attached.
> >>>>>> 
> >>>>>> Signed-off-by: Jyri Sarha <jsarha@ti.com>
> >>>>>> ---
> >>>>>> 
> >>>>>>  drivers/gpu/drm/drm_bridge.c | 15 ++++++++++++---
> >>>>>>  include/drm/drm_bridge.h     | 11 ++++++++++-
> >>>>>>  2 files changed, 22 insertions(+), 4 deletions(-)
> >>>>>> 
> >>>>>> diff --git a/drivers/gpu/drm/drm_bridge.c
> >>>>>> b/drivers/gpu/drm/drm_bridge.c
> >>>>>> index 0ee052b..36d427b 100644
> >>>>>> --- a/drivers/gpu/drm/drm_bridge.c
> >>>>>> +++ b/drivers/gpu/drm/drm_bridge.c
> >>> 
> >>> [snip]
> >>> 
> >>>>>> @@ -114,6 +118,9 @@ int drm_bridge_attach(struct drm_device *dev,
> >>>>>> struct drm_bridge *bridge)
> >>>>>>  	if (bridge->dev)
> >>>>>>  		return -EBUSY;
> >>>>>> 
> >>>>>> +	if (!try_module_get(bridge->module))
> >>>>>> +		return -ENOENT;
> >>>>> 
> >>>>> Isn't this still racy ? What happens if the module is removed right
> >>>>> before this call ? Won't the bridge object be freed, and this code
> >>>>> then try to call try_module_get() on freed memory ?
> >>>>> 
> >>>>> To fix this properly I think we need to make the bridge object
> >>>>> refcounted, with a release callback to signal to the bridge driver
> >>>>> that memory can be freed. The refcount should be increased in
> >>>>> of_drm_find_bridge(), and decreased in a new drm_bridge_put()
> >>>>> function (the "fun" part will be to update drivers to call that :-S).
> >>>>> 
> >>>>> The module refcount still needs to be increased in
> >>>>> drm_bridge_attach() like you do here, but you'll need to protect it
> >>>>> with bridge_lock to avoid a race between try_module_get() and
> >>>>> drm_bridge_remove().
> >>>> 
> >>>> Hm right, I thought _attach is the function called directly, but we
> >>>> do lookup+attach. Might be good to have an of helper which does the
> >>>> lookup+attach and then drops the reference. We can do that as long as
> >>>> attach/detach holds onto a reference like the one below.
> >>>> 
> >>>> And I don't think we need a new bridge refcount, we can just us
> >>>> try_module_get/put for that (but wrapped into helpers ofc).
> >>> 
> >>> The bridge refcount and the module refcount serve two different
> >>> purposes. The first one addresses the unbind race by preventing the
> >>> object from being freed while still referenced, to avoid faulty data
> >>> memory accesses (note that increasing a module refcount doesn't
> >>> prevent unbinding the module from the device). The second one
> >>> addresses the module unload race by preventing code from being
> >>> unloaded while still being reachable through function pointers,
> >>> avoiding faulty text memory accesses (as well as data memory accesses
> >>> to any .data or .rodata section in the module, if those are referenced
> >>> through pointers in the bridge object).
> >>> 
> >>> We thus need both types of refcounting, with the former tied to the
> >>> lookup operation and the latter to the attach operation (even though we
> >>> could handle module refcounting at lookup time as well if preferred,
> >>> but in a well-behaved system the bridge callbacks should not be invoked
> >>> before attach time).
> >> 
> >> So you're saying struct drm_bridge should embed a struct device? Or at
> >> least kobj? Handling that race is one of the reasons we have them ...
> > 
> > I'm thinking about kref + a release callback. The really fun part to solve
> > this in the long term will be to teach the DRM core about encoder
> > hot-unplug :-) In the meantime the minimum we need is to allow the system
> > to fail safely when a bridge is unplugged, which will involve safe
> > recovery from .atomic_commit() failures.
> 
> Why exactly do you want to hotplug encoders? Or bridges fwiw ... since at
> least only making those hotpluggable will make the uabi story easier since
> they're not exposed.

Ideally to avoid disabling the whole display engine when one encoder isn't 
available/operational. Right now we're waiting for all pieces to be available 
(using deferred probing or the component framework) before registering the DRM 
device. This means that if one bridge can't be probed successfully for any 
reason we'll end up having not display at all. This includes the case where 
the driver for the bridge is not available. If we could support dynamic 
hotplug of bridges, we could start with a display engine that supports a 
subset of the outputs, and add new outputs as they become operational.

We have a similar issue when unbinding bridge devices from their driver. They 
obviously can't be used anymore, but we have no solution to handle that apart 
from unregistering the DRM device completely, as otherwise rebinding the 
bridge to the driver later can't be handled.
Daniel Vetter Nov. 30, 2016, 1:09 p.m. UTC | #19
On Wed, Nov 30, 2016 at 01:03:20PM +0200, Laurent Pinchart wrote:
> On Wednesday 30 Nov 2016 11:55:20 Daniel Vetter wrote:
> > Why exactly do you want to hotplug encoders? Or bridges fwiw ... since at
> > least only making those hotpluggable will make the uabi story easier since
> > they're not exposed.
> 
> Ideally to avoid disabling the whole display engine when one encoder isn't 
> available/operational. Right now we're waiting for all pieces to be available 
> (using deferred probing or the component framework) before registering the DRM 
> device. This means that if one bridge can't be probed successfully for any 
> reason we'll end up having not display at all. This includes the case where 
> the driver for the bridge is not available. If we could support dynamic 
> hotplug of bridges, we could start with a display engine that supports a 
> subset of the outputs, and add new outputs as they become operational.
> 
> We have a similar issue when unbinding bridge devices from their driver. They 
> obviously can't be used anymore, but we have no solution to handle that apart 
> from unregistering the DRM device completely, as otherwise rebinding the 
> bridge to the driver later can't be handled.

This all sounds pretty cool, but does anyone care? Like what's the
real-world use-case here? Some cosmic ray destroyed the bridge driver on
your android phone and now you want it to magically fall back to hdmi that
no one ever plugs in? Or someone misconfigures their kernel and gets
greeted with a black screen, instead of a ... black screen?

I don't even think project ARA would qualify, since I expect a pluggable
screen to show up as a device on the greybus thing, so would get it's own
drm_device and screen sharing is done through prime.

So yeah I'm a bit sarcastic, but imo this is just one item in a massive
list of fairly-theoretical things which are broken in drm and which we
might want to fix when we're all retired and bored. I'm not yet planning
for that ;-) In case you need more:
- Handle drm_device lifetimes and hot-unplug correctly. Actually somewhat
  relevant because udl.
- Add proper locking for connector_list. Atm we have none.
- Fix up the debugfs lifetime disaster.
- Fix up the prime exported buffer lifetime disaster.
- Fix up the dma_fence lifetime disaster.
- ...

Those are all fundamentally broken things which need design-level changes
throughout the subsystem. And in theory you can even hit them, if you try
hard enough unplugging udl constantly. Encoder hotplug isn't even on my
list here ... because there's a _lot_ more broken with driver unbind even
without making bridges and encoders suddenly hot-add capable.

Cheers, Daniel
Andrzej Hajda Dec. 1, 2016, 6:50 a.m. UTC | #20
On 30.11.2016 10:39, Laurent Pinchart wrote:
> Hi Andrzej,
>
> On Wednesday 30 Nov 2016 10:26:24 Andrzej Hajda wrote:
>> On 30.11.2016 09:16, Laurent Pinchart wrote:
>>> On Wednesday 30 Nov 2016 09:11:56 Andrzej Hajda wrote:
>>>> On 29.11.2016 22:12, Jyri Sarha wrote:
>>>>> Store the module that provides the bridge and adjust its refcount
>>>>> accordingly. The bridge module unload should not be allowed while the
>>>>> bridge is attached.
>>>> Module refcounting as a way of preventing driver from unbinding
>>>> is quite popular, but as other developers said already is incorrect
>>>> solution - it does not really prevent from unbinding and is a hack.
>>> Absolutely, module refcounting must not be used as a way to prevent
>>> unbinding, but it's still necessary to prevent functions from
>>> disappearing. The unbinding race has to be fixed through reference
>>> counting to prevent objects from being freed, but if an object contains
>>> function pointers that refer to code part of a module, object refcounting
>>> won't prevent the code from being removed. Only module refcounting helps
>>> there. The two techniques are thus complementary.
>> It is not true. There at least two existing and proper solutions, which
>> do not use refcounting:
>>
>> 1. components - if you put the bridge into component framework, it will
>> bring down drm device before detaching the bridge.
> Don't get me started on that one. The component concept is fine, its 
> implementation less so. It's on my (long) to-do list of things to fix.
>
>> 2. proper callbacks from the provider (bridge in this case) to the
>> consumer (encoder or drm device). Such callbacks exists for example in
>> case of MIPI-DSI panels attached to encoders. On removal
>> panel calls mipi_dsi_detach, which calls .detach ops in associated
>> encoder, encoder safely disables the video pipeline and the panel
>> continue its removal.
> *safely* is the keyword here. I have yet to see a solution based on a removal 
> notification callback that is both race-free and easy to use in drivers.
>
>> Of course both solutions have some pitfalls, the first one removes whole
>> drmdev instead of disabling one pipeline,
> The DRM subsystem doesn't have hotplug support (except for displayed connected 
> to connectors of course), let alone hot-unplug support. That should be fixed, 
> but will be a very long term goal.
>
> Regardless of that, the component framework also relies on a removal 
> notification callback, which has the same drawback as the DSI one. Handling 
> this in a race-free way is complex. Seeing how drivers fail at simple things 
> (such as calling to drm_bridge_detach(), which all but one driver fail to do), 
> I'd be surprised if even a single driver got the component unbind handling 
> right.

Could you be more precise. What exactly prevents DSI or component
framework from being safe, or where do you see big complexity?
It is of course more complex than current 'solution' but as we know
current 'solution' is broken.

On the other side, why do you think refcounting solution is better? Even
if refcounting is done properly, it will generate problems
with handling 'ghost' objects detached from the bridge, what if the
bridge re-appears, etc... Handling all this properly will complicate
code much more.
But most importantly lack of notification to upstream about broken
pipeline looks like broken design.

Regards
Andrzej
Andrzej Hajda Dec. 1, 2016, 7:07 a.m. UTC | #21
On 30.11.2016 14:09, Daniel Vetter wrote:
> On Wed, Nov 30, 2016 at 01:03:20PM +0200, Laurent Pinchart wrote:
>> On Wednesday 30 Nov 2016 11:55:20 Daniel Vetter wrote:
>>> Why exactly do you want to hotplug encoders? Or bridges fwiw ... since at
>>> least only making those hotpluggable will make the uabi story easier since
>>> they're not exposed.
>> Ideally to avoid disabling the whole display engine when one encoder isn't 
>> available/operational. Right now we're waiting for all pieces to be available 
>> (using deferred probing or the component framework) before registering the DRM 
>> device. This means that if one bridge can't be probed successfully for any 
>> reason we'll end up having not display at all. This includes the case where 
>> the driver for the bridge is not available. If we could support dynamic 
>> hotplug of bridges, we could start with a display engine that supports a 
>> subset of the outputs, and add new outputs as they become operational.
>>
>> We have a similar issue when unbinding bridge devices from their driver. They 
>> obviously can't be used anymore, but we have no solution to handle that apart 
>> from unregistering the DRM device completely, as otherwise rebinding the 
>> bridge to the driver later can't be handled.
> This all sounds pretty cool, but does anyone care? Like what's the
> real-world use-case here? Some cosmic ray destroyed the bridge driver on
> your android phone and now you want it to magically fall back to hdmi that
> no one ever plugs in? Or someone misconfigures their kernel and gets
> greeted with a black screen, instead of a ... black screen?

Real use case is that we need to always load hdmi path drivers at phone
startup just in case somebody will use it.
This way we are wasting space and more importantly boot time, for code
which won't be used by 99% users of phones.
Putting them into modules an loading on MHL/HDMI cable plug-in would be
more optimal, I guess.

Regards
Andrzej
Daniel Vetter Dec. 1, 2016, 7:18 a.m. UTC | #22
On Thu, Dec 01, 2016 at 08:07:29AM +0100, Andrzej Hajda wrote:
> On 30.11.2016 14:09, Daniel Vetter wrote:
> > On Wed, Nov 30, 2016 at 01:03:20PM +0200, Laurent Pinchart wrote:
> >> On Wednesday 30 Nov 2016 11:55:20 Daniel Vetter wrote:
> >>> Why exactly do you want to hotplug encoders? Or bridges fwiw ... since at
> >>> least only making those hotpluggable will make the uabi story easier since
> >>> they're not exposed.
> >> Ideally to avoid disabling the whole display engine when one encoder isn't 
> >> available/operational. Right now we're waiting for all pieces to be available 
> >> (using deferred probing or the component framework) before registering the DRM 
> >> device. This means that if one bridge can't be probed successfully for any 
> >> reason we'll end up having not display at all. This includes the case where 
> >> the driver for the bridge is not available. If we could support dynamic 
> >> hotplug of bridges, we could start with a display engine that supports a 
> >> subset of the outputs, and add new outputs as they become operational.
> >>
> >> We have a similar issue when unbinding bridge devices from their driver. They 
> >> obviously can't be used anymore, but we have no solution to handle that apart 
> >> from unregistering the DRM device completely, as otherwise rebinding the 
> >> bridge to the driver later can't be handled.
> > This all sounds pretty cool, but does anyone care? Like what's the
> > real-world use-case here? Some cosmic ray destroyed the bridge driver on
> > your android phone and now you want it to magically fall back to hdmi that
> > no one ever plugs in? Or someone misconfigures their kernel and gets
> > greeted with a black screen, instead of a ... black screen?
> 
> Real use case is that we need to always load hdmi path drivers at phone
> startup just in case somebody will use it.
> This way we are wasting space and more importantly boot time, for code
> which won't be used by 99% users of phones.
> Putting them into modules an loading on MHL/HDMI cable plug-in would be
> more optimal, I guess.

Do we have numbers for this? What part of the overhead is the edid probing
and reading, which we probably should optimize either way ... optimize as
in make sure we never ever stall anything for edid reads.

And if you never load the hdmi driver, how do you know when to load it
because the user plugged in the cable?
-Daniel
Andrzej Hajda Dec. 1, 2016, 1:22 p.m. UTC | #23
On 01.12.2016 08:18, Daniel Vetter wrote:
> On Thu, Dec 01, 2016 at 08:07:29AM +0100, Andrzej Hajda wrote:
>> On 30.11.2016 14:09, Daniel Vetter wrote:
>>> On Wed, Nov 30, 2016 at 01:03:20PM +0200, Laurent Pinchart wrote:
>>>> On Wednesday 30 Nov 2016 11:55:20 Daniel Vetter wrote:
>>>>> Why exactly do you want to hotplug encoders? Or bridges fwiw ... since at
>>>>> least only making those hotpluggable will make the uabi story easier since
>>>>> they're not exposed.
>>>> Ideally to avoid disabling the whole display engine when one encoder isn't 
>>>> available/operational. Right now we're waiting for all pieces to be available 
>>>> (using deferred probing or the component framework) before registering the DRM 
>>>> device. This means that if one bridge can't be probed successfully for any 
>>>> reason we'll end up having not display at all. This includes the case where 
>>>> the driver for the bridge is not available. If we could support dynamic 
>>>> hotplug of bridges, we could start with a display engine that supports a 
>>>> subset of the outputs, and add new outputs as they become operational.
>>>>
>>>> We have a similar issue when unbinding bridge devices from their driver. They 
>>>> obviously can't be used anymore, but we have no solution to handle that apart 
>>>> from unregistering the DRM device completely, as otherwise rebinding the 
>>>> bridge to the driver later can't be handled.
>>> This all sounds pretty cool, but does anyone care? Like what's the
>>> real-world use-case here? Some cosmic ray destroyed the bridge driver on
>>> your android phone and now you want it to magically fall back to hdmi that
>>> no one ever plugs in? Or someone misconfigures their kernel and gets
>>> greeted with a black screen, instead of a ... black screen?
>> Real use case is that we need to always load hdmi path drivers at phone
>> startup just in case somebody will use it.
>> This way we are wasting space and more importantly boot time, for code
>> which won't be used by 99% users of phones.
>> Putting them into modules an loading on MHL/HDMI cable plug-in would be
>> more optimal, I guess.
> Do we have numbers for this?

For number of HDMI/MHL users in mobiles, I have no stats :)
For display boot delay due to deferring hdmi driver is 2.5-3.5 seconds
on peach-pi board for example [1].

[1]:
https://storage.kernelci.org/ulfh/v4.9-rc7-120-g38cdf7e0bfee/arm-exynos_defconfig/lab-baylibre-seattle/boot-exynos5800-peach-pi.html


>  What part of the overhead is the edid probing
> and reading, which we probably should optimize either way ... optimize as
> in make sure we never ever stall anything for edid reads.

As EDID probing should be performed only after detecting sink it seems
irrelevant here.

>
> And if you never load the hdmi driver, how do you know when to load it
> because the user plugged in the cable?

Mobiles often have detection which cable is plugged in. However I am not
sure if kernel sends such events to userspace,
but this should be simple to do.

Regards
Andrzej
Daniel Vetter Dec. 1, 2016, 3:12 p.m. UTC | #24
On Thu, Dec 01, 2016 at 02:22:18PM +0100, Andrzej Hajda wrote:
> On 01.12.2016 08:18, Daniel Vetter wrote:
> > On Thu, Dec 01, 2016 at 08:07:29AM +0100, Andrzej Hajda wrote:
> >> On 30.11.2016 14:09, Daniel Vetter wrote:
> >>> On Wed, Nov 30, 2016 at 01:03:20PM +0200, Laurent Pinchart wrote:
> >>>> On Wednesday 30 Nov 2016 11:55:20 Daniel Vetter wrote:
> >>>>> Why exactly do you want to hotplug encoders? Or bridges fwiw ... since at
> >>>>> least only making those hotpluggable will make the uabi story easier since
> >>>>> they're not exposed.
> >>>> Ideally to avoid disabling the whole display engine when one encoder isn't 
> >>>> available/operational. Right now we're waiting for all pieces to be available 
> >>>> (using deferred probing or the component framework) before registering the DRM 
> >>>> device. This means that if one bridge can't be probed successfully for any 
> >>>> reason we'll end up having not display at all. This includes the case where 
> >>>> the driver for the bridge is not available. If we could support dynamic 
> >>>> hotplug of bridges, we could start with a display engine that supports a 
> >>>> subset of the outputs, and add new outputs as they become operational.
> >>>>
> >>>> We have a similar issue when unbinding bridge devices from their driver. They 
> >>>> obviously can't be used anymore, but we have no solution to handle that apart 
> >>>> from unregistering the DRM device completely, as otherwise rebinding the 
> >>>> bridge to the driver later can't be handled.
> >>> This all sounds pretty cool, but does anyone care? Like what's the
> >>> real-world use-case here? Some cosmic ray destroyed the bridge driver on
> >>> your android phone and now you want it to magically fall back to hdmi that
> >>> no one ever plugs in? Or someone misconfigures their kernel and gets
> >>> greeted with a black screen, instead of a ... black screen?
> >> Real use case is that we need to always load hdmi path drivers at phone
> >> startup just in case somebody will use it.
> >> This way we are wasting space and more importantly boot time, for code
> >> which won't be used by 99% users of phones.
> >> Putting them into modules an loading on MHL/HDMI cable plug-in would be
> >> more optimal, I guess.
> > Do we have numbers for this?
> 
> For number of HDMI/MHL users in mobiles, I have no stats :)
> For display boot delay due to deferring hdmi driver is 2.5-3.5 seconds
> on peach-pi board for example [1].

That sounds horrible. We load our entire driver in that time, and it has 3
hdmi ports. What exactly is that thing doing for 3 seconds?! Until we know
what's going on I'm not sure it's just a driver that has a dead-slow init
function ...

> [1]:
> https://storage.kernelci.org/ulfh/v4.9-rc7-120-g38cdf7e0bfee/arm-exynos_defconfig/lab-baylibre-seattle/boot-exynos5800-peach-pi.html
> 
> 
> >  What part of the overhead is the edid probing
> > and reading, which we probably should optimize either way ... optimize as
> > in make sure we never ever stall anything for edid reads.
> 
> As EDID probing should be performed only after detecting sink it seems
> irrelevant here.
> 
> >
> > And if you never load the hdmi driver, how do you know when to load it
> > because the user plugged in the cable?
> 
> Mobiles often have detection which cable is plugged in. However I am not
> sure if kernel sends such events to userspace,
> but this should be simple to do.

Well, to do that (at least with drm) you need the driver loaded, or at
least the stuff it supports registered.
-Daniel
Andrzej Hajda Dec. 2, 2016, 6:49 a.m. UTC | #25
On 01.12.2016 16:12, Daniel Vetter wrote:
> On Thu, Dec 01, 2016 at 02:22:18PM +0100, Andrzej Hajda wrote:
>> On 01.12.2016 08:18, Daniel Vetter wrote:
>>> On Thu, Dec 01, 2016 at 08:07:29AM +0100, Andrzej Hajda wrote:
>>>> On 30.11.2016 14:09, Daniel Vetter wrote:
>>>>> On Wed, Nov 30, 2016 at 01:03:20PM +0200, Laurent Pinchart wrote:
>>>>>> On Wednesday 30 Nov 2016 11:55:20 Daniel Vetter wrote:
>>>>>>> Why exactly do you want to hotplug encoders? Or bridges fwiw ... since at
>>>>>>> least only making those hotpluggable will make the uabi story easier since
>>>>>>> they're not exposed.
>>>>>> Ideally to avoid disabling the whole display engine when one encoder isn't 
>>>>>> available/operational. Right now we're waiting for all pieces to be available 
>>>>>> (using deferred probing or the component framework) before registering the DRM 
>>>>>> device. This means that if one bridge can't be probed successfully for any 
>>>>>> reason we'll end up having not display at all. This includes the case where 
>>>>>> the driver for the bridge is not available. If we could support dynamic 
>>>>>> hotplug of bridges, we could start with a display engine that supports a 
>>>>>> subset of the outputs, and add new outputs as they become operational.
>>>>>>
>>>>>> We have a similar issue when unbinding bridge devices from their driver. They 
>>>>>> obviously can't be used anymore, but we have no solution to handle that apart 
>>>>>> from unregistering the DRM device completely, as otherwise rebinding the 
>>>>>> bridge to the driver later can't be handled.
>>>>> This all sounds pretty cool, but does anyone care? Like what's the
>>>>> real-world use-case here? Some cosmic ray destroyed the bridge driver on
>>>>> your android phone and now you want it to magically fall back to hdmi that
>>>>> no one ever plugs in? Or someone misconfigures their kernel and gets
>>>>> greeted with a black screen, instead of a ... black screen?
>>>> Real use case is that we need to always load hdmi path drivers at phone
>>>> startup just in case somebody will use it.
>>>> This way we are wasting space and more importantly boot time, for code
>>>> which won't be used by 99% users of phones.
>>>> Putting them into modules an loading on MHL/HDMI cable plug-in would be
>>>> more optimal, I guess.
>>> Do we have numbers for this?
>> For number of HDMI/MHL users in mobiles, I have no stats :)
>> For display boot delay due to deferring hdmi driver is 2.5-3.5 seconds
>> on peach-pi board for example [1].
> That sounds horrible. We load our entire driver in that time, and it has 3
> hdmi ports. What exactly is that thing doing for 3 seconds?! Until we know
> what's going on I'm not sure it's just a driver that has a dead-slow init
> function ...

As I wrote it is due to deferring probe of hdmi driver. It has nothing
to do with device initialization.

>
>> [1]:
>> https://storage.kernelci.org/ulfh/v4.9-rc7-120-g38cdf7e0bfee/arm-exynos_defconfig/lab-baylibre-seattle/boot-exynos5800-peach-pi.html
>>
>>
>>>  What part of the overhead is the edid probing
>>> and reading, which we probably should optimize either way ... optimize as
>>> in make sure we never ever stall anything for edid reads.
>> As EDID probing should be performed only after detecting sink it seems
>> irrelevant here.
>>
>>> And if you never load the hdmi driver, how do you know when to load it
>>> because the user plugged in the cable?
>> Mobiles often have detection which cable is plugged in. However I am not
>> sure if kernel sends such events to userspace,
>> but this should be simple to do.
> Well, to do that (at least with drm) you need the driver loaded, or at
> least the stuff it supports registered.
> -Daniel

Fortunately, in case of mobiles, cable detection is usually done outside
of DRM
via extcon driver.

Regards
Andrzej
diff mbox

Patch

diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
index 0ee052b..36d427b 100644
--- a/drivers/gpu/drm/drm_bridge.c
+++ b/drivers/gpu/drm/drm_bridge.c
@@ -61,22 +61,26 @@ 
 static LIST_HEAD(bridge_list);
 
 /**
- * drm_bridge_add - add the given bridge to the global bridge list
+ * __drm_bridge_add - add the given bridge to the global bridge list
  *
  * @bridge: bridge control structure
+ * @module: module that provides this bridge
  *
  * RETURNS:
  * Unconditionally returns Zero.
  */
-int drm_bridge_add(struct drm_bridge *bridge)
+int __drm_bridge_add(struct drm_bridge *bridge, struct module *module)
 {
+#ifdef MODULE
+	bridge->module = module;
+#endif
 	mutex_lock(&bridge_lock);
 	list_add_tail(&bridge->list, &bridge_list);
 	mutex_unlock(&bridge_lock);
 
 	return 0;
 }
-EXPORT_SYMBOL(drm_bridge_add);
+EXPORT_SYMBOL(__drm_bridge_add);
 
 /**
  * drm_bridge_remove - remove the given bridge from the global bridge list
@@ -114,6 +118,9 @@  int drm_bridge_attach(struct drm_device *dev, struct drm_bridge *bridge)
 	if (bridge->dev)
 		return -EBUSY;
 
+	if (!try_module_get(bridge->module))
+		return -ENOENT;
+
 	bridge->dev = dev;
 
 	if (bridge->funcs->attach)
@@ -144,6 +151,8 @@  void drm_bridge_detach(struct drm_bridge *bridge)
 	if (bridge->funcs->detach)
 		bridge->funcs->detach(bridge);
 
+	module_put(bridge->module);
+
 	bridge->dev = NULL;
 }
 EXPORT_SYMBOL(drm_bridge_detach);
diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h
index 530a1d6..d60d5d2 100644
--- a/include/drm/drm_bridge.h
+++ b/include/drm/drm_bridge.h
@@ -25,6 +25,7 @@ 
 
 #include <linux/list.h>
 #include <linux/ctype.h>
+#include <linux/module.h>
 #include <drm/drm_mode_object.h>
 #include <drm/drm_modes.h>
 
@@ -192,13 +193,21 @@  struct drm_bridge {
 #ifdef CONFIG_OF
 	struct device_node *of_node;
 #endif
+#ifdef MODULE
+	struct module *module;
+#endif
 	struct list_head list;
 
 	const struct drm_bridge_funcs *funcs;
 	void *driver_private;
 };
 
-int drm_bridge_add(struct drm_bridge *bridge);
+int __drm_bridge_add(struct drm_bridge *bridge, struct module *module);
+#ifdef MODULE
+#define drm_bridge_add(bridge) __drm_bridge_add(bridge, THIS_MODULE)
+#else
+#define drm_bridge_add(bridge) __drm_bridge_add(bridge, NULL)
+#endif
 void drm_bridge_remove(struct drm_bridge *bridge);
 struct drm_bridge *of_drm_find_bridge(struct device_node *np);
 int drm_bridge_attach(struct drm_device *dev, struct drm_bridge *bridge);