diff mbox series

[RFT,03/15] drm/ingenic: Call drm_atomic_helper_shutdown() at shutdown time

Message ID 20230901164111.RFT.3.Iea742f06d8bec41598aa40378fc625fbd7e8a3d6@changeid (mailing list archive)
State New, archived
Headers show
Series drm: non-drm-misc drivers call drm_atomic_helper_shutdown() at the right times | expand

Commit Message

Doug Anderson Sept. 1, 2023, 11:41 p.m. UTC
Based on grepping through the source code this driver appears to be
missing a call to drm_atomic_helper_shutdown() at system shutdown
time. Among other things, this means that if a panel is in use that it
won't be cleanly powered off at system shutdown time.

The fact that we should call drm_atomic_helper_shutdown() in the case
of OS shutdown/restart comes straight out of the kernel doc "driver
instance overview" in drm_drv.c.

Since this driver uses the component model and shutdown happens at the
base driver, we communicate whether we have to call
drm_atomic_helper_shutdown() by seeing if drvdata is non-NULL.

Suggested-by: Maxime Ripard <mripard@kernel.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
---
This commit is only compile-time tested.

NOTE: this patch touches a lot more than other similar patches since
the bind() function is long and we want to make sure that we unset the
drvdata if bind() fails.

While making this patch, I noticed that the bind() function of this
driver is using "devm" and thus assumes it doesn't need to do much
explicit error handling. That's actually a bug. As per kernel docs [1]
"the lifetime of the aggregate driver does not align with any of the
underlying struct device instances. Therefore devm cannot be used and
all resources acquired or allocated in this callback must be
explicitly released in the unbind callback". Fixing that is outside
the scope of this commit.

[1] https://docs.kernel.org/driver-api/component.html

 drivers/gpu/drm/ingenic/ingenic-drm-drv.c | 66 +++++++++++++++--------
 1 file changed, 44 insertions(+), 22 deletions(-)

Comments

mripard@kernel.org Sept. 4, 2023, 7:36 a.m. UTC | #1
On Fri, 1 Sep 2023 16:41:14 -0700, Douglas Anderson wrote:
> Based on grepping through the source code this driver appears to be
> missing a call to drm_atomic_helper_shutdown() at system shutdown
> time. Among other things, this means that if a panel is in use that it
> won't be cleanly powered off at system shutdown time.
> 
> 
> [ ... ]

Reviewed-by: Maxime Ripard <mripard@kernel.org>

Thanks!
Maxime
Paul Cercueil Sept. 4, 2023, 9:15 a.m. UTC | #2
Hi Douglas,

Le vendredi 01 septembre 2023 à 16:41 -0700, Douglas Anderson a écrit :
> Based on grepping through the source code this driver appears to be
> missing a call to drm_atomic_helper_shutdown() at system shutdown
> time. Among other things, this means that if a panel is in use that
> it
> won't be cleanly powered off at system shutdown time.
> 
> The fact that we should call drm_atomic_helper_shutdown() in the case
> of OS shutdown/restart comes straight out of the kernel doc "driver
> instance overview" in drm_drv.c.
> 
> Since this driver uses the component model and shutdown happens at
> the
> base driver, we communicate whether we have to call
> drm_atomic_helper_shutdown() by seeing if drvdata is non-NULL.
> 
> Suggested-by: Maxime Ripard <mripard@kernel.org>
> Signed-off-by: Douglas Anderson <dianders@chromium.org>

LGTM.
Acked-by: Paul Cercueil <paul@crapouillou.net>

> ---
> This commit is only compile-time tested.
> 
> NOTE: this patch touches a lot more than other similar patches since
> the bind() function is long and we want to make sure that we unset
> the
> drvdata if bind() fails.
> 
> While making this patch, I noticed that the bind() function of this
> driver is using "devm" and thus assumes it doesn't need to do much
> explicit error handling. That's actually a bug. As per kernel docs
> [1]
> "the lifetime of the aggregate driver does not align with any of the
> underlying struct device instances. Therefore devm cannot be used and
> all resources acquired or allocated in this callback must be
> explicitly released in the unbind callback". Fixing that is outside
> the scope of this commit.
> 
> [1] https://docs.kernel.org/driver-api/component.html
> 

Noted, thanks.

Cheers,
-Paul

>  drivers/gpu/drm/ingenic/ingenic-drm-drv.c | 66 +++++++++++++++------
> --
>  1 file changed, 44 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/ingenic/ingenic-drm-drv.c
> b/drivers/gpu/drm/ingenic/ingenic-drm-drv.c
> index 8dbd4847d3a6..51995a0cd568 100644
> --- a/drivers/gpu/drm/ingenic/ingenic-drm-drv.c
> +++ b/drivers/gpu/drm/ingenic/ingenic-drm-drv.c
> @@ -1130,7 +1130,7 @@ static int ingenic_drm_bind(struct device *dev,
> bool has_components)
>  
>         ret = drmm_mode_config_init(drm);
>         if (ret)
> -               return ret;
> +               goto err_drvdata;
>  
>         drm->mode_config.min_width = 0;
>         drm->mode_config.min_height = 0;
> @@ -1142,7 +1142,8 @@ static int ingenic_drm_bind(struct device *dev,
> bool has_components)
>         base = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
>         if (IS_ERR(base)) {
>                 dev_err(dev, "Failed to get memory resource\n");
> -               return PTR_ERR(base);
> +               ret = PTR_ERR(base);
> +               goto err_drvdata;
>         }
>  
>         regmap_config = ingenic_drm_regmap_config;
> @@ -1151,33 +1152,40 @@ static int ingenic_drm_bind(struct device
> *dev, bool has_components)
>                                           &regmap_config);
>         if (IS_ERR(priv->map)) {
>                 dev_err(dev, "Failed to create regmap\n");
> -               return PTR_ERR(priv->map);
> +               ret = PTR_ERR(priv->map);
> +               goto err_drvdata;
>         }
>  
>         irq = platform_get_irq(pdev, 0);
> -       if (irq < 0)
> -               return irq;
> +       if (irq < 0) {
> +               ret = irq;
> +               goto err_drvdata;
> +       }
>  
>         if (soc_info->needs_dev_clk) {
>                 priv->lcd_clk = devm_clk_get(dev, "lcd");
>                 if (IS_ERR(priv->lcd_clk)) {
>                         dev_err(dev, "Failed to get lcd clock\n");
> -                       return PTR_ERR(priv->lcd_clk);
> +                       ret = PTR_ERR(priv->lcd_clk);
> +                       goto err_drvdata;
>                 }
>         }
>  
>         priv->pix_clk = devm_clk_get(dev, "lcd_pclk");
>         if (IS_ERR(priv->pix_clk)) {
>                 dev_err(dev, "Failed to get pixel clock\n");
> -               return PTR_ERR(priv->pix_clk);
> +               ret = PTR_ERR(priv->pix_clk);
> +               goto err_drvdata;
>         }
>  
>         priv->dma_hwdescs = dmam_alloc_coherent(dev,
>                                                 sizeof(*priv-
> >dma_hwdescs),
>                                                 &priv-
> >dma_hwdescs_phys,
>                                                 GFP_KERNEL);
> -       if (!priv->dma_hwdescs)
> -               return -ENOMEM;
> +       if (!priv->dma_hwdescs) {
> +               ret = -ENOMEM;
> +               goto err_drvdata;
> +       }
>  
>         /* Configure DMA hwdesc for foreground0 plane */
>         ingenic_drm_configure_hwdesc_plane(priv, 0);
> @@ -1199,7 +1207,7 @@ static int ingenic_drm_bind(struct device *dev,
> bool has_components)
>                                        NULL, DRM_PLANE_TYPE_PRIMARY,
> NULL);
>         if (ret) {
>                 dev_err(dev, "Failed to register plane: %i\n", ret);
> -               return ret;
> +               goto err_drvdata;
>         }
>  
>         if (soc_info->map_noncoherent)
> @@ -1211,7 +1219,7 @@ static int ingenic_drm_bind(struct device *dev,
> bool has_components)
>                                         NULL,
> &ingenic_drm_crtc_funcs, NULL);
>         if (ret) {
>                 dev_err(dev, "Failed to init CRTC: %i\n", ret);
> -               return ret;
> +               goto err_drvdata;
>         }
>  
>         drm_crtc_enable_color_mgmt(&priv->crtc, 0, false,
> @@ -1230,7 +1238,7 @@ static int ingenic_drm_bind(struct device *dev,
> bool has_components)
>                 if (ret) {
>                         dev_err(dev, "Failed to register overlay
> plane: %i\n",
>                                 ret);
> -                       return ret;
> +                       goto err_drvdata;
>                 }
>  
>                 if (soc_info->map_noncoherent)
> @@ -1241,17 +1249,18 @@ static int ingenic_drm_bind(struct device
> *dev, bool has_components)
>                         if (ret) {
>                                 if (ret != -EPROBE_DEFER)
>                                         dev_err(dev, "Failed to bind
> components: %i\n", ret);
> -                               return ret;
> +                               goto err_drvdata;
>                         }
>  
>                         ret = devm_add_action_or_reset(dev,
> ingenic_drm_unbind_all, priv);
>                         if (ret)
> -                               return ret;
> +                               goto err_drvdata;
>  
>                         priv->ipu_plane = drm_plane_from_index(drm,
> 2);
>                         if (!priv->ipu_plane) {
>                                 dev_err(dev, "Failed to retrieve IPU
> plane\n");
> -                               return -EINVAL;
> +                               ret = -EINVAL;
> +                               goto err_drvdata;
>                         }
>                 }
>         }
> @@ -1263,7 +1272,7 @@ static int ingenic_drm_bind(struct device *dev,
> bool has_components)
>                                 break; /* we're done */
>                         if (ret != -EPROBE_DEFER)
>                                 dev_err(dev, "Failed to get bridge
> handle\n");
> -                       return ret;
> +                       goto err_drvdata;
>                 }
>  
>                 if (panel)
> @@ -1275,7 +1284,7 @@ static int ingenic_drm_bind(struct device *dev,
> bool has_components)
>                 if (IS_ERR(ib)) {
>                         ret = PTR_ERR(ib);
>                         dev_err(dev, "Failed to init encoder: %d\n",
> ret);
> -                       return ret;
> +                       goto err_drvdata;
>                 }
>  
>                 encoder = &ib->encoder;
> @@ -1290,13 +1299,14 @@ static int ingenic_drm_bind(struct device
> *dev, bool has_components)
>                                         DRM_BRIDGE_ATTACH_NO_CONNECTO
> R);
>                 if (ret) {
>                         dev_err(dev, "Unable to attach bridge\n");
> -                       return ret;
> +                       goto err_drvdata;
>                 }
>  
>                 connector = drm_bridge_connector_init(drm, encoder);
>                 if (IS_ERR(connector)) {
>                         dev_err(dev, "Unable to init connector\n");
> -                       return PTR_ERR(connector);
> +                       ret = PTR_ERR(connector);
> +                       goto err_drvdata;
>                 }
>  
>                 drm_connector_attach_encoder(connector, encoder);
> @@ -1313,13 +1323,13 @@ static int ingenic_drm_bind(struct device
> *dev, bool has_components)
>         ret = devm_request_irq(dev, irq, ingenic_drm_irq_handler, 0,
> drm->driver->name, drm);
>         if (ret) {
>                 dev_err(dev, "Unable to install IRQ handler\n");
> -               return ret;
> +               goto err_drvdata;
>         }
>  
>         ret = drm_vblank_init(drm, 1);
>         if (ret) {
>                 dev_err(dev, "Failed calling drm_vblank_init()\n");
> -               return ret;
> +               goto err_drvdata;
>         }
>  
>         drm_mode_config_reset(drm);
> @@ -1327,7 +1337,7 @@ static int ingenic_drm_bind(struct device *dev,
> bool has_components)
>         ret = clk_prepare_enable(priv->pix_clk);
>         if (ret) {
>                 dev_err(dev, "Unable to start pixel clock\n");
> -               return ret;
> +               goto err_drvdata;
>         }
>  
>         if (priv->lcd_clk) {
> @@ -1402,6 +1412,8 @@ static int ingenic_drm_bind(struct device *dev,
> bool has_components)
>                 clk_disable_unprepare(priv->lcd_clk);
>  err_pixclk_disable:
>         clk_disable_unprepare(priv->pix_clk);
> +err_drvdata:
> +       platform_set_drvdata(pdev, NULL);
>         return ret;
>  }
>  
> @@ -1422,6 +1434,7 @@ static void ingenic_drm_unbind(struct device
> *dev)
>  
>         drm_dev_unregister(&priv->drm);
>         drm_atomic_helper_shutdown(&priv->drm);
> +       dev_set_drvdata(dev, NULL);
>  }
>  
>  static const struct component_master_ops ingenic_master_ops = {
> @@ -1461,6 +1474,14 @@ static int ingenic_drm_remove(struct
> platform_device *pdev)
>         return 0;
>  }
>  
> +static void ingenic_drm_shutdown(struct platform_device *pdev)
> +{
> +       struct ingenic_drm *priv = platform_get_drvdata(pdev);
> +
> +       if (priv)
> +               drm_atomic_helper_shutdown(&priv->drm);
> +}
> +
>  static int ingenic_drm_suspend(struct device *dev)
>  {
>         struct ingenic_drm *priv = dev_get_drvdata(dev);
> @@ -1612,6 +1633,7 @@ static struct platform_driver
> ingenic_drm_driver = {
>         },
>         .probe = ingenic_drm_probe,
>         .remove = ingenic_drm_remove,
> +       .shutdown = ingenic_drm_shutdown,
>  };
>  
>  static int ingenic_drm_init(void)
Doug Anderson Sept. 5, 2023, 8:16 p.m. UTC | #3
Paul,

On Mon, Sep 4, 2023 at 2:15 AM Paul Cercueil <paul@crapouillou.net> wrote:
>
> Hi Douglas,
>
> Le vendredi 01 septembre 2023 à 16:41 -0700, Douglas Anderson a écrit :
> > Based on grepping through the source code this driver appears to be
> > missing a call to drm_atomic_helper_shutdown() at system shutdown
> > time. Among other things, this means that if a panel is in use that
> > it
> > won't be cleanly powered off at system shutdown time.
> >
> > The fact that we should call drm_atomic_helper_shutdown() in the case
> > of OS shutdown/restart comes straight out of the kernel doc "driver
> > instance overview" in drm_drv.c.
> >
> > Since this driver uses the component model and shutdown happens at
> > the
> > base driver, we communicate whether we have to call
> > drm_atomic_helper_shutdown() by seeing if drvdata is non-NULL.
> >
> > Suggested-by: Maxime Ripard <mripard@kernel.org>
> > Signed-off-by: Douglas Anderson <dianders@chromium.org>
>
> LGTM.
> Acked-by: Paul Cercueil <paul@crapouillou.net>

Thanks for the Ack! Would you expect this patch to land through
"drm-misc", or do you expect it to go through some other tree?
Running:

./scripts/get_maintainer.pl --scm -f drivers/gpu/drm/ingenic/ingenic-drm-drv.c

...does not show that this driver normally goes through drm-misc, but
it also doesn't show that it goes through any other tree so maybe it's
just an artifact of the way it's tagged in the MAINTAINERS file? If
it's fine for this to go through drm-misc, I'll probably land it (with
your Ack and Maxime's Review) sooner rather than later just to make
this patch series less unwieldy.


> > ---
> > This commit is only compile-time tested.
> >
> > NOTE: this patch touches a lot more than other similar patches since
> > the bind() function is long and we want to make sure that we unset
> > the
> > drvdata if bind() fails.
> >
> > While making this patch, I noticed that the bind() function of this
> > driver is using "devm" and thus assumes it doesn't need to do much
> > explicit error handling. That's actually a bug. As per kernel docs
> > [1]
> > "the lifetime of the aggregate driver does not align with any of the
> > underlying struct device instances. Therefore devm cannot be used and
> > all resources acquired or allocated in this callback must be
> > explicitly released in the unbind callback". Fixing that is outside
> > the scope of this commit.
> >
> > [1] https://docs.kernel.org/driver-api/component.html
> >
>
> Noted, thanks.

FWIW, I think that at least a few other DRM drivers handle this by
doing some of their resource allocation / acquiring in the probe()
function and then only doing things in the bind() that absolutely need
to be in the bind. ;-)


-Doug
mripard@kernel.org Sept. 6, 2023, 8:39 a.m. UTC | #4
On Tue, Sep 05, 2023 at 01:16:08PM -0700, Doug Anderson wrote:
> > > ---
> > > This commit is only compile-time tested.
> > >
> > > NOTE: this patch touches a lot more than other similar patches since
> > > the bind() function is long and we want to make sure that we unset
> > > the
> > > drvdata if bind() fails.
> > >
> > > While making this patch, I noticed that the bind() function of this
> > > driver is using "devm" and thus assumes it doesn't need to do much
> > > explicit error handling. That's actually a bug. As per kernel docs
> > > [1]
> > > "the lifetime of the aggregate driver does not align with any of the
> > > underlying struct device instances. Therefore devm cannot be used and
> > > all resources acquired or allocated in this callback must be
> > > explicitly released in the unbind callback". Fixing that is outside
> > > the scope of this commit.
> > >
> > > [1] https://docs.kernel.org/driver-api/component.html
> > >
> >
> > Noted, thanks.
> 
> FWIW, I think that at least a few other DRM drivers handle this by
> doing some of their resource allocation / acquiring in the probe()
> function and then only doing things in the bind() that absolutely need
> to be in the bind. ;-)

That doesn't change much. The fundamental issue is that the DRM device
sticks around until the last application that has an open fd to it
closes it.

So it doesn't have any relationship with the unbind/remove timing, and
for all we know it can be there indefinitely, while the application
continues to interact with the driver.

Maxime
Doug Anderson Sept. 13, 2023, 4:23 p.m. UTC | #5
Hi,

On Wed, Sep 6, 2023 at 1:39 AM Maxime Ripard <mripard@kernel.org> wrote:
>
> On Tue, Sep 05, 2023 at 01:16:08PM -0700, Doug Anderson wrote:
> > > > ---
> > > > This commit is only compile-time tested.
> > > >
> > > > NOTE: this patch touches a lot more than other similar patches since
> > > > the bind() function is long and we want to make sure that we unset
> > > > the
> > > > drvdata if bind() fails.
> > > >
> > > > While making this patch, I noticed that the bind() function of this
> > > > driver is using "devm" and thus assumes it doesn't need to do much
> > > > explicit error handling. That's actually a bug. As per kernel docs
> > > > [1]
> > > > "the lifetime of the aggregate driver does not align with any of the
> > > > underlying struct device instances. Therefore devm cannot be used and
> > > > all resources acquired or allocated in this callback must be
> > > > explicitly released in the unbind callback". Fixing that is outside
> > > > the scope of this commit.
> > > >
> > > > [1] https://docs.kernel.org/driver-api/component.html
> > > >
> > >
> > > Noted, thanks.
> >
> > FWIW, I think that at least a few other DRM drivers handle this by
> > doing some of their resource allocation / acquiring in the probe()
> > function and then only doing things in the bind() that absolutely need
> > to be in the bind. ;-)
>
> That doesn't change much. The fundamental issue is that the DRM device
> sticks around until the last application that has an open fd to it
> closes it.
>
> So it doesn't have any relationship with the unbind/remove timing, and
> for all we know it can be there indefinitely, while the application
> continues to interact with the driver.

I spent some time thinking about similar issues recently and, assuming
my understanding is correct, I'd at least partially disagree.

Specifically, I _think_ the only thing that's truly required to remain
valid until userspace closes the last open "fd" is the memory for the
"struct drm_device" itself, right? My understanding is that this is
similar to how "struct device" works. The memory backing a "struct
device" has to live until the last client releases a reference to it
even if everything else about a device has gone away. So if it was all
working perfectly then if the Linux driver backing the "struct
drm_device" goes away then we'd release resources and NULL out a bunch
of stuff in the "struct drm_device" but still keep the actual "struct
drm_device" around since userspace still has a reference. Pretty much
all userspace calls would fail, but at least they wouldn't crash. Is
that roughly the gist?

Assuming that's correct, then _most_ of the resource acquiring /
memory allocation can still happen in the device probe() routine and
can still use devm as long as we do something to ensure that any
resources released are no longer pointed to by anything in the "struct
drm_device".

To make it concrete, I think we want this (feel free to correct). For
simplicity, I'm assuming a driver that _doesn't_ use the component
framework:

a) Linux driver probe() happens. The "struct drm_device" is allocated
in probe() by devm_drm_dev_alloc(). This takes a reference to the
"struct drm_device". The device also acquires resources / allocates
memory.

b) Userspace acquires a reference to the "struct drm_device". Refcount
is now 2 (one from userspace, one from the Linux driver).

c) The Linux driver unbinds, presumably because userspace requested
it. From earlier I think we decided that we can't (by design) block
unbind. Once unbind happens then we shouldn't try to keep operating
the device and the driver should stop running. As part of the unbind,
the remove() is called and also "devm" resources are deallocated. If
any of the things freed are pointed to by the "struct drm_device" then
the code needs to NULL them out at this time. Also we should make sure
that any callback functions that userspace could cause to be invoked
return errors. Our code could go away at any point here since
userspace could "rmmod" our module.

d) Eventually userspace releases the reference and the "struct
drm_device" memory gets automatically freed because it was allocated
by devm_drm_dev_alloc()


NOTE: potentially some things could be allocated / managed by
drmm_xyz() function, like drmm_kmalloc() and that could simplify some
things. However, it's not a panacea for everything. Specifically once
the Linux driver unbind finishes then the device isn't functional
anymore.



-Doug
Doug Anderson Sept. 13, 2023, 4:25 p.m. UTC | #6
Hi,

On Tue, Sep 5, 2023 at 1:16 PM Doug Anderson <dianders@chromium.org> wrote:
>
> Paul,
>
> On Mon, Sep 4, 2023 at 2:15 AM Paul Cercueil <paul@crapouillou.net> wrote:
> >
> > Hi Douglas,
> >
> > Le vendredi 01 septembre 2023 à 16:41 -0700, Douglas Anderson a écrit :
> > > Based on grepping through the source code this driver appears to be
> > > missing a call to drm_atomic_helper_shutdown() at system shutdown
> > > time. Among other things, this means that if a panel is in use that
> > > it
> > > won't be cleanly powered off at system shutdown time.
> > >
> > > The fact that we should call drm_atomic_helper_shutdown() in the case
> > > of OS shutdown/restart comes straight out of the kernel doc "driver
> > > instance overview" in drm_drv.c.
> > >
> > > Since this driver uses the component model and shutdown happens at
> > > the
> > > base driver, we communicate whether we have to call
> > > drm_atomic_helper_shutdown() by seeing if drvdata is non-NULL.
> > >
> > > Suggested-by: Maxime Ripard <mripard@kernel.org>
> > > Signed-off-by: Douglas Anderson <dianders@chromium.org>
> >
> > LGTM.
> > Acked-by: Paul Cercueil <paul@crapouillou.net>
>
> Thanks for the Ack! Would you expect this patch to land through
> "drm-misc", or do you expect it to go through some other tree?
> Running:
>
> ./scripts/get_maintainer.pl --scm -f drivers/gpu/drm/ingenic/ingenic-drm-drv.c
>
> ...does not show that this driver normally goes through drm-misc, but
> it also doesn't show that it goes through any other tree so maybe it's
> just an artifact of the way it's tagged in the MAINTAINERS file? If
> it's fine for this to go through drm-misc, I'll probably land it (with
> your Ack and Maxime's Review) sooner rather than later just to make
> this patch series less unwieldy.
>
>
> > > ---
> > > This commit is only compile-time tested.
> > >
> > > NOTE: this patch touches a lot more than other similar patches since
> > > the bind() function is long and we want to make sure that we unset
> > > the
> > > drvdata if bind() fails.
> > >
> > > While making this patch, I noticed that the bind() function of this
> > > driver is using "devm" and thus assumes it doesn't need to do much
> > > explicit error handling. That's actually a bug. As per kernel docs
> > > [1]
> > > "the lifetime of the aggregate driver does not align with any of the
> > > underlying struct device instances. Therefore devm cannot be used and
> > > all resources acquired or allocated in this callback must be
> > > explicitly released in the unbind callback". Fixing that is outside
> > > the scope of this commit.
> > >
> > > [1] https://docs.kernel.org/driver-api/component.html
> > >
> >
> > Noted, thanks.
>
> FWIW, I think that at least a few other DRM drivers handle this by
> doing some of their resource allocation / acquiring in the probe()
> function and then only doing things in the bind() that absolutely need
> to be in the bind. ;-)

I've been collecting patches that are ready to land in drm-misc but,
right now, I'm not taking this patch since I didn't get any
clarification of whether it should land through drm-misc or somewhere
else.

-Doug
Paul Cercueil Sept. 13, 2023, 6 p.m. UTC | #7
Hi Doug,

Le mercredi 13 septembre 2023 à 09:25 -0700, Doug Anderson a écrit :
> Hi,
> 
> On Tue, Sep 5, 2023 at 1:16 PM Doug Anderson <dianders@chromium.org>
> wrote:
> > 
> > Paul,
> > 
> > On Mon, Sep 4, 2023 at 2:15 AM Paul Cercueil <paul@crapouillou.net>
> > wrote:
> > > 
> > > Hi Douglas,
> > > 
> > > Le vendredi 01 septembre 2023 à 16:41 -0700, Douglas Anderson a
> > > écrit :
> > > > Based on grepping through the source code this driver appears
> > > > to be
> > > > missing a call to drm_atomic_helper_shutdown() at system
> > > > shutdown
> > > > time. Among other things, this means that if a panel is in use
> > > > that
> > > > it
> > > > won't be cleanly powered off at system shutdown time.
> > > > 
> > > > The fact that we should call drm_atomic_helper_shutdown() in
> > > > the case
> > > > of OS shutdown/restart comes straight out of the kernel doc
> > > > "driver
> > > > instance overview" in drm_drv.c.
> > > > 
> > > > Since this driver uses the component model and shutdown happens
> > > > at
> > > > the
> > > > base driver, we communicate whether we have to call
> > > > drm_atomic_helper_shutdown() by seeing if drvdata is non-NULL.
> > > > 
> > > > Suggested-by: Maxime Ripard <mripard@kernel.org>
> > > > Signed-off-by: Douglas Anderson <dianders@chromium.org>
> > > 
> > > LGTM.
> > > Acked-by: Paul Cercueil <paul@crapouillou.net>
> > 
> > Thanks for the Ack! Would you expect this patch to land through
> > "drm-misc", or do you expect it to go through some other tree?
> > Running:
> > 
> > ./scripts/get_maintainer.pl --scm -f
> > drivers/gpu/drm/ingenic/ingenic-drm-drv.c
> > 
> > ...does not show that this driver normally goes through drm-misc,
> > but
> > it also doesn't show that it goes through any other tree so maybe
> > it's
> > just an artifact of the way it's tagged in the MAINTAINERS file? If
> > it's fine for this to go through drm-misc, I'll probably land it
> > (with
> > your Ack and Maxime's Review) sooner rather than later just to make
> > this patch series less unwieldy.
> > 
> > 
> > > > ---
> > > > This commit is only compile-time tested.
> > > > 
> > > > NOTE: this patch touches a lot more than other similar patches
> > > > since
> > > > the bind() function is long and we want to make sure that we
> > > > unset
> > > > the
> > > > drvdata if bind() fails.
> > > > 
> > > > While making this patch, I noticed that the bind() function of
> > > > this
> > > > driver is using "devm" and thus assumes it doesn't need to do
> > > > much
> > > > explicit error handling. That's actually a bug. As per kernel
> > > > docs
> > > > [1]
> > > > "the lifetime of the aggregate driver does not align with any
> > > > of the
> > > > underlying struct device instances. Therefore devm cannot be
> > > > used and
> > > > all resources acquired or allocated in this callback must be
> > > > explicitly released in the unbind callback". Fixing that is
> > > > outside
> > > > the scope of this commit.
> > > > 
> > > > [1] https://docs.kernel.org/driver-api/component.html
> > > > 
> > > 
> > > Noted, thanks.
> > 
> > FWIW, I think that at least a few other DRM drivers handle this by
> > doing some of their resource allocation / acquiring in the probe()
> > function and then only doing things in the bind() that absolutely
> > need
> > to be in the bind. ;-)
> 
> I've been collecting patches that are ready to land in drm-misc but,
> right now, I'm not taking this patch since I didn't get any
> clarification of whether it should land through drm-misc or somewhere
> else.

Sorry, you can take it in drm-misc, yes.

Cheers,
-Paul
Doug Anderson Sept. 13, 2023, 6:21 p.m. UTC | #8
Hi,

On Fri, Sep 1, 2023 at 4:42 PM Douglas Anderson <dianders@chromium.org> wrote:
>
> Based on grepping through the source code this driver appears to be
> missing a call to drm_atomic_helper_shutdown() at system shutdown
> time. Among other things, this means that if a panel is in use that it
> won't be cleanly powered off at system shutdown time.
>
> The fact that we should call drm_atomic_helper_shutdown() in the case
> of OS shutdown/restart comes straight out of the kernel doc "driver
> instance overview" in drm_drv.c.
>
> Since this driver uses the component model and shutdown happens at the
> base driver, we communicate whether we have to call
> drm_atomic_helper_shutdown() by seeing if drvdata is non-NULL.
>
> Suggested-by: Maxime Ripard <mripard@kernel.org>
> Signed-off-by: Douglas Anderson <dianders@chromium.org>
> ---
> This commit is only compile-time tested.
>
> NOTE: this patch touches a lot more than other similar patches since
> the bind() function is long and we want to make sure that we unset the
> drvdata if bind() fails.
>
> While making this patch, I noticed that the bind() function of this
> driver is using "devm" and thus assumes it doesn't need to do much
> explicit error handling. That's actually a bug. As per kernel docs [1]
> "the lifetime of the aggregate driver does not align with any of the
> underlying struct device instances. Therefore devm cannot be used and
> all resources acquired or allocated in this callback must be
> explicitly released in the unbind callback". Fixing that is outside
> the scope of this commit.
>
> [1] https://docs.kernel.org/driver-api/component.html
>
>  drivers/gpu/drm/ingenic/ingenic-drm-drv.c | 66 +++++++++++++++--------
>  1 file changed, 44 insertions(+), 22 deletions(-)

[ ... cut ... ]

> @@ -1612,6 +1633,7 @@ static struct platform_driver ingenic_drm_driver = {
>         },
>         .probe = ingenic_drm_probe,
>         .remove = ingenic_drm_remove,
> +       .shutdown = ingenic_drm_shutdown,

I resolved the trivial conflict with commit 2b9b0a9fc548
("drm/ingenic: Convert to platform remove callback returning void"),
then pushed to drm-misc-next:

c3ca98396ffa (HEAD -> drm-misc-next) drm/ingenic: Call
drm_atomic_helper_shutdown() at shutdown time
mripard@kernel.org Sept. 14, 2023, 8:14 a.m. UTC | #9
Hi,

On Wed, Sep 13, 2023 at 09:23:29AM -0700, Doug Anderson wrote:
> On Wed, Sep 6, 2023 at 1:39 AM Maxime Ripard <mripard@kernel.org> wrote:
> > On Tue, Sep 05, 2023 at 01:16:08PM -0700, Doug Anderson wrote:
> > > > > This commit is only compile-time tested.
> > > > >
> > > > > NOTE: this patch touches a lot more than other similar patches since
> > > > > the bind() function is long and we want to make sure that we unset
> > > > > the
> > > > > drvdata if bind() fails.
> > > > >
> > > > > While making this patch, I noticed that the bind() function of this
> > > > > driver is using "devm" and thus assumes it doesn't need to do much
> > > > > explicit error handling. That's actually a bug. As per kernel docs
> > > > > [1]
> > > > > "the lifetime of the aggregate driver does not align with any of the
> > > > > underlying struct device instances. Therefore devm cannot be used and
> > > > > all resources acquired or allocated in this callback must be
> > > > > explicitly released in the unbind callback". Fixing that is outside
> > > > > the scope of this commit.
> > > > >
> > > > > [1] https://docs.kernel.org/driver-api/component.html
> > > > >
> > > >
> > > > Noted, thanks.
> > >
> > > FWIW, I think that at least a few other DRM drivers handle this by
> > > doing some of their resource allocation / acquiring in the probe()
> > > function and then only doing things in the bind() that absolutely need
> > > to be in the bind. ;-)
> >
> > That doesn't change much. The fundamental issue is that the DRM device
> > sticks around until the last application that has an open fd to it
> > closes it.
> >
> > So it doesn't have any relationship with the unbind/remove timing, and
> > for all we know it can be there indefinitely, while the application
> > continues to interact with the driver.
> 
> I spent some time thinking about similar issues recently and, assuming
> my understanding is correct, I'd at least partially disagree.
> 
> Specifically, I _think_ the only thing that's truly required to remain
> valid until userspace closes the last open "fd" is the memory for the
> "struct drm_device" itself, right? My understanding is that this is
> similar to how "struct device" works. The memory backing a "struct
> device" has to live until the last client releases a reference to it
> even if everything else about a device has gone away. So if it was all
> working perfectly then if the Linux driver backing the "struct
> drm_device" goes away then we'd release resources and NULL out a bunch
> of stuff in the "struct drm_device" but still keep the actual "struct
> drm_device" around since userspace still has a reference. Pretty much
> all userspace calls would fail, but at least they wouldn't crash. Is
> that roughly the gist?

Yes, but also, no.

In the spirit, you're right. However, there's three things interfering
here:

  - You don't always have a match between device and KMS entity. Display
    pipelines are usually multiple devices working together, and while
    you probably have a 1:1 relationship with bridges and panels (and to
    some extent encoders/connectors), the planes and framebuffers for
    example are a mess :) So, if the device backing the planes is to be
    removed, what are you removing exactly? All of the planes and
    framebuffers? Do you free the buffers allocated by the userspace
    (that it might still use?)?

  - In addition to that, KMS doesn't deal with individual entities being
    hotplugged so neither the subsystem nor the application expect to
    have a connector being removed.

  - ioctl's aren't filtered once the device is starting to get removed
    on most drivers.

So due to 1 and 2, we can't really partially remove components unless
the application is aware of it, and it doesn't expect to. And most
drivers still allow (probably unwillingly though) the application to
call ioctls once the DRM device has lost at least one of its backing
devices.

> Assuming that's correct, then _most_ of the resource acquiring /
> memory allocation can still happen in the device probe() routine and
> can still use devm as long as we do something to ensure that any
> resources released are no longer pointed to by anything in the "struct
> drm_device".
> 
> To make it concrete, I think we want this (feel free to correct). For
> simplicity, I'm assuming a driver that _doesn't_ use the component
> framework:
> 
> a) Linux driver probe() happens. The "struct drm_device" is allocated
> in probe() by devm_drm_dev_alloc(). This takes a reference to the
> "struct drm_device". The device also acquires resources / allocates
> memory.

You need to differentiate resources and allocations there. Resources can
be expected to go away at the same time than the device, so using devm
is fine. Allocations are largely disconnected from the device lifetime,
and using devm leads to UAF.

> b) Userspace acquires a reference to the "struct drm_device". Refcount
> is now 2 (one from userspace, one from the Linux driver).
> 
> c) The Linux driver unbinds, presumably because userspace requested
> it. From earlier I think we decided that we can't (by design) block
> unbind. Once unbind happens then we shouldn't try to keep operating
> the device

That part is correct, because the resources aren't there anymore.

> the driver should stop running.

But for the reasons above, the driver needs to still operate (in a
degraded mode).

> As part of the unbind, the remove() is called and also "devm"
> resources are deallocated. If any of the things freed are pointed to
> by the "struct drm_device" then the code needs to NULL them out at
> this time.

Right, we also need to make sure we don't access any of the resources
that got freed. This is typically done by protecting all the accesses
with drm_dev_enter/drm_dev_exit.

> Also we should make sure that any callback functions that userspace
> could cause to be invoked return errors.

That would prevent any new ioctl from occuring after the device has been
removed, but that doesn't fix the race condition if it's removed while
there's a commit happening. This is further complicated by the fact that
commits can be queued (so you would have multiple submitted already) or
made asynchronous.

> Our code could go away at any point here since userspace could "rmmod"
> our module.

Yeah, we probably have a bug there. Boris also reported something like
that recently where if you add an action with drmm_add_action, and then
remove the module, the function would have been free'd by the time it
executes.

> d) Eventually userspace releases the reference and the "struct
> drm_device" memory gets automatically freed because it was allocated
> by devm_drm_dev_alloc()

It was allocated by devm_drm_dev_alloc() but wasn't by devm_kzalloc().
devm_drm_dev_alloc() will "only" register an action to put back its
reference, but any application that opens the DRM device file will take
a reference as well (through drm_minor_acquire()).

So it's not freed at device_release_all() time, but when the last
reference is given back which could happen much later.

> NOTE: potentially some things could be allocated / managed by
> drmm_xyz() function, like drmm_kmalloc() and that could simplify some
> things.

The general rule is that any allocation needed for the framework
interactions need to be allocated by drmm, any allocation/resource
needed to operate the device need to be allocated by devm.

> However, it's not a panacea for everything. Specifically once
> the Linux driver unbind finishes then the device isn't functional
> anymore.

What's wrong with it then?

Maxime
Doug Anderson Sept. 14, 2023, 10:29 p.m. UTC | #10
Hi,

On Thu, Sep 14, 2023 at 1:14 AM Maxime Ripard <mripard@kernel.org> wrote:
>
> > > So it doesn't have any relationship with the unbind/remove timing, and
> > > for all we know it can be there indefinitely, while the application
> > > continues to interact with the driver.
> >
> > I spent some time thinking about similar issues recently and, assuming
> > my understanding is correct, I'd at least partially disagree.
> >
> > Specifically, I _think_ the only thing that's truly required to remain
> > valid until userspace closes the last open "fd" is the memory for the
> > "struct drm_device" itself, right? My understanding is that this is
> > similar to how "struct device" works. The memory backing a "struct
> > device" has to live until the last client releases a reference to it
> > even if everything else about a device has gone away. So if it was all
> > working perfectly then if the Linux driver backing the "struct
> > drm_device" goes away then we'd release resources and NULL out a bunch
> > of stuff in the "struct drm_device" but still keep the actual "struct
> > drm_device" around since userspace still has a reference. Pretty much
> > all userspace calls would fail, but at least they wouldn't crash. Is
> > that roughly the gist?
>
> Yes, but also, no.
>
> In the spirit, you're right. However, there's three things interfering
> here:
>
>   - You don't always have a match between device and KMS entity. Display
>     pipelines are usually multiple devices working together, and while
>     you probably have a 1:1 relationship with bridges and panels (and to
>     some extent encoders/connectors), the planes and framebuffers for
>     example are a mess :) So, if the device backing the planes is to be
>     removed, what are you removing exactly? All of the planes and
>     framebuffers? Do you free the buffers allocated by the userspace
>     (that it might still use?)?
>
>   - In addition to that, KMS doesn't deal with individual entities being
>     hotplugged so neither the subsystem nor the application expect to
>     have a connector being removed.
>
>   - ioctl's aren't filtered once the device is starting to get removed
>     on most drivers.
>
> So due to 1 and 2, we can't really partially remove components unless
> the application is aware of it, and it doesn't expect to. And most
> drivers still allow (probably unwillingly though) the application to
> call ioctls once the DRM device has lost at least one of its backing
> devices.

We "can't", but we "can", right? Userspace can freely unbind a driver.
Unless you want to dig into if the community would allow a driver to
block "unbind" then we have to, at the very least, not crash the
kernel when userspace does this. Ideally we'd have something more
elegant than just "don't crash the kernel", but at least we shouldn't
crash.


> > Assuming that's correct, then _most_ of the resource acquiring /
> > memory allocation can still happen in the device probe() routine and
> > can still use devm as long as we do something to ensure that any
> > resources released are no longer pointed to by anything in the "struct
> > drm_device".
> >
> > To make it concrete, I think we want this (feel free to correct). For
> > simplicity, I'm assuming a driver that _doesn't_ use the component
> > framework:
> >
> > a) Linux driver probe() happens. The "struct drm_device" is allocated
> > in probe() by devm_drm_dev_alloc(). This takes a reference to the
> > "struct drm_device". The device also acquires resources / allocates
> > memory.
>
> You need to differentiate resources and allocations there. Resources can
> be expected to go away at the same time than the device, so using devm
> is fine. Allocations are largely disconnected from the device lifetime,
> and using devm leads to UAF.

Right. I think my original point was looking at "ingenic-drm-drv.c".
Much of the "devm" stuff there is resources and those specific things
could be moved to probe() instead of bind(), right?

For allocations, I think you'd have to look at each allocation. If the
allocation needed to live as long as the "struct drm_device" then devm
is clearly the wrong choice. ...but not every allocation needs to live
that long. Also, even if in the "simple" case allocations need to live
as long as a "struct drm_device", it's possible that there are some
cases where there's only an indirect reference to the memory. In that
case, you could NULL out the indirect reference and then free it.
Obviously someone would need to take care here.


> > b) Userspace acquires a reference to the "struct drm_device". Refcount
> > is now 2 (one from userspace, one from the Linux driver).
> >
> > c) The Linux driver unbinds, presumably because userspace requested
> > it. From earlier I think we decided that we can't (by design) block
> > unbind. Once unbind happens then we shouldn't try to keep operating
> > the device
>
> That part is correct, because the resources aren't there anymore.
>
> > the driver should stop running.
>
> But for the reasons above, the driver needs to still operate (in a
> degraded mode).

So I think here is where the disconnect is from our viewpoints. IMO
when a Linux driver is unbound then it makes no sense to try to
operate the device in "a degraded mode". When a Linux driver is
unbound then it should be releasing all of the resources from the
device (iomaps, IRQs, regulators, GPIOs, etc). That's just what
unbinding a driver is supposed to do.

I understand what you're saying above about display pipelines being
multiple Linux drivers working together and that it doesn't make lots
of sense to just unbind a random Linux device driver in the middle of
things. ...and I don't really have a simple/great answer for how to do
something super elegant if userspace tries to just randomly unbind one
of the many drivers in an active display pipeline.


> > As part of the unbind, the remove() is called and also "devm"
> > resources are deallocated. If any of the things freed are pointed to
> > by the "struct drm_device" then the code needs to NULL them out at
> > this time.
>
> Right, we also need to make sure we don't access any of the resources
> that got freed. This is typically done by protecting all the accesses
> with drm_dev_enter/drm_dev_exit.
>
> > Also we should make sure that any callback functions that userspace
> > could cause to be invoked return errors.
>
> That would prevent any new ioctl from occuring after the device has been
> removed, but that doesn't fix the race condition if it's removed while
> there's a commit happening. This is further complicated by the fact that
> commits can be queued (so you would have multiple submitted already) or
> made asynchronous.

I guess I would have expected that the remove() callback in the device
would prevent new commits from starting and then block waiting until
any in-progress commits were finished? ...kinda like how drivers call
del_timer_sync() in their remove functions...


> > Our code could go away at any point here since userspace could "rmmod"
> > our module.
>
> Yeah, we probably have a bug there. Boris also reported something like
> that recently where if you add an action with drmm_add_action, and then
> remove the module, the function would have been free'd by the time it
> executes.

I'm fairly certain that you can prevent a module from being unloaded
by just grabbing a refcount to it. However, I'm not sure that's the
right solution. If we're trying to run driver code after a driver has
been unbound then, IMO, that's the bug.


> > However, it's not a panacea for everything. Specifically once
> > the Linux driver unbind finishes then the device isn't functional
> > anymore.
>
> What's wrong with it then?

I'm mostly just saying don't just search-and-replace "devm" with
"drmm" in your driver and call it done. You need to think carefully
about which things are which lifetime.

---

Ironically, while digging into this I'm tempted to take back my
original request. Despite the kernel docs I pointed at [1], it
actually looks like it might be fine to use "devm" within a
component's bind() function. In try_to_bring_up_aggregate_device() it
seems like the code is opening up a nested "devres" group specifically
to allow this to work. A little bit of testing that I did with this
shows that, indeed, the nesting seems to be working. Am I missing
something here?

[1] https://docs.kernel.org/driver-api/component.html
mripard@kernel.org Sept. 19, 2023, 9:33 a.m. UTC | #11
On Thu, Sep 14, 2023 at 03:29:16PM -0700, Doug Anderson wrote:
> Hi,
> 
> On Thu, Sep 14, 2023 at 1:14 AM Maxime Ripard <mripard@kernel.org> wrote:
> >
> > > > So it doesn't have any relationship with the unbind/remove timing, and
> > > > for all we know it can be there indefinitely, while the application
> > > > continues to interact with the driver.
> > >
> > > I spent some time thinking about similar issues recently and, assuming
> > > my understanding is correct, I'd at least partially disagree.
> > >
> > > Specifically, I _think_ the only thing that's truly required to remain
> > > valid until userspace closes the last open "fd" is the memory for the
> > > "struct drm_device" itself, right? My understanding is that this is
> > > similar to how "struct device" works. The memory backing a "struct
> > > device" has to live until the last client releases a reference to it
> > > even if everything else about a device has gone away. So if it was all
> > > working perfectly then if the Linux driver backing the "struct
> > > drm_device" goes away then we'd release resources and NULL out a bunch
> > > of stuff in the "struct drm_device" but still keep the actual "struct
> > > drm_device" around since userspace still has a reference. Pretty much
> > > all userspace calls would fail, but at least they wouldn't crash. Is
> > > that roughly the gist?
> >
> > Yes, but also, no.
> >
> > In the spirit, you're right. However, there's three things interfering
> > here:
> >
> >   - You don't always have a match between device and KMS entity. Display
> >     pipelines are usually multiple devices working together, and while
> >     you probably have a 1:1 relationship with bridges and panels (and to
> >     some extent encoders/connectors), the planes and framebuffers for
> >     example are a mess :) So, if the device backing the planes is to be
> >     removed, what are you removing exactly? All of the planes and
> >     framebuffers? Do you free the buffers allocated by the userspace
> >     (that it might still use?)?
> >
> >   - In addition to that, KMS doesn't deal with individual entities being
> >     hotplugged so neither the subsystem nor the application expect to
> >     have a connector being removed.
> >
> >   - ioctl's aren't filtered once the device is starting to get removed
> >     on most drivers.
> >
> > So due to 1 and 2, we can't really partially remove components unless
> > the application is aware of it, and it doesn't expect to. And most
> > drivers still allow (probably unwillingly though) the application to
> > call ioctls once the DRM device has lost at least one of its backing
> > devices.
> 
> We "can't", but we "can", right? Userspace can freely unbind a driver.
> Unless you want to dig into if the community would allow a driver to
> block "unbind" then we have to, at the very least, not crash the
> kernel when userspace does this. Ideally we'd have something more
> elegant than just "don't crash the kernel", but at least we shouldn't
> crash.

I'm not sure what you mean here, sorry

> > > Assuming that's correct, then _most_ of the resource acquiring /
> > > memory allocation can still happen in the device probe() routine and
> > > can still use devm as long as we do something to ensure that any
> > > resources released are no longer pointed to by anything in the "struct
> > > drm_device".
> > >
> > > To make it concrete, I think we want this (feel free to correct). For
> > > simplicity, I'm assuming a driver that _doesn't_ use the component
> > > framework:
> > >
> > > a) Linux driver probe() happens. The "struct drm_device" is allocated
> > > in probe() by devm_drm_dev_alloc(). This takes a reference to the
> > > "struct drm_device". The device also acquires resources / allocates
> > > memory.
> >
> > You need to differentiate resources and allocations there. Resources can
> > be expected to go away at the same time than the device, so using devm
> > is fine. Allocations are largely disconnected from the device lifetime,
> > and using devm leads to UAF.
> 
> Right. I think my original point was looking at "ingenic-drm-drv.c".
> Much of the "devm" stuff there is resources and those specific things
> could be moved to probe() instead of bind(), right?

It depends. The registers, clock, regmap allocations are fine. The panel
isn't for example.

> For allocations, I think you'd have to look at each allocation. If the
> allocation needed to live as long as the "struct drm_device" then devm
> is clearly the wrong choice. ...but not every allocation needs to live
> that long.

Most of the allocations are in a KMS driver though? At least all the
structures that store either planes, crtcs, encoders, connectors, panels
or bridges (plus their state) need to be allocated through drmm.

> Also, even if in the "simple" case allocations need to live as long as
> a "struct drm_device", it's possible that there are some cases where
> there's only an indirect reference to the memory. In that case, you
> could NULL out the indirect reference and then free it. Obviously
> someone would need to take care here.

I guess we could, but it would be fairly hard to do since if we clear a
connector, we would need to clear that particular allocation, but also
from all the states that reference it, and the entities that store a
pointer to it somehow (some of them possibly in drivers). It's not super
valuable anyway since the current expectation is that it's all or
nothing, if you remove one connector you are expected to remove the
whole KMS driver.

> > > b) Userspace acquires a reference to the "struct drm_device". Refcount
> > > is now 2 (one from userspace, one from the Linux driver).
> > >
> > > c) The Linux driver unbinds, presumably because userspace requested
> > > it. From earlier I think we decided that we can't (by design) block
> > > unbind. Once unbind happens then we shouldn't try to keep operating
> > > the device
> >
> > That part is correct, because the resources aren't there anymore.
> >
> > > the driver should stop running.
> >
> > But for the reasons above, the driver needs to still operate (in a
> > degraded mode).
> 
> So I think here is where the disconnect is from our viewpoints. IMO
> when a Linux driver is unbound then it makes no sense to try to
> operate the device in "a degraded mode". When a Linux driver is
> unbound then it should be releasing all of the resources from the
> device (iomaps, IRQs, regulators, GPIOs, etc). That's just what
> unbinding a driver is supposed to do.

I guess we agree on that part.

> I understand what you're saying above about display pipelines being
> multiple Linux drivers working together and that it doesn't make lots
> of sense to just unbind a random Linux device driver in the middle of
> things.

That's not what I'm saying though. What I'm saying is that if we remove
one, everything must go. And that's what ingenic is doing btw. But all
the allocations still need to stay until the last fd is closed.

> ...and I don't really have a simple/great answer for how to do
> something super elegant if userspace tries to just randomly unbind one
> of the many drivers in an active display pipeline.

That's not a concern. I know vc4 handles that just fine, probably others
too.

> > > As part of the unbind, the remove() is called and also "devm"
> > > resources are deallocated. If any of the things freed are pointed to
> > > by the "struct drm_device" then the code needs to NULL them out at
> > > this time.
> >
> > Right, we also need to make sure we don't access any of the resources
> > that got freed. This is typically done by protecting all the accesses
> > with drm_dev_enter/drm_dev_exit.
> >
> > > Also we should make sure that any callback functions that userspace
> > > could cause to be invoked return errors.
> >
> > That would prevent any new ioctl from occuring after the device has been
> > removed, but that doesn't fix the race condition if it's removed while
> > there's a commit happening. This is further complicated by the fact that
> > commits can be queued (so you would have multiple submitted already) or
> > made asynchronous.
> 
> I guess I would have expected that the remove() callback in the device
> would prevent new commits from starting and then block waiting until
> any in-progress commits were finished? ...kinda like how drivers call
> del_timer_sync() in their remove functions...
> 
> 
> > > Our code could go away at any point here since userspace could "rmmod"
> > > our module.
> >
> > Yeah, we probably have a bug there. Boris also reported something like
> > that recently where if you add an action with drmm_add_action, and then
> > remove the module, the function would have been free'd by the time it
> > executes.
> 
> I'm fairly certain that you can prevent a module from being unloaded
> by just grabbing a refcount to it. However, I'm not sure that's the
> right solution. If we're trying to run driver code after a driver has
> been unbound then, IMO, that's the bug.

init, exit and probe run while the device in unbound.

> > > However, it's not a panacea for everything. Specifically once
> > > the Linux driver unbind finishes then the device isn't functional
> > > anymore.
> >
> > What's wrong with it then?
> 
> I'm mostly just saying don't just search-and-replace "devm" with
> "drmm" in your driver and call it done. You need to think carefully
> about which things are which lifetime.

Sure, where did I say anything different? For vc4, it took me a ~60
patches to do the conversion, so yeah, it's not just a sed call.

> Ironically, while digging into this I'm tempted to take back my
> original request. Despite the kernel docs I pointed at [1], it
> actually looks like it might be fine to use "devm" within a
> component's bind() function. In try_to_bring_up_aggregate_device() it
> seems like the code is opening up a nested "devres" group specifically
> to allow this to work. A little bit of testing that I did with this
> shows that, indeed, the nesting seems to be working. Am I missing
> something here?

I don't think we're on the same page, because I also don't know why
that's relevant in that particular context?

Sure, you can use devm in a component framework driver. The limitations
I'm talking about have nothing to do with the component framework but
rather between devm and KMS. So all the issues I brought up are still
very much relevant for a single device doing devm_ allocations at probe.

Maxime
diff mbox series

Patch

diff --git a/drivers/gpu/drm/ingenic/ingenic-drm-drv.c b/drivers/gpu/drm/ingenic/ingenic-drm-drv.c
index 8dbd4847d3a6..51995a0cd568 100644
--- a/drivers/gpu/drm/ingenic/ingenic-drm-drv.c
+++ b/drivers/gpu/drm/ingenic/ingenic-drm-drv.c
@@ -1130,7 +1130,7 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 
 	ret = drmm_mode_config_init(drm);
 	if (ret)
-		return ret;
+		goto err_drvdata;
 
 	drm->mode_config.min_width = 0;
 	drm->mode_config.min_height = 0;
@@ -1142,7 +1142,8 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 	base = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
 	if (IS_ERR(base)) {
 		dev_err(dev, "Failed to get memory resource\n");
-		return PTR_ERR(base);
+		ret = PTR_ERR(base);
+		goto err_drvdata;
 	}
 
 	regmap_config = ingenic_drm_regmap_config;
@@ -1151,33 +1152,40 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 					  &regmap_config);
 	if (IS_ERR(priv->map)) {
 		dev_err(dev, "Failed to create regmap\n");
-		return PTR_ERR(priv->map);
+		ret = PTR_ERR(priv->map);
+		goto err_drvdata;
 	}
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return irq;
+	if (irq < 0) {
+		ret = irq;
+		goto err_drvdata;
+	}
 
 	if (soc_info->needs_dev_clk) {
 		priv->lcd_clk = devm_clk_get(dev, "lcd");
 		if (IS_ERR(priv->lcd_clk)) {
 			dev_err(dev, "Failed to get lcd clock\n");
-			return PTR_ERR(priv->lcd_clk);
+			ret = PTR_ERR(priv->lcd_clk);
+			goto err_drvdata;
 		}
 	}
 
 	priv->pix_clk = devm_clk_get(dev, "lcd_pclk");
 	if (IS_ERR(priv->pix_clk)) {
 		dev_err(dev, "Failed to get pixel clock\n");
-		return PTR_ERR(priv->pix_clk);
+		ret = PTR_ERR(priv->pix_clk);
+		goto err_drvdata;
 	}
 
 	priv->dma_hwdescs = dmam_alloc_coherent(dev,
 						sizeof(*priv->dma_hwdescs),
 						&priv->dma_hwdescs_phys,
 						GFP_KERNEL);
-	if (!priv->dma_hwdescs)
-		return -ENOMEM;
+	if (!priv->dma_hwdescs) {
+		ret = -ENOMEM;
+		goto err_drvdata;
+	}
 
 	/* Configure DMA hwdesc for foreground0 plane */
 	ingenic_drm_configure_hwdesc_plane(priv, 0);
@@ -1199,7 +1207,7 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 				       NULL, DRM_PLANE_TYPE_PRIMARY, NULL);
 	if (ret) {
 		dev_err(dev, "Failed to register plane: %i\n", ret);
-		return ret;
+		goto err_drvdata;
 	}
 
 	if (soc_info->map_noncoherent)
@@ -1211,7 +1219,7 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 					NULL, &ingenic_drm_crtc_funcs, NULL);
 	if (ret) {
 		dev_err(dev, "Failed to init CRTC: %i\n", ret);
-		return ret;
+		goto err_drvdata;
 	}
 
 	drm_crtc_enable_color_mgmt(&priv->crtc, 0, false,
@@ -1230,7 +1238,7 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 		if (ret) {
 			dev_err(dev, "Failed to register overlay plane: %i\n",
 				ret);
-			return ret;
+			goto err_drvdata;
 		}
 
 		if (soc_info->map_noncoherent)
@@ -1241,17 +1249,18 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 			if (ret) {
 				if (ret != -EPROBE_DEFER)
 					dev_err(dev, "Failed to bind components: %i\n", ret);
-				return ret;
+				goto err_drvdata;
 			}
 
 			ret = devm_add_action_or_reset(dev, ingenic_drm_unbind_all, priv);
 			if (ret)
-				return ret;
+				goto err_drvdata;
 
 			priv->ipu_plane = drm_plane_from_index(drm, 2);
 			if (!priv->ipu_plane) {
 				dev_err(dev, "Failed to retrieve IPU plane\n");
-				return -EINVAL;
+				ret = -EINVAL;
+				goto err_drvdata;
 			}
 		}
 	}
@@ -1263,7 +1272,7 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 				break; /* we're done */
 			if (ret != -EPROBE_DEFER)
 				dev_err(dev, "Failed to get bridge handle\n");
-			return ret;
+			goto err_drvdata;
 		}
 
 		if (panel)
@@ -1275,7 +1284,7 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 		if (IS_ERR(ib)) {
 			ret = PTR_ERR(ib);
 			dev_err(dev, "Failed to init encoder: %d\n", ret);
-			return ret;
+			goto err_drvdata;
 		}
 
 		encoder = &ib->encoder;
@@ -1290,13 +1299,14 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 					DRM_BRIDGE_ATTACH_NO_CONNECTOR);
 		if (ret) {
 			dev_err(dev, "Unable to attach bridge\n");
-			return ret;
+			goto err_drvdata;
 		}
 
 		connector = drm_bridge_connector_init(drm, encoder);
 		if (IS_ERR(connector)) {
 			dev_err(dev, "Unable to init connector\n");
-			return PTR_ERR(connector);
+			ret = PTR_ERR(connector);
+			goto err_drvdata;
 		}
 
 		drm_connector_attach_encoder(connector, encoder);
@@ -1313,13 +1323,13 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 	ret = devm_request_irq(dev, irq, ingenic_drm_irq_handler, 0, drm->driver->name, drm);
 	if (ret) {
 		dev_err(dev, "Unable to install IRQ handler\n");
-		return ret;
+		goto err_drvdata;
 	}
 
 	ret = drm_vblank_init(drm, 1);
 	if (ret) {
 		dev_err(dev, "Failed calling drm_vblank_init()\n");
-		return ret;
+		goto err_drvdata;
 	}
 
 	drm_mode_config_reset(drm);
@@ -1327,7 +1337,7 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 	ret = clk_prepare_enable(priv->pix_clk);
 	if (ret) {
 		dev_err(dev, "Unable to start pixel clock\n");
-		return ret;
+		goto err_drvdata;
 	}
 
 	if (priv->lcd_clk) {
@@ -1402,6 +1412,8 @@  static int ingenic_drm_bind(struct device *dev, bool has_components)
 		clk_disable_unprepare(priv->lcd_clk);
 err_pixclk_disable:
 	clk_disable_unprepare(priv->pix_clk);
+err_drvdata:
+	platform_set_drvdata(pdev, NULL);
 	return ret;
 }
 
@@ -1422,6 +1434,7 @@  static void ingenic_drm_unbind(struct device *dev)
 
 	drm_dev_unregister(&priv->drm);
 	drm_atomic_helper_shutdown(&priv->drm);
+	dev_set_drvdata(dev, NULL);
 }
 
 static const struct component_master_ops ingenic_master_ops = {
@@ -1461,6 +1474,14 @@  static int ingenic_drm_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static void ingenic_drm_shutdown(struct platform_device *pdev)
+{
+	struct ingenic_drm *priv = platform_get_drvdata(pdev);
+
+	if (priv)
+		drm_atomic_helper_shutdown(&priv->drm);
+}
+
 static int ingenic_drm_suspend(struct device *dev)
 {
 	struct ingenic_drm *priv = dev_get_drvdata(dev);
@@ -1612,6 +1633,7 @@  static struct platform_driver ingenic_drm_driver = {
 	},
 	.probe = ingenic_drm_probe,
 	.remove = ingenic_drm_remove,
+	.shutdown = ingenic_drm_shutdown,
 };
 
 static int ingenic_drm_init(void)