diff mbox series

[v7,5/9] drm/fb_dma: Add generic get_scanout_buffer() for drm_panic

Message ID 20240104160301.185915-6-jfalempe@redhat.com (mailing list archive)
State New, archived
Headers show
Series drm/panic: Add a drm panic handler | expand

Commit Message

Jocelyn Falempe Jan. 4, 2024, 4 p.m. UTC
This was initialy done for imx6, but should work on most drivers
using drm_fb_dma_helper.

Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
---
 drivers/gpu/drm/drm_fb_dma_helper.c | 55 +++++++++++++++++++++++++++++
 include/drm/drm_fb_dma_helper.h     |  4 +++
 2 files changed, 59 insertions(+)

Comments

kernel test robot Jan. 6, 2024, 2:54 a.m. UTC | #1
Hi Jocelyn,

kernel test robot noticed the following build errors:

[auto build test ERROR on 50a3c772bd927dd409c484832ddd9f6bf00b7389]

url:    https://github.com/intel-lab-lkp/linux/commits/Jocelyn-Falempe/drm-format-helper-Add-drm_fb_blit_from_r1-and-drm_fb_fill/20240105-001038
base:   50a3c772bd927dd409c484832ddd9f6bf00b7389
patch link:    https://lore.kernel.org/r/20240104160301.185915-6-jfalempe%40redhat.com
patch subject: [PATCH v7 5/9] drm/fb_dma: Add generic get_scanout_buffer() for drm_panic
config: mips-gcw0_defconfig (https://download.01.org/0day-ci/archive/20240106/202401061003.lNwFS0D6-lkp@intel.com/config)
compiler: mipsel-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240106/202401061003.lNwFS0D6-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401061003.lNwFS0D6-lkp@intel.com/

All errors (new ones prefixed by >>):

   mipsel-linux-ld: drivers/gpu/drm/drm_fb_dma_helper.o: in function `drm_panic_is_format_supported':
>> drm_fb_dma_helper.c:(.text+0x2c8): multiple definition of `drm_panic_is_format_supported'; drivers/gpu/drm/drm_drv.o:drm_drv.c:(.text+0x1104): first defined here
Maxime Ripard Jan. 8, 2024, 10:20 a.m. UTC | #2
Hi,

On Thu, Jan 04, 2024 at 05:00:49PM +0100, Jocelyn Falempe wrote:
> This was initialy done for imx6, but should work on most drivers
> using drm_fb_dma_helper.
> 
> Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
> ---
>  drivers/gpu/drm/drm_fb_dma_helper.c | 55 +++++++++++++++++++++++++++++
>  include/drm/drm_fb_dma_helper.h     |  4 +++
>  2 files changed, 59 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_fb_dma_helper.c b/drivers/gpu/drm/drm_fb_dma_helper.c
> index 3b535ad1b07c..caed2935df4f 100644
> --- a/drivers/gpu/drm/drm_fb_dma_helper.c
> +++ b/drivers/gpu/drm/drm_fb_dma_helper.c
> @@ -15,6 +15,7 @@
>  #include <drm/drm_framebuffer.h>
>  #include <drm/drm_gem_dma_helper.h>
>  #include <drm/drm_gem_framebuffer_helper.h>
> +#include <drm/drm_panic.h>
>  #include <drm/drm_plane.h>
>  #include <linux/dma-mapping.h>
>  #include <linux/module.h>
> @@ -148,3 +149,57 @@ void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
>  	}
>  }
>  EXPORT_SYMBOL_GPL(drm_fb_dma_sync_non_coherent);
> +
> +#if defined(CONFIG_DRM_PANIC)
> +/**
> + * @dev: DRM device
> + * @drm_scanout_buffer: scanout buffer for the panic handler
> + * Returns: 0 or negative error code
> + *
> + * Generic get_scanout_buffer() implementation, for drivers that uses the
> + * drm_fb_dma_helper.
> + */
> +int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
> +				     struct drm_scanout_buffer *sb)
> +{
> +	struct drm_plane *plane;
> +	struct drm_gem_dma_object *dma_obj;
> +	struct drm_framebuffer *fb;
> +
> +	drm_for_each_primary_visible_plane(plane, dev) {
> +		fb = plane->state->fb;
> +		/* Only support linear modifier */
> +		if (fb->modifier != DRM_FORMAT_MOD_LINEAR)
> +			continue;
> +
> +		/* Check if color format is supported */
> +		if (!drm_panic_is_format_supported(fb->format->format))
> +			continue;
> +
> +		dma_obj = drm_fb_dma_get_gem_obj(fb, 0);
> +
> +		/* Buffer should be accessible from the CPU */
> +		if (dma_obj->base.import_attach)
> +			continue;
> +
> +		/* Buffer should be already mapped to CPU */
> +		if (!dma_obj->vaddr)
> +			continue;
> +
> +		iosys_map_set_vaddr(&sb->map, dma_obj->vaddr);
> +		sb->format = fb->format;
> +		sb->height = fb->height;
> +		sb->width = fb->width;
> +		sb->pitch = fb->pitches[0];
> +		return 0;
> +	}
> +	return -ENODEV;
> +}
> +#else
> +int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
> +				     struct drm_scanout_buffer *sb)
> +{
> +	return 0;
> +}
> +#endif
> +EXPORT_SYMBOL(drm_panic_gem_get_scanout_buffer);

Looks much better, thanks :)

I think we should be more vocal about the failure cases too. Maybe log
it through warn/pr_crit or whatever so that at least we have an idea
what went wrong in a post mortem.

Maxime
Jocelyn Falempe Jan. 12, 2024, 12:29 p.m. UTC | #3
On 08/01/2024 11:20, Maxime Ripard wrote:
> Hi,
> 
> On Thu, Jan 04, 2024 at 05:00:49PM +0100, Jocelyn Falempe wrote:
>> This was initialy done for imx6, but should work on most drivers
>> using drm_fb_dma_helper.
>>
>> Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
>> ---
>>   drivers/gpu/drm/drm_fb_dma_helper.c | 55 +++++++++++++++++++++++++++++
>>   include/drm/drm_fb_dma_helper.h     |  4 +++
>>   2 files changed, 59 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/drm_fb_dma_helper.c b/drivers/gpu/drm/drm_fb_dma_helper.c
>> index 3b535ad1b07c..caed2935df4f 100644
>> --- a/drivers/gpu/drm/drm_fb_dma_helper.c
>> +++ b/drivers/gpu/drm/drm_fb_dma_helper.c
>> @@ -15,6 +15,7 @@
>>   #include <drm/drm_framebuffer.h>
>>   #include <drm/drm_gem_dma_helper.h>
>>   #include <drm/drm_gem_framebuffer_helper.h>
>> +#include <drm/drm_panic.h>
>>   #include <drm/drm_plane.h>
>>   #include <linux/dma-mapping.h>
>>   #include <linux/module.h>
>> @@ -148,3 +149,57 @@ void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
>>   	}
>>   }
>>   EXPORT_SYMBOL_GPL(drm_fb_dma_sync_non_coherent);
>> +
>> +#if defined(CONFIG_DRM_PANIC)
>> +/**
>> + * @dev: DRM device
>> + * @drm_scanout_buffer: scanout buffer for the panic handler
>> + * Returns: 0 or negative error code
>> + *
>> + * Generic get_scanout_buffer() implementation, for drivers that uses the
>> + * drm_fb_dma_helper.
>> + */
>> +int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
>> +				     struct drm_scanout_buffer *sb)
>> +{
>> +	struct drm_plane *plane;
>> +	struct drm_gem_dma_object *dma_obj;
>> +	struct drm_framebuffer *fb;
>> +
>> +	drm_for_each_primary_visible_plane(plane, dev) {
>> +		fb = plane->state->fb;
>> +		/* Only support linear modifier */
>> +		if (fb->modifier != DRM_FORMAT_MOD_LINEAR)
>> +			continue;
>> +
>> +		/* Check if color format is supported */
>> +		if (!drm_panic_is_format_supported(fb->format->format))
>> +			continue;
>> +
>> +		dma_obj = drm_fb_dma_get_gem_obj(fb, 0);
>> +
>> +		/* Buffer should be accessible from the CPU */
>> +		if (dma_obj->base.import_attach)
>> +			continue;
>> +
>> +		/* Buffer should be already mapped to CPU */
>> +		if (!dma_obj->vaddr)
>> +			continue;
>> +
>> +		iosys_map_set_vaddr(&sb->map, dma_obj->vaddr);
>> +		sb->format = fb->format;
>> +		sb->height = fb->height;
>> +		sb->width = fb->width;
>> +		sb->pitch = fb->pitches[0];
>> +		return 0;
>> +	}
>> +	return -ENODEV;
>> +}
>> +#else
>> +int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
>> +				     struct drm_scanout_buffer *sb)
>> +{
>> +	return 0;
>> +}
>> +#endif
>> +EXPORT_SYMBOL(drm_panic_gem_get_scanout_buffer);
> 
> Looks much better, thanks :)
> 
> I think we should be more vocal about the failure cases too. Maybe log
> it through warn/pr_crit or whatever so that at least we have an idea
> what went wrong in a post mortem.

Thanks for the review.
Yes I can add an error message when it fails to find a scanout buffer.
> 
> Maxime
Daniel Vetter Jan. 12, 2024, 1:41 p.m. UTC | #4
On Thu, Jan 04, 2024 at 05:00:49PM +0100, Jocelyn Falempe wrote:
> This was initialy done for imx6, but should work on most drivers
> using drm_fb_dma_helper.
> 
> Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
> ---
>  drivers/gpu/drm/drm_fb_dma_helper.c | 55 +++++++++++++++++++++++++++++
>  include/drm/drm_fb_dma_helper.h     |  4 +++
>  2 files changed, 59 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_fb_dma_helper.c b/drivers/gpu/drm/drm_fb_dma_helper.c
> index 3b535ad1b07c..caed2935df4f 100644
> --- a/drivers/gpu/drm/drm_fb_dma_helper.c
> +++ b/drivers/gpu/drm/drm_fb_dma_helper.c
> @@ -15,6 +15,7 @@
>  #include <drm/drm_framebuffer.h>
>  #include <drm/drm_gem_dma_helper.h>
>  #include <drm/drm_gem_framebuffer_helper.h>
> +#include <drm/drm_panic.h>
>  #include <drm/drm_plane.h>
>  #include <linux/dma-mapping.h>
>  #include <linux/module.h>
> @@ -148,3 +149,57 @@ void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
>  	}
>  }
>  EXPORT_SYMBOL_GPL(drm_fb_dma_sync_non_coherent);
> +
> +#if defined(CONFIG_DRM_PANIC)
> +/**
> + * @dev: DRM device
> + * @drm_scanout_buffer: scanout buffer for the panic handler
> + * Returns: 0 or negative error code
> + *
> + * Generic get_scanout_buffer() implementation, for drivers that uses the
> + * drm_fb_dma_helper.
> + */
> +int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
> +				     struct drm_scanout_buffer *sb)
> +{
> +	struct drm_plane *plane;
> +	struct drm_gem_dma_object *dma_obj;
> +	struct drm_framebuffer *fb;
> +
> +	drm_for_each_primary_visible_plane(plane, dev) {

Ok that's not enough locking by far. You can't just hope that nothing
disappears while you're in a panic handler. We've been there and ended up
reliably oopsing in the panic handler itself. So you _have_ to follow the
full set of locking rules for all drm structures, or things will just get
worse at the worst possible moment.

But also, you're not allowed to do anything else than trylock, because a
panic handler might run from nmi context, and so you cannot even acquire
irq-safe spinlocks reliably.

Which means:

- You need to be safe against concurrent drm_dev_unregister. Using the
  atomic panic notifier directly for each device should take care of that
  (but maybe that stuff is still not nmi safe, not sure).

- You _have_ to use all the locks. Luckily iterating over the plane list
  doesn't need one, but you have to trylock the plane's modeset lock.
  Which means your nice iterator macro is already toast, because that
  already looks at state it's not allowed to look at without a lock. Or
  well, the plane->state pointer is no-go already.

Given the locking issues I'm not sure whether the
drm_for_each_primary_visible_plane iterator is going to work, you'd need
something like iter_init/next/end we have for walking the connector list.
Plus it would be very panic specific due to the trylock, so maybe

drm_for_each_visible_plane_in_panic_handler()

or something like that.

One thing I was wondering is whether we should lift this iteration over
all planes into the shared code, and move the ->get_scanout_buffer
function to the drm_plane_funcs structure instead?

> +		fb = plane->state->fb;
> +		/* Only support linear modifier */
> +		if (fb->modifier != DRM_FORMAT_MOD_LINEAR)
> +			continue;
> +
> +		/* Check if color format is supported */
> +		if (!drm_panic_is_format_supported(fb->format->format))
> +			continue;
> +
> +		dma_obj = drm_fb_dma_get_gem_obj(fb, 0);
> +
> +		/* Buffer should be accessible from the CPU */
> +		if (dma_obj->base.import_attach)

This might be a bit too restrictive, since some drivers import dma-buf
including a vmap. So just checking for ->vaddr might be better. But can be
changed later on.

> +			continue;
> +
> +		/* Buffer should be already mapped to CPU */

I'd clarify this comment to state that vaddr is invariant over the
lifetime of the buffer and therefore needs no locking. Correct locking
that a) takes all the locks b) never ever stalls for one is absolutely
crucial for a panic handler that won't make the situation worse.

> +		if (!dma_obj->vaddr)


> +			continue;
> +
> +		iosys_map_set_vaddr(&sb->map, dma_obj->vaddr);
> +		sb->format = fb->format;
> +		sb->height = fb->height;
> +		sb->width = fb->width;
> +		sb->pitch = fb->pitches[0];
> +		return 0;

Otherwise this lgtm.
-Sima

> +	}
> +	return -ENODEV;
> +}
> +#else
> +int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
> +				     struct drm_scanout_buffer *sb)
> +{
> +	return 0;
> +}
> +#endif
> +EXPORT_SYMBOL(drm_panic_gem_get_scanout_buffer);
> diff --git a/include/drm/drm_fb_dma_helper.h b/include/drm/drm_fb_dma_helper.h
> index d5e036c57801..2ae432865079 100644
> --- a/include/drm/drm_fb_dma_helper.h
> +++ b/include/drm/drm_fb_dma_helper.h
> @@ -7,6 +7,7 @@
>  struct drm_device;
>  struct drm_framebuffer;
>  struct drm_plane_state;
> +struct drm_scanout_buffer;
>  
>  struct drm_gem_dma_object *drm_fb_dma_get_gem_obj(struct drm_framebuffer *fb,
>  	unsigned int plane);
> @@ -19,5 +20,8 @@ void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
>  				  struct drm_plane_state *old_state,
>  				  struct drm_plane_state *state);
>  
> +int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
> +				     struct drm_scanout_buffer *sb);
> +
>  #endif
>  
> -- 
> 2.43.0
>
Maxime Ripard Jan. 12, 2024, 1:56 p.m. UTC | #5
On Fri, Jan 12, 2024 at 02:41:53PM +0100, Daniel Vetter wrote:
> > +		fb = plane->state->fb;
> > +		/* Only support linear modifier */
> > +		if (fb->modifier != DRM_FORMAT_MOD_LINEAR)
> > +			continue;
> > +
> > +		/* Check if color format is supported */
> > +		if (!drm_panic_is_format_supported(fb->format->format))
> > +			continue;
> > +
> > +		dma_obj = drm_fb_dma_get_gem_obj(fb, 0);
> > +
> > +		/* Buffer should be accessible from the CPU */
> > +		if (dma_obj->base.import_attach)
> 
> This might be a bit too restrictive, since some drivers import dma-buf
> including a vmap. So just checking for ->vaddr might be better. But can be
> changed later on.
> 
> > +			continue;
> > +
> > +		/* Buffer should be already mapped to CPU */
> 
> I'd clarify this comment to state that vaddr is invariant over the
> lifetime of the buffer and therefore needs no locking. Correct locking
> that a) takes all the locks b) never ever stalls for one is absolutely
> crucial for a panic handler that won't make the situation worse.

I think this comment was made to address buffers that are accessible to
the CPU but don't have a CPU mapping (ie, created with
DMA_ATTR_NO_KERNEL_MAPPING for example).

Maxime
Jocelyn Falempe Jan. 17, 2024, 2:28 p.m. UTC | #6
On 12/01/2024 14:41, Daniel Vetter wrote:
> On Thu, Jan 04, 2024 at 05:00:49PM +0100, Jocelyn Falempe wrote:
>> This was initialy done for imx6, but should work on most drivers
>> using drm_fb_dma_helper.
>>
>> Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
>> ---
>>   drivers/gpu/drm/drm_fb_dma_helper.c | 55 +++++++++++++++++++++++++++++
>>   include/drm/drm_fb_dma_helper.h     |  4 +++
>>   2 files changed, 59 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/drm_fb_dma_helper.c b/drivers/gpu/drm/drm_fb_dma_helper.c
>> index 3b535ad1b07c..caed2935df4f 100644
>> --- a/drivers/gpu/drm/drm_fb_dma_helper.c
>> +++ b/drivers/gpu/drm/drm_fb_dma_helper.c
>> @@ -15,6 +15,7 @@
>>   #include <drm/drm_framebuffer.h>
>>   #include <drm/drm_gem_dma_helper.h>
>>   #include <drm/drm_gem_framebuffer_helper.h>
>> +#include <drm/drm_panic.h>
>>   #include <drm/drm_plane.h>
>>   #include <linux/dma-mapping.h>
>>   #include <linux/module.h>
>> @@ -148,3 +149,57 @@ void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
>>   	}
>>   }
>>   EXPORT_SYMBOL_GPL(drm_fb_dma_sync_non_coherent);
>> +
>> +#if defined(CONFIG_DRM_PANIC)
>> +/**
>> + * @dev: DRM device
>> + * @drm_scanout_buffer: scanout buffer for the panic handler
>> + * Returns: 0 or negative error code
>> + *
>> + * Generic get_scanout_buffer() implementation, for drivers that uses the
>> + * drm_fb_dma_helper.
>> + */
>> +int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
>> +				     struct drm_scanout_buffer *sb)
>> +{
>> +	struct drm_plane *plane;
>> +	struct drm_gem_dma_object *dma_obj;
>> +	struct drm_framebuffer *fb;
>> +
>> +	drm_for_each_primary_visible_plane(plane, dev) {
> 
> Ok that's not enough locking by far. You can't just hope that nothing
> disappears while you're in a panic handler. We've been there and ended up
> reliably oopsing in the panic handler itself. So you _have_ to follow the
> full set of locking rules for all drm structures, or things will just get
> worse at the worst possible moment.
> 
> But also, you're not allowed to do anything else than trylock, because a
> panic handler might run from nmi context, and so you cannot even acquire
> irq-safe spinlocks reliably.
> 
> Which means:
> 
> - You need to be safe against concurrent drm_dev_unregister. Using the
>    atomic panic notifier directly for each device should take care of that
>    (but maybe that stuff is still not nmi safe, not sure).
> 
> - You _have_ to use all the locks. Luckily iterating over the plane list
>    doesn't need one, but you have to trylock the plane's modeset lock.
>    Which means your nice iterator macro is already toast, because that
>    already looks at state it's not allowed to look at without a lock. Or
>    well, the plane->state pointer is no-go already.

mutex_trylock() shouldn't be called from interrupt context, and as the 
panic may occurs in irq, I can't use that.

But the panic context should guarantee that only one CPU is still running:
https://elixir.bootlin.com/linux/latest/source/kernel/panic.c#L310

So I think using mutex_is_locked() should be safe: 
https://elixir.bootlin.com/linux/latest/source/include/linux/mutex.h#L128

This will only check if the lock is not taken, but as it's not possible 
for another task to run at the same time, I think that should be good 
enough ?

The drawback, is if we want to test without crashing the kernel, then we 
need to take the locks with trylock(), (and it's safe this time), but 
the code path would be slightly different.
Daniel Vetter Jan. 18, 2024, 1:38 p.m. UTC | #7
On Fri, Jan 12, 2024 at 02:56:17PM +0100, Maxime Ripard wrote:
> On Fri, Jan 12, 2024 at 02:41:53PM +0100, Daniel Vetter wrote:
> > > +		fb = plane->state->fb;
> > > +		/* Only support linear modifier */
> > > +		if (fb->modifier != DRM_FORMAT_MOD_LINEAR)
> > > +			continue;
> > > +
> > > +		/* Check if color format is supported */
> > > +		if (!drm_panic_is_format_supported(fb->format->format))
> > > +			continue;
> > > +
> > > +		dma_obj = drm_fb_dma_get_gem_obj(fb, 0);
> > > +
> > > +		/* Buffer should be accessible from the CPU */
> > > +		if (dma_obj->base.import_attach)
> > 
> > This might be a bit too restrictive, since some drivers import dma-buf
> > including a vmap. So just checking for ->vaddr might be better. But can be
> > changed later on.
> > 
> > > +			continue;
> > > +
> > > +		/* Buffer should be already mapped to CPU */
> > 
> > I'd clarify this comment to state that vaddr is invariant over the
> > lifetime of the buffer and therefore needs no locking. Correct locking
> > that a) takes all the locks b) never ever stalls for one is absolutely
> > crucial for a panic handler that won't make the situation worse.
> 
> I think this comment was made to address buffers that are accessible to
> the CPU but don't have a CPU mapping (ie, created with
> DMA_ATTR_NO_KERNEL_MAPPING for example).

But then the NULL value of vaddr would also be invariant ...

My emphasis is more that we need to be really careful with all the locking
rules here in the panic handler and not just assume it's going to be safe.
I've stitched some tricky design together, but need to still move it from
the whiteboard to a patch.
-Sima
Daniel Vetter Jan. 18, 2024, 1:51 p.m. UTC | #8
On Wed, Jan 17, 2024 at 03:28:09PM +0100, Jocelyn Falempe wrote:
> 
> 
> On 12/01/2024 14:41, Daniel Vetter wrote:
> > On Thu, Jan 04, 2024 at 05:00:49PM +0100, Jocelyn Falempe wrote:
> > > This was initialy done for imx6, but should work on most drivers
> > > using drm_fb_dma_helper.
> > > 
> > > Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
> > > ---
> > >   drivers/gpu/drm/drm_fb_dma_helper.c | 55 +++++++++++++++++++++++++++++
> > >   include/drm/drm_fb_dma_helper.h     |  4 +++
> > >   2 files changed, 59 insertions(+)
> > > 
> > > diff --git a/drivers/gpu/drm/drm_fb_dma_helper.c b/drivers/gpu/drm/drm_fb_dma_helper.c
> > > index 3b535ad1b07c..caed2935df4f 100644
> > > --- a/drivers/gpu/drm/drm_fb_dma_helper.c
> > > +++ b/drivers/gpu/drm/drm_fb_dma_helper.c
> > > @@ -15,6 +15,7 @@
> > >   #include <drm/drm_framebuffer.h>
> > >   #include <drm/drm_gem_dma_helper.h>
> > >   #include <drm/drm_gem_framebuffer_helper.h>
> > > +#include <drm/drm_panic.h>
> > >   #include <drm/drm_plane.h>
> > >   #include <linux/dma-mapping.h>
> > >   #include <linux/module.h>
> > > @@ -148,3 +149,57 @@ void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
> > >   	}
> > >   }
> > >   EXPORT_SYMBOL_GPL(drm_fb_dma_sync_non_coherent);
> > > +
> > > +#if defined(CONFIG_DRM_PANIC)
> > > +/**
> > > + * @dev: DRM device
> > > + * @drm_scanout_buffer: scanout buffer for the panic handler
> > > + * Returns: 0 or negative error code
> > > + *
> > > + * Generic get_scanout_buffer() implementation, for drivers that uses the
> > > + * drm_fb_dma_helper.
> > > + */
> > > +int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
> > > +				     struct drm_scanout_buffer *sb)
> > > +{
> > > +	struct drm_plane *plane;
> > > +	struct drm_gem_dma_object *dma_obj;
> > > +	struct drm_framebuffer *fb;
> > > +
> > > +	drm_for_each_primary_visible_plane(plane, dev) {
> > 
> > Ok that's not enough locking by far. You can't just hope that nothing
> > disappears while you're in a panic handler. We've been there and ended up
> > reliably oopsing in the panic handler itself. So you _have_ to follow the
> > full set of locking rules for all drm structures, or things will just get
> > worse at the worst possible moment.
> > 
> > But also, you're not allowed to do anything else than trylock, because a
> > panic handler might run from nmi context, and so you cannot even acquire
> > irq-safe spinlocks reliably.
> > 
> > Which means:
> > 
> > - You need to be safe against concurrent drm_dev_unregister. Using the
> >    atomic panic notifier directly for each device should take care of that
> >    (but maybe that stuff is still not nmi safe, not sure).
> > 
> > - You _have_ to use all the locks. Luckily iterating over the plane list
> >    doesn't need one, but you have to trylock the plane's modeset lock.
> >    Which means your nice iterator macro is already toast, because that
> >    already looks at state it's not allowed to look at without a lock. Or
> >    well, the plane->state pointer is no-go already.
> 
> mutex_trylock() shouldn't be called from interrupt context, and as the panic
> may occurs in irq, I can't use that.

Yeah I learned that later that day too, disappointing :-/

What's worse, spin_trylock is also no-go, because on r-t kernel they're
essentially sleeping locks. And so spin_trylock can retry in a loop, and
if we interrupt another thread at /just/ the right moment where the
spinlock is in an inconsistent state, the spin_trylock in the panic
handler will loop forever.

> But the panic context should guarantee that only one CPU is still running:
> https://elixir.bootlin.com/linux/latest/source/kernel/panic.c#L310

Uh no, panics can nest so you might panic while a panic is going on and
interrupt it in the middle. So just making sure no other code is running
isn't enough unfortunately.

Also the ipi stuff that should ensure the other cpu is stopped might not
work, afaiui in nmi panic context you cannot send around ipi.

> So I think using mutex_is_locked() should be safe:
> https://elixir.bootlin.com/linux/latest/source/include/linux/mutex.h#L128
> 
> This will only check if the lock is not taken, but as it's not possible for
> another task to run at the same time, I think that should be good enough ?
> 
> The drawback, is if we want to test without crashing the kernel, then we
> need to take the locks with trylock(), (and it's safe this time), but the
> code path would be slightly different.

So I think this predates some of irc chat, and I haven't typed up what I
designed together with John Ogness. Haven't done the patch yet, but rought
sketch:

- we use a raw_spin_lock. That's the _only_ lock type where trylock is ok

- we protect the drm_plane->state update with that lock. That's an
  extremely tiny section of code, so should be fine.

- For just that pointer update we could also use rcu, but using a spinlock
  has the advantage that drivers with peek/poke mmio support (for
  accessing unmapping vram or similar) could wrap these mmio access with
  that spinlock, and so would be able to use peek/poke in their panic
  handler. As soon as we need to protect against mmio we really need a
  lock.

- Because the lock also provides a barrier there's a lot of things you can
  safely assume, and so don't need additional locking for that in the
  panic handler:

  - Anything that's invariant over the lifetime of a drm_device (as
    delineated by drm_dev_register/unregister) is safe to access, like the
    plane list or plane structure

  - Anything in drm_plane_state that's only computed in atomic_check and
    not touched in atomic_commit code anymore is safe to access. This
    means we can get at the bo and everything else without any locks, and
    anything that is invariant for the lifetime of the bo like vaddr for
    dma-helper bo is also fine.

  - Furthermore because ->prepare_fb is called _before_ this point, and we
    stall at the right places before we call ->cleanup_fb. Unfortunately
    the same isn't true for ->begin/end_fb_access (by design), so we
    cannot rely on everything, but we can rely on the fact that the
    framebuffer is correctly pinned for drivers using dynamic buffer
    managers. The kernel vmap might not be around though, I need to think
    more how we best solve that issue. But that's a problem for when we
    add panic support to the first such driver.

I think with this design we'd cover 95% of all cases by simply doing the
raw_spin_trylock before we call into the driver's callback.

Cheers, Sima
Maxime Ripard Jan. 26, 2024, 12:39 p.m. UTC | #9
On Thu, Jan 18, 2024 at 02:38:53PM +0100, Daniel Vetter wrote:
> On Fri, Jan 12, 2024 at 02:56:17PM +0100, Maxime Ripard wrote:
> > On Fri, Jan 12, 2024 at 02:41:53PM +0100, Daniel Vetter wrote:
> > > > +		fb = plane->state->fb;
> > > > +		/* Only support linear modifier */
> > > > +		if (fb->modifier != DRM_FORMAT_MOD_LINEAR)
> > > > +			continue;
> > > > +
> > > > +		/* Check if color format is supported */
> > > > +		if (!drm_panic_is_format_supported(fb->format->format))
> > > > +			continue;
> > > > +
> > > > +		dma_obj = drm_fb_dma_get_gem_obj(fb, 0);
> > > > +
> > > > +		/* Buffer should be accessible from the CPU */
> > > > +		if (dma_obj->base.import_attach)
> > > 
> > > This might be a bit too restrictive, since some drivers import dma-buf
> > > including a vmap. So just checking for ->vaddr might be better. But can be
> > > changed later on.
> > > 
> > > > +			continue;
> > > > +
> > > > +		/* Buffer should be already mapped to CPU */
> > > 
> > > I'd clarify this comment to state that vaddr is invariant over the
> > > lifetime of the buffer and therefore needs no locking. Correct locking
> > > that a) takes all the locks b) never ever stalls for one is absolutely
> > > crucial for a panic handler that won't make the situation worse.
> > 
> > I think this comment was made to address buffers that are accessible to
> > the CPU but don't have a CPU mapping (ie, created with
> > DMA_ATTR_NO_KERNEL_MAPPING for example).
> 
> But then the NULL value of vaddr would also be invariant ...
> 
> My emphasis is more that we need to be really careful with all the locking
> rules here in the panic handler and not just assume it's going to be safe.
> I've stitched some tricky design together, but need to still move it from
> the whiteboard to a patch.

Oh, sorry.

I misinterpreted what you were going for with that review. Sorry for the noise.

Maxime
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_fb_dma_helper.c b/drivers/gpu/drm/drm_fb_dma_helper.c
index 3b535ad1b07c..caed2935df4f 100644
--- a/drivers/gpu/drm/drm_fb_dma_helper.c
+++ b/drivers/gpu/drm/drm_fb_dma_helper.c
@@ -15,6 +15,7 @@ 
 #include <drm/drm_framebuffer.h>
 #include <drm/drm_gem_dma_helper.h>
 #include <drm/drm_gem_framebuffer_helper.h>
+#include <drm/drm_panic.h>
 #include <drm/drm_plane.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
@@ -148,3 +149,57 @@  void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
 	}
 }
 EXPORT_SYMBOL_GPL(drm_fb_dma_sync_non_coherent);
+
+#if defined(CONFIG_DRM_PANIC)
+/**
+ * @dev: DRM device
+ * @drm_scanout_buffer: scanout buffer for the panic handler
+ * Returns: 0 or negative error code
+ *
+ * Generic get_scanout_buffer() implementation, for drivers that uses the
+ * drm_fb_dma_helper.
+ */
+int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
+				     struct drm_scanout_buffer *sb)
+{
+	struct drm_plane *plane;
+	struct drm_gem_dma_object *dma_obj;
+	struct drm_framebuffer *fb;
+
+	drm_for_each_primary_visible_plane(plane, dev) {
+		fb = plane->state->fb;
+		/* Only support linear modifier */
+		if (fb->modifier != DRM_FORMAT_MOD_LINEAR)
+			continue;
+
+		/* Check if color format is supported */
+		if (!drm_panic_is_format_supported(fb->format->format))
+			continue;
+
+		dma_obj = drm_fb_dma_get_gem_obj(fb, 0);
+
+		/* Buffer should be accessible from the CPU */
+		if (dma_obj->base.import_attach)
+			continue;
+
+		/* Buffer should be already mapped to CPU */
+		if (!dma_obj->vaddr)
+			continue;
+
+		iosys_map_set_vaddr(&sb->map, dma_obj->vaddr);
+		sb->format = fb->format;
+		sb->height = fb->height;
+		sb->width = fb->width;
+		sb->pitch = fb->pitches[0];
+		return 0;
+	}
+	return -ENODEV;
+}
+#else
+int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
+				     struct drm_scanout_buffer *sb)
+{
+	return 0;
+}
+#endif
+EXPORT_SYMBOL(drm_panic_gem_get_scanout_buffer);
diff --git a/include/drm/drm_fb_dma_helper.h b/include/drm/drm_fb_dma_helper.h
index d5e036c57801..2ae432865079 100644
--- a/include/drm/drm_fb_dma_helper.h
+++ b/include/drm/drm_fb_dma_helper.h
@@ -7,6 +7,7 @@ 
 struct drm_device;
 struct drm_framebuffer;
 struct drm_plane_state;
+struct drm_scanout_buffer;
 
 struct drm_gem_dma_object *drm_fb_dma_get_gem_obj(struct drm_framebuffer *fb,
 	unsigned int plane);
@@ -19,5 +20,8 @@  void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
 				  struct drm_plane_state *old_state,
 				  struct drm_plane_state *state);
 
+int drm_panic_gem_get_scanout_buffer(struct drm_device *dev,
+				     struct drm_scanout_buffer *sb);
+
 #endif