diff mbox series

[4/7] drm/ttm: move LRU walk defines into new internal header

Message ID 20240710124301.1628-5-christian.koenig@amd.com (mailing list archive)
State New, archived
Headers show
Series [1/7] dma-buf/dma-resv: Introduce dma_resv_trylock_ctx() | expand

Commit Message

Christian König July 10, 2024, 12:42 p.m. UTC
That is something drivers really shouldn't mess with.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c      |  1 +
 drivers/gpu/drm/ttm/ttm_bo_util.c |  2 +
 drivers/gpu/drm/ttm/ttm_bo_util.h | 67 +++++++++++++++++++++++++++++++
 include/drm/ttm/ttm_bo.h          | 35 ----------------
 4 files changed, 70 insertions(+), 35 deletions(-)
 create mode 100644 drivers/gpu/drm/ttm/ttm_bo_util.h

Comments

Matthew Brost July 10, 2024, 6:19 p.m. UTC | #1
On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König wrote:
> That is something drivers really shouldn't mess with.
> 

Thomas uses this in Xe to implement a shrinker [1]. Seems to need to
remain available for drivers.

Matt

[1] https://patchwork.freedesktop.org/patch/602165/?series=131815&rev=6

> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/ttm/ttm_bo.c      |  1 +
>  drivers/gpu/drm/ttm/ttm_bo_util.c |  2 +
>  drivers/gpu/drm/ttm/ttm_bo_util.h | 67 +++++++++++++++++++++++++++++++
>  include/drm/ttm/ttm_bo.h          | 35 ----------------
>  4 files changed, 70 insertions(+), 35 deletions(-)
>  create mode 100644 drivers/gpu/drm/ttm/ttm_bo_util.h
> 
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 0131ec802066..41bee8696e69 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -45,6 +45,7 @@
>  #include <linux/dma-resv.h>
>  
>  #include "ttm_module.h"
> +#include "ttm_bo_util.h"
>  
>  static void ttm_bo_mem_space_debug(struct ttm_buffer_object *bo,
>  					struct ttm_placement *placement)
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
> index 3c07f4712d5c..03e28e3d0d03 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> @@ -37,6 +37,8 @@
>  
>  #include <drm/drm_cache.h>
>  
> +#include "ttm_bo_util.h"
> +
>  struct ttm_transfer_obj {
>  	struct ttm_buffer_object base;
>  	struct ttm_buffer_object *bo;
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.h b/drivers/gpu/drm/ttm/ttm_bo_util.h
> new file mode 100644
> index 000000000000..c19b50809208
> --- /dev/null
> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.h
> @@ -0,0 +1,67 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/**************************************************************************
> + * Copyright 2024 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + **************************************************************************/
> +#ifndef _TTM_BO_UTIL_H_
> +#define _TTM_BO_UTIL_H_
> +
> +struct ww_acquire_ctx;
> +
> +struct ttm_buffer_object;
> +struct ttm_operation_ctx;
> +struct ttm_lru_walk;
> +
> +/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
> +struct ttm_lru_walk_ops {
> +	/**
> +	 * process_bo - Process this bo.
> +	 * @walk: struct ttm_lru_walk describing the walk.
> +	 * @bo: A locked and referenced buffer object.
> +	 *
> +	 * Return: Negative error code on error, User-defined positive value
> +	 * (typically, but not always, size of the processed bo) on success.
> +	 * On success, the returned values are summed by the walk and the
> +	 * walk exits when its target is met.
> +	 * 0 also indicates success, -EBUSY means this bo was skipped.
> +	 */
> +	s64 (*process_bo)(struct ttm_lru_walk *walk,
> +			  struct ttm_buffer_object *bo);
> +};
> +
> +/**
> + * struct ttm_lru_walk - Structure describing a LRU walk.
> + */
> +struct ttm_lru_walk {
> +	/** @ops: Pointer to the ops structure. */
> +	const struct ttm_lru_walk_ops *ops;
> +	/** @ctx: Pointer to the struct ttm_operation_ctx. */
> +	struct ttm_operation_ctx *ctx;
> +	/** @ticket: The struct ww_acquire_ctx if any. */
> +	struct ww_acquire_ctx *ticket;
> +	/** @tryock_only: Only use trylock for locking. */
> +	bool trylock_only;
> +};
> +
> +s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct ttm_device *bdev,
> +			   struct ttm_resource_manager *man, s64 target);
> +
> +#endif
> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> index d1a732d56259..5f7c967222a2 100644
> --- a/include/drm/ttm/ttm_bo.h
> +++ b/include/drm/ttm/ttm_bo.h
> @@ -194,41 +194,6 @@ struct ttm_operation_ctx {
>  	uint64_t bytes_moved;
>  };
>  
> -struct ttm_lru_walk;
> -
> -/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
> -struct ttm_lru_walk_ops {
> -	/**
> -	 * process_bo - Process this bo.
> -	 * @walk: struct ttm_lru_walk describing the walk.
> -	 * @bo: A locked and referenced buffer object.
> -	 *
> -	 * Return: Negative error code on error, User-defined positive value
> -	 * (typically, but not always, size of the processed bo) on success.
> -	 * On success, the returned values are summed by the walk and the
> -	 * walk exits when its target is met.
> -	 * 0 also indicates success, -EBUSY means this bo was skipped.
> -	 */
> -	s64 (*process_bo)(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo);
> -};
> -
> -/**
> - * struct ttm_lru_walk - Structure describing a LRU walk.
> - */
> -struct ttm_lru_walk {
> -	/** @ops: Pointer to the ops structure. */
> -	const struct ttm_lru_walk_ops *ops;
> -	/** @ctx: Pointer to the struct ttm_operation_ctx. */
> -	struct ttm_operation_ctx *ctx;
> -	/** @ticket: The struct ww_acquire_ctx if any. */
> -	struct ww_acquire_ctx *ticket;
> -	/** @tryock_only: Only use trylock for locking. */
> -	bool trylock_only;
> -};
> -
> -s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct ttm_device *bdev,
> -			   struct ttm_resource_manager *man, s64 target);
> -
>  /**
>   * ttm_bo_get - reference a struct ttm_buffer_object
>   *
> -- 
> 2.34.1
>
Christian König July 11, 2024, 12:01 p.m. UTC | #2
Am 10.07.24 um 20:19 schrieb Matthew Brost:
> On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König wrote:
>> That is something drivers really shouldn't mess with.
>>
> Thomas uses this in Xe to implement a shrinker [1]. Seems to need to
> remain available for drivers.

No, that is exactly what I try to prevent with that.

This is an internally thing of TTM and drivers should never use it directly.

Regards,
Christian.

>
> Matt
>
> [1] https://patchwork.freedesktop.org/patch/602165/?series=131815&rev=6
>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/ttm/ttm_bo.c      |  1 +
>>   drivers/gpu/drm/ttm/ttm_bo_util.c |  2 +
>>   drivers/gpu/drm/ttm/ttm_bo_util.h | 67 +++++++++++++++++++++++++++++++
>>   include/drm/ttm/ttm_bo.h          | 35 ----------------
>>   4 files changed, 70 insertions(+), 35 deletions(-)
>>   create mode 100644 drivers/gpu/drm/ttm/ttm_bo_util.h
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> index 0131ec802066..41bee8696e69 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -45,6 +45,7 @@
>>   #include <linux/dma-resv.h>
>>   
>>   #include "ttm_module.h"
>> +#include "ttm_bo_util.h"
>>   
>>   static void ttm_bo_mem_space_debug(struct ttm_buffer_object *bo,
>>   					struct ttm_placement *placement)
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
>> index 3c07f4712d5c..03e28e3d0d03 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
>> @@ -37,6 +37,8 @@
>>   
>>   #include <drm/drm_cache.h>
>>   
>> +#include "ttm_bo_util.h"
>> +
>>   struct ttm_transfer_obj {
>>   	struct ttm_buffer_object base;
>>   	struct ttm_buffer_object *bo;
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.h b/drivers/gpu/drm/ttm/ttm_bo_util.h
>> new file mode 100644
>> index 000000000000..c19b50809208
>> --- /dev/null
>> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.h
>> @@ -0,0 +1,67 @@
>> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
>> +/**************************************************************************
>> + * Copyright 2024 Advanced Micro Devices, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>> + * OTHER DEALINGS IN THE SOFTWARE.
>> + *
>> + **************************************************************************/
>> +#ifndef _TTM_BO_UTIL_H_
>> +#define _TTM_BO_UTIL_H_
>> +
>> +struct ww_acquire_ctx;
>> +
>> +struct ttm_buffer_object;
>> +struct ttm_operation_ctx;
>> +struct ttm_lru_walk;
>> +
>> +/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
>> +struct ttm_lru_walk_ops {
>> +	/**
>> +	 * process_bo - Process this bo.
>> +	 * @walk: struct ttm_lru_walk describing the walk.
>> +	 * @bo: A locked and referenced buffer object.
>> +	 *
>> +	 * Return: Negative error code on error, User-defined positive value
>> +	 * (typically, but not always, size of the processed bo) on success.
>> +	 * On success, the returned values are summed by the walk and the
>> +	 * walk exits when its target is met.
>> +	 * 0 also indicates success, -EBUSY means this bo was skipped.
>> +	 */
>> +	s64 (*process_bo)(struct ttm_lru_walk *walk,
>> +			  struct ttm_buffer_object *bo);
>> +};
>> +
>> +/**
>> + * struct ttm_lru_walk - Structure describing a LRU walk.
>> + */
>> +struct ttm_lru_walk {
>> +	/** @ops: Pointer to the ops structure. */
>> +	const struct ttm_lru_walk_ops *ops;
>> +	/** @ctx: Pointer to the struct ttm_operation_ctx. */
>> +	struct ttm_operation_ctx *ctx;
>> +	/** @ticket: The struct ww_acquire_ctx if any. */
>> +	struct ww_acquire_ctx *ticket;
>> +	/** @tryock_only: Only use trylock for locking. */
>> +	bool trylock_only;
>> +};
>> +
>> +s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct ttm_device *bdev,
>> +			   struct ttm_resource_manager *man, s64 target);
>> +
>> +#endif
>> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
>> index d1a732d56259..5f7c967222a2 100644
>> --- a/include/drm/ttm/ttm_bo.h
>> +++ b/include/drm/ttm/ttm_bo.h
>> @@ -194,41 +194,6 @@ struct ttm_operation_ctx {
>>   	uint64_t bytes_moved;
>>   };
>>   
>> -struct ttm_lru_walk;
>> -
>> -/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
>> -struct ttm_lru_walk_ops {
>> -	/**
>> -	 * process_bo - Process this bo.
>> -	 * @walk: struct ttm_lru_walk describing the walk.
>> -	 * @bo: A locked and referenced buffer object.
>> -	 *
>> -	 * Return: Negative error code on error, User-defined positive value
>> -	 * (typically, but not always, size of the processed bo) on success.
>> -	 * On success, the returned values are summed by the walk and the
>> -	 * walk exits when its target is met.
>> -	 * 0 also indicates success, -EBUSY means this bo was skipped.
>> -	 */
>> -	s64 (*process_bo)(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo);
>> -};
>> -
>> -/**
>> - * struct ttm_lru_walk - Structure describing a LRU walk.
>> - */
>> -struct ttm_lru_walk {
>> -	/** @ops: Pointer to the ops structure. */
>> -	const struct ttm_lru_walk_ops *ops;
>> -	/** @ctx: Pointer to the struct ttm_operation_ctx. */
>> -	struct ttm_operation_ctx *ctx;
>> -	/** @ticket: The struct ww_acquire_ctx if any. */
>> -	struct ww_acquire_ctx *ticket;
>> -	/** @tryock_only: Only use trylock for locking. */
>> -	bool trylock_only;
>> -};
>> -
>> -s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct ttm_device *bdev,
>> -			   struct ttm_resource_manager *man, s64 target);
>> -
>>   /**
>>    * ttm_bo_get - reference a struct ttm_buffer_object
>>    *
>> -- 
>> 2.34.1
>>
Matthew Brost July 11, 2024, 4 p.m. UTC | #3
On Thu, Jul 11, 2024 at 02:01:00PM +0200, Christian König wrote:
> Am 10.07.24 um 20:19 schrieb Matthew Brost:
> > On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König wrote:
> > > That is something drivers really shouldn't mess with.
> > > 
> > Thomas uses this in Xe to implement a shrinker [1]. Seems to need to
> > remain available for drivers.
> 
> No, that is exactly what I try to prevent with that.
> 
> This is an internally thing of TTM and drivers should never use it directly.
> 

Admittedly having fully gotten through Thomas's shrinker patches so I
can't reasonably defend how Thomas is using the LRU walker in the Xe
driver but I will say Thomas is generally correct wrt layering code.

I'd say since he for the next month, table this patch until he returns
and we can all discuss options.

Matt

> Regards,
> Christian.
> 
> > 
> > Matt
> > 
> > [1] https://patchwork.freedesktop.org/patch/602165/?series=131815&rev=6
> > 
> > > Signed-off-by: Christian König <christian.koenig@amd.com>
> > > ---
> > >   drivers/gpu/drm/ttm/ttm_bo.c      |  1 +
> > >   drivers/gpu/drm/ttm/ttm_bo_util.c |  2 +
> > >   drivers/gpu/drm/ttm/ttm_bo_util.h | 67 +++++++++++++++++++++++++++++++
> > >   include/drm/ttm/ttm_bo.h          | 35 ----------------
> > >   4 files changed, 70 insertions(+), 35 deletions(-)
> > >   create mode 100644 drivers/gpu/drm/ttm/ttm_bo_util.h
> > > 
> > > diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> > > index 0131ec802066..41bee8696e69 100644
> > > --- a/drivers/gpu/drm/ttm/ttm_bo.c
> > > +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> > > @@ -45,6 +45,7 @@
> > >   #include <linux/dma-resv.h>
> > >   #include "ttm_module.h"
> > > +#include "ttm_bo_util.h"
> > >   static void ttm_bo_mem_space_debug(struct ttm_buffer_object *bo,
> > >   					struct ttm_placement *placement)
> > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > index 3c07f4712d5c..03e28e3d0d03 100644
> > > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > @@ -37,6 +37,8 @@
> > >   #include <drm/drm_cache.h>
> > > +#include "ttm_bo_util.h"
> > > +
> > >   struct ttm_transfer_obj {
> > >   	struct ttm_buffer_object base;
> > >   	struct ttm_buffer_object *bo;
> > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.h b/drivers/gpu/drm/ttm/ttm_bo_util.h
> > > new file mode 100644
> > > index 000000000000..c19b50809208
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.h
> > > @@ -0,0 +1,67 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> > > +/**************************************************************************
> > > + * Copyright 2024 Advanced Micro Devices, Inc.
> > > + *
> > > + * Permission is hereby granted, free of charge, to any person obtaining a
> > > + * copy of this software and associated documentation files (the "Software"),
> > > + * to deal in the Software without restriction, including without limitation
> > > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > > + * and/or sell copies of the Software, and to permit persons to whom the
> > > + * Software is furnished to do so, subject to the following conditions:
> > > + *
> > > + * The above copyright notice and this permission notice shall be included in
> > > + * all copies or substantial portions of the Software.
> > > + *
> > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> > > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> > > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> > > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> > > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> > > + * OTHER DEALINGS IN THE SOFTWARE.
> > > + *
> > > + **************************************************************************/
> > > +#ifndef _TTM_BO_UTIL_H_
> > > +#define _TTM_BO_UTIL_H_
> > > +
> > > +struct ww_acquire_ctx;
> > > +
> > > +struct ttm_buffer_object;
> > > +struct ttm_operation_ctx;
> > > +struct ttm_lru_walk;
> > > +
> > > +/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
> > > +struct ttm_lru_walk_ops {
> > > +	/**
> > > +	 * process_bo - Process this bo.
> > > +	 * @walk: struct ttm_lru_walk describing the walk.
> > > +	 * @bo: A locked and referenced buffer object.
> > > +	 *
> > > +	 * Return: Negative error code on error, User-defined positive value
> > > +	 * (typically, but not always, size of the processed bo) on success.
> > > +	 * On success, the returned values are summed by the walk and the
> > > +	 * walk exits when its target is met.
> > > +	 * 0 also indicates success, -EBUSY means this bo was skipped.
> > > +	 */
> > > +	s64 (*process_bo)(struct ttm_lru_walk *walk,
> > > +			  struct ttm_buffer_object *bo);
> > > +};
> > > +
> > > +/**
> > > + * struct ttm_lru_walk - Structure describing a LRU walk.
> > > + */
> > > +struct ttm_lru_walk {
> > > +	/** @ops: Pointer to the ops structure. */
> > > +	const struct ttm_lru_walk_ops *ops;
> > > +	/** @ctx: Pointer to the struct ttm_operation_ctx. */
> > > +	struct ttm_operation_ctx *ctx;
> > > +	/** @ticket: The struct ww_acquire_ctx if any. */
> > > +	struct ww_acquire_ctx *ticket;
> > > +	/** @tryock_only: Only use trylock for locking. */
> > > +	bool trylock_only;
> > > +};
> > > +
> > > +s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct ttm_device *bdev,
> > > +			   struct ttm_resource_manager *man, s64 target);
> > > +
> > > +#endif
> > > diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> > > index d1a732d56259..5f7c967222a2 100644
> > > --- a/include/drm/ttm/ttm_bo.h
> > > +++ b/include/drm/ttm/ttm_bo.h
> > > @@ -194,41 +194,6 @@ struct ttm_operation_ctx {
> > >   	uint64_t bytes_moved;
> > >   };
> > > -struct ttm_lru_walk;
> > > -
> > > -/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
> > > -struct ttm_lru_walk_ops {
> > > -	/**
> > > -	 * process_bo - Process this bo.
> > > -	 * @walk: struct ttm_lru_walk describing the walk.
> > > -	 * @bo: A locked and referenced buffer object.
> > > -	 *
> > > -	 * Return: Negative error code on error, User-defined positive value
> > > -	 * (typically, but not always, size of the processed bo) on success.
> > > -	 * On success, the returned values are summed by the walk and the
> > > -	 * walk exits when its target is met.
> > > -	 * 0 also indicates success, -EBUSY means this bo was skipped.
> > > -	 */
> > > -	s64 (*process_bo)(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo);
> > > -};
> > > -
> > > -/**
> > > - * struct ttm_lru_walk - Structure describing a LRU walk.
> > > - */
> > > -struct ttm_lru_walk {
> > > -	/** @ops: Pointer to the ops structure. */
> > > -	const struct ttm_lru_walk_ops *ops;
> > > -	/** @ctx: Pointer to the struct ttm_operation_ctx. */
> > > -	struct ttm_operation_ctx *ctx;
> > > -	/** @ticket: The struct ww_acquire_ctx if any. */
> > > -	struct ww_acquire_ctx *ticket;
> > > -	/** @tryock_only: Only use trylock for locking. */
> > > -	bool trylock_only;
> > > -};
> > > -
> > > -s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct ttm_device *bdev,
> > > -			   struct ttm_resource_manager *man, s64 target);
> > > -
> > >   /**
> > >    * ttm_bo_get - reference a struct ttm_buffer_object
> > >    *
> > > -- 
> > > 2.34.1
> > > 
>
Thomas Hellström Aug. 6, 2024, 8:29 a.m. UTC | #4
Hi, Christian.

On Thu, 2024-07-11 at 14:01 +0200, Christian König wrote:
> Am 10.07.24 um 20:19 schrieb Matthew Brost:
> > On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König wrote:
> > > That is something drivers really shouldn't mess with.
> > > 
> > Thomas uses this in Xe to implement a shrinker [1]. Seems to need
> > to
> > remain available for drivers.
> 
> No, that is exactly what I try to prevent with that.
> 
> This is an internally thing of TTM and drivers should never use it
> directly.

That driver-facing LRU walker is a direct response/solution to this
comment that you made in the first shrinker series:

https://lore.kernel.org/linux-mm/b7491378-defd-4f1c-31e2-29e4c77e2d67@amd.com/T/#ma918844aa8a6efe8768fdcda0c6590d5c93850c9


That is also mentioned in the cover letter of the recent shrinker
series, and this walker has been around in that shrinker series for
more than half a year now so if you think it's not the correct driver
facing API IMHO that should be addressed by a review comment in that
series rather than in posting a conflicting patch?

So assuming that we still want the driver to register the shrinker,
IMO that helper abstracts away all the nasty locking and pitfalls for a
driver-registered shrinker, and is similar in structure to for example
the pagewalk helper in mm/pagewalk.c.

An alternative that could be tried as a driver-facing API is to provide
a for_each_bo_in_lru_lock() macro where the driver open-codes
"process_bo()" inside the for loop but I tried this and found it quite
fragile since the driver might exit the loop without performing the
necessary cleanup.

However with using scoped_guard() in linux/cleanup.h that could
probably be mitigated to some exent, but I still think that a walker
helper like this is the safer choice and given the complexity of a for_
macro involving scoped_guard(), I think the walker helper is the
easiest-to-maintain solution moving forward.

But open to suggestions.

Thanks
Thomas


> 
> Regards,
> Christian.
> 
> > 
> > Matt
> > 
> > [1]
> > https://patchwork.freedesktop.org/patch/602165/?series=131815&rev=6
> > 
> > > Signed-off-by: Christian König <christian.koenig@amd.com>
> > > ---
> > >   drivers/gpu/drm/ttm/ttm_bo.c      |  1 +
> > >   drivers/gpu/drm/ttm/ttm_bo_util.c |  2 +
> > >   drivers/gpu/drm/ttm/ttm_bo_util.h | 67
> > > +++++++++++++++++++++++++++++++
> > >   include/drm/ttm/ttm_bo.h          | 35 ----------------
> > >   4 files changed, 70 insertions(+), 35 deletions(-)
> > >   create mode 100644 drivers/gpu/drm/ttm/ttm_bo_util.h
> > > 
> > > diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
> > > b/drivers/gpu/drm/ttm/ttm_bo.c
> > > index 0131ec802066..41bee8696e69 100644
> > > --- a/drivers/gpu/drm/ttm/ttm_bo.c
> > > +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> > > @@ -45,6 +45,7 @@
> > >   #include <linux/dma-resv.h>
> > >   
> > >   #include "ttm_module.h"
> > > +#include "ttm_bo_util.h"
> > >   
> > >   static void ttm_bo_mem_space_debug(struct ttm_buffer_object
> > > *bo,
> > >   					struct ttm_placement
> > > *placement)
> > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > index 3c07f4712d5c..03e28e3d0d03 100644
> > > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > @@ -37,6 +37,8 @@
> > >   
> > >   #include <drm/drm_cache.h>
> > >   
> > > +#include "ttm_bo_util.h"
> > > +
> > >   struct ttm_transfer_obj {
> > >   	struct ttm_buffer_object base;
> > >   	struct ttm_buffer_object *bo;
> > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.h
> > > b/drivers/gpu/drm/ttm/ttm_bo_util.h
> > > new file mode 100644
> > > index 000000000000..c19b50809208
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.h
> > > @@ -0,0 +1,67 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> > > +/***************************************************************
> > > ***********
> > > + * Copyright 2024 Advanced Micro Devices, Inc.
> > > + *
> > > + * Permission is hereby granted, free of charge, to any person
> > > obtaining a
> > > + * copy of this software and associated documentation files (the
> > > "Software"),
> > > + * to deal in the Software without restriction, including
> > > without limitation
> > > + * the rights to use, copy, modify, merge, publish, distribute,
> > > sublicense,
> > > + * and/or sell copies of the Software, and to permit persons to
> > > whom the
> > > + * Software is furnished to do so, subject to the following
> > > conditions:
> > > + *
> > > + * The above copyright notice and this permission notice shall
> > > be included in
> > > + * all copies or substantial portions of the Software.
> > > + *
> > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> > > KIND, EXPRESS OR
> > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > > MERCHANTABILITY,
> > > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> > > EVENT SHALL
> > > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> > > DAMAGES OR
> > > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> > > OTHERWISE,
> > > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> > > THE USE OR
> > > + * OTHER DEALINGS IN THE SOFTWARE.
> > > + *
> > > +
> > > *****************************************************************
> > > *********/
> > > +#ifndef _TTM_BO_UTIL_H_
> > > +#define _TTM_BO_UTIL_H_
> > > +
> > > +struct ww_acquire_ctx;
> > > +
> > > +struct ttm_buffer_object;
> > > +struct ttm_operation_ctx;
> > > +struct ttm_lru_walk;
> > > +
> > > +/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
> > > +struct ttm_lru_walk_ops {
> > > +	/**
> > > +	 * process_bo - Process this bo.
> > > +	 * @walk: struct ttm_lru_walk describing the walk.
> > > +	 * @bo: A locked and referenced buffer object.
> > > +	 *
> > > +	 * Return: Negative error code on error, User-defined
> > > positive value
> > > +	 * (typically, but not always, size of the processed bo)
> > > on success.
> > > +	 * On success, the returned values are summed by the
> > > walk and the
> > > +	 * walk exits when its target is met.
> > > +	 * 0 also indicates success, -EBUSY means this bo was
> > > skipped.
> > > +	 */
> > > +	s64 (*process_bo)(struct ttm_lru_walk *walk,
> > > +			  struct ttm_buffer_object *bo);
> > > +};
> > > +
> > > +/**
> > > + * struct ttm_lru_walk - Structure describing a LRU walk.
> > > + */
> > > +struct ttm_lru_walk {
> > > +	/** @ops: Pointer to the ops structure. */
> > > +	const struct ttm_lru_walk_ops *ops;
> > > +	/** @ctx: Pointer to the struct ttm_operation_ctx. */
> > > +	struct ttm_operation_ctx *ctx;
> > > +	/** @ticket: The struct ww_acquire_ctx if any. */
> > > +	struct ww_acquire_ctx *ticket;
> > > +	/** @tryock_only: Only use trylock for locking. */
> > > +	bool trylock_only;
> > > +};
> > > +
> > > +s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct
> > > ttm_device *bdev,
> > > +			   struct ttm_resource_manager *man, s64
> > > target);
> > > +
> > > +#endif
> > > diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> > > index d1a732d56259..5f7c967222a2 100644
> > > --- a/include/drm/ttm/ttm_bo.h
> > > +++ b/include/drm/ttm/ttm_bo.h
> > > @@ -194,41 +194,6 @@ struct ttm_operation_ctx {
> > >   	uint64_t bytes_moved;
> > >   };
> > >   
> > > -struct ttm_lru_walk;
> > > -
> > > -/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
> > > -struct ttm_lru_walk_ops {
> > > -	/**
> > > -	 * process_bo - Process this bo.
> > > -	 * @walk: struct ttm_lru_walk describing the walk.
> > > -	 * @bo: A locked and referenced buffer object.
> > > -	 *
> > > -	 * Return: Negative error code on error, User-defined
> > > positive value
> > > -	 * (typically, but not always, size of the processed bo)
> > > on success.
> > > -	 * On success, the returned values are summed by the
> > > walk and the
> > > -	 * walk exits when its target is met.
> > > -	 * 0 also indicates success, -EBUSY means this bo was
> > > skipped.
> > > -	 */
> > > -	s64 (*process_bo)(struct ttm_lru_walk *walk, struct
> > > ttm_buffer_object *bo);
> > > -};
> > > -
> > > -/**
> > > - * struct ttm_lru_walk - Structure describing a LRU walk.
> > > - */
> > > -struct ttm_lru_walk {
> > > -	/** @ops: Pointer to the ops structure. */
> > > -	const struct ttm_lru_walk_ops *ops;
> > > -	/** @ctx: Pointer to the struct ttm_operation_ctx. */
> > > -	struct ttm_operation_ctx *ctx;
> > > -	/** @ticket: The struct ww_acquire_ctx if any. */
> > > -	struct ww_acquire_ctx *ticket;
> > > -	/** @tryock_only: Only use trylock for locking. */
> > > -	bool trylock_only;
> > > -};
> > > -
> > > -s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct
> > > ttm_device *bdev,
> > > -			   struct ttm_resource_manager *man, s64
> > > target);
> > > -
> > >   /**
> > >    * ttm_bo_get - reference a struct ttm_buffer_object
> > >    *
> > > -- 
> > > 2.34.1
> > > 
>
Christian König Aug. 19, 2024, 11:03 a.m. UTC | #5
Am 06.08.24 um 10:29 schrieb Thomas Hellström:
> Hi, Christian.
>
> On Thu, 2024-07-11 at 14:01 +0200, Christian König wrote:
>> Am 10.07.24 um 20:19 schrieb Matthew Brost:
>>> On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König wrote:
>>>> That is something drivers really shouldn't mess with.
>>>>
>>> Thomas uses this in Xe to implement a shrinker [1]. Seems to need
>>> to
>>> remain available for drivers.
>> No, that is exactly what I try to prevent with that.
>>
>> This is an internally thing of TTM and drivers should never use it
>> directly.
> That driver-facing LRU walker is a direct response/solution to this
> comment that you made in the first shrinker series:
>
> https://lore.kernel.org/linux-mm/b7491378-defd-4f1c-31e2-29e4c77e2d67@amd.com/T/#ma918844aa8a6efe8768fdcda0c6590d5c93850c9

Ah, yeah that was about how we are should be avoiding middle layer design.

But a function which returns the next candidate for eviction and a 
function which calls a callback for eviction is exactly the opposite.

> That is also mentioned in the cover letter of the recent shrinker
> series, and this walker has been around in that shrinker series for
> more than half a year now so if you think it's not the correct driver
> facing API IMHO that should be addressed by a review comment in that
> series rather than in posting a conflicting patch?

I actually outlined that in the review comments for the patch series. 
E.g. a walker function with a callback is basically a middle layer.

What outlined in the link above is that a function which returns the 
next eviction candidate is a better approach than a callback.

> So assuming that we still want the driver to register the shrinker,
> IMO that helper abstracts away all the nasty locking and pitfalls for a
> driver-registered shrinker, and is similar in structure to for example
> the pagewalk helper in mm/pagewalk.c.
>
> An alternative that could be tried as a driver-facing API is to provide
> a for_each_bo_in_lru_lock() macro where the driver open-codes
> "process_bo()" inside the for loop but I tried this and found it quite
> fragile since the driver might exit the loop without performing the
> necessary cleanup.

The point is that the shrinker should *never* need to have context. E.g. 
a walker which allows going over multiple BOs for eviction is exactly 
the wrong approach for that.

The shrinker should evict always only exactly one BO and the next 
invocation of a shrinker should not depend on the result of the previous 
one.

Or am I missing something vital here?

Regards,
Christian.

>
> However with using scoped_guard() in linux/cleanup.h that could
> probably be mitigated to some exent, but I still think that a walker
> helper like this is the safer choice and given the complexity of a for_
> macro involving scoped_guard(), I think the walker helper is the
> easiest-to-maintain solution moving forward.
>
> But open to suggestions.
>
> Thanks
> Thomas
>
>
>> Regards,
>> Christian.
>>
>>> Matt
>>>
>>> [1]
>>> https://patchwork.freedesktop.org/patch/602165/?series=131815&rev=6
>>>
>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/ttm/ttm_bo.c      |  1 +
>>>>    drivers/gpu/drm/ttm/ttm_bo_util.c |  2 +
>>>>    drivers/gpu/drm/ttm/ttm_bo_util.h | 67
>>>> +++++++++++++++++++++++++++++++
>>>>    include/drm/ttm/ttm_bo.h          | 35 ----------------
>>>>    4 files changed, 70 insertions(+), 35 deletions(-)
>>>>    create mode 100644 drivers/gpu/drm/ttm/ttm_bo_util.h
>>>>
>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>> index 0131ec802066..41bee8696e69 100644
>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>> @@ -45,6 +45,7 @@
>>>>    #include <linux/dma-resv.h>
>>>>    
>>>>    #include "ttm_module.h"
>>>> +#include "ttm_bo_util.h"
>>>>    
>>>>    static void ttm_bo_mem_space_debug(struct ttm_buffer_object
>>>> *bo,
>>>>    					struct ttm_placement
>>>> *placement)
>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c
>>>> b/drivers/gpu/drm/ttm/ttm_bo_util.c
>>>> index 3c07f4712d5c..03e28e3d0d03 100644
>>>> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
>>>> @@ -37,6 +37,8 @@
>>>>    
>>>>    #include <drm/drm_cache.h>
>>>>    
>>>> +#include "ttm_bo_util.h"
>>>> +
>>>>    struct ttm_transfer_obj {
>>>>    	struct ttm_buffer_object base;
>>>>    	struct ttm_buffer_object *bo;
>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.h
>>>> b/drivers/gpu/drm/ttm/ttm_bo_util.h
>>>> new file mode 100644
>>>> index 000000000000..c19b50809208
>>>> --- /dev/null
>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.h
>>>> @@ -0,0 +1,67 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
>>>> +/***************************************************************
>>>> ***********
>>>> + * Copyright 2024 Advanced Micro Devices, Inc.
>>>> + *
>>>> + * Permission is hereby granted, free of charge, to any person
>>>> obtaining a
>>>> + * copy of this software and associated documentation files (the
>>>> "Software"),
>>>> + * to deal in the Software without restriction, including
>>>> without limitation
>>>> + * the rights to use, copy, modify, merge, publish, distribute,
>>>> sublicense,
>>>> + * and/or sell copies of the Software, and to permit persons to
>>>> whom the
>>>> + * Software is furnished to do so, subject to the following
>>>> conditions:
>>>> + *
>>>> + * The above copyright notice and this permission notice shall
>>>> be included in
>>>> + * all copies or substantial portions of the Software.
>>>> + *
>>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
>>>> KIND, EXPRESS OR
>>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>>>> MERCHANTABILITY,
>>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
>>>> EVENT SHALL
>>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>>>> DAMAGES OR
>>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>>>> OTHERWISE,
>>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
>>>> THE USE OR
>>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>>> + *
>>>> +
>>>> *****************************************************************
>>>> *********/
>>>> +#ifndef _TTM_BO_UTIL_H_
>>>> +#define _TTM_BO_UTIL_H_
>>>> +
>>>> +struct ww_acquire_ctx;
>>>> +
>>>> +struct ttm_buffer_object;
>>>> +struct ttm_operation_ctx;
>>>> +struct ttm_lru_walk;
>>>> +
>>>> +/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
>>>> +struct ttm_lru_walk_ops {
>>>> +	/**
>>>> +	 * process_bo - Process this bo.
>>>> +	 * @walk: struct ttm_lru_walk describing the walk.
>>>> +	 * @bo: A locked and referenced buffer object.
>>>> +	 *
>>>> +	 * Return: Negative error code on error, User-defined
>>>> positive value
>>>> +	 * (typically, but not always, size of the processed bo)
>>>> on success.
>>>> +	 * On success, the returned values are summed by the
>>>> walk and the
>>>> +	 * walk exits when its target is met.
>>>> +	 * 0 also indicates success, -EBUSY means this bo was
>>>> skipped.
>>>> +	 */
>>>> +	s64 (*process_bo)(struct ttm_lru_walk *walk,
>>>> +			  struct ttm_buffer_object *bo);
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct ttm_lru_walk - Structure describing a LRU walk.
>>>> + */
>>>> +struct ttm_lru_walk {
>>>> +	/** @ops: Pointer to the ops structure. */
>>>> +	const struct ttm_lru_walk_ops *ops;
>>>> +	/** @ctx: Pointer to the struct ttm_operation_ctx. */
>>>> +	struct ttm_operation_ctx *ctx;
>>>> +	/** @ticket: The struct ww_acquire_ctx if any. */
>>>> +	struct ww_acquire_ctx *ticket;
>>>> +	/** @tryock_only: Only use trylock for locking. */
>>>> +	bool trylock_only;
>>>> +};
>>>> +
>>>> +s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct
>>>> ttm_device *bdev,
>>>> +			   struct ttm_resource_manager *man, s64
>>>> target);
>>>> +
>>>> +#endif
>>>> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
>>>> index d1a732d56259..5f7c967222a2 100644
>>>> --- a/include/drm/ttm/ttm_bo.h
>>>> +++ b/include/drm/ttm/ttm_bo.h
>>>> @@ -194,41 +194,6 @@ struct ttm_operation_ctx {
>>>>    	uint64_t bytes_moved;
>>>>    };
>>>>    
>>>> -struct ttm_lru_walk;
>>>> -
>>>> -/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
>>>> -struct ttm_lru_walk_ops {
>>>> -	/**
>>>> -	 * process_bo - Process this bo.
>>>> -	 * @walk: struct ttm_lru_walk describing the walk.
>>>> -	 * @bo: A locked and referenced buffer object.
>>>> -	 *
>>>> -	 * Return: Negative error code on error, User-defined
>>>> positive value
>>>> -	 * (typically, but not always, size of the processed bo)
>>>> on success.
>>>> -	 * On success, the returned values are summed by the
>>>> walk and the
>>>> -	 * walk exits when its target is met.
>>>> -	 * 0 also indicates success, -EBUSY means this bo was
>>>> skipped.
>>>> -	 */
>>>> -	s64 (*process_bo)(struct ttm_lru_walk *walk, struct
>>>> ttm_buffer_object *bo);
>>>> -};
>>>> -
>>>> -/**
>>>> - * struct ttm_lru_walk - Structure describing a LRU walk.
>>>> - */
>>>> -struct ttm_lru_walk {
>>>> -	/** @ops: Pointer to the ops structure. */
>>>> -	const struct ttm_lru_walk_ops *ops;
>>>> -	/** @ctx: Pointer to the struct ttm_operation_ctx. */
>>>> -	struct ttm_operation_ctx *ctx;
>>>> -	/** @ticket: The struct ww_acquire_ctx if any. */
>>>> -	struct ww_acquire_ctx *ticket;
>>>> -	/** @tryock_only: Only use trylock for locking. */
>>>> -	bool trylock_only;
>>>> -};
>>>> -
>>>> -s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct
>>>> ttm_device *bdev,
>>>> -			   struct ttm_resource_manager *man, s64
>>>> target);
>>>> -
>>>>    /**
>>>>     * ttm_bo_get - reference a struct ttm_buffer_object
>>>>     *
>>>> -- 
>>>> 2.34.1
>>>>
Thomas Hellström Aug. 19, 2024, 11:38 a.m. UTC | #6
Hi, Christian,

On Mon, 2024-08-19 at 13:03 +0200, Christian König wrote:
> Am 06.08.24 um 10:29 schrieb Thomas Hellström:
> > Hi, Christian.
> > 
> > On Thu, 2024-07-11 at 14:01 +0200, Christian König wrote:
> > > Am 10.07.24 um 20:19 schrieb Matthew Brost:
> > > > On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König
> > > > wrote:
> > > > > That is something drivers really shouldn't mess with.
> > > > > 
> > > > Thomas uses this in Xe to implement a shrinker [1]. Seems to
> > > > need
> > > > to
> > > > remain available for drivers.
> > > No, that is exactly what I try to prevent with that.
> > > 
> > > This is an internally thing of TTM and drivers should never use
> > > it
> > > directly.
> > That driver-facing LRU walker is a direct response/solution to this
> > comment that you made in the first shrinker series:
> > 
> > https://lore.kernel.org/linux-mm/b7491378-defd-4f1c-31e2-29e4c77e2d67@amd.com/T/#ma918844aa8a6efe8768fdcda0c6590d5c93850c9
> 
> Ah, yeah that was about how we are should be avoiding middle layer
> design.
> 
> But a function which returns the next candidate for eviction and a 
> function which calls a callback for eviction is exactly the opposite.
> 
> > That is also mentioned in the cover letter of the recent shrinker
> > series, and this walker has been around in that shrinker series for
> > more than half a year now so if you think it's not the correct
> > driver
> > facing API IMHO that should be addressed by a review comment in
> > that
> > series rather than in posting a conflicting patch?
> 
> I actually outlined that in the review comments for the patch series.
> E.g. a walker function with a callback is basically a middle layer.
> 
> What outlined in the link above is that a function which returns the 
> next eviction candidate is a better approach than a callback.
> 
> > So assuming that we still want the driver to register the shrinker,
> > IMO that helper abstracts away all the nasty locking and pitfalls
> > for a
> > driver-registered shrinker, and is similar in structure to for
> > example
> > the pagewalk helper in mm/pagewalk.c.
> > 
> > An alternative that could be tried as a driver-facing API is to
> > provide
> > a for_each_bo_in_lru_lock() macro where the driver open-codes
> > "process_bo()" inside the for loop but I tried this and found it
> > quite
> > fragile since the driver might exit the loop without performing the
> > necessary cleanup.
> 
> The point is that the shrinker should *never* need to have context.
> E.g. 
> a walker which allows going over multiple BOs for eviction is exactly
> the wrong approach for that.
> 
> The shrinker should evict always only exactly one BO and the next 
> invocation of a shrinker should not depend on the result of the
> previous 
> one.
> 
> Or am I missing something vital here?

A couple of things,

1) I'd like to think of the middle-layer vs helper like the helper has
its own ops, and can be used optionally from the driver. IIRC, the
atomic modesetting / pageflip ops are implemented in exactly this way.

Sometimes a certain loop operation can't be easily or at least robustly
implemented using a for_each_.. approach. Like for example
mm/pagewalk.c. In this shrinking case I think it's probably possible
using the scoped_guard() in cleanup.h. This way we could get rid of
this middle layer discussion by turning the interface inside-out:

for_each_bo_on_lru_locked(xxx)
	driver_shrink();

But I do think the currently suggested approach is less fragile and
prone to driver error.

FWIW in addition to the above examples, also drm_gem_lru_scan works
like this.

2) The shrinkers suggest to shrink a number of pages, not a single bo,
again drm_gem_lru_scan works like this. i915 works like this. I think
we should align with those.

3) Even if we had a function to obtain the driver to shrink, the driver
needs to have its say about the suitability for shrinking, so a
callback is needed anyway (eviction_valuable).
In addition, if shrinking fails for some reason, what would stop that
function to return the same bo, again and again just like the problem
we previously had in TTM?

So basically all the restartable LRU work was motivated by this use-
case in mind, so making that private I must say comes as a pretty major
surprise.

I could have a look at the 

for_each_bo_on_lru_locked(xxx)
	driver_shrink();

approach, but having TTM just blindly return a single bo without
context will not work IMO.

Thanks,
/Thomas






> 
> Regards,
> Christian.
> 
> > 
> > However with using scoped_guard() in linux/cleanup.h that could
> > probably be mitigated to some exent, but I still think that a
> > walker
> > helper like this is the safer choice and given the complexity of a
> > for_
> > macro involving scoped_guard(), I think the walker helper is the
> > easiest-to-maintain solution moving forward.
> > 
> > But open to suggestions.
> > 
> > Thanks
> > Thomas
> > 
> > 
> > > Regards,
> > > Christian.
> > > 
> > > > Matt
> > > > 
> > > > [1]
> > > > https://patchwork.freedesktop.org/patch/602165/?series=131815&rev=6
> > > > 
> > > > > Signed-off-by: Christian König <christian.koenig@amd.com>
> > > > > ---
> > > > >    drivers/gpu/drm/ttm/ttm_bo.c      |  1 +
> > > > >    drivers/gpu/drm/ttm/ttm_bo_util.c |  2 +
> > > > >    drivers/gpu/drm/ttm/ttm_bo_util.h | 67
> > > > > +++++++++++++++++++++++++++++++
> > > > >    include/drm/ttm/ttm_bo.h          | 35 ----------------
> > > > >    4 files changed, 70 insertions(+), 35 deletions(-)
> > > > >    create mode 100644 drivers/gpu/drm/ttm/ttm_bo_util.h
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
> > > > > b/drivers/gpu/drm/ttm/ttm_bo.c
> > > > > index 0131ec802066..41bee8696e69 100644
> > > > > --- a/drivers/gpu/drm/ttm/ttm_bo.c
> > > > > +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> > > > > @@ -45,6 +45,7 @@
> > > > >    #include <linux/dma-resv.h>
> > > > >    
> > > > >    #include "ttm_module.h"
> > > > > +#include "ttm_bo_util.h"
> > > > >    
> > > > >    static void ttm_bo_mem_space_debug(struct
> > > > > ttm_buffer_object
> > > > > *bo,
> > > > >    					struct ttm_placement
> > > > > *placement)
> > > > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > > > b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > > > index 3c07f4712d5c..03e28e3d0d03 100644
> > > > > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > > > @@ -37,6 +37,8 @@
> > > > >    
> > > > >    #include <drm/drm_cache.h>
> > > > >    
> > > > > +#include "ttm_bo_util.h"
> > > > > +
> > > > >    struct ttm_transfer_obj {
> > > > >    	struct ttm_buffer_object base;
> > > > >    	struct ttm_buffer_object *bo;
> > > > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.h
> > > > > b/drivers/gpu/drm/ttm/ttm_bo_util.h
> > > > > new file mode 100644
> > > > > index 000000000000..c19b50809208
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.h
> > > > > @@ -0,0 +1,67 @@
> > > > > +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> > > > > +/***********************************************************
> > > > > ****
> > > > > ***********
> > > > > + * Copyright 2024 Advanced Micro Devices, Inc.
> > > > > + *
> > > > > + * Permission is hereby granted, free of charge, to any
> > > > > person
> > > > > obtaining a
> > > > > + * copy of this software and associated documentation files
> > > > > (the
> > > > > "Software"),
> > > > > + * to deal in the Software without restriction, including
> > > > > without limitation
> > > > > + * the rights to use, copy, modify, merge, publish,
> > > > > distribute,
> > > > > sublicense,
> > > > > + * and/or sell copies of the Software, and to permit persons
> > > > > to
> > > > > whom the
> > > > > + * Software is furnished to do so, subject to the following
> > > > > conditions:
> > > > > + *
> > > > > + * The above copyright notice and this permission notice
> > > > > shall
> > > > > be included in
> > > > > + * all copies or substantial portions of the Software.
> > > > > + *
> > > > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> > > > > KIND, EXPRESS OR
> > > > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > > > > MERCHANTABILITY,
> > > > > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN
> > > > > NO
> > > > > EVENT SHALL
> > > > > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY
> > > > > CLAIM,
> > > > > DAMAGES OR
> > > > > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
> > > > > OR
> > > > > OTHERWISE,
> > > > > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
> > > > > OR
> > > > > THE USE OR
> > > > > + * OTHER DEALINGS IN THE SOFTWARE.
> > > > > + *
> > > > > +
> > > > > *************************************************************
> > > > > ****
> > > > > *********/
> > > > > +#ifndef _TTM_BO_UTIL_H_
> > > > > +#define _TTM_BO_UTIL_H_
> > > > > +
> > > > > +struct ww_acquire_ctx;
> > > > > +
> > > > > +struct ttm_buffer_object;
> > > > > +struct ttm_operation_ctx;
> > > > > +struct ttm_lru_walk;
> > > > > +
> > > > > +/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
> > > > > +struct ttm_lru_walk_ops {
> > > > > +	/**
> > > > > +	 * process_bo - Process this bo.
> > > > > +	 * @walk: struct ttm_lru_walk describing the walk.
> > > > > +	 * @bo: A locked and referenced buffer object.
> > > > > +	 *
> > > > > +	 * Return: Negative error code on error, User-
> > > > > defined
> > > > > positive value
> > > > > +	 * (typically, but not always, size of the processed
> > > > > bo)
> > > > > on success.
> > > > > +	 * On success, the returned values are summed by the
> > > > > walk and the
> > > > > +	 * walk exits when its target is met.
> > > > > +	 * 0 also indicates success, -EBUSY means this bo
> > > > > was
> > > > > skipped.
> > > > > +	 */
> > > > > +	s64 (*process_bo)(struct ttm_lru_walk *walk,
> > > > > +			  struct ttm_buffer_object *bo);
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct ttm_lru_walk - Structure describing a LRU walk.
> > > > > + */
> > > > > +struct ttm_lru_walk {
> > > > > +	/** @ops: Pointer to the ops structure. */
> > > > > +	const struct ttm_lru_walk_ops *ops;
> > > > > +	/** @ctx: Pointer to the struct ttm_operation_ctx.
> > > > > */
> > > > > +	struct ttm_operation_ctx *ctx;
> > > > > +	/** @ticket: The struct ww_acquire_ctx if any. */
> > > > > +	struct ww_acquire_ctx *ticket;
> > > > > +	/** @tryock_only: Only use trylock for locking. */
> > > > > +	bool trylock_only;
> > > > > +};
> > > > > +
> > > > > +s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct
> > > > > ttm_device *bdev,
> > > > > +			   struct ttm_resource_manager *man,
> > > > > s64
> > > > > target);
> > > > > +
> > > > > +#endif
> > > > > diff --git a/include/drm/ttm/ttm_bo.h
> > > > > b/include/drm/ttm/ttm_bo.h
> > > > > index d1a732d56259..5f7c967222a2 100644
> > > > > --- a/include/drm/ttm/ttm_bo.h
> > > > > +++ b/include/drm/ttm/ttm_bo.h
> > > > > @@ -194,41 +194,6 @@ struct ttm_operation_ctx {
> > > > >    	uint64_t bytes_moved;
> > > > >    };
> > > > >    
> > > > > -struct ttm_lru_walk;
> > > > > -
> > > > > -/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
> > > > > -struct ttm_lru_walk_ops {
> > > > > -	/**
> > > > > -	 * process_bo - Process this bo.
> > > > > -	 * @walk: struct ttm_lru_walk describing the walk.
> > > > > -	 * @bo: A locked and referenced buffer object.
> > > > > -	 *
> > > > > -	 * Return: Negative error code on error, User-
> > > > > defined
> > > > > positive value
> > > > > -	 * (typically, but not always, size of the processed
> > > > > bo)
> > > > > on success.
> > > > > -	 * On success, the returned values are summed by the
> > > > > walk and the
> > > > > -	 * walk exits when its target is met.
> > > > > -	 * 0 also indicates success, -EBUSY means this bo
> > > > > was
> > > > > skipped.
> > > > > -	 */
> > > > > -	s64 (*process_bo)(struct ttm_lru_walk *walk, struct
> > > > > ttm_buffer_object *bo);
> > > > > -};
> > > > > -
> > > > > -/**
> > > > > - * struct ttm_lru_walk - Structure describing a LRU walk.
> > > > > - */
> > > > > -struct ttm_lru_walk {
> > > > > -	/** @ops: Pointer to the ops structure. */
> > > > > -	const struct ttm_lru_walk_ops *ops;
> > > > > -	/** @ctx: Pointer to the struct ttm_operation_ctx.
> > > > > */
> > > > > -	struct ttm_operation_ctx *ctx;
> > > > > -	/** @ticket: The struct ww_acquire_ctx if any. */
> > > > > -	struct ww_acquire_ctx *ticket;
> > > > > -	/** @tryock_only: Only use trylock for locking. */
> > > > > -	bool trylock_only;
> > > > > -};
> > > > > -
> > > > > -s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct
> > > > > ttm_device *bdev,
> > > > > -			   struct ttm_resource_manager *man,
> > > > > s64
> > > > > target);
> > > > > -
> > > > >    /**
> > > > >     * ttm_bo_get - reference a struct ttm_buffer_object
> > > > >     *
> > > > > -- 
> > > > > 2.34.1
> > > > > 
>
Daniel Vetter Aug. 19, 2024, 2:14 p.m. UTC | #7
On Mon, Aug 19, 2024 at 01:38:56PM +0200, Thomas Hellström wrote:
> Hi, Christian,
> 
> On Mon, 2024-08-19 at 13:03 +0200, Christian König wrote:
> > Am 06.08.24 um 10:29 schrieb Thomas Hellström:
> > > Hi, Christian.
> > > 
> > > On Thu, 2024-07-11 at 14:01 +0200, Christian König wrote:
> > > > Am 10.07.24 um 20:19 schrieb Matthew Brost:
> > > > > On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König
> > > > > wrote:
> > > > > > That is something drivers really shouldn't mess with.
> > > > > > 
> > > > > Thomas uses this in Xe to implement a shrinker [1]. Seems to
> > > > > need
> > > > > to
> > > > > remain available for drivers.
> > > > No, that is exactly what I try to prevent with that.
> > > > 
> > > > This is an internally thing of TTM and drivers should never use
> > > > it
> > > > directly.
> > > That driver-facing LRU walker is a direct response/solution to this
> > > comment that you made in the first shrinker series:
> > > 
> > > https://lore.kernel.org/linux-mm/b7491378-defd-4f1c-31e2-29e4c77e2d67@amd.com/T/#ma918844aa8a6efe8768fdcda0c6590d5c93850c9
> > 
> > Ah, yeah that was about how we are should be avoiding middle layer
> > design.
> > 
> > But a function which returns the next candidate for eviction and a 
> > function which calls a callback for eviction is exactly the opposite.
> > 
> > > That is also mentioned in the cover letter of the recent shrinker
> > > series, and this walker has been around in that shrinker series for
> > > more than half a year now so if you think it's not the correct
> > > driver
> > > facing API IMHO that should be addressed by a review comment in
> > > that
> > > series rather than in posting a conflicting patch?
> > 
> > I actually outlined that in the review comments for the patch series.
> > E.g. a walker function with a callback is basically a middle layer.
> > 
> > What outlined in the link above is that a function which returns the 
> > next eviction candidate is a better approach than a callback.
> > 
> > > So assuming that we still want the driver to register the shrinker,
> > > IMO that helper abstracts away all the nasty locking and pitfalls
> > > for a
> > > driver-registered shrinker, and is similar in structure to for
> > > example
> > > the pagewalk helper in mm/pagewalk.c.
> > > 
> > > An alternative that could be tried as a driver-facing API is to
> > > provide
> > > a for_each_bo_in_lru_lock() macro where the driver open-codes
> > > "process_bo()" inside the for loop but I tried this and found it
> > > quite
> > > fragile since the driver might exit the loop without performing the
> > > necessary cleanup.
> > 
> > The point is that the shrinker should *never* need to have context.
> > E.g. 
> > a walker which allows going over multiple BOs for eviction is exactly
> > the wrong approach for that.
> > 
> > The shrinker should evict always only exactly one BO and the next 
> > invocation of a shrinker should not depend on the result of the
> > previous 
> > one.
> > 
> > Or am I missing something vital here?
> 
> A couple of things,
> 
> 1) I'd like to think of the middle-layer vs helper like the helper has
> its own ops, and can be used optionally from the driver. IIRC, the
> atomic modesetting / pageflip ops are implemented in exactly this way.
> 
> Sometimes a certain loop operation can't be easily or at least robustly
> implemented using a for_each_.. approach. Like for example
> mm/pagewalk.c. In this shrinking case I think it's probably possible
> using the scoped_guard() in cleanup.h. This way we could get rid of
> this middle layer discussion by turning the interface inside-out:
> 
> for_each_bo_on_lru_locked(xxx)
> 	driver_shrink();
> 
> But I do think the currently suggested approach is less fragile and
> prone to driver error.
> 
> FWIW in addition to the above examples, also drm_gem_lru_scan works
> like this.

a iteration state structure (like drm_connector_iter) plus then a macro
for the common case that uses cleanup.h should get the job done.

> 2) The shrinkers suggest to shrink a number of pages, not a single bo,
> again drm_gem_lru_scan works like this. i915 works like this. I think
> we should align with those.

Yeah that's how shrinkers work, so if we demidlayer then it realls needs
to be a loop in the driver, not a "here's the next bo to nuke" I think.

> 3) Even if we had a function to obtain the driver to shrink, the driver
> needs to have its say about the suitability for shrinking, so a
> callback is needed anyway (eviction_valuable).
> In addition, if shrinking fails for some reason, what would stop that
> function to return the same bo, again and again just like the problem
> we previously had in TTM?

Yeah I think if consensus moves back to drivers not looking at ttm lru
directly, then that entire shrinker looping needs to be as some kind of
midlayer in ttm itself. Otherwise I don't think it works, so agreeing with
Thomas here.

> So basically all the restartable LRU work was motivated by this use-
> case in mind, so making that private I must say comes as a pretty major
> surprise.
> 
> I could have a look at the 
> 
> for_each_bo_on_lru_locked(xxx)
> 	driver_shrink();
> 
> approach, but having TTM just blindly return a single bo without
> context will not work IMO.

Another thing to keep in mind is that at least experience from really
resource-starved igpu platforms says that cpu consumption for shrinking
matters. So really need to not toss list walk state, and also at least
from I think msm experience and maybe also i915 (i915-gem's a bit ... too
complex to really understand it anymore) is that parallelism matters too.
Eventually under memory pressures multiple cpu cores just aboslutely
hammer the shrinkers, so being stuck on locks is no good.

But maybe let's get this off the ground first.
-Sima
Christian König Aug. 19, 2024, 3:26 p.m. UTC | #8
Am 19.08.24 um 16:14 schrieb Daniel Vetter:
> On Mon, Aug 19, 2024 at 01:38:56PM +0200, Thomas Hellström wrote:
>> Hi, Christian,
>>
>> On Mon, 2024-08-19 at 13:03 +0200, Christian König wrote:
>>> Am 06.08.24 um 10:29 schrieb Thomas Hellström:
>>>> Hi, Christian.
>>>>
>>>> On Thu, 2024-07-11 at 14:01 +0200, Christian König wrote:
>>>>> Am 10.07.24 um 20:19 schrieb Matthew Brost:
>>>>>> On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König
>>>>>> wrote:
>>>>>>> That is something drivers really shouldn't mess with.
>>>>>>>
>>>>>> Thomas uses this in Xe to implement a shrinker [1]. Seems to
>>>>>> need
>>>>>> to
>>>>>> remain available for drivers.
>>>>> No, that is exactly what I try to prevent with that.
>>>>>
>>>>> This is an internally thing of TTM and drivers should never use
>>>>> it
>>>>> directly.
>>>> That driver-facing LRU walker is a direct response/solution to this
>>>> comment that you made in the first shrinker series:
>>>>
>>>> https://lore.kernel.org/linux-mm/b7491378-defd-4f1c-31e2-29e4c77e2d67@amd.com/T/#ma918844aa8a6efe8768fdcda0c6590d5c93850c9
>>> Ah, yeah that was about how we are should be avoiding middle layer
>>> design.
>>>
>>> But a function which returns the next candidate for eviction and a
>>> function which calls a callback for eviction is exactly the opposite.
>>>
>>>> That is also mentioned in the cover letter of the recent shrinker
>>>> series, and this walker has been around in that shrinker series for
>>>> more than half a year now so if you think it's not the correct
>>>> driver
>>>> facing API IMHO that should be addressed by a review comment in
>>>> that
>>>> series rather than in posting a conflicting patch?
>>> I actually outlined that in the review comments for the patch series.
>>> E.g. a walker function with a callback is basically a middle layer.
>>>
>>> What outlined in the link above is that a function which returns the
>>> next eviction candidate is a better approach than a callback.
>>>
>>>> So assuming that we still want the driver to register the shrinker,
>>>> IMO that helper abstracts away all the nasty locking and pitfalls
>>>> for a
>>>> driver-registered shrinker, and is similar in structure to for
>>>> example
>>>> the pagewalk helper in mm/pagewalk.c.
>>>>
>>>> An alternative that could be tried as a driver-facing API is to
>>>> provide
>>>> a for_each_bo_in_lru_lock() macro where the driver open-codes
>>>> "process_bo()" inside the for loop but I tried this and found it
>>>> quite
>>>> fragile since the driver might exit the loop without performing the
>>>> necessary cleanup.
>>> The point is that the shrinker should *never* need to have context.
>>> E.g.
>>> a walker which allows going over multiple BOs for eviction is exactly
>>> the wrong approach for that.
>>>
>>> The shrinker should evict always only exactly one BO and the next
>>> invocation of a shrinker should not depend on the result of the
>>> previous
>>> one.
>>>
>>> Or am I missing something vital here?
>> A couple of things,
>>
>> 1) I'd like to think of the middle-layer vs helper like the helper has
>> its own ops, and can be used optionally from the driver. IIRC, the
>> atomic modesetting / pageflip ops are implemented in exactly this way.
>>
>> Sometimes a certain loop operation can't be easily or at least robustly
>> implemented using a for_each_.. approach. Like for example
>> mm/pagewalk.c. In this shrinking case I think it's probably possible
>> using the scoped_guard() in cleanup.h. This way we could get rid of
>> this middle layer discussion by turning the interface inside-out:
>>
>> for_each_bo_on_lru_locked(xxx)
>> 	driver_shrink();
>>
>> But I do think the currently suggested approach is less fragile and
>> prone to driver error.
>>
>> FWIW in addition to the above examples, also drm_gem_lru_scan works
>> like this.
> a iteration state structure (like drm_connector_iter) plus then a macro
> for the common case that uses cleanup.h should get the job done.

Yeah, completely agree. It basically boils down to which one needs to 
pack a state bag.

In a mid-layer design it's the driver or the caller of functions, e.g. 
the much hated init callback in the DRM layer was a perfect example of that.

In a non mid-layer approach it's the framework or the called function, 
examples are how the fence iterators in the dma_resv or the 
drm_connector, plane etc.. work.

One big difference between the two approach is who and where things like 
preparation and cleanup are done, e.g. who takes locks for example.

>> 2) The shrinkers suggest to shrink a number of pages, not a single bo,
>> again drm_gem_lru_scan works like this. i915 works like this. I think
>> we should align with those.
> Yeah that's how shrinkers work, so if we demidlayer then it realls needs
> to be a loop in the driver, not a "here's the next bo to nuke" I think.

Hui? Well that's not how I understand shrinkers.

The shrinker gives you the maximum number of objects to scan and not how 
many pages to free. In return you give the actual number of objects to 
scanned and pages freed.

As far as I know the number of objects are in the sense of SLUBs or 
rather different LRU lists.

So for BO shrinking we should probably only free or rather unpin a 
single BO per callback otherwise we mess up the fairness between 
shrinkers in the MM layer.

>> 3) Even if we had a function to obtain the driver to shrink, the driver
>> needs to have its say about the suitability for shrinking, so a
>> callback is needed anyway (eviction_valuable).
>> In addition, if shrinking fails for some reason, what would stop that
>> function to return the same bo, again and again just like the problem
>> we previously had in TTM?
> Yeah I think if consensus moves back to drivers not looking at ttm lru
> directly, then that entire shrinker looping needs to be as some kind of
> midlayer in ttm itself. Otherwise I don't think it works, so agreeing with
> Thomas here.

Well, the idea was to get rid of callbacks like eviction_valuable.

So instead of TTM calling the driver asking if a BO is valuable to evict 
TTM provides the functionality to iterate over the possible candidates 
and the driver then says "oh yeah evict that one".

We probably can't do that fully, e.g. in case of contention on a 
resource we still need a callback to inform the driver and we also need 
a copy callback. But that's basically it.

>> So basically all the restartable LRU work was motivated by this use-
>> case in mind, so making that private I must say comes as a pretty major
>> surprise.
>>
>> I could have a look at the
>>
>> for_each_bo_on_lru_locked(xxx)
>> 	driver_shrink();
>>
>> approach, but having TTM just blindly return a single bo without
>> context will not work IMO.
> Another thing to keep in mind is that at least experience from really
> resource-starved igpu platforms says that cpu consumption for shrinking
> matters. So really need to not toss list walk state, and also at least
> from I think msm experience and maybe also i915 (i915-gem's a bit ... too
> complex to really understand it anymore) is that parallelism matters too.
> Eventually under memory pressures multiple cpu cores just aboslutely
> hammer the shrinkers, so being stuck on locks is no good.

Yeah that's basically the reason why I looked into this before as well.

Thomas implementation is actually pretty good at that because it only 
has the LRU spinlock as contented object and multiple CPUs can walk the 
LRU at the same time otherwise.

Regards,
Christian.

>
> But maybe let's get this off the ground first.
> -Sima
Thomas Hellström Aug. 20, 2024, 10:37 a.m. UTC | #9
Hi,

On Mon, 2024-08-19 at 17:26 +0200, Christian König wrote:
> Am 19.08.24 um 16:14 schrieb Daniel Vetter:
> > On Mon, Aug 19, 2024 at 01:38:56PM +0200, Thomas Hellström wrote:
> > > Hi, Christian,
> > > 
> > > On Mon, 2024-08-19 at 13:03 +0200, Christian König wrote:
> > > > Am 06.08.24 um 10:29 schrieb Thomas Hellström:
> > > > > Hi, Christian.
> > > > > 
> > > > > On Thu, 2024-07-11 at 14:01 +0200, Christian König wrote:
> > > > > > Am 10.07.24 um 20:19 schrieb Matthew Brost:
> > > > > > > On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König
> > > > > > > wrote:
> > > > > > > > That is something drivers really shouldn't mess with.
> > > > > > > > 
> > > > > > > Thomas uses this in Xe to implement a shrinker [1]. Seems
> > > > > > > to
> > > > > > > need
> > > > > > > to
> > > > > > > remain available for drivers.
> > > > > > No, that is exactly what I try to prevent with that.
> > > > > > 
> > > > > > This is an internally thing of TTM and drivers should never
> > > > > > use
> > > > > > it
> > > > > > directly.
> > > > > That driver-facing LRU walker is a direct response/solution
> > > > > to this
> > > > > comment that you made in the first shrinker series:
> > > > > 
> > > > > https://lore.kernel.org/linux-mm/b7491378-defd-4f1c-31e2-29e4c77e2d67@amd.com/T/#ma918844aa8a6efe8768fdcda0c6590d5c93850c9
> > > > Ah, yeah that was about how we are should be avoiding middle
> > > > layer
> > > > design.
> > > > 
> > > > But a function which returns the next candidate for eviction
> > > > and a
> > > > function which calls a callback for eviction is exactly the
> > > > opposite.
> > > > 
> > > > > That is also mentioned in the cover letter of the recent
> > > > > shrinker
> > > > > series, and this walker has been around in that shrinker
> > > > > series for
> > > > > more than half a year now so if you think it's not the
> > > > > correct
> > > > > driver
> > > > > facing API IMHO that should be addressed by a review comment
> > > > > in
> > > > > that
> > > > > series rather than in posting a conflicting patch?
> > > > I actually outlined that in the review comments for the patch
> > > > series.
> > > > E.g. a walker function with a callback is basically a middle
> > > > layer.
> > > > 
> > > > What outlined in the link above is that a function which
> > > > returns the
> > > > next eviction candidate is a better approach than a callback.
> > > > 
> > > > > So assuming that we still want the driver to register the
> > > > > shrinker,
> > > > > IMO that helper abstracts away all the nasty locking and
> > > > > pitfalls
> > > > > for a
> > > > > driver-registered shrinker, and is similar in structure to
> > > > > for
> > > > > example
> > > > > the pagewalk helper in mm/pagewalk.c.
> > > > > 
> > > > > An alternative that could be tried as a driver-facing API is
> > > > > to
> > > > > provide
> > > > > a for_each_bo_in_lru_lock() macro where the driver open-codes
> > > > > "process_bo()" inside the for loop but I tried this and found
> > > > > it
> > > > > quite
> > > > > fragile since the driver might exit the loop without
> > > > > performing the
> > > > > necessary cleanup.
> > > > The point is that the shrinker should *never* need to have
> > > > context.
> > > > E.g.
> > > > a walker which allows going over multiple BOs for eviction is
> > > > exactly
> > > > the wrong approach for that.
> > > > 
> > > > The shrinker should evict always only exactly one BO and the
> > > > next
> > > > invocation of a shrinker should not depend on the result of the
> > > > previous
> > > > one.
> > > > 
> > > > Or am I missing something vital here?
> > > A couple of things,
> > > 
> > > 1) I'd like to think of the middle-layer vs helper like the
> > > helper has
> > > its own ops, and can be used optionally from the driver. IIRC,
> > > the
> > > atomic modesetting / pageflip ops are implemented in exactly this
> > > way.
> > > 
> > > Sometimes a certain loop operation can't be easily or at least
> > > robustly
> > > implemented using a for_each_.. approach. Like for example
> > > mm/pagewalk.c. In this shrinking case I think it's probably
> > > possible
> > > using the scoped_guard() in cleanup.h. This way we could get rid
> > > of
> > > this middle layer discussion by turning the interface inside-out:
> > > 
> > > for_each_bo_on_lru_locked(xxx)
> > > 	driver_shrink();
> > > 
> > > But I do think the currently suggested approach is less fragile
> > > and
> > > prone to driver error.
> > > 
> > > FWIW in addition to the above examples, also drm_gem_lru_scan
> > > works
> > > like this.
> > a iteration state structure (like drm_connector_iter) plus then a
> > macro
> > for the common case that uses cleanup.h should get the job done.
> 
> Yeah, completely agree. It basically boils down to which one needs to
> pack a state bag.
> 
> In a mid-layer design it's the driver or the caller of functions,
> e.g. 
> the much hated init callback in the DRM layer was a perfect example
> of that.
> 
> In a non mid-layer approach it's the framework or the called
> function, 
> examples are how the fence iterators in the dma_resv or the 
> drm_connector, plane etc.. work.
> 
> One big difference between the two approach is who and where things
> like 
> preparation and cleanup are done, e.g. who takes locks for example.

So what in your opinion is an acceptable solution here? 
The "get next object to shrink" approach won't work, since it's subject
to the old TTM swap problem now removed:
If shrinking fails we will get the same object upon subsequent calls.
If we bump LRU we could end up with infinite loops.
So IMO we need to be able to loop. I don't really care wether we do
this as an explicit loop or whether we use the LRU walker, but I think
from a maintainability point-of-view it is better to keep LRU walking
in a single place.

If we return an unlocked object, we'd need to refcount and drop the lru
lock, but maybe that's not a bad thing.

But what's the main drawback of exporting the existing helper.

> 
> > > 2) The shrinkers suggest to shrink a number of pages, not a
> > > single bo,
> > > again drm_gem_lru_scan works like this. i915 works like this. I
> > > think
> > > we should align with those.
> > Yeah that's how shrinkers work, so if we demidlayer then it realls
> > needs
> > to be a loop in the driver, not a "here's the next bo to nuke" I
> > think.
> 
> Hui? Well that's not how I understand shrinkers.
> 
> The shrinker gives you the maximum number of objects to scan and not
> how 
> many pages to free. In return you give the actual number of objects
> to 
> scanned and pages freed.
> 
> As far as I know the number of objects are in the sense of SLUBs or 
> rather different LRU lists.
> 
> So for BO shrinking we should probably only free or rather unpin a 
> single BO per callback otherwise we mess up the fairness between 
> shrinkers in the MM layer.

Hm. AFAICT all drm shrinkers use pages as objects, and the docs of
nr_to_scan says it's the number of objects to scan and try to reclaim.
I think this strategy has had a fair amount of testing with the i915
driver.
In any case, I don't think TTM should enforce a different way of
shrinking by the means of a severely restricted helper?

/Thomas
Christian König Aug. 20, 2024, 3:45 p.m. UTC | #10
Am 20.08.24 um 12:37 schrieb Thomas Hellström:
> Hi,
>
> On Mon, 2024-08-19 at 17:26 +0200, Christian König wrote:
>> Am 19.08.24 um 16:14 schrieb Daniel Vetter:
>>> On Mon, Aug 19, 2024 at 01:38:56PM +0200, Thomas Hellström wrote:
>>>> Hi, Christian,
>>>>
>>>> On Mon, 2024-08-19 at 13:03 +0200, Christian König wrote:
>>>>> Am 06.08.24 um 10:29 schrieb Thomas Hellström:
>>>>>> Hi, Christian.
>>>>>>
>>>>>> On Thu, 2024-07-11 at 14:01 +0200, Christian König wrote:
>>>>>>> Am 10.07.24 um 20:19 schrieb Matthew Brost:
>>>>>>>> On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian König
>>>>>>>> wrote:
>>>>>>>>> That is something drivers really shouldn't mess with.
>>>>>>>>>
>>>>>>>> Thomas uses this in Xe to implement a shrinker [1]. Seems
>>>>>>>> to
>>>>>>>> need
>>>>>>>> to
>>>>>>>> remain available for drivers.
>>>>>>> No, that is exactly what I try to prevent with that.
>>>>>>>
>>>>>>> This is an internally thing of TTM and drivers should never
>>>>>>> use
>>>>>>> it
>>>>>>> directly.
>>>>>> That driver-facing LRU walker is a direct response/solution
>>>>>> to this
>>>>>> comment that you made in the first shrinker series:
>>>>>>
>>>>>> https://lore.kernel.org/linux-mm/b7491378-defd-4f1c-31e2-29e4c77e2d67@amd.com/T/#ma918844aa8a6efe8768fdcda0c6590d5c93850c9
>>>>> Ah, yeah that was about how we are should be avoiding middle
>>>>> layer
>>>>> design.
>>>>>
>>>>> But a function which returns the next candidate for eviction
>>>>> and a
>>>>> function which calls a callback for eviction is exactly the
>>>>> opposite.
>>>>>
>>>>>> That is also mentioned in the cover letter of the recent
>>>>>> shrinker
>>>>>> series, and this walker has been around in that shrinker
>>>>>> series for
>>>>>> more than half a year now so if you think it's not the
>>>>>> correct
>>>>>> driver
>>>>>> facing API IMHO that should be addressed by a review comment
>>>>>> in
>>>>>> that
>>>>>> series rather than in posting a conflicting patch?
>>>>> I actually outlined that in the review comments for the patch
>>>>> series.
>>>>> E.g. a walker function with a callback is basically a middle
>>>>> layer.
>>>>>
>>>>> What outlined in the link above is that a function which
>>>>> returns the
>>>>> next eviction candidate is a better approach than a callback.
>>>>>
>>>>>> So assuming that we still want the driver to register the
>>>>>> shrinker,
>>>>>> IMO that helper abstracts away all the nasty locking and
>>>>>> pitfalls
>>>>>> for a
>>>>>> driver-registered shrinker, and is similar in structure to
>>>>>> for
>>>>>> example
>>>>>> the pagewalk helper in mm/pagewalk.c.
>>>>>>
>>>>>> An alternative that could be tried as a driver-facing API is
>>>>>> to
>>>>>> provide
>>>>>> a for_each_bo_in_lru_lock() macro where the driver open-codes
>>>>>> "process_bo()" inside the for loop but I tried this and found
>>>>>> it
>>>>>> quite
>>>>>> fragile since the driver might exit the loop without
>>>>>> performing the
>>>>>> necessary cleanup.
>>>>> The point is that the shrinker should *never* need to have
>>>>> context.
>>>>> E.g.
>>>>> a walker which allows going over multiple BOs for eviction is
>>>>> exactly
>>>>> the wrong approach for that.
>>>>>
>>>>> The shrinker should evict always only exactly one BO and the
>>>>> next
>>>>> invocation of a shrinker should not depend on the result of the
>>>>> previous
>>>>> one.
>>>>>
>>>>> Or am I missing something vital here?
>>>> A couple of things,
>>>>
>>>> 1) I'd like to think of the middle-layer vs helper like the
>>>> helper has
>>>> its own ops, and can be used optionally from the driver. IIRC,
>>>> the
>>>> atomic modesetting / pageflip ops are implemented in exactly this
>>>> way.
>>>>
>>>> Sometimes a certain loop operation can't be easily or at least
>>>> robustly
>>>> implemented using a for_each_.. approach. Like for example
>>>> mm/pagewalk.c. In this shrinking case I think it's probably
>>>> possible
>>>> using the scoped_guard() in cleanup.h. This way we could get rid
>>>> of
>>>> this middle layer discussion by turning the interface inside-out:
>>>>
>>>> for_each_bo_on_lru_locked(xxx)
>>>> 	driver_shrink();
>>>>
>>>> But I do think the currently suggested approach is less fragile
>>>> and
>>>> prone to driver error.
>>>>
>>>> FWIW in addition to the above examples, also drm_gem_lru_scan
>>>> works
>>>> like this.
>>> a iteration state structure (like drm_connector_iter) plus then a
>>> macro
>>> for the common case that uses cleanup.h should get the job done.
>> Yeah, completely agree. It basically boils down to which one needs to
>> pack a state bag.
>>
>> In a mid-layer design it's the driver or the caller of functions,
>> e.g.
>> the much hated init callback in the DRM layer was a perfect example
>> of that.
>>
>> In a non mid-layer approach it's the framework or the called
>> function,
>> examples are how the fence iterators in the dma_resv or the
>> drm_connector, plane etc.. work.
>>
>> One big difference between the two approach is who and where things
>> like
>> preparation and cleanup are done, e.g. who takes locks for example.
> So what in your opinion is an acceptable solution here?
> The "get next object to shrink" approach won't work, since it's subject
> to the old TTM swap problem now removed:
> If shrinking fails we will get the same object upon subsequent calls.

But and that is expected behavior. If shrinking fails just going to the 
next object is invalid as far as I can see.

That's why I was so puzzled why you tried to apply the walker to the TTM 
shrinker.

Or why exactly should shrinking fail?

> If we bump LRU we could end up with infinite loops.
> So IMO we need to be able to loop. I don't really care wether we do
> this as an explicit loop or whether we use the LRU walker, but I think
> from a maintainability point-of-view it is better to keep LRU walking
> in a single place.
>
> If we return an unlocked object, we'd need to refcount and drop the lru
> lock, but maybe that's not a bad thing.
>
> But what's the main drawback of exporting the existing helper.

Well that we re-creates exactly the mid-layer mess I worked so hard to 
remove from TTM.

>>>> 2) The shrinkers suggest to shrink a number of pages, not a
>>>> single bo,
>>>> again drm_gem_lru_scan works like this. i915 works like this. I
>>>> think
>>>> we should align with those.
>>> Yeah that's how shrinkers work, so if we demidlayer then it realls
>>> needs
>>> to be a loop in the driver, not a "here's the next bo to nuke" I
>>> think.
>> Hui? Well that's not how I understand shrinkers.
>>
>> The shrinker gives you the maximum number of objects to scan and not
>> how
>> many pages to free. In return you give the actual number of objects
>> to
>> scanned and pages freed.
>>
>> As far as I know the number of objects are in the sense of SLUBs or
>> rather different LRU lists.
>>
>> So for BO shrinking we should probably only free or rather unpin a
>> single BO per callback otherwise we mess up the fairness between
>> shrinkers in the MM layer.
> Hm. AFAICT all drm shrinkers use pages as objects, and the docs of
> nr_to_scan says it's the number of objects to scan and try to reclaim.
> I think this strategy has had a fair amount of testing with the i915
> driver.
> In any case, I don't think TTM should enforce a different way of
> shrinking by the means of a severely restricted helper?

Well, as far as I can see that is exactly what TTM should do.

I mean the main advantage to make a common component is to enforce 
correct behavior.

Regards,
Christian.

>
> /Thomas
>
Thomas Hellström Aug. 20, 2024, 4 p.m. UTC | #11
On Tue, 2024-08-20 at 17:45 +0200, Christian König wrote:
> Am 20.08.24 um 12:37 schrieb Thomas Hellström:
> > Hi,
> > 
> > On Mon, 2024-08-19 at 17:26 +0200, Christian König wrote:
> > > Am 19.08.24 um 16:14 schrieb Daniel Vetter:
> > > > On Mon, Aug 19, 2024 at 01:38:56PM +0200, Thomas Hellström
> > > > wrote:
> > > > > Hi, Christian,
> > > > > 
> > > > > On Mon, 2024-08-19 at 13:03 +0200, Christian König wrote:
> > > > > > Am 06.08.24 um 10:29 schrieb Thomas Hellström:
> > > > > > > Hi, Christian.
> > > > > > > 
> > > > > > > On Thu, 2024-07-11 at 14:01 +0200, Christian König wrote:
> > > > > > > > Am 10.07.24 um 20:19 schrieb Matthew Brost:
> > > > > > > > > On Wed, Jul 10, 2024 at 02:42:58PM +0200, Christian
> > > > > > > > > König
> > > > > > > > > wrote:
> > > > > > > > > > That is something drivers really shouldn't mess
> > > > > > > > > > with.
> > > > > > > > > > 
> > > > > > > > > Thomas uses this in Xe to implement a shrinker [1].
> > > > > > > > > Seems
> > > > > > > > > to
> > > > > > > > > need
> > > > > > > > > to
> > > > > > > > > remain available for drivers.
> > > > > > > > No, that is exactly what I try to prevent with that.
> > > > > > > > 
> > > > > > > > This is an internally thing of TTM and drivers should
> > > > > > > > never
> > > > > > > > use
> > > > > > > > it
> > > > > > > > directly.
> > > > > > > That driver-facing LRU walker is a direct
> > > > > > > response/solution
> > > > > > > to this
> > > > > > > comment that you made in the first shrinker series:
> > > > > > > 
> > > > > > > https://lore.kernel.org/linux-mm/b7491378-defd-4f1c-31e2-29e4c77e2d67@amd.com/T/#ma918844aa8a6efe8768fdcda0c6590d5c93850c9
> > > > > > Ah, yeah that was about how we are should be avoiding
> > > > > > middle
> > > > > > layer
> > > > > > design.
> > > > > > 
> > > > > > But a function which returns the next candidate for
> > > > > > eviction
> > > > > > and a
> > > > > > function which calls a callback for eviction is exactly the
> > > > > > opposite.
> > > > > > 
> > > > > > > That is also mentioned in the cover letter of the recent
> > > > > > > shrinker
> > > > > > > series, and this walker has been around in that shrinker
> > > > > > > series for
> > > > > > > more than half a year now so if you think it's not the
> > > > > > > correct
> > > > > > > driver
> > > > > > > facing API IMHO that should be addressed by a review
> > > > > > > comment
> > > > > > > in
> > > > > > > that
> > > > > > > series rather than in posting a conflicting patch?
> > > > > > I actually outlined that in the review comments for the
> > > > > > patch
> > > > > > series.
> > > > > > E.g. a walker function with a callback is basically a
> > > > > > middle
> > > > > > layer.
> > > > > > 
> > > > > > What outlined in the link above is that a function which
> > > > > > returns the
> > > > > > next eviction candidate is a better approach than a
> > > > > > callback.
> > > > > > 
> > > > > > > So assuming that we still want the driver to register the
> > > > > > > shrinker,
> > > > > > > IMO that helper abstracts away all the nasty locking and
> > > > > > > pitfalls
> > > > > > > for a
> > > > > > > driver-registered shrinker, and is similar in structure
> > > > > > > to
> > > > > > > for
> > > > > > > example
> > > > > > > the pagewalk helper in mm/pagewalk.c.
> > > > > > > 
> > > > > > > An alternative that could be tried as a driver-facing API
> > > > > > > is
> > > > > > > to
> > > > > > > provide
> > > > > > > a for_each_bo_in_lru_lock() macro where the driver open-
> > > > > > > codes
> > > > > > > "process_bo()" inside the for loop but I tried this and
> > > > > > > found
> > > > > > > it
> > > > > > > quite
> > > > > > > fragile since the driver might exit the loop without
> > > > > > > performing the
> > > > > > > necessary cleanup.
> > > > > > The point is that the shrinker should *never* need to have
> > > > > > context.
> > > > > > E.g.
> > > > > > a walker which allows going over multiple BOs for eviction
> > > > > > is
> > > > > > exactly
> > > > > > the wrong approach for that.
> > > > > > 
> > > > > > The shrinker should evict always only exactly one BO and
> > > > > > the
> > > > > > next
> > > > > > invocation of a shrinker should not depend on the result of
> > > > > > the
> > > > > > previous
> > > > > > one.
> > > > > > 
> > > > > > Or am I missing something vital here?
> > > > > A couple of things,
> > > > > 
> > > > > 1) I'd like to think of the middle-layer vs helper like the
> > > > > helper has
> > > > > its own ops, and can be used optionally from the driver.
> > > > > IIRC,
> > > > > the
> > > > > atomic modesetting / pageflip ops are implemented in exactly
> > > > > this
> > > > > way.
> > > > > 
> > > > > Sometimes a certain loop operation can't be easily or at
> > > > > least
> > > > > robustly
> > > > > implemented using a for_each_.. approach. Like for example
> > > > > mm/pagewalk.c. In this shrinking case I think it's probably
> > > > > possible
> > > > > using the scoped_guard() in cleanup.h. This way we could get
> > > > > rid
> > > > > of
> > > > > this middle layer discussion by turning the interface inside-
> > > > > out:
> > > > > 
> > > > > for_each_bo_on_lru_locked(xxx)
> > > > > 	driver_shrink();
> > > > > 
> > > > > But I do think the currently suggested approach is less
> > > > > fragile
> > > > > and
> > > > > prone to driver error.
> > > > > 
> > > > > FWIW in addition to the above examples, also drm_gem_lru_scan
> > > > > works
> > > > > like this.
> > > > a iteration state structure (like drm_connector_iter) plus then
> > > > a
> > > > macro
> > > > for the common case that uses cleanup.h should get the job
> > > > done.
> > > Yeah, completely agree. It basically boils down to which one
> > > needs to
> > > pack a state bag.
> > > 
> > > In a mid-layer design it's the driver or the caller of functions,
> > > e.g.
> > > the much hated init callback in the DRM layer was a perfect
> > > example
> > > of that.
> > > 
> > > In a non mid-layer approach it's the framework or the called
> > > function,
> > > examples are how the fence iterators in the dma_resv or the
> > > drm_connector, plane etc.. work.
> > > 
> > > One big difference between the two approach is who and where
> > > things
> > > like
> > > preparation and cleanup are done, e.g. who takes locks for
> > > example.
> > So what in your opinion is an acceptable solution here?
> > The "get next object to shrink" approach won't work, since it's
> > subject
> > to the old TTM swap problem now removed:
> > If shrinking fails we will get the same object upon subsequent
> > calls.
> 
> But and that is expected behavior. If shrinking fails just going to
> the 
> next object is invalid as far as I can see.
> 
> That's why I was so puzzled why you tried to apply the walker to the
> TTM 
> shrinker.
> 
> Or why exactly should shrinking fail?

A common example would be not having runtime pm and the particular bo
needs it to unbind, we want to try the next bo. Example: i915 GGTT
bound bos and Lunar Lake PL_TT bos.

And again, all other drm bo shrinkers do this. We just want to do the
same.

> 
> > If we bump LRU we could end up with infinite loops.
> > So IMO we need to be able to loop. I don't really care wether we do
> > this as an explicit loop or whether we use the LRU walker, but I
> > think
> > from a maintainability point-of-view it is better to keep LRU
> > walking
> > in a single place.
> > 
> > If we return an unlocked object, we'd need to refcount and drop the
> > lru
> > lock, but maybe that's not a bad thing.
> > 
> > But what's the main drawback of exporting the existing helper.
> 
> Well that we re-creates exactly the mid-layer mess I worked so hard
> to 
> remove from TTM.

It doesn't IMO. I agree the first attempt did. This affects only the
LRU iteration itself and I'm even fine to get rid of the callback using
a for_ macro.


> 
> > > > > 2) The shrinkers suggest to shrink a number of pages, not a
> > > > > single bo,
> > > > > again drm_gem_lru_scan works like this. i915 works like this.
> > > > > I
> > > > > think
> > > > > we should align with those.
> > > > Yeah that's how shrinkers work, so if we demidlayer then it
> > > > realls
> > > > needs
> > > > to be a loop in the driver, not a "here's the next bo to nuke"
> > > > I
> > > > think.
> > > Hui? Well that's not how I understand shrinkers.
> > > 
> > > The shrinker gives you the maximum number of objects to scan and
> > > not
> > > how
> > > many pages to free. In return you give the actual number of
> > > objects
> > > to
> > > scanned and pages freed.
> > > 
> > > As far as I know the number of objects are in the sense of SLUBs
> > > or
> > > rather different LRU lists.
> > > 
> > > So for BO shrinking we should probably only free or rather unpin
> > > a
> > > single BO per callback otherwise we mess up the fairness between
> > > shrinkers in the MM layer.
> > Hm. AFAICT all drm shrinkers use pages as objects, and the docs of
> > nr_to_scan says it's the number of objects to scan and try to
> > reclaim.
> > I think this strategy has had a fair amount of testing with the
> > i915
> > driver.
> > In any case, I don't think TTM should enforce a different way of
> > shrinking by the means of a severely restricted helper?
> 
> Well, as far as I can see that is exactly what TTM should do.
> 
> I mean the main advantage to make a common component is to enforce 
> correct behavior.

But if all other drivers don't agree this as correct behavior and
instead want to keep behavior that is proven to work, that's a dead
end.

/Thomas


> 
> Regards,
> Christian.
> 
> > 
> > /Thomas
> >
Thomas Hellström Aug. 21, 2024, 7:02 a.m. UTC | #12
Hi, Christian,

On Tue, 2024-08-20 at 17:45 +0200, Christian König wrote:
> Am 20.08.24 um 12:37 schrieb Thomas Hellström:

...

> 
> > I don't really care wether we do
> > this as an explicit loop or whether we use the LRU walker, but I
> > think
> > from a maintainability point-of-view it is better to keep LRU
> > walking
> > in a single place.
> > 
> > If we return an unlocked object, we'd need to refcount and drop the
> > lru
> > lock, but maybe that's not a bad thing.
> > 
> > But what's the main drawback of exporting the existing helper.
> 
> Well that we re-creates exactly the mid-layer mess I worked so hard
> to 
> remove from TTM.
> 
> > > > 

So could you please answer whether a solution with an iterator and a
loop macro to de-midlayer is an OK solution form your POW, or suggest
an alternative solution that you find acceptable that also allows bo
shrinking with LRU traversal similar to how it's done elsewhere in DRM?

Thanks,
Thomas
Christian König Aug. 21, 2024, 8:14 a.m. UTC | #13
Am 20.08.24 um 18:00 schrieb Thomas Hellström:
>> Or why exactly should shrinking fail?
> A common example would be not having runtime pm and the particular bo
> needs it to unbind, we want to try the next bo. Example: i915 GGTT
> bound bos and Lunar Lake PL_TT bos.

WHAT? So you basically block shrinking BOs because you can't unbind them 
because the device is powered down?

I would say that this is a serious NO-GO. It basically means that 
powered down devices can lock down system memory for undefined amount of 
time.

In other words an application can allocate memory, map it into GGTT and 
then suspend or even get killed and we are not able to recover the 
memory because there is no activity on the GPU any more?

That really sounds like a bug in the driver design to me.

> And again, all other drm bo shrinkers do this. We just want to do the
> same.

Do you have pointers?

>>> If we bump LRU we could end up with infinite loops.
>>> So IMO we need to be able to loop. I don't really care wether we do
>>> this as an explicit loop or whether we use the LRU walker, but I
>>> think
>>> from a maintainability point-of-view it is better to keep LRU
>>> walking
>>> in a single place.
>>>
>>> If we return an unlocked object, we'd need to refcount and drop the
>>> lru
>>> lock, but maybe that's not a bad thing.
>>>
>>> But what's the main drawback of exporting the existing helper.
>> Well that we re-creates exactly the mid-layer mess I worked so hard
>> to
>> remove from TTM.
> It doesn't IMO. I agree the first attempt did. This affects only the
> LRU iteration itself and I'm even fine to get rid of the callback using
> a for_ macro.

Well, I mean using a for_each approach is objectively better than having 
a callback and a state bag.

But the fundamental question is if drivers are allowed to reject 
shrinking. And I think the answer is no, they need to be designed in a 
way where shrinking is always possible.

What can be that we can't get the necessary locks to evict and object 
(because it's about to be used etc...), but that are the per-requisites 
TTM should be checking.

>>> In any case, I don't think TTM should enforce a different way of
>>> shrinking by the means of a severely restricted helper?
>> Well, as far as I can see that is exactly what TTM should do.
>>
>> I mean the main advantage to make a common component is to enforce
>> correct behavior.
> But if all other drivers don't agree this as correct behavior and
> instead want to keep behavior that is proven to work, that's a dead
> end.

Well no, even if all drivers agree to (for example) drop security 
precautions it's still not something acceptable.

And same thing here, if we block shrinking because drivers think they 
want their runtime PM implemented in a certain way then upstream needs 
to block this and push back.

As far as I can see it's mandatory to have shrinkers not depend on 
runtime PM, cause otherwise you run into resources handling which 
depends on the well behavior of userspace and that in turn in something 
we can't allow.

Regards,
Christian.

>
> /Thomas
>
>
>> Regards,
>> Christian.
>>
>>> /Thomas
>>>
Thomas Hellström Aug. 21, 2024, 8:57 a.m. UTC | #14
On Wed, 2024-08-21 at 10:14 +0200, Christian König wrote:
> Am 20.08.24 um 18:00 schrieb Thomas Hellström:
> > > Or why exactly should shrinking fail?
> > A common example would be not having runtime pm and the particular
> > bo
> > needs it to unbind, we want to try the next bo. Example: i915 GGTT
> > bound bos and Lunar Lake PL_TT bos.
> 
> WHAT? So you basically block shrinking BOs because you can't unbind
> them 
> because the device is powered down?
> 
> I would say that this is a serious NO-GO. It basically means that 
> powered down devices can lock down system memory for undefined amount
> of 
> time.
> 
> In other words an application can allocate memory, map it into GGTT
> and 
> then suspend or even get killed and we are not able to recover the 
> memory because there is no activity on the GPU any more?
> 
> That really sounds like a bug in the driver design to me.

It's bad but it's not as bad as it sounds.

Problem is we can't wake up during direct reclaim IIRC due to runtime
pm lockdep violations, but we can and do fire up a thread to wake the
device and after the wakeup delay have subsequent shrink calls succeed,
or punt to kswapd or the oom handler.
I think that's an orthogonal discussion, though. There are other
reasons shrinking might fail, like the bo being busy in direct reclaim
(shouldn't wait for idle there but ok in kswapd), Other points of
failure is ofc shmem radix tree allocations (not seen one yet, though)
which might succeed with a smaller bo.
(Not saying, though, that there isn't more to be done with the xe
runtime pm implementation).

> 
> > And again, all other drm bo shrinkers do this. We just want to do
> > the
> > same.
> 
> Do you have pointers?

As Sima said, this is complicated but not beyond comprehension: i915
https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317

msm:
https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
which uses
https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/drm_gem.c#L1426
that is very similar in structure to what I implemented for TTM.

Panfrost: (although only purgeable objects AFAICT).
https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/drm_gem.c#L1426


> 
> > > > If we bump LRU we could end up with infinite loops.
> > > > So IMO we need to be able to loop. I don't really care wether
> > > > we do
> > > > this as an explicit loop or whether we use the LRU walker, but
> > > > I
> > > > think
> > > > from a maintainability point-of-view it is better to keep LRU
> > > > walking
> > > > in a single place.
> > > > 
> > > > If we return an unlocked object, we'd need to refcount and drop
> > > > the
> > > > lru
> > > > lock, but maybe that's not a bad thing.
> > > > 
> > > > But what's the main drawback of exporting the existing helper.
> > > Well that we re-creates exactly the mid-layer mess I worked so
> > > hard
> > > to
> > > remove from TTM.
> > It doesn't IMO. I agree the first attempt did. This affects only
> > the
> > LRU iteration itself and I'm even fine to get rid of the callback
> > using
> > a for_ macro.
> 
> Well, I mean using a for_each approach is objectively better than
> having 
> a callback and a state bag.
> 
> But the fundamental question is if drivers are allowed to reject 
> shrinking. And I think the answer is no, they need to be designed in
> a 
> way where shrinking is always possible.

Rejects can be out of our control, due to anticipated deadlocks, oom
and deferring to kswapd.

> 
> What can be that we can't get the necessary locks to evict and object
> (because it's about to be used etc...), but that are the per-
> requisites 
> TTM should be checking.
> 
> > > > In any case, I don't think TTM should enforce a different way
> > > > of
> > > > shrinking by the means of a severely restricted helper?
> > > Well, as far as I can see that is exactly what TTM should do.
> > > 
> > > I mean the main advantage to make a common component is to
> > > enforce
> > > correct behavior.
> > But if all other drivers don't agree this as correct behavior and
> > instead want to keep behavior that is proven to work, that's a dead
> > end.
> 
> Well no, even if all drivers agree to (for example) drop security 
> precautions it's still not something acceptable.
> 
> And same thing here, if we block shrinking because drivers think they
> want their runtime PM implemented in a certain way then upstream
> needs 
> to block this and push back.
> 
> As far as I can see it's mandatory to have shrinkers not depend on 
> runtime PM, cause otherwise you run into resources handling which 
> depends on the well behavior of userspace and that in turn in
> something 
> we can't allow.

Please see the above explanation for runtime pm, and for the record I
agree that enforcing disallowed or security violations is a completely
valid thing.

/Thomas

> 
> Regards,
> Christian.
> 
> > 
> > /Thomas
> > 
> > 
> > > Regards,
> > > Christian.
> > > 
> > > > /Thomas
> > > >
Thomas Hellström Aug. 21, 2024, 9:31 a.m. UTC | #15
On Wed, 2024-08-21 at 10:57 +0200, Thomas Hellström wrote:
> On Wed, 2024-08-21 at 10:14 +0200, Christian König wrote:
> > Am 20.08.24 um 18:00 schrieb Thomas Hellström:
> > > > Or why exactly should shrinking fail?
> > > A common example would be not having runtime pm and the
> > > particular
> > > bo
> > > needs it to unbind, we want to try the next bo. Example: i915
> > > GGTT
> > > bound bos and Lunar Lake PL_TT bos.
> > 
> > WHAT? So you basically block shrinking BOs because you can't unbind
> > them 
> > because the device is powered down?
> > 
> > I would say that this is a serious NO-GO. It basically means that 
> > powered down devices can lock down system memory for undefined
> > amount
> > of 
> > time.
> > 
> > In other words an application can allocate memory, map it into GGTT
> > and 
> > then suspend or even get killed and we are not able to recover the 
> > memory because there is no activity on the GPU any more?
> > 
> > That really sounds like a bug in the driver design to me.
> 
> It's bad but it's not as bad as it sounds.
> 
> Problem is we can't wake up during direct reclaim IIRC due to runtime
> pm lockdep violations, but we can and do fire up a thread to wake the
> device and after the wakeup delay have subsequent shrink calls
> succeed,
> or punt to kswapd or the oom handler.
> I think that's an orthogonal discussion, though. There are other
> reasons shrinking might fail, like the bo being busy in direct
> reclaim
> (shouldn't wait for idle there but ok in kswapd), Other points of
> failure is ofc shmem radix tree allocations (not seen one yet,
> though)
> which might succeed with a smaller bo.
> (Not saying, though, that there isn't more to be done with the xe
> runtime pm implementation).
> 
> > 
> > > And again, all other drm bo shrinkers do this. We just want to do
> > > the
> > > same.
> > 
> > Do you have pointers?
> 
> As Sima said, this is complicated but not beyond comprehension: i915
> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
> 
> msm:
> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
> which uses
> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/drm_gem.c#L1426
> that is very similar in structure to what I implemented for TTM.
> 
> Panfrost: (although only purgeable objects AFAICT).
> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/drm_gem.c#L1426
> 
> 
> > 
> > > > > If we bump LRU we could end up with infinite loops.
> > > > > So IMO we need to be able to loop. I don't really care wether
> > > > > we do
> > > > > this as an explicit loop or whether we use the LRU walker,
> > > > > but
> > > > > I
> > > > > think
> > > > > from a maintainability point-of-view it is better to keep LRU
> > > > > walking
> > > > > in a single place.
> > > > > 
> > > > > If we return an unlocked object, we'd need to refcount and
> > > > > drop
> > > > > the
> > > > > lru
> > > > > lock, but maybe that's not a bad thing.
> > > > > 
> > > > > But what's the main drawback of exporting the existing
> > > > > helper.
> > > > Well that we re-creates exactly the mid-layer mess I worked so
> > > > hard
> > > > to
> > > > remove from TTM.
> > > It doesn't IMO. I agree the first attempt did. This affects only
> > > the
> > > LRU iteration itself and I'm even fine to get rid of the callback
> > > using
> > > a for_ macro.
> > 
> > Well, I mean using a for_each approach is objectively better than
> > having 
> > a callback and a state bag.
> > 
> > But the fundamental question is if drivers are allowed to reject 
> > shrinking. And I think the answer is no, they need to be designed
> > in
> > a 
> > way where shrinking is always possible.
> 
> Rejects can be out of our control, due to anticipated deadlocks, oom
> and deferring to kswapd.
> 
> > 
> > What can be that we can't get the necessary locks to evict and
> > object
> > (because it's about to be used etc...), but that are the per-
> > requisites 
> > TTM should be checking.
> > 
> > > > > In any case, I don't think TTM should enforce a different way
> > > > > of
> > > > > shrinking by the means of a severely restricted helper?
> > > > Well, as far as I can see that is exactly what TTM should do.
> > > > 
> > > > I mean the main advantage to make a common component is to
> > > > enforce
> > > > correct behavior.
> > > But if all other drivers don't agree this as correct behavior and
> > > instead want to keep behavior that is proven to work, that's a
> > > dead
> > > end.
> > 
> > Well no, even if all drivers agree to (for example) drop security 
> > precautions it's still not something acceptable.
> > 
> > And same thing here, if we block shrinking because drivers think
> > they
> > want their runtime PM implemented in a certain way then upstream
> > needs 
> > to block this and push back.
> > 
> > As far as I can see it's mandatory to have shrinkers not depend on 
> > runtime PM, cause otherwise you run into resources handling which 
> > depends on the well behavior of userspace and that in turn in
> > something 
> > we can't allow.
> 
> Please see the above explanation for runtime pm, and for the record I
> agree that enforcing disallowed or security violations is a
> completely
> valid thing.

Meant to say enforcing disallowing security violations..

Another thing that came to my mind is ofc swap_space. Depending on how
backup is done if we're out of swap space we can only shrink purgeable
WONTNEED objects (user-space pools typically), and TTM has no knowledge
of those (currently at least).

/Thomas


> 
> /Thomas
> 
> > 
> > Regards,
> > Christian.
> > 
> > > 
> > > /Thomas
> > > 
> > > 
> > > > Regards,
> > > > Christian.
> > > > 
> > > > > /Thomas
> > > > > 
>
Christian König Aug. 21, 2024, 9:48 a.m. UTC | #16
Am 21.08.24 um 10:57 schrieb Thomas Hellström:
> On Wed, 2024-08-21 at 10:14 +0200, Christian König wrote:
>> Am 20.08.24 um 18:00 schrieb Thomas Hellström:
>>>> Or why exactly should shrinking fail?
>>> A common example would be not having runtime pm and the particular
>>> bo
>>> needs it to unbind, we want to try the next bo. Example: i915 GGTT
>>> bound bos and Lunar Lake PL_TT bos.
>> WHAT? So you basically block shrinking BOs because you can't unbind
>> them
>> because the device is powered down?
>>
>> I would say that this is a serious NO-GO. It basically means that
>> powered down devices can lock down system memory for undefined amount
>> of
>> time.
>>
>> In other words an application can allocate memory, map it into GGTT
>> and
>> then suspend or even get killed and we are not able to recover the
>> memory because there is no activity on the GPU any more?
>>
>> That really sounds like a bug in the driver design to me.
> It's bad but it's not as bad as it sounds.
>
> Problem is we can't wake up during direct reclaim IIRC due to runtime
> pm lockdep violations, but we can and do fire up a thread to wake the
> device and after the wakeup delay have subsequent shrink calls succeed,
> or punt to kswapd or the oom handler.

Yeah that is obvious. The runtime PM is an interface designed to be used 
from a very high level IOCTL/system call.

And delegating that from a shrinker to a worker is not valid as far as I 
can see, instead of reducing the memory pressure the shrinker would then 
increase it.

> I think that's an orthogonal discussion, though. There are other
> reasons shrinking might fail, like the bo being busy in direct reclaim
> (shouldn't wait for idle there but ok in kswapd), Other points of
> failure is ofc shmem radix tree allocations (not seen one yet, though)
> which might succeed with a smaller bo.
> (Not saying, though, that there isn't more to be done with the xe
> runtime pm implementation).

I don't think that argumentation is valid.

When a BO is locked then that it is ok to not shrink it, but TTM should 
be able to determine all those prerequisites.

In other words the idea of a function returning a BO to the driver is 
that the driver is obligated to shrink that one.

That other necessary allocation can fail like shmen for example is 
obvious as well, but that's why we discussed to allow shrinking BOs 
partially as well.

And I really don't think this discussion is orthogonal. We are basically 
discussing what drivers should do and not should do. And as far as I can 
see the requirement to expose the LRUs to drivers comes up only because 
the driver wants to do something it shouldn't.

>>> And again, all other drm bo shrinkers do this. We just want to do
>>> the
>>> same.
>> Do you have pointers?
> As Sima said, this is complicated but not beyond comprehension: i915
> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317

As far as I can tell what i915 does here is extremely questionable.

     if (sc->nr_scanned < sc->nr_to_scan && current_is_kswapd()) {
....
         with_intel_runtime_pm(&i915->runtime_pm, wakeref) {

with_intel_runtime_pm() then calls pm_runtime_get_sync().

So basically the i915 shrinker assumes that when called from kswapd() 
that it can synchronously wait for runtime PM to power up the device again.

As far as I can tell that means that a device driver makes strong and 
completely undocumented assumptions how kswapd works internally.

> msm:
> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
> which uses
> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/drm_gem.c#L1426
> that is very similar in structure to what I implemented for TTM.
>
> Panfrost: (although only purgeable objects AFAICT).
> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/drm_gem.c#L1426

 From skimming over the source only MSM actually seems to use this and 
the criteria for rejecting shrinking is everything TTM should know, e.g. 
if a BO is pinned, idle etc...

>>>>> If we bump LRU we could end up with infinite loops.
>>>>> So IMO we need to be able to loop. I don't really care wether
>>>>> we do
>>>>> this as an explicit loop or whether we use the LRU walker, but
>>>>> I
>>>>> think
>>>>> from a maintainability point-of-view it is better to keep LRU
>>>>> walking
>>>>> in a single place.
>>>>>
>>>>> If we return an unlocked object, we'd need to refcount and drop
>>>>> the
>>>>> lru
>>>>> lock, but maybe that's not a bad thing.
>>>>>
>>>>> But what's the main drawback of exporting the existing helper.
>>>> Well that we re-creates exactly the mid-layer mess I worked so
>>>> hard
>>>> to
>>>> remove from TTM.
>>> It doesn't IMO. I agree the first attempt did. This affects only
>>> the
>>> LRU iteration itself and I'm even fine to get rid of the callback
>>> using
>>> a for_ macro.
>> Well, I mean using a for_each approach is objectively better than
>> having
>> a callback and a state bag.
>>
>> But the fundamental question is if drivers are allowed to reject
>> shrinking. And I think the answer is no, they need to be designed in
>> a
>> way where shrinking is always possible.
> Rejects can be out of our control, due to anticipated deadlocks, oom
> and deferring to kswapd.
>
>> What can be that we can't get the necessary locks to evict and object
>> (because it's about to be used etc...), but that are the per-
>> requisites
>> TTM should be checking.
>>
>>>>> In any case, I don't think TTM should enforce a different way
>>>>> of
>>>>> shrinking by the means of a severely restricted helper?
>>>> Well, as far as I can see that is exactly what TTM should do.
>>>>
>>>> I mean the main advantage to make a common component is to
>>>> enforce
>>>> correct behavior.
>>> But if all other drivers don't agree this as correct behavior and
>>> instead want to keep behavior that is proven to work, that's a dead
>>> end.
>> Well no, even if all drivers agree to (for example) drop security
>> precautions it's still not something acceptable.
>>
>> And same thing here, if we block shrinking because drivers think they
>> want their runtime PM implemented in a certain way then upstream
>> needs
>> to block this and push back.
>>
>> As far as I can see it's mandatory to have shrinkers not depend on
>> runtime PM, cause otherwise you run into resources handling which
>> depends on the well behavior of userspace and that in turn in
>> something
>> we can't allow.
> Please see the above explanation for runtime pm, and for the record I
> agree that enforcing disallowed or security violations is a completely
> valid thing.

Putting the TTM issue aside as far as I can tell what i915 is extremely 
questionable and doing the same thing in XE is most likely not something 
we should allow.

Regards,
Christian.

>
> /Thomas
>
>> Regards,
>> Christian.
>>
>>> /Thomas
>>>
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> /Thomas
>>>>>
Thomas Hellström Aug. 21, 2024, noon UTC | #17
Hi,

On Wed, 2024-08-21 at 11:48 +0200, Christian König wrote:
> Am 21.08.24 um 10:57 schrieb Thomas Hellström:
> > On Wed, 2024-08-21 at 10:14 +0200, Christian König wrote:
> > > Am 20.08.24 um 18:00 schrieb Thomas Hellström:
> > > > > Or why exactly should shrinking fail?
> > > > A common example would be not having runtime pm and the
> > > > particular
> > > > bo
> > > > needs it to unbind, we want to try the next bo. Example: i915
> > > > GGTT
> > > > bound bos and Lunar Lake PL_TT bos.
> > > WHAT? So you basically block shrinking BOs because you can't
> > > unbind
> > > them
> > > because the device is powered down?
> > > 
> > > I would say that this is a serious NO-GO. It basically means that
> > > powered down devices can lock down system memory for undefined
> > > amount
> > > of
> > > time.
> > > 
> > > In other words an application can allocate memory, map it into
> > > GGTT
> > > and
> > > then suspend or even get killed and we are not able to recover
> > > the
> > > memory because there is no activity on the GPU any more?
> > > 
> > > That really sounds like a bug in the driver design to me.
> > It's bad but it's not as bad as it sounds.
> > 
> > Problem is we can't wake up during direct reclaim IIRC due to
> > runtime
> > pm lockdep violations, but we can and do fire up a thread to wake
> > the
> > device and after the wakeup delay have subsequent shrink calls
> > succeed,
> > or punt to kswapd or the oom handler.
> 
> Yeah that is obvious. The runtime PM is an interface designed to be
> used 
> from a very high level IOCTL/system call.
> 
> And delegating that from a shrinker to a worker is not valid as far
> as I 
> can see, instead of reducing the memory pressure the shrinker would
> then 
> increase it.

Perhaps an ignorant question, but aren't IO devices potentially woken
up for swapout at memory pressure time using runtime PM, thereby
increasing memory pressure in a similar way?

> 
> > I think that's an orthogonal discussion, though. There are other
> > reasons shrinking might fail, like the bo being busy in direct
> > reclaim
> > (shouldn't wait for idle there but ok in kswapd), Other points of
> > failure is ofc shmem radix tree allocations (not seen one yet,
> > though)
> > which might succeed with a smaller bo.
> > (Not saying, though, that there isn't more to be done with the xe
> > runtime pm implementation).
> 
> I don't think that argumentation is valid.
> 
> When a BO is locked then that it is ok to not shrink it, but TTM
> should 
> be able to determine all those prerequisites.
> 
> In other words the idea of a function returning a BO to the driver is
> that the driver is obligated to shrink that one.

But that doesn't take potential driver deadlock avoidance into account,
so it would restrict the driver shrinkers by assuming that all deadlock
avoidance needed is known by TTM.

And since small memory allocations are allowed even at reclaim time to
be able so reclaim or shrink even more memory, IMO the pm resume
problem reduces into a deadlock avoidance problem where we use
pm_tryget similar to bo_trylock. Important thing, though, in both
cases, that the device is woken or similarly the lock is released in
reasonable time for the reclaim to retry and succeed.

> 
> That other necessary allocation can fail like shmen for example is 
> obvious as well, but that's why we discussed to allow shrinking BOs 
> partially as well.

Good point, that eliminates that problem.

> 
> And I really don't think this discussion is orthogonal. We are
> basically 
> discussing what drivers should do and not should do. And as far as I
> can 
> see the requirement to expose the LRUs to drivers comes up only
> because 
> the driver wants to do something it shouldn't.

Currently we have the purgeable vs non-purgeable and driver-level
trylock-type deadlock avoidance not falling into those categories. And
I'd like to categorize runtime pm as a trylock-type deadlock avoidance.

bo trylock and idle could, as you say, be handled in the TTM helper, so
can purgeable and non-purgeable at lack of swap-space, provided that we
provide such a flag in the ttm_bo ofc?

> 
> > > > And again, all other drm bo shrinkers do this. We just want to
> > > > do
> > > > the
> > > > same.
> > > Do you have pointers?
> > As Sima said, this is complicated but not beyond comprehension:
> > i915
> > https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
> 
> As far as I can tell what i915 does here is extremely questionable.
> 
>      if (sc->nr_scanned < sc->nr_to_scan && current_is_kswapd()) {
> ....
>          with_intel_runtime_pm(&i915->runtime_pm, wakeref) {
> 
> with_intel_runtime_pm() then calls pm_runtime_get_sync().
> 
> So basically the i915 shrinker assumes that when called from kswapd()
> that it can synchronously wait for runtime PM to power up the device
> again.
> 
> As far as I can tell that means that a device driver makes strong and
> completely undocumented assumptions how kswapd works internally.

Admittedly that looks weird

But I'd really expect a reclaim lockdep splat to happen there if the
i915 pm did something not-allowed. IIRC, the design direction the i915
people got from mm people regarding the shrinkers was to avoid any
sleeps in direct reclaim and punt it to kswapd. Need to ask i915 people
how they can get away with that.

> 
> > msm:
> > https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
> > which uses
> > https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/drm_gem.c#L1426
> > that is very similar in structure to what I implemented for TTM.
> > 
> > Panfrost: (although only purgeable objects AFAICT).
> > https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/drm_gem.c#L1426
> 
>  From skimming over the source only MSM actually seems to use this
> and 
> the criteria for rejecting shrinking is everything TTM should know,
> e.g. 
> if a BO is pinned, idle etc...
> 
> > > > > > If we bump LRU we could end up with infinite loops.
> > > > > > So IMO we need to be able to loop. I don't really care
> > > > > > wether
> > > > > > we do
> > > > > > this as an explicit loop or whether we use the LRU walker,
> > > > > > but
> > > > > > I
> > > > > > think
> > > > > > from a maintainability point-of-view it is better to keep
> > > > > > LRU
> > > > > > walking
> > > > > > in a single place.
> > > > > > 
> > > > > > If we return an unlocked object, we'd need to refcount and
> > > > > > drop
> > > > > > the
> > > > > > lru
> > > > > > lock, but maybe that's not a bad thing.
> > > > > > 
> > > > > > But what's the main drawback of exporting the existing
> > > > > > helper.
> > > > > Well that we re-creates exactly the mid-layer mess I worked
> > > > > so
> > > > > hard
> > > > > to
> > > > > remove from TTM.
> > > > It doesn't IMO. I agree the first attempt did. This affects
> > > > only
> > > > the
> > > > LRU iteration itself and I'm even fine to get rid of the
> > > > callback
> > > > using
> > > > a for_ macro.
> > > Well, I mean using a for_each approach is objectively better than
> > > having
> > > a callback and a state bag.
> > > 
> > > But the fundamental question is if drivers are allowed to reject
> > > shrinking. And I think the answer is no, they need to be designed
> > > in
> > > a
> > > way where shrinking is always possible.
> > Rejects can be out of our control, due to anticipated deadlocks,
> > oom
> > and deferring to kswapd.
> > 
> > > What can be that we can't get the necessary locks to evict and
> > > object
> > > (because it's about to be used etc...), but that are the per-
> > > requisites
> > > TTM should be checking.
> > > 
> > > > > > In any case, I don't think TTM should enforce a different
> > > > > > way
> > > > > > of
> > > > > > shrinking by the means of a severely restricted helper?
> > > > > Well, as far as I can see that is exactly what TTM should do.
> > > > > 
> > > > > I mean the main advantage to make a common component is to
> > > > > enforce
> > > > > correct behavior.
> > > > But if all other drivers don't agree this as correct behavior
> > > > and
> > > > instead want to keep behavior that is proven to work, that's a
> > > > dead
> > > > end.
> > > Well no, even if all drivers agree to (for example) drop security
> > > precautions it's still not something acceptable.
> > > 
> > > And same thing here, if we block shrinking because drivers think
> > > they
> > > want their runtime PM implemented in a certain way then upstream
> > > needs
> > > to block this and push back.
> > > 
> > > As far as I can see it's mandatory to have shrinkers not depend
> > > on
> > > runtime PM, cause otherwise you run into resources handling which
> > > depends on the well behavior of userspace and that in turn in
> > > something
> > > we can't allow.
> > Please see the above explanation for runtime pm, and for the record
> > I
> > agree that enforcing disallowed or security violations is a
> > completely
> > valid thing.
> 
> Putting the TTM issue aside as far as I can tell what i915 is
> extremely 
> questionable and doing the same thing in XE is most likely not
> something 
> we should allow.

Xe is currently not doing the same but relying on a separate thread
rather than kswapd to wake the device.

Thanks,
Thomas


> 
> Regards,
> Christian.
> 
> > 
> > /Thomas
> > 
> > > Regards,
> > > Christian.
> > > 
> > > > /Thomas
> > > > 
> > > > 
> > > > > Regards,
> > > > > Christian.
> > > > > 
> > > > > > /Thomas
> > > > > >
Thomas Hellström Aug. 22, 2024, 6:47 a.m. UTC | #18
On Wed, 2024-08-21 at 14:00 +0200, Thomas Hellström wrote:
> Hi,
> 
> On Wed, 2024-08-21 at 11:48 +0200, Christian König wrote:
> > Am 21.08.24 um 10:57 schrieb Thomas Hellström:
> > > On Wed, 2024-08-21 at 10:14 +0200, Christian König wrote:
> > > > Am 20.08.24 um 18:00 schrieb Thomas Hellström:
> > > > > > Or why exactly should shrinking fail?
> > > > > A common example would be not having runtime pm and the
> > > > > particular
> > > > > bo
> > > > > needs it to unbind, we want to try the next bo. Example: i915
> > > > > GGTT
> > > > > bound bos and Lunar Lake PL_TT bos.
> > > > WHAT? So you basically block shrinking BOs because you can't
> > > > unbind
> > > > them
> > > > because the device is powered down?
> > > > 
> > > > I would say that this is a serious NO-GO. It basically means
> > > > that
> > > > powered down devices can lock down system memory for undefined
> > > > amount
> > > > of
> > > > time.
> > > > 
> > > > In other words an application can allocate memory, map it into
> > > > GGTT
> > > > and
> > > > then suspend or even get killed and we are not able to recover
> > > > the
> > > > memory because there is no activity on the GPU any more?
> > > > 
> > > > That really sounds like a bug in the driver design to me.
> > > It's bad but it's not as bad as it sounds.
> > > 
> > > Problem is we can't wake up during direct reclaim IIRC due to
> > > runtime
> > > pm lockdep violations, but we can and do fire up a thread to wake
> > > the
> > > device and after the wakeup delay have subsequent shrink calls
> > > succeed,
> > > or punt to kswapd or the oom handler.
> > 
> > Yeah that is obvious. The runtime PM is an interface designed to be
> > used 
> > from a very high level IOCTL/system call.
> > 
> > And delegating that from a shrinker to a worker is not valid as far
> > as I 
> > can see, instead of reducing the memory pressure the shrinker would
> > then 
> > increase it.
> 
> Perhaps an ignorant question, but aren't IO devices potentially woken
> up for swapout at memory pressure time using runtime PM, thereby
> increasing memory pressure in a similar way?
> 
> > 
> > > I think that's an orthogonal discussion, though. There are other
> > > reasons shrinking might fail, like the bo being busy in direct
> > > reclaim
> > > (shouldn't wait for idle there but ok in kswapd), Other points of
> > > failure is ofc shmem radix tree allocations (not seen one yet,
> > > though)
> > > which might succeed with a smaller bo.
> > > (Not saying, though, that there isn't more to be done with the xe
> > > runtime pm implementation).
> > 
> > I don't think that argumentation is valid.
> > 
> > When a BO is locked then that it is ok to not shrink it, but TTM
> > should 
> > be able to determine all those prerequisites.
> > 
> > In other words the idea of a function returning a BO to the driver
> > is
> > that the driver is obligated to shrink that one.
> 
> But that doesn't take potential driver deadlock avoidance into
> account,
> so it would restrict the driver shrinkers by assuming that all
> deadlock
> avoidance needed is known by TTM.
> 
> And since small memory allocations are allowed even at reclaim time
> to
> be able so reclaim or shrink even more memory, IMO the pm resume
> problem reduces into a deadlock avoidance problem where we use
> pm_tryget similar to bo_trylock. Important thing, though, in both
> cases, that the device is woken or similarly the lock is released in
> reasonable time for the reclaim to retry and succeed.
> 
> > 
> > That other necessary allocation can fail like shmen for example is 
> > obvious as well, but that's why we discussed to allow shrinking BOs
> > partially as well.
> 
> Good point, that eliminates that problem.
> 
> > 
> > And I really don't think this discussion is orthogonal. We are
> > basically 
> > discussing what drivers should do and not should do. And as far as
> > I
> > can 
> > see the requirement to expose the LRUs to drivers comes up only
> > because 
> > the driver wants to do something it shouldn't.
> 
> Currently we have the purgeable vs non-purgeable and driver-level
> trylock-type deadlock avoidance not falling into those categories.
> And
> I'd like to categorize runtime pm as a trylock-type deadlock
> avoidance.
> 
> bo trylock and idle could, as you say, be handled in the TTM helper,
> so
> can purgeable and non-purgeable at lack of swap-space, provided that
> we
> provide such a flag in the ttm_bo ofc?
> 
> > 
> > > > > And again, all other drm bo shrinkers do this. We just want
> > > > > to
> > > > > do
> > > > > the
> > > > > same.
> > > > Do you have pointers?
> > > As Sima said, this is complicated but not beyond comprehension:
> > > i915
> > > https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
> > 
> > As far as I can tell what i915 does here is extremely questionable.
> > 
> >      if (sc->nr_scanned < sc->nr_to_scan && current_is_kswapd()) {
> > ....
> >          with_intel_runtime_pm(&i915->runtime_pm, wakeref) {
> > 
> > with_intel_runtime_pm() then calls pm_runtime_get_sync().
> > 
> > So basically the i915 shrinker assumes that when called from
> > kswapd()
> > that it can synchronously wait for runtime PM to power up the
> > device
> > again.
> > 
> > As far as I can tell that means that a device driver makes strong
> > and
> > completely undocumented assumptions how kswapd works internally.
> 
> Admittedly that looks weird
> 
> But I'd really expect a reclaim lockdep splat to happen there if the
> i915 pm did something not-allowed. IIRC, the design direction the
> i915
> people got from mm people regarding the shrinkers was to avoid any
> sleeps in direct reclaim and punt it to kswapd. Need to ask i915
> people
> how they can get away with that.
> 
> 

So it turns out that Xe integrated pm resume is reclaim-safe, and I'd
expect i915's to be as well. Xe discrete pm resume isn't.

So that means that, at least for integrated, the i915 shrinker should
be ok from that POW, and punting certain bos to kswapd is not AFAICT
abusing any undocumented features of kswapd but rather a way to avoid
resuming the device during direct reclaim, like documented.

/Thomas
Christian König Aug. 22, 2024, 7:55 a.m. UTC | #19
Am 22.08.24 um 08:47 schrieb Thomas Hellström:
>>>> As Sima said, this is complicated but not beyond comprehension:
>>>> i915
>>>> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
>>> As far as I can tell what i915 does here is extremely questionable.
>>>
>>>       if (sc->nr_scanned < sc->nr_to_scan && current_is_kswapd()) {
>>> ....
>>>           with_intel_runtime_pm(&i915->runtime_pm, wakeref) {
>>>
>>> with_intel_runtime_pm() then calls pm_runtime_get_sync().
>>>
>>> So basically the i915 shrinker assumes that when called from
>>> kswapd()
>>> that it can synchronously wait for runtime PM to power up the
>>> device
>>> again.
>>>
>>> As far as I can tell that means that a device driver makes strong
>>> and
>>> completely undocumented assumptions how kswapd works internally.
>> Admittedly that looks weird
>>
>> But I'd really expect a reclaim lockdep splat to happen there if the
>> i915 pm did something not-allowed. IIRC, the design direction the
>> i915
>> people got from mm people regarding the shrinkers was to avoid any
>> sleeps in direct reclaim and punt it to kswapd. Need to ask i915
>> people
>> how they can get away with that.
>>
>>
> So it turns out that Xe integrated pm resume is reclaim-safe, and I'd
> expect i915's to be as well. Xe discrete pm resume isn't.
>
> So that means that, at least for integrated, the i915 shrinker should
> be ok from that POW, and punting certain bos to kswapd is not AFAICT
> abusing any undocumented features of kswapd but rather a way to avoid
> resuming the device during direct reclaim, like documented.

The more I think about this the more I disagree to this driver design. 
In my opinion device drivers should *never* resume runtime PM in a 
shrinker callback in the first place.

When the device is turned off it means that all of it's operations are 
stopped and eventually power to caches etc turned off as well. So I 
don't see any ongoing writeback operations or similar either.

So the question is why do we need to power on the device in a shrinker 
in the first place?

What could be is that the device needs to flush GART TLBs or similar 
when it is turned on, e.g. that you grab a PM reference to make sure 
that during your HW operation the device doesn't suspend.

But that doesn't mean that you should resume the device. In other words 
when the device is powered down you shouldn't power it up again.

And for GART we already have the necessary move callback implemented in 
TTM. This is done by radeon, amdgpu and nouveu in a common way as far as 
I can see.

So why should Xe be special and follow the very questionable approach of 
i915 here?

Regards,
Christian.


>
> /Thomas
>
>
>
Thomas Hellström Aug. 22, 2024, 8:21 a.m. UTC | #20
On Thu, 2024-08-22 at 09:55 +0200, Christian König wrote:
> Am 22.08.24 um 08:47 schrieb Thomas Hellström:
> > > > > As Sima said, this is complicated but not beyond
> > > > > comprehension:
> > > > > i915
> > > > > https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
> > > > As far as I can tell what i915 does here is extremely
> > > > questionable.
> > > > 
> > > >       if (sc->nr_scanned < sc->nr_to_scan &&
> > > > current_is_kswapd()) {
> > > > ....
> > > >           with_intel_runtime_pm(&i915->runtime_pm, wakeref) {
> > > > 
> > > > with_intel_runtime_pm() then calls pm_runtime_get_sync().
> > > > 
> > > > So basically the i915 shrinker assumes that when called from
> > > > kswapd()
> > > > that it can synchronously wait for runtime PM to power up the
> > > > device
> > > > again.
> > > > 
> > > > As far as I can tell that means that a device driver makes
> > > > strong
> > > > and
> > > > completely undocumented assumptions how kswapd works
> > > > internally.
> > > Admittedly that looks weird
> > > 
> > > But I'd really expect a reclaim lockdep splat to happen there if
> > > the
> > > i915 pm did something not-allowed. IIRC, the design direction the
> > > i915
> > > people got from mm people regarding the shrinkers was to avoid
> > > any
> > > sleeps in direct reclaim and punt it to kswapd. Need to ask i915
> > > people
> > > how they can get away with that.
> > > 
> > > 
> > So it turns out that Xe integrated pm resume is reclaim-safe, and
> > I'd
> > expect i915's to be as well. Xe discrete pm resume isn't.
> > 
> > So that means that, at least for integrated, the i915 shrinker
> > should
> > be ok from that POW, and punting certain bos to kswapd is not
> > AFAICT
> > abusing any undocumented features of kswapd but rather a way to
> > avoid
> > resuming the device during direct reclaim, like documented.
> 
> The more I think about this the more I disagree to this driver
> design. 
> In my opinion device drivers should *never* resume runtime PM in a 
> shrinker callback in the first place.

Runtime PM resume is allowed even from irq context if carefully
implemented by the driver and flagged as such to the core. 

https://docs.kernel.org/power/runtime_pm.html

Resuming runtime PM from reclaim therefore shouldn't be an issue IMO,
and really up to the driver. 

> 
> When the device is turned off it means that all of it's operations
> are 
> stopped and eventually power to caches etc turned off as well. So I 
> don't see any ongoing writeback operations or similar either.
> 
> So the question is why do we need to power on the device in a
> shrinker 
> in the first place?
> 
> What could be is that the device needs to flush GART TLBs or similar 
> when it is turned on, e.g. that you grab a PM reference to make sure 
> that during your HW operation the device doesn't suspend.

Exactly why the i915 needs to flush the GART I'm not sure of but I
suspect the gart TLB might be forgotten while suspended.

> 
> But that doesn't mean that you should resume the device. In other
> words 
> when the device is powered down you shouldn't power it up again.
> 
> And for GART we already have the necessary move callback implemented
> in 
> TTM. This is done by radeon, amdgpu and nouveu in a common way as far
> as 
> I can see.
> 
> So why should Xe be special and follow the very questionable approach
> of 
> i915 here?

For Xe, Lunar Lake (integrated) has the interesting design that each bo
carries compression metadata that needs to be blitted to system pages
during shrinking. The alternative is to resolve all buffer objects at
device runtime suspend...

But runtime PM aside, with a one-bo only approach we still have the
drawbacks that it 

* eliminates possibility for driver deadlock avoidance
* Requires TTM knowledge of "purgeable" bos
* Requires an additional LRU to avoid O(n2) traversal of already
shrunken objects
* Drivers with legitimate shrinker designs that don't fit in the TTM-
enforced model will have frustrated maintainers.

Thanks,
Thomas


> 
> Regards,
> Christian.
> 
> 
> > 
> > /Thomas
> > 
> > 
> >
Thomas Hellström Aug. 22, 2024, 8:36 a.m. UTC | #21
On Thu, 2024-08-22 at 10:21 +0200, Thomas Hellström wrote:
> On Thu, 2024-08-22 at 09:55 +0200, Christian König wrote:
> > Am 22.08.24 um 08:47 schrieb Thomas Hellström:
> > > > > > As Sima said, this is complicated but not beyond
> > > > > > comprehension:
> > > > > > i915
> > > > > > https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
> > > > > As far as I can tell what i915 does here is extremely
> > > > > questionable.
> > > > > 
> > > > >       if (sc->nr_scanned < sc->nr_to_scan &&
> > > > > current_is_kswapd()) {
> > > > > ....
> > > > >           with_intel_runtime_pm(&i915->runtime_pm, wakeref) {
> > > > > 
> > > > > with_intel_runtime_pm() then calls pm_runtime_get_sync().
> > > > > 
> > > > > So basically the i915 shrinker assumes that when called from
> > > > > kswapd()
> > > > > that it can synchronously wait for runtime PM to power up the
> > > > > device
> > > > > again.
> > > > > 
> > > > > As far as I can tell that means that a device driver makes
> > > > > strong
> > > > > and
> > > > > completely undocumented assumptions how kswapd works
> > > > > internally.
> > > > Admittedly that looks weird
> > > > 
> > > > But I'd really expect a reclaim lockdep splat to happen there
> > > > if
> > > > the
> > > > i915 pm did something not-allowed. IIRC, the design direction
> > > > the
> > > > i915
> > > > people got from mm people regarding the shrinkers was to avoid
> > > > any
> > > > sleeps in direct reclaim and punt it to kswapd. Need to ask
> > > > i915
> > > > people
> > > > how they can get away with that.
> > > > 
> > > > 
> > > So it turns out that Xe integrated pm resume is reclaim-safe, and
> > > I'd
> > > expect i915's to be as well. Xe discrete pm resume isn't.
> > > 
> > > So that means that, at least for integrated, the i915 shrinker
> > > should
> > > be ok from that POW, and punting certain bos to kswapd is not
> > > AFAICT
> > > abusing any undocumented features of kswapd but rather a way to
> > > avoid
> > > resuming the device during direct reclaim, like documented.
> > 
> > The more I think about this the more I disagree to this driver
> > design. 
> > In my opinion device drivers should *never* resume runtime PM in a 
> > shrinker callback in the first place.
> 
> Runtime PM resume is allowed even from irq context if carefully
> implemented by the driver and flagged as such to the core. 
> 
> https://docs.kernel.org/power/runtime_pm.html
> 
> Resuming runtime PM from reclaim therefore shouldn't be an issue IMO,
> and really up to the driver. 
> 
> > 
> > When the device is turned off it means that all of it's operations
> > are 
> > stopped and eventually power to caches etc turned off as well. So I
> > don't see any ongoing writeback operations or similar either.
> > 
> > So the question is why do we need to power on the device in a
> > shrinker 
> > in the first place?
> > 
> > What could be is that the device needs to flush GART TLBs or
> > similar 
> > when it is turned on, e.g. that you grab a PM reference to make
> > sure 
> > that during your HW operation the device doesn't suspend.
> 
> Exactly why the i915 needs to flush the GART I'm not sure of but I
> suspect the gart TLB might be forgotten while suspended.

To be clear, the intel GGTTs are programmed using device mmio.
And while i915 is a data point, there's no immediate plans to convert
it to using the TTM page pool AFAIK, so for the sake of discussion
let's focus on the Xe usage.

/Thomas
Daniel Vetter Aug. 22, 2024, 9:23 a.m. UTC | #22
On Wed, Aug 21, 2024 at 10:14:34AM +0200, Christian König wrote:
> Am 20.08.24 um 18:00 schrieb Thomas Hellström:
> > > Or why exactly should shrinking fail?
> > A common example would be not having runtime pm and the particular bo
> > needs it to unbind, we want to try the next bo. Example: i915 GGTT
> > bound bos and Lunar Lake PL_TT bos.
> 
> WHAT? So you basically block shrinking BOs because you can't unbind them
> because the device is powered down?

Yes. amdgpu does the same btw :-)

It's a fairly fundamental issue of rpm on discrete gpus, or anything that
looks a lot like a discrete gpu. The deadlock scenario is roughly:

- In runtime suspend you need to copy any bo out of vram into system ram
  before you power the gpu. This requires bo and ttm locks.

- You can't just avoid this by holding an rpm reference as long as any bo
  is still in vram, because that defacto means you'll never autosuspend at
  runtime. Plus most real hw is complex enough that you have some driver
  objects that you need to throw out or recreate, so in practice no way to
  avoid all this.

- runtime resume tends to be easier and mostly doable without taking bo
  and ttm locks, because by design you know no one else can possibly have
  any need to get at the gpu hw - it was all powered off after all. It's
  still messy, but doable.

- Unfortunately this doesn't help, because your runtime resume might need
  to wait for a in-progress suspend operation to complete. Which means you
  still deadlock even if your resume path has entirely reasonable locking.

On integrated you can mostly avoid this all because there's no need to
swap out bo to system memory, they're there already. Exceptions like the
busted coherency stuff on LNL aside.

But on discrete it's just suck.

TTM discrete gpu drivers avoided all that by simply not having a shrinker
where you need to runtime pm get, instead all runtime pm gets are outmost,
without holding any ttm or bo locks.

> I would say that this is a serious NO-GO. It basically means that powered
> down devices can lock down system memory for undefined amount of time.
> 
> In other words an application can allocate memory, map it into GGTT and then
> suspend or even get killed and we are not able to recover the memory because
> there is no activity on the GPU any more?
> 
> That really sounds like a bug in the driver design to me.

It's a bug in the runtime pm core imo. I think interim what Thomas laid
out is the best solution, since in practice when the gpu is off you really
shouldn't need to wake it up. Except when you're unlucky and racing a
runtime suspend against a shrinker activity (like runtime suspend throws a
bo into system memory, and the shrinker then immediately wants to swap it
out).

I've been pondering this mess for a few months, and I think I have a
solution. But it's a lot of work in core pm code unfortunately:

I think we need to split the runtime_suspend callback into two halfes:

->runtime_suspend_prepare

This would be run by the rpm core code from a worker without holding any
locks at all. Also, any runtime_pm_get call will not wait on this prepare
callback to finish, so it's up to the driver to make sure all the locking
is there. Synchronous suspend calls obviously have to wait for this to
finish, but that should only happen during system suspend or driver
unload, where we don't have these deadlock issues.

Drivers can use this callback for any non-destructive prep work
(non-destructive aside from the copy engine time wasted if it fails) like
swapping bo from vram to system memory. Drivers must not actually shut
down the hardware because a runtime_pm_get call must succeed without
waiting for this callback to finish.

If any runtime_pm_get call happens while the suspend attempt will be
aborted without further action.

->runtime_suspend

This does the actual hw power-off. The power core must guarantee that the
->runtime_suspend_prepare has successfully completed at least once without
the rpm refcount being elevated from 0 to 1 again.

This way drivers can assume that all bo have been swapped out from vram
already, and there's no need to acquire bo or ttm locks in the suspend
path that could block the resume path.

Which would then allow unconditional runtime_pm_get in the shrinker paths.

Unfortunately this will be all really tricky to implement and I think
needs to be done in the rumtime pm core.

Cheers, Sima
Christian König Aug. 22, 2024, 9:29 a.m. UTC | #23
Am 22.08.24 um 10:21 schrieb Thomas Hellström:
> On Thu, 2024-08-22 at 09:55 +0200, Christian König wrote:
>> Am 22.08.24 um 08:47 schrieb Thomas Hellström:
>>>>>> As Sima said, this is complicated but not beyond
>>>>>> comprehension:
>>>>>> i915
>>>>>> https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
>>>>> As far as I can tell what i915 does here is extremely
>>>>> questionable.
>>>>>
>>>>>        if (sc->nr_scanned < sc->nr_to_scan &&
>>>>> current_is_kswapd()) {
>>>>> ....
>>>>>            with_intel_runtime_pm(&i915->runtime_pm, wakeref) {
>>>>>
>>>>> with_intel_runtime_pm() then calls pm_runtime_get_sync().
>>>>>
>>>>> So basically the i915 shrinker assumes that when called from
>>>>> kswapd()
>>>>> that it can synchronously wait for runtime PM to power up the
>>>>> device
>>>>> again.
>>>>>
>>>>> As far as I can tell that means that a device driver makes
>>>>> strong
>>>>> and
>>>>> completely undocumented assumptions how kswapd works
>>>>> internally.
>>>> Admittedly that looks weird
>>>>
>>>> But I'd really expect a reclaim lockdep splat to happen there if
>>>> the
>>>> i915 pm did something not-allowed. IIRC, the design direction the
>>>> i915
>>>> people got from mm people regarding the shrinkers was to avoid
>>>> any
>>>> sleeps in direct reclaim and punt it to kswapd. Need to ask i915
>>>> people
>>>> how they can get away with that.
>>>>
>>>>
>>> So it turns out that Xe integrated pm resume is reclaim-safe, and
>>> I'd
>>> expect i915's to be as well. Xe discrete pm resume isn't.
>>>
>>> So that means that, at least for integrated, the i915 shrinker
>>> should
>>> be ok from that POW, and punting certain bos to kswapd is not
>>> AFAICT
>>> abusing any undocumented features of kswapd but rather a way to
>>> avoid
>>> resuming the device during direct reclaim, like documented.
>> The more I think about this the more I disagree to this driver
>> design.
>> In my opinion device drivers should *never* resume runtime PM in a
>> shrinker callback in the first place.
> Runtime PM resume is allowed even from irq context if carefully
> implemented by the driver and flagged as such to the core.
>
> https://docs.kernel.org/power/runtime_pm.html
>
> Resuming runtime PM from reclaim therefore shouldn't be an issue IMO,
> and really up to the driver.

Mhm when it's up to the driver on which level to use runtime PM then 
that at least explains why the framework doesn't have lockdep annotations.

Ok, that is at least convincing the what i915 does here should work somehow.

>> When the device is turned off it means that all of it's operations
>> are
>> stopped and eventually power to caches etc turned off as well. So I
>> don't see any ongoing writeback operations or similar either.
>>
>> So the question is why do we need to power on the device in a
>> shrinker
>> in the first place?
>>
>> What could be is that the device needs to flush GART TLBs or similar
>> when it is turned on, e.g. that you grab a PM reference to make sure
>> that during your HW operation the device doesn't suspend.
> Exactly why the i915 needs to flush the GART I'm not sure of but I
> suspect the gart TLB might be forgotten while suspended.

Well that is unproblematic. Amdgpu and I think nouveau does something 
similar.

But you don't need to resume the hardware for this, just grabbing the 
reference to make sure that it doesn't suspend is sufficient.

The assumption I make here is that you don't need to do anything when 
the hardware is power down anyway. That seems to be true for at least 
the hardware designs I'm aware of.

>> But that doesn't mean that you should resume the device. In other
>> words
>> when the device is powered down you shouldn't power it up again.
>>
>> And for GART we already have the necessary move callback implemented
>> in
>> TTM. This is done by radeon, amdgpu and nouveu in a common way as far
>> as
>> I can see.
>>
>> So why should Xe be special and follow the very questionable approach
>> of
>> i915 here?
> For Xe, Lunar Lake (integrated) has the interesting design that each bo
> carries compression metadata that needs to be blitted to system pages
> during shrinking. The alternative is to resolve all buffer objects at
> device runtime suspend...

That's the same for amdgpu as well, but when the device is powered down 
those compression data needs to be evacuated anyway.



> But runtime PM aside, with a one-bo only approach we still have the
> drawbacks that it
>
> * eliminates possibility for driver deadlock avoidance
> * Requires TTM knowledge of "purgeable" bos
> * Requires an additional LRU to avoid O(n2) traversal of already
> shrunken objects
> * Drivers with legitimate shrinker designs that don't fit in the TTM-
> enforced model will have frustrated maintainers.

I still find that only halve-convincing. The real question is if it's a 
good idea to give drivers the power to decide what to shrink and what 
not to shrink.

And at least with the arguments and experience at hand I would vote for 
not doing that. We have added the eviction_valuable callback for amdgpu 
and ended up in quite a mess with that.

Background is that some eviction decision done by the driver where not 
as optimal as we hoped it to be.

On the other hand keeping track of all the swapped out objects should be 
TTMs job anyway, e.g. having a TTM_PL_SWAPPED domain.

So in my mind the ideal solution still looks like this:

driver_specific_shrinker_scan(...)
{
     driver_specific_preparations(...);
     bo = ttm_reserve_next_bo_to_shrink(...);
     ttm_bo_validate(bo, TTM_PL_SWAPPED);
     ttm_bo_unreserver(bo);
     driver_specific_cleanups(...);
}

When there is a potential deadlock because the shrinker might be called 
from driver code which holds locks the driver needs to it's specific 
preparation or cleanup then those would apply to all BOs and not just 
the one returned from TTM.

The only use case I can see were the driver would need to filter out the 
BOs to shrink would be if TTM doesn't know about all the information to 
make a decision what to shrink and exactly that is what I try to avoid.

Regards,
Christian.

>
> Thanks,
> Thomas
>
>
>> Regards,
>> Christian.
>>
>>
>>> /Thomas
>>>
>>>
>>>
Thomas Hellström Aug. 22, 2024, 1:16 p.m. UTC | #24
On Thu, 2024-08-22 at 11:29 +0200, Christian König wrote:
> Am 22.08.24 um 10:21 schrieb Thomas Hellström:
> > On Thu, 2024-08-22 at 09:55 +0200, Christian König wrote:
> > > Am 22.08.24 um 08:47 schrieb Thomas Hellström:
> > > > > > > As Sima said, this is complicated but not beyond
> > > > > > > comprehension:
> > > > > > > i915
> > > > > > > https://elixir.bootlin.com/linux/v6.11-rc4/source/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c#L317
> > > > > > As far as I can tell what i915 does here is extremely
> > > > > > questionable.
> > > > > > 
> > > > > >        if (sc->nr_scanned < sc->nr_to_scan &&
> > > > > > current_is_kswapd()) {
> > > > > > ....
> > > > > >            with_intel_runtime_pm(&i915->runtime_pm,
> > > > > > wakeref) {
> > > > > > 
> > > > > > with_intel_runtime_pm() then calls pm_runtime_get_sync().
> > > > > > 
> > > > > > So basically the i915 shrinker assumes that when called
> > > > > > from
> > > > > > kswapd()
> > > > > > that it can synchronously wait for runtime PM to power up
> > > > > > the
> > > > > > device
> > > > > > again.
> > > > > > 
> > > > > > As far as I can tell that means that a device driver makes
> > > > > > strong
> > > > > > and
> > > > > > completely undocumented assumptions how kswapd works
> > > > > > internally.
> > > > > Admittedly that looks weird
> > > > > 
> > > > > But I'd really expect a reclaim lockdep splat to happen there
> > > > > if
> > > > > the
> > > > > i915 pm did something not-allowed. IIRC, the design direction
> > > > > the
> > > > > i915
> > > > > people got from mm people regarding the shrinkers was to
> > > > > avoid
> > > > > any
> > > > > sleeps in direct reclaim and punt it to kswapd. Need to ask
> > > > > i915
> > > > > people
> > > > > how they can get away with that.
> > > > > 
> > > > > 
> > > > So it turns out that Xe integrated pm resume is reclaim-safe,
> > > > and
> > > > I'd
> > > > expect i915's to be as well. Xe discrete pm resume isn't.
> > > > 
> > > > So that means that, at least for integrated, the i915 shrinker
> > > > should
> > > > be ok from that POW, and punting certain bos to kswapd is not
> > > > AFAICT
> > > > abusing any undocumented features of kswapd but rather a way to
> > > > avoid
> > > > resuming the device during direct reclaim, like documented.
> > > The more I think about this the more I disagree to this driver
> > > design.
> > > In my opinion device drivers should *never* resume runtime PM in
> > > a
> > > shrinker callback in the first place.
> > Runtime PM resume is allowed even from irq context if carefully
> > implemented by the driver and flagged as such to the core.
> > 
> > https://docs.kernel.org/power/runtime_pm.html
> > 
> > Resuming runtime PM from reclaim therefore shouldn't be an issue
> > IMO,
> > and really up to the driver.
> 
> Mhm when it's up to the driver on which level to use runtime PM then 
> that at least explains why the framework doesn't have lockdep
> annotations.
> 
> Ok, that is at least convincing the what i915 does here should work
> somehow.
> 
> > > When the device is turned off it means that all of it's
> > > operations
> > > are
> > > stopped and eventually power to caches etc turned off as well. So
> > > I
> > > don't see any ongoing writeback operations or similar either.
> > > 
> > > So the question is why do we need to power on the device in a
> > > shrinker
> > > in the first place?
> > > 
> > > What could be is that the device needs to flush GART TLBs or
> > > similar
> > > when it is turned on, e.g. that you grab a PM reference to make
> > > sure
> > > that during your HW operation the device doesn't suspend.
> > Exactly why the i915 needs to flush the GART I'm not sure of but I
> > suspect the gart TLB might be forgotten while suspended.
> 
> Well that is unproblematic. Amdgpu and I think nouveau does something
> similar.
> 
> But you don't need to resume the hardware for this, just grabbing the
> reference to make sure that it doesn't suspend is sufficient.
> 
> The assumption I make here is that you don't need to do anything when
> the hardware is power down anyway. That seems to be true for at least
> the hardware designs I'm aware of.

I'm not sure I understand you correctly but do you suggest that each bo
with a GGTT mapping should hold a pm reference and we refuse to suspend
while there are GGTT mappings?

> 
> > > But that doesn't mean that you should resume the device. In other
> > > words
> > > when the device is powered down you shouldn't power it up again.
> > > 
> > > And for GART we already have the necessary move callback
> > > implemented
> > > in
> > > TTM. This is done by radeon, amdgpu and nouveu in a common way as
> > > far
> > > as
> > > I can see.
> > > 
> > > So why should Xe be special and follow the very questionable
> > > approach
> > > of
> > > i915 here?
> > For Xe, Lunar Lake (integrated) has the interesting design that
> > each bo
> > carries compression metadata that needs to be blitted to system
> > pages
> > during shrinking. The alternative is to resolve all buffer objects
> > at
> > device runtime suspend...
> 
> That's the same for amdgpu as well, but when the device is powered
> down 
> those compression data needs to be evacuated anyway.

That's true for Intel discrete when entering D3Cold, which is typically
not done if that means a huge amount of data to evict.

Integrated doesn't suspend to D3Cold but using D3Hot and the
compression metadata is persistent, but is only accessible when the
device is woken. 

> 
> 
> 
> > But runtime PM aside, with a one-bo only approach we still have the
> > drawbacks that it
> > 
> > * eliminates possibility for driver deadlock avoidance
> > * Requires TTM knowledge of "purgeable" bos
> > * Requires an additional LRU to avoid O(n2) traversal of already
> > shrunken objects
> > * Drivers with legitimate shrinker designs that don't fit in the
> > TTM-
> > enforced model will have frustrated maintainers.
> 
> I still find that only halve-convincing. The real question is if it's
> a 
> good idea to give drivers the power to decide what to shrink and what
> not to shrink.
> 
> And at least with the arguments and experience at hand I would vote
> for 
> not doing that. We have added the eviction_valuable callback for
> amdgpu 
> and ended up in quite a mess with that.
> 
> Background is that some eviction decision done by the driver where
> not 
> as optimal as we hoped it to be.
> 
> On the other hand keeping track of all the swapped out objects should
> be 
> TTMs job anyway, e.g. having a TTM_PL_SWAPPED domain.
> 
> So in my mind the ideal solution still looks like this:
> 
> driver_specific_shrinker_scan(...)
> {
>      driver_specific_preparations(...);
>      bo = ttm_reserve_next_bo_to_shrink(...);
>      ttm_bo_validate(bo, TTM_PL_SWAPPED);
>      ttm_bo_unreserver(bo);
>      driver_specific_cleanups(...);
> }
> 
> When there is a potential deadlock because the shrinker might be
> called 
> from driver code which holds locks the driver needs to it's specific 
> preparation or cleanup then those would apply to all BOs and not just
> the one returned from TTM.
> 
> The only use case I can see were the driver would need to filter out
> the 
> BOs to shrink would be if TTM doesn't know about all the information
> to 
> make a decision what to shrink and exactly that is what I try to
> avoid.

Yeah, but unfortunately the use cases we have require per-bo decisions,
so suggested strucure becomes:


driver_specific_shrinker_scan(...)
{
     ttm_for_each_suggested_bo_to_shrink(&bo) {
	if (driver_preparations_and_ok_to_shrink(bo)) {
		ttm_shrink_this(&bo);
		driver_cleanups();
		if (vmscan_thinks_weve_done_enough())
			break;
     }
}

Thanks,
Thomas

> 
> Regards,
> Christian.
> 
> > 
> > Thanks,
> > Thomas
> > 
> > 
> > > Regards,
> > > Christian.
> > > 
> > > 
> > > > /Thomas
> > > > 
> > > > 
> > > >
Christian König Aug. 22, 2024, 1:19 p.m. UTC | #25
Am 22.08.24 um 11:23 schrieb Daniel Vetter:
> On Wed, Aug 21, 2024 at 10:14:34AM +0200, Christian König wrote:
>> Am 20.08.24 um 18:00 schrieb Thomas Hellström:
>>>> Or why exactly should shrinking fail?
>>> A common example would be not having runtime pm and the particular bo
>>> needs it to unbind, we want to try the next bo. Example: i915 GGTT
>>> bound bos and Lunar Lake PL_TT bos.
>> WHAT? So you basically block shrinking BOs because you can't unbind them
>> because the device is powered down?
> Yes. amdgpu does the same btw :-)

Well, amdgpu wouldn't block shrinking as far as I know.

When the GPU is powered down all fences are signaled and things like 
GART unbinding is just postponed until the GPU wakes up again.

> It's a fairly fundamental issue of rpm on discrete gpus, or anything that
> looks a lot like a discrete gpu. The deadlock scenario is roughly:
>
> - In runtime suspend you need to copy any bo out of vram into system ram
>    before you power the gpu. This requires bo and ttm locks.
>
> - You can't just avoid this by holding an rpm reference as long as any bo
>    is still in vram, because that defacto means you'll never autosuspend at
>    runtime. Plus most real hw is complex enough that you have some driver
>    objects that you need to throw out or recreate, so in practice no way to
>    avoid all this.
>
> - runtime resume tends to be easier and mostly doable without taking bo
>    and ttm locks, because by design you know no one else can possibly have
>    any need to get at the gpu hw - it was all powered off after all. It's
>    still messy, but doable.
>
> - Unfortunately this doesn't help, because your runtime resume might need
>    to wait for a in-progress suspend operation to complete. Which means you
>    still deadlock even if your resume path has entirely reasonable locking.

Uff, yeah that totally confirms my gut feeling that this looked really 
questionable.

> On integrated you can mostly avoid this all because there's no need to
> swap out bo to system memory, they're there already. Exceptions like the
> busted coherency stuff on LNL aside.
>
> But on discrete it's just suck.

Mhm, I never worked with pure integrated devices but at least radeon, 
amdgpu and nouveau seems to have a solution which would work as far as I 
can tell.

Basically on suspend you make sure that everything necessary for 
shrinking is done, e.g. waiting for fences, evacuating VRAM etc...

Hardware specific updates like GART while the device is suspended are 
postponed until resume.

> TTM discrete gpu drivers avoided all that by simply not having a shrinker
> where you need to runtime pm get, instead all runtime pm gets are outmost,
> without holding any ttm or bo locks.

Yes, exactly that.

>> I would say that this is a serious NO-GO. It basically means that powered
>> down devices can lock down system memory for undefined amount of time.
>>
>> In other words an application can allocate memory, map it into GGTT and then
>> suspend or even get killed and we are not able to recover the memory because
>> there is no activity on the GPU any more?
>>
>> That really sounds like a bug in the driver design to me.
> It's a bug in the runtime pm core imo. I think interim what Thomas laid
> out is the best solution, since in practice when the gpu is off you really
> shouldn't need to wake it up. Except when you're unlucky and racing a
> runtime suspend against a shrinker activity (like runtime suspend throws a
> bo into system memory, and the shrinker then immediately wants to swap it
> out).

Mhm, why exactly is that problematic?

Wouldn't pm_runtime_get_if_in_use() just return 0 in this situation and 
we postpone any hw activity?

> I've been pondering this mess for a few months, and I think I have a
> solution. But it's a lot of work in core pm code unfortunately:
>
> I think we need to split the runtime_suspend callback into two halfes:
>
> ->runtime_suspend_prepare
>
> This would be run by the rpm core code from a worker without holding any
> locks at all. Also, any runtime_pm_get call will not wait on this prepare
> callback to finish, so it's up to the driver to make sure all the locking
> is there. Synchronous suspend calls obviously have to wait for this to
> finish, but that should only happen during system suspend or driver
> unload, where we don't have these deadlock issues.
>
> Drivers can use this callback for any non-destructive prep work
> (non-destructive aside from the copy engine time wasted if it fails) like
> swapping bo from vram to system memory. Drivers must not actually shut
> down the hardware because a runtime_pm_get call must succeed without
> waiting for this callback to finish.
>
> If any runtime_pm_get call happens while the suspend attempt will be
> aborted without further action.
>
> ->runtime_suspend
>
> This does the actual hw power-off. The power core must guarantee that the
> ->runtime_suspend_prepare has successfully completed at least once without
> the rpm refcount being elevated from 0 to 1 again.
>
> This way drivers can assume that all bo have been swapped out from vram
> already, and there's no need to acquire bo or ttm locks in the suspend
> path that could block the resume path.
>
> Which would then allow unconditional runtime_pm_get in the shrinker paths.
>
> Unfortunately this will be all really tricky to implement and I think
> needs to be done in the rumtime pm core.

Completely agree that this is complicated, but I still don't see the 
need for it.

Drivers just need to use pm_runtime_get_if_in_use() inside the shrinker 
and postpone all hw activity until resume.

At least for amdgpu that shouldn't be a problem at all.

Regards,
Christian.

>
> Cheers, Sima
Daniel Vetter Aug. 27, 2024, 4:52 p.m. UTC | #26
[machine died, new one working now, I can read complicated mails again an
answer.]

On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König wrote:
> Am 22.08.24 um 11:23 schrieb Daniel Vetter:
> > On Wed, Aug 21, 2024 at 10:14:34AM +0200, Christian König wrote:
> > > Am 20.08.24 um 18:00 schrieb Thomas Hellström:
> > > > > Or why exactly should shrinking fail?
> > > > A common example would be not having runtime pm and the particular bo
> > > > needs it to unbind, we want to try the next bo. Example: i915 GGTT
> > > > bound bos and Lunar Lake PL_TT bos.
> > > WHAT? So you basically block shrinking BOs because you can't unbind them
> > > because the device is powered down?
> > Yes. amdgpu does the same btw :-)
> 
> Well, amdgpu wouldn't block shrinking as far as I know.
> 
> When the GPU is powered down all fences are signaled and things like GART
> unbinding is just postponed until the GPU wakes up again.
> 
> > It's a fairly fundamental issue of rpm on discrete gpus, or anything that
> > looks a lot like a discrete gpu. The deadlock scenario is roughly:
> > 
> > - In runtime suspend you need to copy any bo out of vram into system ram
> >    before you power the gpu. This requires bo and ttm locks.
> > 
> > - You can't just avoid this by holding an rpm reference as long as any bo
> >    is still in vram, because that defacto means you'll never autosuspend at
> >    runtime. Plus most real hw is complex enough that you have some driver
> >    objects that you need to throw out or recreate, so in practice no way to
> >    avoid all this.
> > 
> > - runtime resume tends to be easier and mostly doable without taking bo
> >    and ttm locks, because by design you know no one else can possibly have
> >    any need to get at the gpu hw - it was all powered off after all. It's
> >    still messy, but doable.
> > 
> > - Unfortunately this doesn't help, because your runtime resume might need
> >    to wait for a in-progress suspend operation to complete. Which means you
> >    still deadlock even if your resume path has entirely reasonable locking.
> 
> Uff, yeah that totally confirms my gut feeling that this looked really
> questionable.
> 
> > On integrated you can mostly avoid this all because there's no need to
> > swap out bo to system memory, they're there already. Exceptions like the
> > busted coherency stuff on LNL aside.
> > 
> > But on discrete it's just suck.
> 
> Mhm, I never worked with pure integrated devices but at least radeon, amdgpu
> and nouveau seems to have a solution which would work as far as I can tell.

Yeah integrated is easy (if you don't do silly stuff like intel on their
latest).

> Basically on suspend you make sure that everything necessary for shrinking
> is done, e.g. waiting for fences, evacuating VRAM etc...

This is the hard part, or the "evacuating VRAM" part at least. My proposal
is that essentially the runtime_suspend_prepare hook does that. Imo the
clean design is that we wait for rpm autosuspend, and only then start to
evacuate VRAM. Otherwise you need to hold a rpm reference if there's
anything in VRAM, which kinda defeats the point of runtime pm ...

Or you have your own runtime pm refcount that tracks whether anyone uses
the gpu, and if that drops to 0 for long enough you evacuate VRAM, and
then drop the rpm reference. Which is just implementing the idea I've
typed up, except hand-rolled in driver code and with a rpm refcount that's
entirely driver internal.

And at least last time I checked, amdgpu does the VRAM evacuation in the
runtime pm suspend callback, which is why all your runtime_pm_get calls
are outside of the big bo/ttm locks.

> Hardware specific updates like GART while the device is suspended are
> postponed until resume.

Yeah GART shouldn't be the issue, except when you're racing.

> > TTM discrete gpu drivers avoided all that by simply not having a shrinker
> > where you need to runtime pm get, instead all runtime pm gets are outmost,
> > without holding any ttm or bo locks.
> 
> Yes, exactly that.
> 
> > > I would say that this is a serious NO-GO. It basically means that powered
> > > down devices can lock down system memory for undefined amount of time.
> > > 
> > > In other words an application can allocate memory, map it into GGTT and then
> > > suspend or even get killed and we are not able to recover the memory because
> > > there is no activity on the GPU any more?
> > > 
> > > That really sounds like a bug in the driver design to me.
> > It's a bug in the runtime pm core imo. I think interim what Thomas laid
> > out is the best solution, since in practice when the gpu is off you really
> > shouldn't need to wake it up. Except when you're unlucky and racing a
> > runtime suspend against a shrinker activity (like runtime suspend throws a
> > bo into system memory, and the shrinker then immediately wants to swap it
> > out).
> 
> Mhm, why exactly is that problematic?
> 
> Wouldn't pm_runtime_get_if_in_use() just return 0 in this situation and we
> postpone any hw activity?

So when you're runtime suspend, you need to evacuate VRAM. Which means
potentially a lot needs to be moved into system memory. Which means it's
likely the shrinker gets active. Also, it's the point where
pm_runtime_get_if_in_use() will consistently fail, so right when you need
the shrinker to be reliable it will fail the most.

> > I've been pondering this mess for a few months, and I think I have a
> > solution. But it's a lot of work in core pm code unfortunately:
> > 
> > I think we need to split the runtime_suspend callback into two halfes:
> > 
> > ->runtime_suspend_prepare
> > 
> > This would be run by the rpm core code from a worker without holding any
> > locks at all. Also, any runtime_pm_get call will not wait on this prepare
> > callback to finish, so it's up to the driver to make sure all the locking
> > is there. Synchronous suspend calls obviously have to wait for this to
> > finish, but that should only happen during system suspend or driver
> > unload, where we don't have these deadlock issues.
> > 
> > Drivers can use this callback for any non-destructive prep work
> > (non-destructive aside from the copy engine time wasted if it fails) like
> > swapping bo from vram to system memory. Drivers must not actually shut
> > down the hardware because a runtime_pm_get call must succeed without
> > waiting for this callback to finish.
> > 
> > If any runtime_pm_get call happens while the suspend attempt will be
> > aborted without further action.
> > 
> > ->runtime_suspend
> > 
> > This does the actual hw power-off. The power core must guarantee that the
> > ->runtime_suspend_prepare has successfully completed at least once without
> > the rpm refcount being elevated from 0 to 1 again.
> > 
> > This way drivers can assume that all bo have been swapped out from vram
> > already, and there's no need to acquire bo or ttm locks in the suspend
> > path that could block the resume path.
> > 
> > Which would then allow unconditional runtime_pm_get in the shrinker paths.
> > 
> > Unfortunately this will be all really tricky to implement and I think
> > needs to be done in the rumtime pm core.
> 
> Completely agree that this is complicated, but I still don't see the need
> for it.
> 
> Drivers just need to use pm_runtime_get_if_in_use() inside the shrinker and
> postpone all hw activity until resume.

Not good enough, at least long term I think. Also postponing hw activity
to resume doesn't solve the deadlock issue, if you still need to grab ttm
locks on resume.

> At least for amdgpu that shouldn't be a problem at all.

Yeah, because atm amdgpu has the rpm_get calls outside of ttm locks, which
is exactly the inversion you've complained about as being broken driver
design.
-Sima
Daniel Vetter Aug. 27, 2024, 4:58 p.m. UTC | #27
On Thu, Aug 22, 2024 at 11:29:24AM +0200, Christian König wrote:
> Am 22.08.24 um 10:21 schrieb Thomas Hellström:
> > On Thu, 2024-08-22 at 09:55 +0200, Christian König wrote:
> > > In my opinion device drivers should *never* resume runtime PM in a
> > > shrinker callback in the first place.
> > Runtime PM resume is allowed even from irq context if carefully
> > implemented by the driver and flagged as such to the core.
> > 
> > https://docs.kernel.org/power/runtime_pm.html
> > 
> > Resuming runtime PM from reclaim therefore shouldn't be an issue IMO,
> > and really up to the driver.
> 
> Mhm when it's up to the driver on which level to use runtime PM then that at
> least explains why the framework doesn't have lockdep annotations.

Aside on the lockdep question:

Per-device lockdep annotations would be doable I think (since the rules
really are per-device). But lockdep really doesn't cope too well with too
many dynamic lockdep classes, so that's maybe the reason this was never
done?

It's the same with module reload, eventually you run out of lockdep class
keys :-/

What we perhaps could do is register a driver lockdep key (which is
static), that runtime pm core code could optionally use when set?

Hm, thinking about this some more, if we embed this in struct
device_driver when it's statically created, this could work perhaps
automatically?
-Sima
Daniel Vetter Aug. 27, 2024, 5:53 p.m. UTC | #28
On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
> On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König wrote:
> > Completely agree that this is complicated, but I still don't see the need
> > for it.
> > 
> > Drivers just need to use pm_runtime_get_if_in_use() inside the shrinker and
> > postpone all hw activity until resume.
> 
> Not good enough, at least long term I think. Also postponing hw activity
> to resume doesn't solve the deadlock issue, if you still need to grab ttm
> locks on resume.

Pondered this specific aspect some more, and I think you still have a race
here (even if you avoid the deadlock): If the condiditional rpm_get call
fails there's no guarantee that the device will suspend/resume and clean
up the GART mapping. The race gets a bit smaller if you use
pm_runtime_get_if_active(), but even then you might catch it right when
resume almost finished.

That means we'll have ttm bo hanging around with GART allocations/mappings
which aren't actually valid anymore (since they might escape the cleanup
upon resume due to the race). That doesn't feel like a solid design
either.
-Sima
Alex Deucher Aug. 27, 2024, 6:24 p.m. UTC | #29
On Tue, Aug 27, 2024 at 12:58 PM Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
>
> [machine died, new one working now, I can read complicated mails again an
> answer.]
>
> On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König wrote:
> > Am 22.08.24 um 11:23 schrieb Daniel Vetter:
> > > On Wed, Aug 21, 2024 at 10:14:34AM +0200, Christian König wrote:
> > > > Am 20.08.24 um 18:00 schrieb Thomas Hellström:
> > > > > > Or why exactly should shrinking fail?
> > > > > A common example would be not having runtime pm and the particular bo
> > > > > needs it to unbind, we want to try the next bo. Example: i915 GGTT
> > > > > bound bos and Lunar Lake PL_TT bos.
> > > > WHAT? So you basically block shrinking BOs because you can't unbind them
> > > > because the device is powered down?
> > > Yes. amdgpu does the same btw :-)
> >
> > Well, amdgpu wouldn't block shrinking as far as I know.
> >
> > When the GPU is powered down all fences are signaled and things like GART
> > unbinding is just postponed until the GPU wakes up again.
> >
> > > It's a fairly fundamental issue of rpm on discrete gpus, or anything that
> > > looks a lot like a discrete gpu. The deadlock scenario is roughly:
> > >
> > > - In runtime suspend you need to copy any bo out of vram into system ram
> > >    before you power the gpu. This requires bo and ttm locks.
> > >
> > > - You can't just avoid this by holding an rpm reference as long as any bo
> > >    is still in vram, because that defacto means you'll never autosuspend at
> > >    runtime. Plus most real hw is complex enough that you have some driver
> > >    objects that you need to throw out or recreate, so in practice no way to
> > >    avoid all this.
> > >
> > > - runtime resume tends to be easier and mostly doable without taking bo
> > >    and ttm locks, because by design you know no one else can possibly have
> > >    any need to get at the gpu hw - it was all powered off after all. It's
> > >    still messy, but doable.
> > >
> > > - Unfortunately this doesn't help, because your runtime resume might need
> > >    to wait for a in-progress suspend operation to complete. Which means you
> > >    still deadlock even if your resume path has entirely reasonable locking.
> >
> > Uff, yeah that totally confirms my gut feeling that this looked really
> > questionable.
> >
> > > On integrated you can mostly avoid this all because there's no need to
> > > swap out bo to system memory, they're there already. Exceptions like the
> > > busted coherency stuff on LNL aside.
> > >
> > > But on discrete it's just suck.
> >
> > Mhm, I never worked with pure integrated devices but at least radeon, amdgpu
> > and nouveau seems to have a solution which would work as far as I can tell.
>
> Yeah integrated is easy (if you don't do silly stuff like intel on their
> latest).
>
> > Basically on suspend you make sure that everything necessary for shrinking
> > is done, e.g. waiting for fences, evacuating VRAM etc...
>
> This is the hard part, or the "evacuating VRAM" part at least. My proposal
> is that essentially the runtime_suspend_prepare hook does that. Imo the
> clean design is that we wait for rpm autosuspend, and only then start to
> evacuate VRAM. Otherwise you need to hold a rpm reference if there's
> anything in VRAM, which kinda defeats the point of runtime pm ...
>
> Or you have your own runtime pm refcount that tracks whether anyone uses
> the gpu, and if that drops to 0 for long enough you evacuate VRAM, and
> then drop the rpm reference. Which is just implementing the idea I've
> typed up, except hand-rolled in driver code and with a rpm refcount that's
> entirely driver internal.
>
> And at least last time I checked, amdgpu does the VRAM evacuation in the
> runtime pm suspend callback, which is why all your runtime_pm_get calls
> are outside of the big bo/ttm locks.

It used to do the eviction in the runtime_suspend callback, but we ran
into problems where eviction would sometimes fail if there was too
much memory pressure during suspend.  Moving it to prepare made things
fail earlier.  Ultimately, I think the problem is that runtime pm
disables swap during runtime pm. See:
https://gitlab.freedesktop.org/drm/amd/-/issues/2362#note_2111666

Alex

>
> > Hardware specific updates like GART while the device is suspended are
> > postponed until resume.
>
> Yeah GART shouldn't be the issue, except when you're racing.
>
> > > TTM discrete gpu drivers avoided all that by simply not having a shrinker
> > > where you need to runtime pm get, instead all runtime pm gets are outmost,
> > > without holding any ttm or bo locks.
> >
> > Yes, exactly that.
> >
> > > > I would say that this is a serious NO-GO. It basically means that powered
> > > > down devices can lock down system memory for undefined amount of time.
> > > >
> > > > In other words an application can allocate memory, map it into GGTT and then
> > > > suspend or even get killed and we are not able to recover the memory because
> > > > there is no activity on the GPU any more?
> > > >
> > > > That really sounds like a bug in the driver design to me.
> > > It's a bug in the runtime pm core imo. I think interim what Thomas laid
> > > out is the best solution, since in practice when the gpu is off you really
> > > shouldn't need to wake it up. Except when you're unlucky and racing a
> > > runtime suspend against a shrinker activity (like runtime suspend throws a
> > > bo into system memory, and the shrinker then immediately wants to swap it
> > > out).
> >
> > Mhm, why exactly is that problematic?
> >
> > Wouldn't pm_runtime_get_if_in_use() just return 0 in this situation and we
> > postpone any hw activity?
>
> So when you're runtime suspend, you need to evacuate VRAM. Which means
> potentially a lot needs to be moved into system memory. Which means it's
> likely the shrinker gets active. Also, it's the point where
> pm_runtime_get_if_in_use() will consistently fail, so right when you need
> the shrinker to be reliable it will fail the most.
>
> > > I've been pondering this mess for a few months, and I think I have a
> > > solution. But it's a lot of work in core pm code unfortunately:
> > >
> > > I think we need to split the runtime_suspend callback into two halfes:
> > >
> > > ->runtime_suspend_prepare
> > >
> > > This would be run by the rpm core code from a worker without holding any
> > > locks at all. Also, any runtime_pm_get call will not wait on this prepare
> > > callback to finish, so it's up to the driver to make sure all the locking
> > > is there. Synchronous suspend calls obviously have to wait for this to
> > > finish, but that should only happen during system suspend or driver
> > > unload, where we don't have these deadlock issues.
> > >
> > > Drivers can use this callback for any non-destructive prep work
> > > (non-destructive aside from the copy engine time wasted if it fails) like
> > > swapping bo from vram to system memory. Drivers must not actually shut
> > > down the hardware because a runtime_pm_get call must succeed without
> > > waiting for this callback to finish.
> > >
> > > If any runtime_pm_get call happens while the suspend attempt will be
> > > aborted without further action.
> > >
> > > ->runtime_suspend
> > >
> > > This does the actual hw power-off. The power core must guarantee that the
> > > ->runtime_suspend_prepare has successfully completed at least once without
> > > the rpm refcount being elevated from 0 to 1 again.
> > >
> > > This way drivers can assume that all bo have been swapped out from vram
> > > already, and there's no need to acquire bo or ttm locks in the suspend
> > > path that could block the resume path.
> > >
> > > Which would then allow unconditional runtime_pm_get in the shrinker paths.
> > >
> > > Unfortunately this will be all really tricky to implement and I think
> > > needs to be done in the rumtime pm core.
> >
> > Completely agree that this is complicated, but I still don't see the need
> > for it.
> >
> > Drivers just need to use pm_runtime_get_if_in_use() inside the shrinker and
> > postpone all hw activity until resume.
>
> Not good enough, at least long term I think. Also postponing hw activity
> to resume doesn't solve the deadlock issue, if you still need to grab ttm
> locks on resume.
>
> > At least for amdgpu that shouldn't be a problem at all.
>
> Yeah, because atm amdgpu has the rpm_get calls outside of ttm locks, which
> is exactly the inversion you've complained about as being broken driver
> design.
> -Sima
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Christian König Aug. 28, 2024, 12:20 p.m. UTC | #30
Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
>> On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König wrote:
>>> Completely agree that this is complicated, but I still don't see the need
>>> for it.
>>>
>>> Drivers just need to use pm_runtime_get_if_in_use() inside the shrinker and
>>> postpone all hw activity until resume.
>> Not good enough, at least long term I think. Also postponing hw activity
>> to resume doesn't solve the deadlock issue, if you still need to grab ttm
>> locks on resume.
> Pondered this specific aspect some more, and I think you still have a race
> here (even if you avoid the deadlock): If the condiditional rpm_get call
> fails there's no guarantee that the device will suspend/resume and clean
> up the GART mapping.

Well I think we have a major disconnect here. When the device is powered 
down there is no GART mapping to clean up any more.

In other words GART is a table in local memory (VRAM) when the device is 
powered down this table is completely destroyed. Any BO which was mapped 
inside this table is now not mapped any more.

So when the shrinker wants to evict a BO which is marked as mapped to 
GART and the device is powered down we just skip the GART unmapping part 
because that has already implicitly happened during power down.

Before mapping any BO into the GART again we power the GPU up through 
the runtime PM calls. And while powering it up again the GART is restored.

> The race gets a bit smaller if you use
> pm_runtime_get_if_active(), but even then you might catch it right when
> resume almost finished.

What race are you talking about?

The worst thing which could happen is that we restore a GART entry which 
isn't needed any more, but that is pretty much irrelevant since we only 
clear them to avoid some hw bugs.

> That means we'll have ttm bo hanging around with GART allocations/mappings
> which aren't actually valid anymore (since they might escape the cleanup
> upon resume due to the race). That doesn't feel like a solid design
> either.

I'm most likely missing something, but I'm really scratching my head 
where you see a problem here.

Regards,
Christian.

> -Sima
Thomas Hellström Aug. 28, 2024, 2:05 p.m. UTC | #31
On Wed, 2024-08-28 at 14:20 +0200, Christian König wrote:
> Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> > On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
> > > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König wrote:
> > > > Completely agree that this is complicated, but I still don't
> > > > see the need
> > > > for it.
> > > > 
> > > > Drivers just need to use pm_runtime_get_if_in_use() inside the
> > > > shrinker and
> > > > postpone all hw activity until resume.
> > > Not good enough, at least long term I think. Also postponing hw
> > > activity
> > > to resume doesn't solve the deadlock issue, if you still need to
> > > grab ttm
> > > locks on resume.
> > Pondered this specific aspect some more, and I think you still have
> > a race
> > here (even if you avoid the deadlock): If the condiditional rpm_get
> > call
> > fails there's no guarantee that the device will suspend/resume and
> > clean
> > up the GART mapping.
> 
> Well I think we have a major disconnect here. When the device is
> powered 
> down there is no GART mapping to clean up any more.
> 
> In other words GART is a table in local memory (VRAM) when the device
> is 
> powered down this table is completely destroyed. Any BO which was
> mapped 
> inside this table is now not mapped any more.
> 
> So when the shrinker wants to evict a BO which is marked as mapped to
> GART and the device is powered down we just skip the GART unmapping
> part 
> because that has already implicitly happened during power down.
> 
> Before mapping any BO into the GART again we power the GPU up through
> the runtime PM calls. And while powering it up again the GART is
> restored.

I think you're forgetting the main Xe use-case of Lunar-lake
compression metadata. I'ts retained by the device during D3hot, but
cannot, at that time, be accessed for shrinking.

And copying it all out "Just in case" when transitioning to D3hot just
isn't a viable solution.

/Thomas
Christian König Aug. 28, 2024, 3:25 p.m. UTC | #32
Am 28.08.24 um 16:05 schrieb Thomas Hellström:
> On Wed, 2024-08-28 at 14:20 +0200, Christian König wrote:
>> Am 27.08.24 um 19:53 schrieb Daniel Vetter:
>>> On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
>>>> On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König wrote:
>>>>> Completely agree that this is complicated, but I still don't
>>>>> see the need
>>>>> for it.
>>>>>
>>>>> Drivers just need to use pm_runtime_get_if_in_use() inside the
>>>>> shrinker and
>>>>> postpone all hw activity until resume.
>>>> Not good enough, at least long term I think. Also postponing hw
>>>> activity
>>>> to resume doesn't solve the deadlock issue, if you still need to
>>>> grab ttm
>>>> locks on resume.
>>> Pondered this specific aspect some more, and I think you still have
>>> a race
>>> here (even if you avoid the deadlock): If the condiditional rpm_get
>>> call
>>> fails there's no guarantee that the device will suspend/resume and
>>> clean
>>> up the GART mapping.
>> Well I think we have a major disconnect here. When the device is
>> powered
>> down there is no GART mapping to clean up any more.
>>
>> In other words GART is a table in local memory (VRAM) when the device
>> is
>> powered down this table is completely destroyed. Any BO which was
>> mapped
>> inside this table is now not mapped any more.
>>
>> So when the shrinker wants to evict a BO which is marked as mapped to
>> GART and the device is powered down we just skip the GART unmapping
>> part
>> because that has already implicitly happened during power down.
>>
>> Before mapping any BO into the GART again we power the GPU up through
>> the runtime PM calls. And while powering it up again the GART is
>> restored.
> I think you're forgetting the main Xe use-case of Lunar-lake
> compression metadata. I'ts retained by the device during D3hot, but
> cannot, at that time, be accessed for shrinking.

Yeah, that is really something we don't have an equivalent for on AMD GPUs.

When the ASIC is powered down VRAM is basically dead as well because it 
won't get refreshed any more.

> And copying it all out "Just in case" when transitioning to D3hot just
> isn't a viable solution.

I would say that this is solvable with a hierarchy of power management 
functionality.

E.g. the runtime PM interface works the same for you as it does for 
amdgpu with evicting TTM BOs etc....

Then separate from runtime PM you have a reference count for the 
accessibility of compressed metadata. And while shrinking you only 
resume this specific part.

Christian.

>
> /Thomas
>
Alex Deucher Aug. 28, 2024, 3:35 p.m. UTC | #33
On Wed, Aug 28, 2024 at 11:26 AM Christian König
<christian.koenig@amd.com> wrote:
>
> Am 28.08.24 um 16:05 schrieb Thomas Hellström:
> > On Wed, 2024-08-28 at 14:20 +0200, Christian König wrote:
> >> Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> >>> On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
> >>>> On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König wrote:
> >>>>> Completely agree that this is complicated, but I still don't
> >>>>> see the need
> >>>>> for it.
> >>>>>
> >>>>> Drivers just need to use pm_runtime_get_if_in_use() inside the
> >>>>> shrinker and
> >>>>> postpone all hw activity until resume.
> >>>> Not good enough, at least long term I think. Also postponing hw
> >>>> activity
> >>>> to resume doesn't solve the deadlock issue, if you still need to
> >>>> grab ttm
> >>>> locks on resume.
> >>> Pondered this specific aspect some more, and I think you still have
> >>> a race
> >>> here (even if you avoid the deadlock): If the condiditional rpm_get
> >>> call
> >>> fails there's no guarantee that the device will suspend/resume and
> >>> clean
> >>> up the GART mapping.
> >> Well I think we have a major disconnect here. When the device is
> >> powered
> >> down there is no GART mapping to clean up any more.
> >>
> >> In other words GART is a table in local memory (VRAM) when the device
> >> is
> >> powered down this table is completely destroyed. Any BO which was
> >> mapped
> >> inside this table is now not mapped any more.
> >>
> >> So when the shrinker wants to evict a BO which is marked as mapped to
> >> GART and the device is powered down we just skip the GART unmapping
> >> part
> >> because that has already implicitly happened during power down.
> >>
> >> Before mapping any BO into the GART again we power the GPU up through
> >> the runtime PM calls. And while powering it up again the GART is
> >> restored.
> > I think you're forgetting the main Xe use-case of Lunar-lake
> > compression metadata. I'ts retained by the device during D3hot, but
> > cannot, at that time, be accessed for shrinking.
>
> Yeah, that is really something we don't have an equivalent for on AMD GPUs.
>
> When the ASIC is powered down VRAM is basically dead as well because it
> won't get refreshed any more.

We actually support memory self refresh for VRAM on some platforms,
but the rest of the GPU is powered down, so only the VRAM contents is
retained.

Alex

>
> > And copying it all out "Just in case" when transitioning to D3hot just
> > isn't a viable solution.
>
> I would say that this is solvable with a hierarchy of power management
> functionality.
>
> E.g. the runtime PM interface works the same for you as it does for
> amdgpu with evicting TTM BOs etc....
>
> Then separate from runtime PM you have a reference count for the
> accessibility of compressed metadata. And while shrinking you only
> resume this specific part.
>
> Christian.
>
> >
> > /Thomas
> >
>
Thomas Hellström Aug. 28, 2024, 3:45 p.m. UTC | #34
On Wed, 2024-08-28 at 17:25 +0200, Christian König wrote:
> Am 28.08.24 um 16:05 schrieb Thomas Hellström:
> > On Wed, 2024-08-28 at 14:20 +0200, Christian König wrote:
> > > Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> > > > On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
> > > > > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König
> > > > > wrote:
> > > > > > Completely agree that this is complicated, but I still
> > > > > > don't
> > > > > > see the need
> > > > > > for it.
> > > > > > 
> > > > > > Drivers just need to use pm_runtime_get_if_in_use() inside
> > > > > > the
> > > > > > shrinker and
> > > > > > postpone all hw activity until resume.
> > > > > Not good enough, at least long term I think. Also postponing
> > > > > hw
> > > > > activity
> > > > > to resume doesn't solve the deadlock issue, if you still need
> > > > > to
> > > > > grab ttm
> > > > > locks on resume.
> > > > Pondered this specific aspect some more, and I think you still
> > > > have
> > > > a race
> > > > here (even if you avoid the deadlock): If the condiditional
> > > > rpm_get
> > > > call
> > > > fails there's no guarantee that the device will suspend/resume
> > > > and
> > > > clean
> > > > up the GART mapping.
> > > Well I think we have a major disconnect here. When the device is
> > > powered
> > > down there is no GART mapping to clean up any more.
> > > 
> > > In other words GART is a table in local memory (VRAM) when the
> > > device
> > > is
> > > powered down this table is completely destroyed. Any BO which was
> > > mapped
> > > inside this table is now not mapped any more.
> > > 
> > > So when the shrinker wants to evict a BO which is marked as
> > > mapped to
> > > GART and the device is powered down we just skip the GART
> > > unmapping
> > > part
> > > because that has already implicitly happened during power down.
> > > 
> > > Before mapping any BO into the GART again we power the GPU up
> > > through
> > > the runtime PM calls. And while powering it up again the GART is
> > > restored.
> > I think you're forgetting the main Xe use-case of Lunar-lake
> > compression metadata. I'ts retained by the device during D3hot, but
> > cannot, at that time, be accessed for shrinking.
> 
> Yeah, that is really something we don't have an equivalent for on AMD
> GPUs.
> 
> When the ASIC is powered down VRAM is basically dead as well because
> it 
> won't get refreshed any more.
> 
> > And copying it all out "Just in case" when transitioning to D3hot
> > just
> > isn't a viable solution.
> 
> I would say that this is solvable with a hierarchy of power
> management 
> functionality.
> 
> E.g. the runtime PM interface works the same for you as it does for 
> amdgpu with evicting TTM BOs etc....
> 
> Then separate from runtime PM you have a reference count for the 
> accessibility of compressed metadata. And while shrinking you only 
> resume this specific part.

Well this looks like exactly what have with current hardware, or rather
*current* hardware that needs to take the bo lock during runtime pm
doesn't need this type of metadata access, which is currently Lunar-
lake only.

So from a runtime pm deadlock point-of-view we're safe. But then we
need to blit out the metadata and we get a fence to wait for, which
we've been told to avoid in direct reclaim but rather is preferred from
kswapd.

And I must admit I still don't get what the problem is with allowing a
driver LRU loop? The fact that some AMD eviction_valuable()
implementations made bad decisions by far doesn't weigh up for  the
loss of flexibility IMO.

/Thomas

> 
> Christian.
> 
> > 
> > /Thomas
> > 
>
Daniel Vetter Sept. 2, 2024, 11 a.m. UTC | #35
On Tue, Aug 27, 2024 at 02:24:27PM -0400, Alex Deucher wrote:
> On Tue, Aug 27, 2024 at 12:58 PM Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> >
> > [machine died, new one working now, I can read complicated mails again an
> > answer.]
> >
> > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König wrote:
> > > Am 22.08.24 um 11:23 schrieb Daniel Vetter:
> > > > On Wed, Aug 21, 2024 at 10:14:34AM +0200, Christian König wrote:
> > > > > Am 20.08.24 um 18:00 schrieb Thomas Hellström:
> > > > > > > Or why exactly should shrinking fail?
> > > > > > A common example would be not having runtime pm and the particular bo
> > > > > > needs it to unbind, we want to try the next bo. Example: i915 GGTT
> > > > > > bound bos and Lunar Lake PL_TT bos.
> > > > > WHAT? So you basically block shrinking BOs because you can't unbind them
> > > > > because the device is powered down?
> > > > Yes. amdgpu does the same btw :-)
> > >
> > > Well, amdgpu wouldn't block shrinking as far as I know.
> > >
> > > When the GPU is powered down all fences are signaled and things like GART
> > > unbinding is just postponed until the GPU wakes up again.
> > >
> > > > It's a fairly fundamental issue of rpm on discrete gpus, or anything that
> > > > looks a lot like a discrete gpu. The deadlock scenario is roughly:
> > > >
> > > > - In runtime suspend you need to copy any bo out of vram into system ram
> > > >    before you power the gpu. This requires bo and ttm locks.
> > > >
> > > > - You can't just avoid this by holding an rpm reference as long as any bo
> > > >    is still in vram, because that defacto means you'll never autosuspend at
> > > >    runtime. Plus most real hw is complex enough that you have some driver
> > > >    objects that you need to throw out or recreate, so in practice no way to
> > > >    avoid all this.
> > > >
> > > > - runtime resume tends to be easier and mostly doable without taking bo
> > > >    and ttm locks, because by design you know no one else can possibly have
> > > >    any need to get at the gpu hw - it was all powered off after all. It's
> > > >    still messy, but doable.
> > > >
> > > > - Unfortunately this doesn't help, because your runtime resume might need
> > > >    to wait for a in-progress suspend operation to complete. Which means you
> > > >    still deadlock even if your resume path has entirely reasonable locking.
> > >
> > > Uff, yeah that totally confirms my gut feeling that this looked really
> > > questionable.
> > >
> > > > On integrated you can mostly avoid this all because there's no need to
> > > > swap out bo to system memory, they're there already. Exceptions like the
> > > > busted coherency stuff on LNL aside.
> > > >
> > > > But on discrete it's just suck.
> > >
> > > Mhm, I never worked with pure integrated devices but at least radeon, amdgpu
> > > and nouveau seems to have a solution which would work as far as I can tell.
> >
> > Yeah integrated is easy (if you don't do silly stuff like intel on their
> > latest).
> >
> > > Basically on suspend you make sure that everything necessary for shrinking
> > > is done, e.g. waiting for fences, evacuating VRAM etc...
> >
> > This is the hard part, or the "evacuating VRAM" part at least. My proposal
> > is that essentially the runtime_suspend_prepare hook does that. Imo the
> > clean design is that we wait for rpm autosuspend, and only then start to
> > evacuate VRAM. Otherwise you need to hold a rpm reference if there's
> > anything in VRAM, which kinda defeats the point of runtime pm ...
> >
> > Or you have your own runtime pm refcount that tracks whether anyone uses
> > the gpu, and if that drops to 0 for long enough you evacuate VRAM, and
> > then drop the rpm reference. Which is just implementing the idea I've
> > typed up, except hand-rolled in driver code and with a rpm refcount that's
> > entirely driver internal.
> >
> > And at least last time I checked, amdgpu does the VRAM evacuation in the
> > runtime pm suspend callback, which is why all your runtime_pm_get calls
> > are outside of the big bo/ttm locks.
> 
> It used to do the eviction in the runtime_suspend callback, but we ran
> into problems where eviction would sometimes fail if there was too
> much memory pressure during suspend.  Moving it to prepare made things
> fail earlier.  Ultimately, I think the problem is that runtime pm
> disables swap during runtime pm. See:
> https://gitlab.freedesktop.org/drm/amd/-/issues/2362#note_2111666

That's system suspend/resume, where you have a prepare callback. At least
as far as I know, for runtime pm this doesn't exist.
-Sima

> 
> Alex
> 
> >
> > > Hardware specific updates like GART while the device is suspended are
> > > postponed until resume.
> >
> > Yeah GART shouldn't be the issue, except when you're racing.
> >
> > > > TTM discrete gpu drivers avoided all that by simply not having a shrinker
> > > > where you need to runtime pm get, instead all runtime pm gets are outmost,
> > > > without holding any ttm or bo locks.
> > >
> > > Yes, exactly that.
> > >
> > > > > I would say that this is a serious NO-GO. It basically means that powered
> > > > > down devices can lock down system memory for undefined amount of time.
> > > > >
> > > > > In other words an application can allocate memory, map it into GGTT and then
> > > > > suspend or even get killed and we are not able to recover the memory because
> > > > > there is no activity on the GPU any more?
> > > > >
> > > > > That really sounds like a bug in the driver design to me.
> > > > It's a bug in the runtime pm core imo. I think interim what Thomas laid
> > > > out is the best solution, since in practice when the gpu is off you really
> > > > shouldn't need to wake it up. Except when you're unlucky and racing a
> > > > runtime suspend against a shrinker activity (like runtime suspend throws a
> > > > bo into system memory, and the shrinker then immediately wants to swap it
> > > > out).
> > >
> > > Mhm, why exactly is that problematic?
> > >
> > > Wouldn't pm_runtime_get_if_in_use() just return 0 in this situation and we
> > > postpone any hw activity?
> >
> > So when you're runtime suspend, you need to evacuate VRAM. Which means
> > potentially a lot needs to be moved into system memory. Which means it's
> > likely the shrinker gets active. Also, it's the point where
> > pm_runtime_get_if_in_use() will consistently fail, so right when you need
> > the shrinker to be reliable it will fail the most.
> >
> > > > I've been pondering this mess for a few months, and I think I have a
> > > > solution. But it's a lot of work in core pm code unfortunately:
> > > >
> > > > I think we need to split the runtime_suspend callback into two halfes:
> > > >
> > > > ->runtime_suspend_prepare
> > > >
> > > > This would be run by the rpm core code from a worker without holding any
> > > > locks at all. Also, any runtime_pm_get call will not wait on this prepare
> > > > callback to finish, so it's up to the driver to make sure all the locking
> > > > is there. Synchronous suspend calls obviously have to wait for this to
> > > > finish, but that should only happen during system suspend or driver
> > > > unload, where we don't have these deadlock issues.
> > > >
> > > > Drivers can use this callback for any non-destructive prep work
> > > > (non-destructive aside from the copy engine time wasted if it fails) like
> > > > swapping bo from vram to system memory. Drivers must not actually shut
> > > > down the hardware because a runtime_pm_get call must succeed without
> > > > waiting for this callback to finish.
> > > >
> > > > If any runtime_pm_get call happens while the suspend attempt will be
> > > > aborted without further action.
> > > >
> > > > ->runtime_suspend
> > > >
> > > > This does the actual hw power-off. The power core must guarantee that the
> > > > ->runtime_suspend_prepare has successfully completed at least once without
> > > > the rpm refcount being elevated from 0 to 1 again.
> > > >
> > > > This way drivers can assume that all bo have been swapped out from vram
> > > > already, and there's no need to acquire bo or ttm locks in the suspend
> > > > path that could block the resume path.
> > > >
> > > > Which would then allow unconditional runtime_pm_get in the shrinker paths.
> > > >
> > > > Unfortunately this will be all really tricky to implement and I think
> > > > needs to be done in the rumtime pm core.
> > >
> > > Completely agree that this is complicated, but I still don't see the need
> > > for it.
> > >
> > > Drivers just need to use pm_runtime_get_if_in_use() inside the shrinker and
> > > postpone all hw activity until resume.
> >
> > Not good enough, at least long term I think. Also postponing hw activity
> > to resume doesn't solve the deadlock issue, if you still need to grab ttm
> > locks on resume.
> >
> > > At least for amdgpu that shouldn't be a problem at all.
> >
> > Yeah, because atm amdgpu has the rpm_get calls outside of ttm locks, which
> > is exactly the inversion you've complained about as being broken driver
> > design.
> > -Sima
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Daniel Vetter Sept. 2, 2024, 11:07 a.m. UTC | #36
On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König wrote:
> Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> > On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
> > > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König wrote:
> > > > Completely agree that this is complicated, but I still don't see the need
> > > > for it.
> > > > 
> > > > Drivers just need to use pm_runtime_get_if_in_use() inside the shrinker and
> > > > postpone all hw activity until resume.
> > > Not good enough, at least long term I think. Also postponing hw activity
> > > to resume doesn't solve the deadlock issue, if you still need to grab ttm
> > > locks on resume.
> > Pondered this specific aspect some more, and I think you still have a race
> > here (even if you avoid the deadlock): If the condiditional rpm_get call
> > fails there's no guarantee that the device will suspend/resume and clean
> > up the GART mapping.
> 
> Well I think we have a major disconnect here. When the device is powered
> down there is no GART mapping to clean up any more.
> 
> In other words GART is a table in local memory (VRAM) when the device is
> powered down this table is completely destroyed. Any BO which was mapped
> inside this table is now not mapped any more.
> 
> So when the shrinker wants to evict a BO which is marked as mapped to GART
> and the device is powered down we just skip the GART unmapping part because
> that has already implicitly happened during power down.
> 
> Before mapping any BO into the GART again we power the GPU up through the
> runtime PM calls. And while powering it up again the GART is restored.

My point is that you can't tell whether the device will power down or not,
you can only tell whether there's a chance it might be powering down and
so you can't get at the rpm reference without deadlock issues.

> > The race gets a bit smaller if you use
> > pm_runtime_get_if_active(), but even then you might catch it right when
> > resume almost finished.
> 
> What race are you talking about?
> 
> The worst thing which could happen is that we restore a GART entry which
> isn't needed any more, but that is pretty much irrelevant since we only
> clear them to avoid some hw bugs.

The race I'm seeing is where you thought the GART entry is not issue,
tossed an object, but the device didn't suspend, so might still use it.

I guess if we're clearly separating the sw allocation of the TTM_TT with
the physical entries in the GART that should all work, but feels a bit
tricky. The race I've seen is essentially these two getting out of sync.

So maybe it was me who's stuck.

What I wonder is whether it works in practice, since on the restore side
you need to get some locks to figure out which gart mappings exist and
need restoring. And that's the same locks as the shrinker needs to figure
out whether it might need to reap a gart mapping.

Or do you just copy the gart entries over and restore them exactly as-is,
so that there's no shared locks?

> > That means we'll have ttm bo hanging around with GART allocations/mappings
> > which aren't actually valid anymore (since they might escape the cleanup
> > upon resume due to the race). That doesn't feel like a solid design
> > either.
> 
> I'm most likely missing something, but I'm really scratching my head where
> you see a problem here.

I guess one issue is that at least traditionally, igfx drivers have nested
runtime pm within dma_resv lock. And dgpu drivers the other way round.
Which is a bit awkward if you're trying for common code.

Cheers, Sima
Thomas Hellström Sept. 18, 2024, 12:57 p.m. UTC | #37
Sima, Christian

I've updated the shrinker series now with a guarded for_each macro
instead:

https://patchwork.freedesktop.org/patch/614514/?series=131815&rev=9

(Note I forgot to remove the export of the previous LRU walker).

 so the midlayer argument is now not an issue anymore. The cleanup.h
guard provides some additional protection against drivers exiting the
LRU loop early.

So remaining is the question whether the driver is allowed to discard a
suggested bo to shrink from TTM.

Arguments for:

1) Not allowing that would require teaching TTM about purgeable
objects.
2) Devices who need the blitter during shrinking would want to punt
runtime_pm_get() to kswapd to avoid sleeping direct reclaim.
3) If those devices end up blitting (LNL) to be able to shrink, they
would want to punt waiting for the fence to signal to kswapd to avoid
waiting in direct reclaim.
4) It looks like we need to resort to folio_trylock in the shmem backup
backend when shrinking is called for gfp_t = GFP_NOFS. A failing
trylock will require a new bo.

Arguments against:
None really. I thought the idea of demidlayering would be to allow the
driver more freedom.

So any feedback appreciated. If that is found acceptable we can proceed
with reviewing this patch and also with the shrinker series.

Thanks,
Thomas


On Mon, 2024-09-02 at 13:07 +0200, Daniel Vetter wrote:
> On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König wrote:
> > Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> > > On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
> > > > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König
> > > > wrote:
> > > > > Completely agree that this is complicated, but I still don't
> > > > > see the need
> > > > > for it.
> > > > > 
> > > > > Drivers just need to use pm_runtime_get_if_in_use() inside
> > > > > the shrinker and
> > > > > postpone all hw activity until resume.
> > > > Not good enough, at least long term I think. Also postponing hw
> > > > activity
> > > > to resume doesn't solve the deadlock issue, if you still need
> > > > to grab ttm
> > > > locks on resume.
> > > Pondered this specific aspect some more, and I think you still
> > > have a race
> > > here (even if you avoid the deadlock): If the condiditional
> > > rpm_get call
> > > fails there's no guarantee that the device will suspend/resume
> > > and clean
> > > up the GART mapping.
> > 
> > Well I think we have a major disconnect here. When the device is
> > powered
> > down there is no GART mapping to clean up any more.
> > 
> > In other words GART is a table in local memory (VRAM) when the
> > device is
> > powered down this table is completely destroyed. Any BO which was
> > mapped
> > inside this table is now not mapped any more.
> > 
> > So when the shrinker wants to evict a BO which is marked as mapped
> > to GART
> > and the device is powered down we just skip the GART unmapping part
> > because
> > that has already implicitly happened during power down.
> > 
> > Before mapping any BO into the GART again we power the GPU up
> > through the
> > runtime PM calls. And while powering it up again the GART is
> > restored.
> 
> My point is that you can't tell whether the device will power down or
> not,
> you can only tell whether there's a chance it might be powering down
> and
> so you can't get at the rpm reference without deadlock issues.
> 
> > > The race gets a bit smaller if you use
> > > pm_runtime_get_if_active(), but even then you might catch it
> > > right when
> > > resume almost finished.
> > 
> > What race are you talking about?
> > 
> > The worst thing which could happen is that we restore a GART entry
> > which
> > isn't needed any more, but that is pretty much irrelevant since we
> > only
> > clear them to avoid some hw bugs.
> 
> The race I'm seeing is where you thought the GART entry is not issue,
> tossed an object, but the device didn't suspend, so might still use
> it.
> 
> I guess if we're clearly separating the sw allocation of the TTM_TT
> with
> the physical entries in the GART that should all work, but feels a
> bit
> tricky. The race I've seen is essentially these two getting out of
> sync.
> 
> So maybe it was me who's stuck.
> 
> What I wonder is whether it works in practice, since on the restore
> side
> you need to get some locks to figure out which gart mappings exist
> and
> need restoring. And that's the same locks as the shrinker needs to
> figure
> out whether it might need to reap a gart mapping.
> 
> Or do you just copy the gart entries over and restore them exactly
> as-is,
> so that there's no shared locks?
> 
> > > That means we'll have ttm bo hanging around with GART
> > > allocations/mappings
> > > which aren't actually valid anymore (since they might escape the
> > > cleanup
> > > upon resume due to the race). That doesn't feel like a solid
> > > design
> > > either.
> > 
> > I'm most likely missing something, but I'm really scratching my
> > head where
> > you see a problem here.
> 
> I guess one issue is that at least traditionally, igfx drivers have
> nested
> runtime pm within dma_resv lock. And dgpu drivers the other way
> round.
> Which is a bit awkward if you're trying for common code.
> 
> Cheers, Sima
Thomas Hellström Oct. 2, 2024, 11:30 a.m. UTC | #38
Hi, Christian,

Ping? Can i get an ack to proceed with this?

Thanks,
Thomas



On Wed, 2024-09-18 at 14:57 +0200, Thomas Hellström wrote:
> Sima, Christian
> 
> I've updated the shrinker series now with a guarded for_each macro
> instead:
> 
> https://patchwork.freedesktop.org/patch/614514/?series=131815&rev=9
> 
> (Note I forgot to remove the export of the previous LRU walker).
> 
>  so the midlayer argument is now not an issue anymore. The cleanup.h
> guard provides some additional protection against drivers exiting the
> LRU loop early.
> 
> So remaining is the question whether the driver is allowed to discard
> a
> suggested bo to shrink from TTM.
> 
> Arguments for:
> 
> 1) Not allowing that would require teaching TTM about purgeable
> objects.
> 2) Devices who need the blitter during shrinking would want to punt
> runtime_pm_get() to kswapd to avoid sleeping direct reclaim.
> 3) If those devices end up blitting (LNL) to be able to shrink, they
> would want to punt waiting for the fence to signal to kswapd to avoid
> waiting in direct reclaim.
> 4) It looks like we need to resort to folio_trylock in the shmem
> backup
> backend when shrinking is called for gfp_t = GFP_NOFS. A failing
> trylock will require a new bo.
> 
> Arguments against:
> None really. I thought the idea of demidlayering would be to allow
> the
> driver more freedom.
> 
> So any feedback appreciated. If that is found acceptable we can
> proceed
> with reviewing this patch and also with the shrinker series.
> 
> Thanks,
> Thomas
> 
> 
> On Mon, 2024-09-02 at 13:07 +0200, Daniel Vetter wrote:
> > On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König wrote:
> > > Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> > > > On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
> > > > > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König
> > > > > wrote:
> > > > > > Completely agree that this is complicated, but I still
> > > > > > don't
> > > > > > see the need
> > > > > > for it.
> > > > > > 
> > > > > > Drivers just need to use pm_runtime_get_if_in_use() inside
> > > > > > the shrinker and
> > > > > > postpone all hw activity until resume.
> > > > > Not good enough, at least long term I think. Also postponing
> > > > > hw
> > > > > activity
> > > > > to resume doesn't solve the deadlock issue, if you still need
> > > > > to grab ttm
> > > > > locks on resume.
> > > > Pondered this specific aspect some more, and I think you still
> > > > have a race
> > > > here (even if you avoid the deadlock): If the condiditional
> > > > rpm_get call
> > > > fails there's no guarantee that the device will suspend/resume
> > > > and clean
> > > > up the GART mapping.
> > > 
> > > Well I think we have a major disconnect here. When the device is
> > > powered
> > > down there is no GART mapping to clean up any more.
> > > 
> > > In other words GART is a table in local memory (VRAM) when the
> > > device is
> > > powered down this table is completely destroyed. Any BO which was
> > > mapped
> > > inside this table is now not mapped any more.
> > > 
> > > So when the shrinker wants to evict a BO which is marked as
> > > mapped
> > > to GART
> > > and the device is powered down we just skip the GART unmapping
> > > part
> > > because
> > > that has already implicitly happened during power down.
> > > 
> > > Before mapping any BO into the GART again we power the GPU up
> > > through the
> > > runtime PM calls. And while powering it up again the GART is
> > > restored.
> > 
> > My point is that you can't tell whether the device will power down
> > or
> > not,
> > you can only tell whether there's a chance it might be powering
> > down
> > and
> > so you can't get at the rpm reference without deadlock issues.
> > 
> > > > The race gets a bit smaller if you use
> > > > pm_runtime_get_if_active(), but even then you might catch it
> > > > right when
> > > > resume almost finished.
> > > 
> > > What race are you talking about?
> > > 
> > > The worst thing which could happen is that we restore a GART
> > > entry
> > > which
> > > isn't needed any more, but that is pretty much irrelevant since
> > > we
> > > only
> > > clear them to avoid some hw bugs.
> > 
> > The race I'm seeing is where you thought the GART entry is not
> > issue,
> > tossed an object, but the device didn't suspend, so might still use
> > it.
> > 
> > I guess if we're clearly separating the sw allocation of the TTM_TT
> > with
> > the physical entries in the GART that should all work, but feels a
> > bit
> > tricky. The race I've seen is essentially these two getting out of
> > sync.
> > 
> > So maybe it was me who's stuck.
> > 
> > What I wonder is whether it works in practice, since on the restore
> > side
> > you need to get some locks to figure out which gart mappings exist
> > and
> > need restoring. And that's the same locks as the shrinker needs to
> > figure
> > out whether it might need to reap a gart mapping.
> > 
> > Or do you just copy the gart entries over and restore them exactly
> > as-is,
> > so that there's no shared locks?
> > 
> > > > That means we'll have ttm bo hanging around with GART
> > > > allocations/mappings
> > > > which aren't actually valid anymore (since they might escape
> > > > the
> > > > cleanup
> > > > upon resume due to the race). That doesn't feel like a solid
> > > > design
> > > > either.
> > > 
> > > I'm most likely missing something, but I'm really scratching my
> > > head where
> > > you see a problem here.
> > 
> > I guess one issue is that at least traditionally, igfx drivers have
> > nested
> > runtime pm within dma_resv lock. And dgpu drivers the other way
> > round.
> > Which is a bit awkward if you're trying for common code.
> > 
> > Cheers, Sima
>
Christian König Oct. 2, 2024, 11:32 a.m. UTC | #39
Ah, yes sorry totally forgotten about that.

Give me till Friday to swap everything back into my head again.

Christian.

Am 02.10.24 um 13:30 schrieb Thomas Hellström:
> Hi, Christian,
>
> Ping? Can i get an ack to proceed with this?
>
> Thanks,
> Thomas
>
>
>
> On Wed, 2024-09-18 at 14:57 +0200, Thomas Hellström wrote:
>> Sima, Christian
>>
>> I've updated the shrinker series now with a guarded for_each macro
>> instead:
>>
>> https://patchwork.freedesktop.org/patch/614514/?series=131815&rev=9
>>
>> (Note I forgot to remove the export of the previous LRU walker).
>>
>>   so the midlayer argument is now not an issue anymore. The cleanup.h
>> guard provides some additional protection against drivers exiting the
>> LRU loop early.
>>
>> So remaining is the question whether the driver is allowed to discard
>> a
>> suggested bo to shrink from TTM.
>>
>> Arguments for:
>>
>> 1) Not allowing that would require teaching TTM about purgeable
>> objects.
>> 2) Devices who need the blitter during shrinking would want to punt
>> runtime_pm_get() to kswapd to avoid sleeping direct reclaim.
>> 3) If those devices end up blitting (LNL) to be able to shrink, they
>> would want to punt waiting for the fence to signal to kswapd to avoid
>> waiting in direct reclaim.
>> 4) It looks like we need to resort to folio_trylock in the shmem
>> backup
>> backend when shrinking is called for gfp_t = GFP_NOFS. A failing
>> trylock will require a new bo.
>>
>> Arguments against:
>> None really. I thought the idea of demidlayering would be to allow
>> the
>> driver more freedom.
>>
>> So any feedback appreciated. If that is found acceptable we can
>> proceed
>> with reviewing this patch and also with the shrinker series.
>>
>> Thanks,
>> Thomas
>>
>>
>> On Mon, 2024-09-02 at 13:07 +0200, Daniel Vetter wrote:
>>> On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König wrote:
>>>> Am 27.08.24 um 19:53 schrieb Daniel Vetter:
>>>>> On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
>>>>>> On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König
>>>>>> wrote:
>>>>>>> Completely agree that this is complicated, but I still
>>>>>>> don't
>>>>>>> see the need
>>>>>>> for it.
>>>>>>>
>>>>>>> Drivers just need to use pm_runtime_get_if_in_use() inside
>>>>>>> the shrinker and
>>>>>>> postpone all hw activity until resume.
>>>>>> Not good enough, at least long term I think. Also postponing
>>>>>> hw
>>>>>> activity
>>>>>> to resume doesn't solve the deadlock issue, if you still need
>>>>>> to grab ttm
>>>>>> locks on resume.
>>>>> Pondered this specific aspect some more, and I think you still
>>>>> have a race
>>>>> here (even if you avoid the deadlock): If the condiditional
>>>>> rpm_get call
>>>>> fails there's no guarantee that the device will suspend/resume
>>>>> and clean
>>>>> up the GART mapping.
>>>> Well I think we have a major disconnect here. When the device is
>>>> powered
>>>> down there is no GART mapping to clean up any more.
>>>>
>>>> In other words GART is a table in local memory (VRAM) when the
>>>> device is
>>>> powered down this table is completely destroyed. Any BO which was
>>>> mapped
>>>> inside this table is now not mapped any more.
>>>>
>>>> So when the shrinker wants to evict a BO which is marked as
>>>> mapped
>>>> to GART
>>>> and the device is powered down we just skip the GART unmapping
>>>> part
>>>> because
>>>> that has already implicitly happened during power down.
>>>>
>>>> Before mapping any BO into the GART again we power the GPU up
>>>> through the
>>>> runtime PM calls. And while powering it up again the GART is
>>>> restored.
>>> My point is that you can't tell whether the device will power down
>>> or
>>> not,
>>> you can only tell whether there's a chance it might be powering
>>> down
>>> and
>>> so you can't get at the rpm reference without deadlock issues.
>>>
>>>>> The race gets a bit smaller if you use
>>>>> pm_runtime_get_if_active(), but even then you might catch it
>>>>> right when
>>>>> resume almost finished.
>>>> What race are you talking about?
>>>>
>>>> The worst thing which could happen is that we restore a GART
>>>> entry
>>>> which
>>>> isn't needed any more, but that is pretty much irrelevant since
>>>> we
>>>> only
>>>> clear them to avoid some hw bugs.
>>> The race I'm seeing is where you thought the GART entry is not
>>> issue,
>>> tossed an object, but the device didn't suspend, so might still use
>>> it.
>>>
>>> I guess if we're clearly separating the sw allocation of the TTM_TT
>>> with
>>> the physical entries in the GART that should all work, but feels a
>>> bit
>>> tricky. The race I've seen is essentially these two getting out of
>>> sync.
>>>
>>> So maybe it was me who's stuck.
>>>
>>> What I wonder is whether it works in practice, since on the restore
>>> side
>>> you need to get some locks to figure out which gart mappings exist
>>> and
>>> need restoring. And that's the same locks as the shrinker needs to
>>> figure
>>> out whether it might need to reap a gart mapping.
>>>
>>> Or do you just copy the gart entries over and restore them exactly
>>> as-is,
>>> so that there's no shared locks?
>>>
>>>>> That means we'll have ttm bo hanging around with GART
>>>>> allocations/mappings
>>>>> which aren't actually valid anymore (since they might escape
>>>>> the
>>>>> cleanup
>>>>> upon resume due to the race). That doesn't feel like a solid
>>>>> design
>>>>> either.
>>>> I'm most likely missing something, but I'm really scratching my
>>>> head where
>>>> you see a problem here.
>>> I guess one issue is that at least traditionally, igfx drivers have
>>> nested
>>> runtime pm within dma_resv lock. And dgpu drivers the other way
>>> round.
>>> Which is a bit awkward if you're trying for common code.
>>>
>>> Cheers, Sima
Thomas Hellström Oct. 2, 2024, 11:36 a.m. UTC | #40
Hi

On Wed, 2024-10-02 at 13:32 +0200, Christian König wrote:
> Ah, yes sorry totally forgotten about that.
> 
> Give me till Friday to swap everything back into my head again.
> 
> Christian.

Thanks!

Once we agree on a direction MBrost is ready to do an in-depth review.

/Thomas


> 
> Am 02.10.24 um 13:30 schrieb Thomas Hellström:
> > Hi, Christian,
> > 
> > Ping? Can i get an ack to proceed with this?
> > 
> > Thanks,
> > Thomas
> > 
> > 
> > 
> > On Wed, 2024-09-18 at 14:57 +0200, Thomas Hellström wrote:
> > > Sima, Christian
> > > 
> > > I've updated the shrinker series now with a guarded for_each
> > > macro
> > > instead:
> > > 
> > > https://patchwork.freedesktop.org/patch/614514/?series=131815&rev=9
> > > 
> > > (Note I forgot to remove the export of the previous LRU walker).
> > > 
> > >   so the midlayer argument is now not an issue anymore. The
> > > cleanup.h
> > > guard provides some additional protection against drivers exiting
> > > the
> > > LRU loop early.
> > > 
> > > So remaining is the question whether the driver is allowed to
> > > discard
> > > a
> > > suggested bo to shrink from TTM.
> > > 
> > > Arguments for:
> > > 
> > > 1) Not allowing that would require teaching TTM about purgeable
> > > objects.
> > > 2) Devices who need the blitter during shrinking would want to
> > > punt
> > > runtime_pm_get() to kswapd to avoid sleeping direct reclaim.
> > > 3) If those devices end up blitting (LNL) to be able to shrink,
> > > they
> > > would want to punt waiting for the fence to signal to kswapd to
> > > avoid
> > > waiting in direct reclaim.
> > > 4) It looks like we need to resort to folio_trylock in the shmem
> > > backup
> > > backend when shrinking is called for gfp_t = GFP_NOFS. A failing
> > > trylock will require a new bo.
> > > 
> > > Arguments against:
> > > None really. I thought the idea of demidlayering would be to
> > > allow
> > > the
> > > driver more freedom.
> > > 
> > > So any feedback appreciated. If that is found acceptable we can
> > > proceed
> > > with reviewing this patch and also with the shrinker series.
> > > 
> > > Thanks,
> > > Thomas
> > > 
> > > 
> > > On Mon, 2024-09-02 at 13:07 +0200, Daniel Vetter wrote:
> > > > On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König
> > > > wrote:
> > > > > Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> > > > > > On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter
> > > > > > wrote:
> > > > > > > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König
> > > > > > > wrote:
> > > > > > > > Completely agree that this is complicated, but I still
> > > > > > > > don't
> > > > > > > > see the need
> > > > > > > > for it.
> > > > > > > > 
> > > > > > > > Drivers just need to use pm_runtime_get_if_in_use()
> > > > > > > > inside
> > > > > > > > the shrinker and
> > > > > > > > postpone all hw activity until resume.
> > > > > > > Not good enough, at least long term I think. Also
> > > > > > > postponing
> > > > > > > hw
> > > > > > > activity
> > > > > > > to resume doesn't solve the deadlock issue, if you still
> > > > > > > need
> > > > > > > to grab ttm
> > > > > > > locks on resume.
> > > > > > Pondered this specific aspect some more, and I think you
> > > > > > still
> > > > > > have a race
> > > > > > here (even if you avoid the deadlock): If the condiditional
> > > > > > rpm_get call
> > > > > > fails there's no guarantee that the device will
> > > > > > suspend/resume
> > > > > > and clean
> > > > > > up the GART mapping.
> > > > > Well I think we have a major disconnect here. When the device
> > > > > is
> > > > > powered
> > > > > down there is no GART mapping to clean up any more.
> > > > > 
> > > > > In other words GART is a table in local memory (VRAM) when
> > > > > the
> > > > > device is
> > > > > powered down this table is completely destroyed. Any BO which
> > > > > was
> > > > > mapped
> > > > > inside this table is now not mapped any more.
> > > > > 
> > > > > So when the shrinker wants to evict a BO which is marked as
> > > > > mapped
> > > > > to GART
> > > > > and the device is powered down we just skip the GART
> > > > > unmapping
> > > > > part
> > > > > because
> > > > > that has already implicitly happened during power down.
> > > > > 
> > > > > Before mapping any BO into the GART again we power the GPU up
> > > > > through the
> > > > > runtime PM calls. And while powering it up again the GART is
> > > > > restored.
> > > > My point is that you can't tell whether the device will power
> > > > down
> > > > or
> > > > not,
> > > > you can only tell whether there's a chance it might be powering
> > > > down
> > > > and
> > > > so you can't get at the rpm reference without deadlock issues.
> > > > 
> > > > > > The race gets a bit smaller if you use
> > > > > > pm_runtime_get_if_active(), but even then you might catch
> > > > > > it
> > > > > > right when
> > > > > > resume almost finished.
> > > > > What race are you talking about?
> > > > > 
> > > > > The worst thing which could happen is that we restore a GART
> > > > > entry
> > > > > which
> > > > > isn't needed any more, but that is pretty much irrelevant
> > > > > since
> > > > > we
> > > > > only
> > > > > clear them to avoid some hw bugs.
> > > > The race I'm seeing is where you thought the GART entry is not
> > > > issue,
> > > > tossed an object, but the device didn't suspend, so might still
> > > > use
> > > > it.
> > > > 
> > > > I guess if we're clearly separating the sw allocation of the
> > > > TTM_TT
> > > > with
> > > > the physical entries in the GART that should all work, but
> > > > feels a
> > > > bit
> > > > tricky. The race I've seen is essentially these two getting out
> > > > of
> > > > sync.
> > > > 
> > > > So maybe it was me who's stuck.
> > > > 
> > > > What I wonder is whether it works in practice, since on the
> > > > restore
> > > > side
> > > > you need to get some locks to figure out which gart mappings
> > > > exist
> > > > and
> > > > need restoring. And that's the same locks as the shrinker needs
> > > > to
> > > > figure
> > > > out whether it might need to reap a gart mapping.
> > > > 
> > > > Or do you just copy the gart entries over and restore them
> > > > exactly
> > > > as-is,
> > > > so that there's no shared locks?
> > > > 
> > > > > > That means we'll have ttm bo hanging around with GART
> > > > > > allocations/mappings
> > > > > > which aren't actually valid anymore (since they might
> > > > > > escape
> > > > > > the
> > > > > > cleanup
> > > > > > upon resume due to the race). That doesn't feel like a
> > > > > > solid
> > > > > > design
> > > > > > either.
> > > > > I'm most likely missing something, but I'm really scratching
> > > > > my
> > > > > head where
> > > > > you see a problem here.
> > > > I guess one issue is that at least traditionally, igfx drivers
> > > > have
> > > > nested
> > > > runtime pm within dma_resv lock. And dgpu drivers the other way
> > > > round.
> > > > Which is a bit awkward if you're trying for common code.
> > > > 
> > > > Cheers, Sima
>
Christian König Oct. 7, 2024, 9:08 a.m. UTC | #41
Hi Thomas,

I'm on sick leave, but I will try to answer questions and share some 
thoughts on this to unblock you.

Am 18.09.24 um 14:57 schrieb Thomas Hellström:
> Sima, Christian
>
> I've updated the shrinker series now with a guarded for_each macro
> instead:
>
> https://patchwork.freedesktop.org/patch/614514/?series=131815&rev=9

Yeah that looks like a big step in the right direction.

> (Note I forgot to remove the export of the previous LRU walker).
>
>   so the midlayer argument is now not an issue anymore. The cleanup.h
> guard provides some additional protection against drivers exiting the
> LRU loop early.
>
> So remaining is the question whether the driver is allowed to discard a
> suggested bo to shrink from TTM.
>
> Arguments for:
>
> 1) Not allowing that would require teaching TTM about purgeable
> objects.

I think that is actually not correct. TTM already knows about purgeable 
objects.

E.g. when TTM asks the driver what to do with evicted objects it is 
perfectly valid to return an empty placement list indicating that the 
backing store can just be thrown away.

We use this for things like temporary buffers for example.

That this doesn't apply to swapping looks like a design bug in the 
driver/TTM interface to me.

> 2) Devices who need the blitter during shrinking would want to punt
> runtime_pm_get() to kswapd to avoid sleeping direct reclaim.

I think the outcome of the discussion is that runtime PM should never be 
mixed into TTM swapping.

You can power up blocks to enable a HW blitter for swapping, but this 
then can't be driven by the runtime PM framework.

> 3) If those devices end up blitting (LNL) to be able to shrink, they
> would want to punt waiting for the fence to signal to kswapd to avoid
> waiting in direct reclaim.

Mhm, what do you mean with that?


> 4) It looks like we need to resort to folio_trylock in the shmem backup
> backend when shrinking is called for gfp_t = GFP_NOFS. A failing
> trylock will require a new bo.

Why would a folio trylock succeed with one BO and not another?

And why would that trylock something the device specific driver should 
handle?

> Arguments against:
> None really. I thought the idea of demidlayering would be to allow the
> driver more freedom.

Well that is a huge argument against it. Giving drivers more freedom is 
absolutely not something which turned out to be valuable in the past.

Instead we should put device drivers in a very strict corset of 
validated approaches to do things.

Background is that in my experience driver developers are perfectly 
willing to do unclean approaches which only work in like 99% of all 
cases just to get a bit more performance or simpler driver implementation.

Those approaches are not legal and in my opinion as subsystem maintainer 
I think we need to be more strict and push back much harder on stuff 
like that.

Regards,
Christian.

>
> So any feedback appreciated. If that is found acceptable we can proceed
> with reviewing this patch and also with the shrinker series.
>
> Thanks,
> Thomas
>
>
> On Mon, 2024-09-02 at 13:07 +0200, Daniel Vetter wrote:
>> On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König wrote:
>>> Am 27.08.24 um 19:53 schrieb Daniel Vetter:
>>>> On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter wrote:
>>>>> On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König
>>>>> wrote:
>>>>>> Completely agree that this is complicated, but I still don't
>>>>>> see the need
>>>>>> for it.
>>>>>>
>>>>>> Drivers just need to use pm_runtime_get_if_in_use() inside
>>>>>> the shrinker and
>>>>>> postpone all hw activity until resume.
>>>>> Not good enough, at least long term I think. Also postponing hw
>>>>> activity
>>>>> to resume doesn't solve the deadlock issue, if you still need
>>>>> to grab ttm
>>>>> locks on resume.
>>>> Pondered this specific aspect some more, and I think you still
>>>> have a race
>>>> here (even if you avoid the deadlock): If the condiditional
>>>> rpm_get call
>>>> fails there's no guarantee that the device will suspend/resume
>>>> and clean
>>>> up the GART mapping.
>>> Well I think we have a major disconnect here. When the device is
>>> powered
>>> down there is no GART mapping to clean up any more.
>>>
>>> In other words GART is a table in local memory (VRAM) when the
>>> device is
>>> powered down this table is completely destroyed. Any BO which was
>>> mapped
>>> inside this table is now not mapped any more.
>>>
>>> So when the shrinker wants to evict a BO which is marked as mapped
>>> to GART
>>> and the device is powered down we just skip the GART unmapping part
>>> because
>>> that has already implicitly happened during power down.
>>>
>>> Before mapping any BO into the GART again we power the GPU up
>>> through the
>>> runtime PM calls. And while powering it up again the GART is
>>> restored.
>> My point is that you can't tell whether the device will power down or
>> not,
>> you can only tell whether there's a chance it might be powering down
>> and
>> so you can't get at the rpm reference without deadlock issues.
>>
>>>> The race gets a bit smaller if you use
>>>> pm_runtime_get_if_active(), but even then you might catch it
>>>> right when
>>>> resume almost finished.
>>> What race are you talking about?
>>>
>>> The worst thing which could happen is that we restore a GART entry
>>> which
>>> isn't needed any more, but that is pretty much irrelevant since we
>>> only
>>> clear them to avoid some hw bugs.
>> The race I'm seeing is where you thought the GART entry is not issue,
>> tossed an object, but the device didn't suspend, so might still use
>> it.
>>
>> I guess if we're clearly separating the sw allocation of the TTM_TT
>> with
>> the physical entries in the GART that should all work, but feels a
>> bit
>> tricky. The race I've seen is essentially these two getting out of
>> sync.
>>
>> So maybe it was me who's stuck.
>>
>> What I wonder is whether it works in practice, since on the restore
>> side
>> you need to get some locks to figure out which gart mappings exist
>> and
>> need restoring. And that's the same locks as the shrinker needs to
>> figure
>> out whether it might need to reap a gart mapping.
>>
>> Or do you just copy the gart entries over and restore them exactly
>> as-is,
>> so that there's no shared locks?
>>
>>>> That means we'll have ttm bo hanging around with GART
>>>> allocations/mappings
>>>> which aren't actually valid anymore (since they might escape the
>>>> cleanup
>>>> upon resume due to the race). That doesn't feel like a solid
>>>> design
>>>> either.
>>> I'm most likely missing something, but I'm really scratching my
>>> head where
>>> you see a problem here.
>> I guess one issue is that at least traditionally, igfx drivers have
>> nested
>> runtime pm within dma_resv lock. And dgpu drivers the other way
>> round.
>> Which is a bit awkward if you're trying for common code.
>>
>> Cheers, Sima
Thomas Hellström Oct. 9, 2024, 1:39 p.m. UTC | #42
On Mon, 2024-10-07 at 11:08 +0200, Christian König wrote:
> Hi Thomas,
> 
> I'm on sick leave, but I will try to answer questions and share some 
> thoughts on this to unblock you.
> 
> Am 18.09.24 um 14:57 schrieb Thomas Hellström:
> > Sima, Christian
> > 
> > I've updated the shrinker series now with a guarded for_each macro
> > instead:
> > 
> > https://patchwork.freedesktop.org/patch/614514/?series=131815&rev=9
> 
> Yeah that looks like a big step in the right direction.
> 
> > (Note I forgot to remove the export of the previous LRU walker).
> > 
> >   so the midlayer argument is now not an issue anymore. The
> > cleanup.h
> > guard provides some additional protection against drivers exiting
> > the
> > LRU loop early.
> > 
> > So remaining is the question whether the driver is allowed to
> > discard a
> > suggested bo to shrink from TTM.
> > 
> > Arguments for:
> > 
> > 1) Not allowing that would require teaching TTM about purgeable
> > objects.
> 
> I think that is actually not correct. TTM already knows about
> purgeable 
> objects.
> 
> E.g. when TTM asks the driver what to do with evicted objects it is 
> perfectly valid to return an empty placement list indicating that the
> backing store can just be thrown away.
> 
> We use this for things like temporary buffers for example.
> 
> That this doesn't apply to swapping looks like a design bug in the 
> driver/TTM interface to me.

Yes we can do that with TTM, but for shrinking there's no point in
trying to shrink when there is no swap-space left, other than purgeable
object. The number of shrinkable objects returned in shrinker::count
needs to reflect that, and *then* only those objects should be selected
for shrinking. If we were to announce that to TTM using a callback,
we're actually back to version 1 of this series which was rejected by
you exactly since it was using callbacks a year or so ago????

> 
> > 2) Devices who need the blitter during shrinking would want to punt
> > runtime_pm_get() to kswapd to avoid sleeping direct reclaim.
> 
> I think the outcome of the discussion is that runtime PM should never
> be 
> mixed into TTM swapping.
> 
> You can power up blocks to enable a HW blitter for swapping, but this
> then can't be driven by the runtime PM framework.

Still that power-on might be sleeping, so what's the difference using
runtime-pm or not? Why should the driver implement yet another power
interface, just because TTM refuses to publish a sane LRU walk
interface?

> 
> > 3) If those devices end up blitting (LNL) to be able to shrink,
> > they
> > would want to punt waiting for the fence to signal to kswapd to
> > avoid
> > waiting in direct reclaim.
> 
> Mhm, what do you mean with that?

When we fired the blitter from direct reclaim, we get a fence. If we
wait for it in direct reclaim we will be sleeping waiting for gpu,
which is bad form. We'd like return a failure so the object is retried
when idle, or from kswapd.

> 
> 
> > 4) It looks like we need to resort to folio_trylock in the shmem
> > backup
> > backend when shrinking is called for gfp_t = GFP_NOFS. A failing
> > trylock will require a new bo.
> 
> Why would a folio trylock succeed with one BO and not another?

Good point. We'd fail anyway but would perhaps need to call
SHRINK_STOP..

> 
> And why would that trylock something the device specific driver
> should 
> handle?

It happens in the TTM shrinker helper called from the driver in the
spirit of demidlayering.

> 
> > Arguments against:
> > None really. I thought the idea of demidlayering would be to allow
> > the
> > driver more freedom.
> 
> Well that is a huge argument against it. Giving drivers more freedom
> is 
> absolutely not something which turned out to be valuable in the past.

So then what's the point of demidlayering?

> 
> Instead we should put device drivers in a very strict corset of 
> validated approaches to do things.
> 
> Background is that in my experience driver developers are perfectly 
> willing to do unclean approaches which only work in like 99% of all 
> cases just to get a bit more performance or simpler driver
> implementation.
> 
> Those approaches are not legal and in my opinion as subsystem
> maintainer 
> I think we need to be more strict and push back much harder on stuff 
> like that.

Still, historically that has made developers abandon common components
for driver-specific solutions. 

And the question is still not answered.

Exactly *why* can't the driver fail and continue traversing the LRU,
because all our argumentation revolves around this and you have yet to
provide an objective reason why. 

/Thomas




> 
> Regards,
> Christian.
> 
> > 
> > So any feedback appreciated. If that is found acceptable we can
> > proceed
> > with reviewing this patch and also with the shrinker series.
> > 
> > Thanks,
> > Thomas
> > 
> > 
> > On Mon, 2024-09-02 at 13:07 +0200, Daniel Vetter wrote:
> > > On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König wrote:
> > > > Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> > > > > On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter
> > > > > wrote:
> > > > > > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König
> > > > > > wrote:
> > > > > > > Completely agree that this is complicated, but I still
> > > > > > > don't
> > > > > > > see the need
> > > > > > > for it.
> > > > > > > 
> > > > > > > Drivers just need to use pm_runtime_get_if_in_use()
> > > > > > > inside
> > > > > > > the shrinker and
> > > > > > > postpone all hw activity until resume.
> > > > > > Not good enough, at least long term I think. Also
> > > > > > postponing hw
> > > > > > activity
> > > > > > to resume doesn't solve the deadlock issue, if you still
> > > > > > need
> > > > > > to grab ttm
> > > > > > locks on resume.
> > > > > Pondered this specific aspect some more, and I think you
> > > > > still
> > > > > have a race
> > > > > here (even if you avoid the deadlock): If the condiditional
> > > > > rpm_get call
> > > > > fails there's no guarantee that the device will
> > > > > suspend/resume
> > > > > and clean
> > > > > up the GART mapping.
> > > > Well I think we have a major disconnect here. When the device
> > > > is
> > > > powered
> > > > down there is no GART mapping to clean up any more.
> > > > 
> > > > In other words GART is a table in local memory (VRAM) when the
> > > > device is
> > > > powered down this table is completely destroyed. Any BO which
> > > > was
> > > > mapped
> > > > inside this table is now not mapped any more.
> > > > 
> > > > So when the shrinker wants to evict a BO which is marked as
> > > > mapped
> > > > to GART
> > > > and the device is powered down we just skip the GART unmapping
> > > > part
> > > > because
> > > > that has already implicitly happened during power down.
> > > > 
> > > > Before mapping any BO into the GART again we power the GPU up
> > > > through the
> > > > runtime PM calls. And while powering it up again the GART is
> > > > restored.
> > > My point is that you can't tell whether the device will power
> > > down or
> > > not,
> > > you can only tell whether there's a chance it might be powering
> > > down
> > > and
> > > so you can't get at the rpm reference without deadlock issues.
> > > 
> > > > > The race gets a bit smaller if you use
> > > > > pm_runtime_get_if_active(), but even then you might catch it
> > > > > right when
> > > > > resume almost finished.
> > > > What race are you talking about?
> > > > 
> > > > The worst thing which could happen is that we restore a GART
> > > > entry
> > > > which
> > > > isn't needed any more, but that is pretty much irrelevant since
> > > > we
> > > > only
> > > > clear them to avoid some hw bugs.
> > > The race I'm seeing is where you thought the GART entry is not
> > > issue,
> > > tossed an object, but the device didn't suspend, so might still
> > > use
> > > it.
> > > 
> > > I guess if we're clearly separating the sw allocation of the
> > > TTM_TT
> > > with
> > > the physical entries in the GART that should all work, but feels
> > > a
> > > bit
> > > tricky. The race I've seen is essentially these two getting out
> > > of
> > > sync.
> > > 
> > > So maybe it was me who's stuck.
> > > 
> > > What I wonder is whether it works in practice, since on the
> > > restore
> > > side
> > > you need to get some locks to figure out which gart mappings
> > > exist
> > > and
> > > need restoring. And that's the same locks as the shrinker needs
> > > to
> > > figure
> > > out whether it might need to reap a gart mapping.
> > > 
> > > Or do you just copy the gart entries over and restore them
> > > exactly
> > > as-is,
> > > so that there's no shared locks?
> > > 
> > > > > That means we'll have ttm bo hanging around with GART
> > > > > allocations/mappings
> > > > > which aren't actually valid anymore (since they might escape
> > > > > the
> > > > > cleanup
> > > > > upon resume due to the race). That doesn't feel like a solid
> > > > > design
> > > > > either.
> > > > I'm most likely missing something, but I'm really scratching my
> > > > head where
> > > > you see a problem here.
> > > I guess one issue is that at least traditionally, igfx drivers
> > > have
> > > nested
> > > runtime pm within dma_resv lock. And dgpu drivers the other way
> > > round.
> > > Which is a bit awkward if you're trying for common code.
> > > 
> > > Cheers, Sima
>
Thomas Hellström Oct. 9, 2024, 2:17 p.m. UTC | #43
On Wed, 2024-10-09 at 15:39 +0200, Thomas Hellström wrote:
> On Mon, 2024-10-07 at 11:08 +0200, Christian König wrote:
> > Hi Thomas,
> > 
> > I'm on sick leave, but I will try to answer questions and share
> > some 
> > thoughts on this to unblock you.
> > 
> > Am 18.09.24 um 14:57 schrieb Thomas Hellström:
> > > Sima, Christian
> > > 
> > > I've updated the shrinker series now with a guarded for_each
> > > macro
> > > instead:
> > > 
> > > https://patchwork.freedesktop.org/patch/614514/?series=131815&rev=9
> > 
> > Yeah that looks like a big step in the right direction.
> > 
> > > (Note I forgot to remove the export of the previous LRU walker).
> > > 
> > >   so the midlayer argument is now not an issue anymore. The
> > > cleanup.h
> > > guard provides some additional protection against drivers exiting
> > > the
> > > LRU loop early.
> > > 
> > > So remaining is the question whether the driver is allowed to
> > > discard a
> > > suggested bo to shrink from TTM.
> > > 
> > > Arguments for:
> > > 
> > > 1) Not allowing that would require teaching TTM about purgeable
> > > objects.
> > 
> > I think that is actually not correct. TTM already knows about
> > purgeable 
> > objects.
> > 
> > E.g. when TTM asks the driver what to do with evicted objects it is
> > perfectly valid to return an empty placement list indicating that
> > the
> > backing store can just be thrown away.
> > 
> > We use this for things like temporary buffers for example.
> > 
> > That this doesn't apply to swapping looks like a design bug in the 
> > driver/TTM interface to me.
> 
> Yes we can do that with TTM, but for shrinking there's no point in
> trying to shrink when there is no swap-space left, other than
> purgeable
> object. The number of shrinkable objects returned in shrinker::count
> needs to reflect that, and *then* only those objects should be
> selected
> for shrinking. If we were to announce that to TTM using a callback,
> we're actually back to version 1 of this series which was rejected by
> you exactly since it was using callbacks a year or so ago????
> 
> > 
> > > 2) Devices who need the blitter during shrinking would want to
> > > punt
> > > runtime_pm_get() to kswapd to avoid sleeping direct reclaim.
> > 
> > I think the outcome of the discussion is that runtime PM should
> > never
> > be 
> > mixed into TTM swapping.
> > 
> > You can power up blocks to enable a HW blitter for swapping, but
> > this
> > then can't be driven by the runtime PM framework.
> 
> Still that power-on might be sleeping, so what's the difference using
> runtime-pm or not? Why should the driver implement yet another power
> interface, just because TTM refuses to publish a sane LRU walk
> interface?
> 
> > 
> > > 3) If those devices end up blitting (LNL) to be able to shrink,
> > > they
> > > would want to punt waiting for the fence to signal to kswapd to
> > > avoid
> > > waiting in direct reclaim.
> > 
> > Mhm, what do you mean with that?
> 
> When we fired the blitter from direct reclaim, we get a fence. If we
> wait for it in direct reclaim we will be sleeping waiting for gpu,
> which is bad form. We'd like return a failure so the object is
> retried
> when idle, or from kswapd.
> 
> > 
> > 
> > > 4) It looks like we need to resort to folio_trylock in the shmem
> > > backup
> > > backend when shrinking is called for gfp_t = GFP_NOFS. A failing
> > > trylock will require a new bo.
> > 
> > Why would a folio trylock succeed with one BO and not another?
> 
> Good point. We'd fail anyway but would perhaps need to call
> SHRINK_STOP..
> 
> > 
> > And why would that trylock something the device specific driver
> > should 
> > handle?
> 
> It happens in the TTM shrinker helper called from the driver in the
> spirit of demidlayering.
> 
> > 
> > > Arguments against:
> > > None really. I thought the idea of demidlayering would be to
> > > allow
> > > the
> > > driver more freedom.
> > 
> > Well that is a huge argument against it. Giving drivers more
> > freedom
> > is 
> > absolutely not something which turned out to be valuable in the
> > past.
> 
> So then what's the point of demidlayering?
> 
> > 
> > Instead we should put device drivers in a very strict corset of 
> > validated approaches to do things.
> > 
> > Background is that in my experience driver developers are perfectly
> > willing to do unclean approaches which only work in like 99% of all
> > cases just to get a bit more performance or simpler driver
> > implementation.
> > 
> > Those approaches are not legal and in my opinion as subsystem
> > maintainer 
> > I think we need to be more strict and push back much harder on
> > stuff 
> > like that.
> 
> Still, historically that has made developers abandon common
> components
> for driver-specific solutions. 
> 
> And the question is still not answered.
> 
> Exactly *why* can't the driver fail and continue traversing the LRU,
> because all our argumentation revolves around this and you have yet
> to
> provide an objective reason why. 

And in the end, while I think there definitely things worthy of
discussion in this series, 

I don't think there is a point in trying to land a LNL shrinker
operating against a crippled TTM LRU walk interface, nor do I think
anyone would want to attempt to port i915 over, and reworking it three
times I'm now pretty familiar with what works and what doesn't.

So question becomes, with proper reviews can I merge the series here as
is, *with* the de-midlayered LRU walk and noting your advise against
it, or not?

Thanks,
Thomas





> 
> /Thomas
> 
> 
> 
> 
> > 
> > Regards,
> > Christian.
> > 
> > > 
> > > So any feedback appreciated. If that is found acceptable we can
> > > proceed
> > > with reviewing this patch and also with the shrinker series.
> > > 
> > > Thanks,
> > > Thomas
> > > 
> > > 
> > > On Mon, 2024-09-02 at 13:07 +0200, Daniel Vetter wrote:
> > > > On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König
> > > > wrote:
> > > > > Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> > > > > > On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter
> > > > > > wrote:
> > > > > > > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König
> > > > > > > wrote:
> > > > > > > > Completely agree that this is complicated, but I still
> > > > > > > > don't
> > > > > > > > see the need
> > > > > > > > for it.
> > > > > > > > 
> > > > > > > > Drivers just need to use pm_runtime_get_if_in_use()
> > > > > > > > inside
> > > > > > > > the shrinker and
> > > > > > > > postpone all hw activity until resume.
> > > > > > > Not good enough, at least long term I think. Also
> > > > > > > postponing hw
> > > > > > > activity
> > > > > > > to resume doesn't solve the deadlock issue, if you still
> > > > > > > need
> > > > > > > to grab ttm
> > > > > > > locks on resume.
> > > > > > Pondered this specific aspect some more, and I think you
> > > > > > still
> > > > > > have a race
> > > > > > here (even if you avoid the deadlock): If the condiditional
> > > > > > rpm_get call
> > > > > > fails there's no guarantee that the device will
> > > > > > suspend/resume
> > > > > > and clean
> > > > > > up the GART mapping.
> > > > > Well I think we have a major disconnect here. When the device
> > > > > is
> > > > > powered
> > > > > down there is no GART mapping to clean up any more.
> > > > > 
> > > > > In other words GART is a table in local memory (VRAM) when
> > > > > the
> > > > > device is
> > > > > powered down this table is completely destroyed. Any BO which
> > > > > was
> > > > > mapped
> > > > > inside this table is now not mapped any more.
> > > > > 
> > > > > So when the shrinker wants to evict a BO which is marked as
> > > > > mapped
> > > > > to GART
> > > > > and the device is powered down we just skip the GART
> > > > > unmapping
> > > > > part
> > > > > because
> > > > > that has already implicitly happened during power down.
> > > > > 
> > > > > Before mapping any BO into the GART again we power the GPU up
> > > > > through the
> > > > > runtime PM calls. And while powering it up again the GART is
> > > > > restored.
> > > > My point is that you can't tell whether the device will power
> > > > down or
> > > > not,
> > > > you can only tell whether there's a chance it might be powering
> > > > down
> > > > and
> > > > so you can't get at the rpm reference without deadlock issues.
> > > > 
> > > > > > The race gets a bit smaller if you use
> > > > > > pm_runtime_get_if_active(), but even then you might catch
> > > > > > it
> > > > > > right when
> > > > > > resume almost finished.
> > > > > What race are you talking about?
> > > > > 
> > > > > The worst thing which could happen is that we restore a GART
> > > > > entry
> > > > > which
> > > > > isn't needed any more, but that is pretty much irrelevant
> > > > > since
> > > > > we
> > > > > only
> > > > > clear them to avoid some hw bugs.
> > > > The race I'm seeing is where you thought the GART entry is not
> > > > issue,
> > > > tossed an object, but the device didn't suspend, so might still
> > > > use
> > > > it.
> > > > 
> > > > I guess if we're clearly separating the sw allocation of the
> > > > TTM_TT
> > > > with
> > > > the physical entries in the GART that should all work, but
> > > > feels
> > > > a
> > > > bit
> > > > tricky. The race I've seen is essentially these two getting out
> > > > of
> > > > sync.
> > > > 
> > > > So maybe it was me who's stuck.
> > > > 
> > > > What I wonder is whether it works in practice, since on the
> > > > restore
> > > > side
> > > > you need to get some locks to figure out which gart mappings
> > > > exist
> > > > and
> > > > need restoring. And that's the same locks as the shrinker needs
> > > > to
> > > > figure
> > > > out whether it might need to reap a gart mapping.
> > > > 
> > > > Or do you just copy the gart entries over and restore them
> > > > exactly
> > > > as-is,
> > > > so that there's no shared locks?
> > > > 
> > > > > > That means we'll have ttm bo hanging around with GART
> > > > > > allocations/mappings
> > > > > > which aren't actually valid anymore (since they might
> > > > > > escape
> > > > > > the
> > > > > > cleanup
> > > > > > upon resume due to the race). That doesn't feel like a
> > > > > > solid
> > > > > > design
> > > > > > either.
> > > > > I'm most likely missing something, but I'm really scratching
> > > > > my
> > > > > head where
> > > > > you see a problem here.
> > > > I guess one issue is that at least traditionally, igfx drivers
> > > > have
> > > > nested
> > > > runtime pm within dma_resv lock. And dgpu drivers the other way
> > > > round.
> > > > Which is a bit awkward if you're trying for common code.
> > > > 
> > > > Cheers, Sima
> > 
>
Christian König Oct. 10, 2024, 8 a.m. UTC | #44
Am 09.10.24 um 16:17 schrieb Thomas Hellström:
> On Wed, 2024-10-09 at 15:39 +0200, Thomas Hellström wrote:
>> On Mon, 2024-10-07 at 11:08 +0200, Christian König wrote:
>>> Hi Thomas,
>>>
>>> I'm on sick leave, but I will try to answer questions and share
>>> some
>>> thoughts on this to unblock you.
>>>
>>> Am 18.09.24 um 14:57 schrieb Thomas Hellström:
>>>> Sima, Christian
>>>>
>>>> I've updated the shrinker series now with a guarded for_each
>>>> macro
>>>> instead:
>>>>
>>>> https://patchwork.freedesktop.org/patch/614514/?series=131815&rev=9
>>> Yeah that looks like a big step in the right direction.
>>>
>>>> (Note I forgot to remove the export of the previous LRU walker).
>>>>
>>>>    so the midlayer argument is now not an issue anymore. The
>>>> cleanup.h
>>>> guard provides some additional protection against drivers exiting
>>>> the
>>>> LRU loop early.
>>>>
>>>> So remaining is the question whether the driver is allowed to
>>>> discard a
>>>> suggested bo to shrink from TTM.
>>>>
>>>> Arguments for:
>>>>
>>>> 1) Not allowing that would require teaching TTM about purgeable
>>>> objects.
>>> I think that is actually not correct. TTM already knows about
>>> purgeable
>>> objects.
>>>
>>> E.g. when TTM asks the driver what to do with evicted objects it is
>>> perfectly valid to return an empty placement list indicating that
>>> the
>>> backing store can just be thrown away.
>>>
>>> We use this for things like temporary buffers for example.
>>>
>>> That this doesn't apply to swapping looks like a design bug in the
>>> driver/TTM interface to me.
>> Yes we can do that with TTM, but for shrinking there's no point in
>> trying to shrink when there is no swap-space left, other than
>> purgeable
>> object. The number of shrinkable objects returned in shrinker::count
>> needs to reflect that, and *then* only those objects should be
>> selected
>> for shrinking. If we were to announce that to TTM using a callback,
>> we're actually back to version 1 of this series which was rejected by
>> you exactly since it was using callbacks a year or so ago????

Yeah that indeed doesn't sound like a good idea.

On the other hand the decision that only purgeable objects should be 
selected when there is no swap space left sounds like something TTM 
should do and not the driver.

>>>> 2) Devices who need the blitter during shrinking would want to
>>>> punt
>>>> runtime_pm_get() to kswapd to avoid sleeping direct reclaim.
>>> I think the outcome of the discussion is that runtime PM should
>>> never
>>> be
>>> mixed into TTM swapping.
>>>
>>> You can power up blocks to enable a HW blitter for swapping, but
>>> this
>>> then can't be driven by the runtime PM framework.
>> Still that power-on might be sleeping, so what's the difference using
>> runtime-pm or not? Why should the driver implement yet another power
>> interface, just because TTM refuses to publish a sane LRU walk
>> interface?

See the discussion with Sima around the PM functions.

My understanding might be wrong, but I now think that with local memory 
you can't do the i915 approach where the PM functions are so low level 
that they can also be called inside the shrinker.

So you basically have PM functions for your whole device and PM 
functions for only the HW blocks necessary for the shrinker.

>>
>>>> 3) If those devices end up blitting (LNL) to be able to shrink,
>>>> they
>>>> would want to punt waiting for the fence to signal to kswapd to
>>>> avoid
>>>> waiting in direct reclaim.
>>> Mhm, what do you mean with that?
>> When we fired the blitter from direct reclaim, we get a fence. If we
>> wait for it in direct reclaim we will be sleeping waiting for gpu,
>> which is bad form. We'd like return a failure so the object is
>> retried
>> when idle, or from kswapd.

Oh, that is a really good point. So basically you want to avoid 
dependencies on the dma_fence.

>>>> 4) It looks like we need to resort to folio_trylock in the shmem
>>>> backup
>>>> backend when shrinking is called for gfp_t = GFP_NOFS. A failing
>>>> trylock will require a new bo.
>>> Why would a folio trylock succeed with one BO and not another?
>> Good point. We'd fail anyway but would perhaps need to call
>> SHRINK_STOP..
>>
>>> And why would that trylock something the device specific driver
>>> should
>>> handle?
>> It happens in the TTM shrinker helper called from the driver in the
>> spirit of demidlayering.
>>
>>>> Arguments against:
>>>> None really. I thought the idea of demidlayering would be to
>>>> allow
>>>> the
>>>> driver more freedom.
>>> Well that is a huge argument against it. Giving drivers more
>>> freedom
>>> is
>>> absolutely not something which turned out to be valuable in the
>>> past.
>> So then what's the point of demidlayering?

So that drivers can prepare the environment for TTM to work with instead 
of TTM asking for it.

In your case for example that means powering up HW blocks so that BOs 
could be moved.

>>> Instead we should put device drivers in a very strict corset of
>>> validated approaches to do things.
>>>
>>> Background is that in my experience driver developers are perfectly
>>> willing to do unclean approaches which only work in like 99% of all
>>> cases just to get a bit more performance or simpler driver
>>> implementation.
>>>
>>> Those approaches are not legal and in my opinion as subsystem
>>> maintainer
>>> I think we need to be more strict and push back much harder on
>>> stuff
>>> like that.
>> Still, historically that has made developers abandon common
>> components
>> for driver-specific solutions.
>>
>> And the question is still not answered.
>>
>> Exactly *why* can't the driver fail and continue traversing the LRU,
>> because all our argumentation revolves around this and you have yet
>> to
>> provide an objective reason why.
> And in the end, while I think there definitely things worthy of
> discussion in this series,
>
> I don't think there is a point in trying to land a LNL shrinker
> operating against a crippled TTM LRU walk interface, nor do I think
> anyone would want to attempt to port i915 over, and reworking it three
> times I'm now pretty familiar with what works and what doesn't.
>
> So question becomes, with proper reviews can I merge the series here as
> is, *with* the de-midlayered LRU walk and noting your advise against
> it, or not?

More or less yes, I still have a bad feeling about it but we probably 
need to see the whole thing getting used to judge if it really works or not.

I mean it's not UAPI we are talking about, so even if we find in 5years 
from now that it was a bad idea we can still roll it back and try 
something else.

So yeah, feel free to go ahead.

Regards,
Christian.


>
> Thanks,
> Thomas
>
>
>
>
>
>> /Thomas
>>
>>
>>
>>
>>> Regards,
>>> Christian.
>>>
>>>> So any feedback appreciated. If that is found acceptable we can
>>>> proceed
>>>> with reviewing this patch and also with the shrinker series.
>>>>
>>>> Thanks,
>>>> Thomas
>>>>
>>>>
>>>> On Mon, 2024-09-02 at 13:07 +0200, Daniel Vetter wrote:
>>>>> On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König
>>>>> wrote:
>>>>>> Am 27.08.24 um 19:53 schrieb Daniel Vetter:
>>>>>>> On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter
>>>>>>> wrote:
>>>>>>>> On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian König
>>>>>>>> wrote:
>>>>>>>>> Completely agree that this is complicated, but I still
>>>>>>>>> don't
>>>>>>>>> see the need
>>>>>>>>> for it.
>>>>>>>>>
>>>>>>>>> Drivers just need to use pm_runtime_get_if_in_use()
>>>>>>>>> inside
>>>>>>>>> the shrinker and
>>>>>>>>> postpone all hw activity until resume.
>>>>>>>> Not good enough, at least long term I think. Also
>>>>>>>> postponing hw
>>>>>>>> activity
>>>>>>>> to resume doesn't solve the deadlock issue, if you still
>>>>>>>> need
>>>>>>>> to grab ttm
>>>>>>>> locks on resume.
>>>>>>> Pondered this specific aspect some more, and I think you
>>>>>>> still
>>>>>>> have a race
>>>>>>> here (even if you avoid the deadlock): If the condiditional
>>>>>>> rpm_get call
>>>>>>> fails there's no guarantee that the device will
>>>>>>> suspend/resume
>>>>>>> and clean
>>>>>>> up the GART mapping.
>>>>>> Well I think we have a major disconnect here. When the device
>>>>>> is
>>>>>> powered
>>>>>> down there is no GART mapping to clean up any more.
>>>>>>
>>>>>> In other words GART is a table in local memory (VRAM) when
>>>>>> the
>>>>>> device is
>>>>>> powered down this table is completely destroyed. Any BO which
>>>>>> was
>>>>>> mapped
>>>>>> inside this table is now not mapped any more.
>>>>>>
>>>>>> So when the shrinker wants to evict a BO which is marked as
>>>>>> mapped
>>>>>> to GART
>>>>>> and the device is powered down we just skip the GART
>>>>>> unmapping
>>>>>> part
>>>>>> because
>>>>>> that has already implicitly happened during power down.
>>>>>>
>>>>>> Before mapping any BO into the GART again we power the GPU up
>>>>>> through the
>>>>>> runtime PM calls. And while powering it up again the GART is
>>>>>> restored.
>>>>> My point is that you can't tell whether the device will power
>>>>> down or
>>>>> not,
>>>>> you can only tell whether there's a chance it might be powering
>>>>> down
>>>>> and
>>>>> so you can't get at the rpm reference without deadlock issues.
>>>>>
>>>>>>> The race gets a bit smaller if you use
>>>>>>> pm_runtime_get_if_active(), but even then you might catch
>>>>>>> it
>>>>>>> right when
>>>>>>> resume almost finished.
>>>>>> What race are you talking about?
>>>>>>
>>>>>> The worst thing which could happen is that we restore a GART
>>>>>> entry
>>>>>> which
>>>>>> isn't needed any more, but that is pretty much irrelevant
>>>>>> since
>>>>>> we
>>>>>> only
>>>>>> clear them to avoid some hw bugs.
>>>>> The race I'm seeing is where you thought the GART entry is not
>>>>> issue,
>>>>> tossed an object, but the device didn't suspend, so might still
>>>>> use
>>>>> it.
>>>>>
>>>>> I guess if we're clearly separating the sw allocation of the
>>>>> TTM_TT
>>>>> with
>>>>> the physical entries in the GART that should all work, but
>>>>> feels
>>>>> a
>>>>> bit
>>>>> tricky. The race I've seen is essentially these two getting out
>>>>> of
>>>>> sync.
>>>>>
>>>>> So maybe it was me who's stuck.
>>>>>
>>>>> What I wonder is whether it works in practice, since on the
>>>>> restore
>>>>> side
>>>>> you need to get some locks to figure out which gart mappings
>>>>> exist
>>>>> and
>>>>> need restoring. And that's the same locks as the shrinker needs
>>>>> to
>>>>> figure
>>>>> out whether it might need to reap a gart mapping.
>>>>>
>>>>> Or do you just copy the gart entries over and restore them
>>>>> exactly
>>>>> as-is,
>>>>> so that there's no shared locks?
>>>>>
>>>>>>> That means we'll have ttm bo hanging around with GART
>>>>>>> allocations/mappings
>>>>>>> which aren't actually valid anymore (since they might
>>>>>>> escape
>>>>>>> the
>>>>>>> cleanup
>>>>>>> upon resume due to the race). That doesn't feel like a
>>>>>>> solid
>>>>>>> design
>>>>>>> either.
>>>>>> I'm most likely missing something, but I'm really scratching
>>>>>> my
>>>>>> head where
>>>>>> you see a problem here.
>>>>> I guess one issue is that at least traditionally, igfx drivers
>>>>> have
>>>>> nested
>>>>> runtime pm within dma_resv lock. And dgpu drivers the other way
>>>>> round.
>>>>> Which is a bit awkward if you're trying for common code.
>>>>>
>>>>> Cheers, Sima
Thomas Hellström Oct. 11, 2024, 4:52 p.m. UTC | #45
On Thu, 2024-10-10 at 10:00 +0200, Christian König wrote:
> Am 09.10.24 um 16:17 schrieb Thomas Hellström:
> > On Wed, 2024-10-09 at 15:39 +0200, Thomas Hellström wrote:
> > > On Mon, 2024-10-07 at 11:08 +0200, Christian König wrote:
> > > > Hi Thomas,
> > > > 
> > > > I'm on sick leave, but I will try to answer questions and share
> > > > some
> > > > thoughts on this to unblock you.
> > > > 
> > > > Am 18.09.24 um 14:57 schrieb Thomas Hellström:
> > > > > Sima, Christian
> > > > > 
> > > > > I've updated the shrinker series now with a guarded for_each
> > > > > macro
> > > > > instead:
> > > > > 
> > > > > https://patchwork.freedesktop.org/patch/614514/?series=131815&rev=9
> > > > Yeah that looks like a big step in the right direction.
> > > > 
> > > > > (Note I forgot to remove the export of the previous LRU
> > > > > walker).
> > > > > 
> > > > >    so the midlayer argument is now not an issue anymore. The
> > > > > cleanup.h
> > > > > guard provides some additional protection against drivers
> > > > > exiting
> > > > > the
> > > > > LRU loop early.
> > > > > 
> > > > > So remaining is the question whether the driver is allowed to
> > > > > discard a
> > > > > suggested bo to shrink from TTM.
> > > > > 
> > > > > Arguments for:
> > > > > 
> > > > > 1) Not allowing that would require teaching TTM about
> > > > > purgeable
> > > > > objects.
> > > > I think that is actually not correct. TTM already knows about
> > > > purgeable
> > > > objects.
> > > > 
> > > > E.g. when TTM asks the driver what to do with evicted objects
> > > > it is
> > > > perfectly valid to return an empty placement list indicating
> > > > that
> > > > the
> > > > backing store can just be thrown away.
> > > > 
> > > > We use this for things like temporary buffers for example.
> > > > 
> > > > That this doesn't apply to swapping looks like a design bug in
> > > > the
> > > > driver/TTM interface to me.
> > > Yes we can do that with TTM, but for shrinking there's no point
> > > in
> > > trying to shrink when there is no swap-space left, other than
> > > purgeable
> > > object. The number of shrinkable objects returned in
> > > shrinker::count
> > > needs to reflect that, and *then* only those objects should be
> > > selected
> > > for shrinking. If we were to announce that to TTM using a
> > > callback,
> > > we're actually back to version 1 of this series which was
> > > rejected by
> > > you exactly since it was using callbacks a year or so ago????
> 
> Yeah that indeed doesn't sound like a good idea.
> 
> On the other hand the decision that only purgeable objects should be 
> selected when there is no swap space left sounds like something TTM 
> should do and not the driver.
> 
> > > > > 2) Devices who need the blitter during shrinking would want
> > > > > to
> > > > > punt
> > > > > runtime_pm_get() to kswapd to avoid sleeping direct reclaim.
> > > > I think the outcome of the discussion is that runtime PM should
> > > > never
> > > > be
> > > > mixed into TTM swapping.
> > > > 
> > > > You can power up blocks to enable a HW blitter for swapping,
> > > > but
> > > > this
> > > > then can't be driven by the runtime PM framework.
> > > Still that power-on might be sleeping, so what's the difference
> > > using
> > > runtime-pm or not? Why should the driver implement yet another
> > > power
> > > interface, just because TTM refuses to publish a sane LRU walk
> > > interface?
> 
> See the discussion with Sima around the PM functions.
> 
> My understanding might be wrong, but I now think that with local
> memory 
> you can't do the i915 approach where the PM functions are so low
> level 
> that they can also be called inside the shrinker.
> 
> So you basically have PM functions for your whole device and PM 
> functions for only the HW blocks necessary for the shrinker.
> 
> > > 
> > > > > 3) If those devices end up blitting (LNL) to be able to
> > > > > shrink,
> > > > > they
> > > > > would want to punt waiting for the fence to signal to kswapd
> > > > > to
> > > > > avoid
> > > > > waiting in direct reclaim.
> > > > Mhm, what do you mean with that?
> > > When we fired the blitter from direct reclaim, we get a fence. If
> > > we
> > > wait for it in direct reclaim we will be sleeping waiting for
> > > gpu,
> > > which is bad form. We'd like return a failure so the object is
> > > retried
> > > when idle, or from kswapd.
> 
> Oh, that is a really good point. So basically you want to avoid 
> dependencies on the dma_fence.
> 
> > > > > 4) It looks like we need to resort to folio_trylock in the
> > > > > shmem
> > > > > backup
> > > > > backend when shrinking is called for gfp_t = GFP_NOFS. A
> > > > > failing
> > > > > trylock will require a new bo.
> > > > Why would a folio trylock succeed with one BO and not another?
> > > Good point. We'd fail anyway but would perhaps need to call
> > > SHRINK_STOP..
> > > 
> > > > And why would that trylock something the device specific driver
> > > > should
> > > > handle?
> > > It happens in the TTM shrinker helper called from the driver in
> > > the
> > > spirit of demidlayering.
> > > 
> > > > > Arguments against:
> > > > > None really. I thought the idea of demidlayering would be to
> > > > > allow
> > > > > the
> > > > > driver more freedom.
> > > > Well that is a huge argument against it. Giving drivers more
> > > > freedom
> > > > is
> > > > absolutely not something which turned out to be valuable in the
> > > > past.
> > > So then what's the point of demidlayering?
> 
> So that drivers can prepare the environment for TTM to work with
> instead 
> of TTM asking for it.
> 
> In your case for example that means powering up HW blocks so that BOs
> could be moved.
> 
> > > > Instead we should put device drivers in a very strict corset of
> > > > validated approaches to do things.
> > > > 
> > > > Background is that in my experience driver developers are
> > > > perfectly
> > > > willing to do unclean approaches which only work in like 99% of
> > > > all
> > > > cases just to get a bit more performance or simpler driver
> > > > implementation.
> > > > 
> > > > Those approaches are not legal and in my opinion as subsystem
> > > > maintainer
> > > > I think we need to be more strict and push back much harder on
> > > > stuff
> > > > like that.
> > > Still, historically that has made developers abandon common
> > > components
> > > for driver-specific solutions.
> > > 
> > > And the question is still not answered.
> > > 
> > > Exactly *why* can't the driver fail and continue traversing the
> > > LRU,
> > > because all our argumentation revolves around this and you have
> > > yet
> > > to
> > > provide an objective reason why.
> > And in the end, while I think there definitely things worthy of
> > discussion in this series,
> > 
> > I don't think there is a point in trying to land a LNL shrinker
> > operating against a crippled TTM LRU walk interface, nor do I think
> > anyone would want to attempt to port i915 over, and reworking it
> > three
> > times I'm now pretty familiar with what works and what doesn't.
> > 
> > So question becomes, with proper reviews can I merge the series
> > here as
> > is, *with* the de-midlayered LRU walk and noting your advise
> > against
> > it, or not?
> 
> More or less yes, I still have a bad feeling about it but we probably
> need to see the whole thing getting used to judge if it really works
> or not.
> 
> I mean it's not UAPI we are talking about, so even if we find in
> 5years 
> from now that it was a bad idea we can still roll it back and try 
> something else.
> 
> So yeah, feel free to go ahead.

Thanks, I'll respin a version moving the checks you pointed out,
(get_nr_swap_pages() + all other checks TTM can really do) etc, into
TTM helpers to simplify such a change in the future if it becomes
needed.

/Thomas


> 
> Regards,
> Christian.
> 
> 
> > 
> > Thanks,
> > Thomas
> > 
> > 
> > 
> > 
> > 
> > > /Thomas
> > > 
> > > 
> > > 
> > > 
> > > > Regards,
> > > > Christian.
> > > > 
> > > > > So any feedback appreciated. If that is found acceptable we
> > > > > can
> > > > > proceed
> > > > > with reviewing this patch and also with the shrinker series.
> > > > > 
> > > > > Thanks,
> > > > > Thomas
> > > > > 
> > > > > 
> > > > > On Mon, 2024-09-02 at 13:07 +0200, Daniel Vetter wrote:
> > > > > > On Wed, Aug 28, 2024 at 02:20:34PM +0200, Christian König
> > > > > > wrote:
> > > > > > > Am 27.08.24 um 19:53 schrieb Daniel Vetter:
> > > > > > > > On Tue, Aug 27, 2024 at 06:52:13PM +0200, Daniel Vetter
> > > > > > > > wrote:
> > > > > > > > > On Thu, Aug 22, 2024 at 03:19:29PM +0200, Christian
> > > > > > > > > König
> > > > > > > > > wrote:
> > > > > > > > > > Completely agree that this is complicated, but I
> > > > > > > > > > still
> > > > > > > > > > don't
> > > > > > > > > > see the need
> > > > > > > > > > for it.
> > > > > > > > > > 
> > > > > > > > > > Drivers just need to use pm_runtime_get_if_in_use()
> > > > > > > > > > inside
> > > > > > > > > > the shrinker and
> > > > > > > > > > postpone all hw activity until resume.
> > > > > > > > > Not good enough, at least long term I think. Also
> > > > > > > > > postponing hw
> > > > > > > > > activity
> > > > > > > > > to resume doesn't solve the deadlock issue, if you
> > > > > > > > > still
> > > > > > > > > need
> > > > > > > > > to grab ttm
> > > > > > > > > locks on resume.
> > > > > > > > Pondered this specific aspect some more, and I think
> > > > > > > > you
> > > > > > > > still
> > > > > > > > have a race
> > > > > > > > here (even if you avoid the deadlock): If the
> > > > > > > > condiditional
> > > > > > > > rpm_get call
> > > > > > > > fails there's no guarantee that the device will
> > > > > > > > suspend/resume
> > > > > > > > and clean
> > > > > > > > up the GART mapping.
> > > > > > > Well I think we have a major disconnect here. When the
> > > > > > > device
> > > > > > > is
> > > > > > > powered
> > > > > > > down there is no GART mapping to clean up any more.
> > > > > > > 
> > > > > > > In other words GART is a table in local memory (VRAM)
> > > > > > > when
> > > > > > > the
> > > > > > > device is
> > > > > > > powered down this table is completely destroyed. Any BO
> > > > > > > which
> > > > > > > was
> > > > > > > mapped
> > > > > > > inside this table is now not mapped any more.
> > > > > > > 
> > > > > > > So when the shrinker wants to evict a BO which is marked
> > > > > > > as
> > > > > > > mapped
> > > > > > > to GART
> > > > > > > and the device is powered down we just skip the GART
> > > > > > > unmapping
> > > > > > > part
> > > > > > > because
> > > > > > > that has already implicitly happened during power down.
> > > > > > > 
> > > > > > > Before mapping any BO into the GART again we power the
> > > > > > > GPU up
> > > > > > > through the
> > > > > > > runtime PM calls. And while powering it up again the GART
> > > > > > > is
> > > > > > > restored.
> > > > > > My point is that you can't tell whether the device will
> > > > > > power
> > > > > > down or
> > > > > > not,
> > > > > > you can only tell whether there's a chance it might be
> > > > > > powering
> > > > > > down
> > > > > > and
> > > > > > so you can't get at the rpm reference without deadlock
> > > > > > issues.
> > > > > > 
> > > > > > > > The race gets a bit smaller if you use
> > > > > > > > pm_runtime_get_if_active(), but even then you might
> > > > > > > > catch
> > > > > > > > it
> > > > > > > > right when
> > > > > > > > resume almost finished.
> > > > > > > What race are you talking about?
> > > > > > > 
> > > > > > > The worst thing which could happen is that we restore a
> > > > > > > GART
> > > > > > > entry
> > > > > > > which
> > > > > > > isn't needed any more, but that is pretty much irrelevant
> > > > > > > since
> > > > > > > we
> > > > > > > only
> > > > > > > clear them to avoid some hw bugs.
> > > > > > The race I'm seeing is where you thought the GART entry is
> > > > > > not
> > > > > > issue,
> > > > > > tossed an object, but the device didn't suspend, so might
> > > > > > still
> > > > > > use
> > > > > > it.
> > > > > > 
> > > > > > I guess if we're clearly separating the sw allocation of
> > > > > > the
> > > > > > TTM_TT
> > > > > > with
> > > > > > the physical entries in the GART that should all work, but
> > > > > > feels
> > > > > > a
> > > > > > bit
> > > > > > tricky. The race I've seen is essentially these two getting
> > > > > > out
> > > > > > of
> > > > > > sync.
> > > > > > 
> > > > > > So maybe it was me who's stuck.
> > > > > > 
> > > > > > What I wonder is whether it works in practice, since on the
> > > > > > restore
> > > > > > side
> > > > > > you need to get some locks to figure out which gart
> > > > > > mappings
> > > > > > exist
> > > > > > and
> > > > > > need restoring. And that's the same locks as the shrinker
> > > > > > needs
> > > > > > to
> > > > > > figure
> > > > > > out whether it might need to reap a gart mapping.
> > > > > > 
> > > > > > Or do you just copy the gart entries over and restore them
> > > > > > exactly
> > > > > > as-is,
> > > > > > so that there's no shared locks?
> > > > > > 
> > > > > > > > That means we'll have ttm bo hanging around with GART
> > > > > > > > allocations/mappings
> > > > > > > > which aren't actually valid anymore (since they might
> > > > > > > > escape
> > > > > > > > the
> > > > > > > > cleanup
> > > > > > > > upon resume due to the race). That doesn't feel like a
> > > > > > > > solid
> > > > > > > > design
> > > > > > > > either.
> > > > > > > I'm most likely missing something, but I'm really
> > > > > > > scratching
> > > > > > > my
> > > > > > > head where
> > > > > > > you see a problem here.
> > > > > > I guess one issue is that at least traditionally, igfx
> > > > > > drivers
> > > > > > have
> > > > > > nested
> > > > > > runtime pm within dma_resv lock. And dgpu drivers the other
> > > > > > way
> > > > > > round.
> > > > > > Which is a bit awkward if you're trying for common code.
> > > > > > 
> > > > > > Cheers, Sima
diff mbox series

Patch

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 0131ec802066..41bee8696e69 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -45,6 +45,7 @@ 
 #include <linux/dma-resv.h>
 
 #include "ttm_module.h"
+#include "ttm_bo_util.h"
 
 static void ttm_bo_mem_space_debug(struct ttm_buffer_object *bo,
 					struct ttm_placement *placement)
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index 3c07f4712d5c..03e28e3d0d03 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -37,6 +37,8 @@ 
 
 #include <drm/drm_cache.h>
 
+#include "ttm_bo_util.h"
+
 struct ttm_transfer_obj {
 	struct ttm_buffer_object base;
 	struct ttm_buffer_object *bo;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.h b/drivers/gpu/drm/ttm/ttm_bo_util.h
new file mode 100644
index 000000000000..c19b50809208
--- /dev/null
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.h
@@ -0,0 +1,67 @@ 
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/**************************************************************************
+ * Copyright 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef _TTM_BO_UTIL_H_
+#define _TTM_BO_UTIL_H_
+
+struct ww_acquire_ctx;
+
+struct ttm_buffer_object;
+struct ttm_operation_ctx;
+struct ttm_lru_walk;
+
+/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
+struct ttm_lru_walk_ops {
+	/**
+	 * process_bo - Process this bo.
+	 * @walk: struct ttm_lru_walk describing the walk.
+	 * @bo: A locked and referenced buffer object.
+	 *
+	 * Return: Negative error code on error, User-defined positive value
+	 * (typically, but not always, size of the processed bo) on success.
+	 * On success, the returned values are summed by the walk and the
+	 * walk exits when its target is met.
+	 * 0 also indicates success, -EBUSY means this bo was skipped.
+	 */
+	s64 (*process_bo)(struct ttm_lru_walk *walk,
+			  struct ttm_buffer_object *bo);
+};
+
+/**
+ * struct ttm_lru_walk - Structure describing a LRU walk.
+ */
+struct ttm_lru_walk {
+	/** @ops: Pointer to the ops structure. */
+	const struct ttm_lru_walk_ops *ops;
+	/** @ctx: Pointer to the struct ttm_operation_ctx. */
+	struct ttm_operation_ctx *ctx;
+	/** @ticket: The struct ww_acquire_ctx if any. */
+	struct ww_acquire_ctx *ticket;
+	/** @tryock_only: Only use trylock for locking. */
+	bool trylock_only;
+};
+
+s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct ttm_device *bdev,
+			   struct ttm_resource_manager *man, s64 target);
+
+#endif
diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
index d1a732d56259..5f7c967222a2 100644
--- a/include/drm/ttm/ttm_bo.h
+++ b/include/drm/ttm/ttm_bo.h
@@ -194,41 +194,6 @@  struct ttm_operation_ctx {
 	uint64_t bytes_moved;
 };
 
-struct ttm_lru_walk;
-
-/** struct ttm_lru_walk_ops - Operations for a LRU walk. */
-struct ttm_lru_walk_ops {
-	/**
-	 * process_bo - Process this bo.
-	 * @walk: struct ttm_lru_walk describing the walk.
-	 * @bo: A locked and referenced buffer object.
-	 *
-	 * Return: Negative error code on error, User-defined positive value
-	 * (typically, but not always, size of the processed bo) on success.
-	 * On success, the returned values are summed by the walk and the
-	 * walk exits when its target is met.
-	 * 0 also indicates success, -EBUSY means this bo was skipped.
-	 */
-	s64 (*process_bo)(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo);
-};
-
-/**
- * struct ttm_lru_walk - Structure describing a LRU walk.
- */
-struct ttm_lru_walk {
-	/** @ops: Pointer to the ops structure. */
-	const struct ttm_lru_walk_ops *ops;
-	/** @ctx: Pointer to the struct ttm_operation_ctx. */
-	struct ttm_operation_ctx *ctx;
-	/** @ticket: The struct ww_acquire_ctx if any. */
-	struct ww_acquire_ctx *ticket;
-	/** @tryock_only: Only use trylock for locking. */
-	bool trylock_only;
-};
-
-s64 ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct ttm_device *bdev,
-			   struct ttm_resource_manager *man, s64 target);
-
 /**
  * ttm_bo_get - reference a struct ttm_buffer_object
  *