diff mbox series

[v5,04/12] drm/ttm, drm/amdgpu, drm/xe: Consider hitch moves within bulk sublist moves

Message ID 20240618071820.130917-5-thomas.hellstrom@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series TTM shrinker helpers and xe buffer object shrinker | expand

Commit Message

Thomas Hellström June 18, 2024, 7:18 a.m. UTC
To address the problem with hitches moving when bulk move
sublists are lru-bumped, register the list cursors with the
ttm_lru_bulk_move structure when traversing its list, and
when lru-bumping the list, move the cursor hitch to the tail.
This also means it's mandatory for drivers to call
ttm_lru_bulk_move_init() and ttm_lru_bulk_move_fini() when
initializing and finalizing the bulk move structure, so add
those calls to the amdgpu- and xe driver.

Compared to v1 this is slightly more code but less fragile
and hopefully easier to understand.

Changes in previous series:
- Completely rework the functionality
- Avoid a NULL pointer dereference assigning manager->mem_type
- Remove some leftover code causing build problems
v2:
- For hitch bulk tail moves, store the mem_type in the cursor
  instead of with the manager.
v3:
- Remove leftover mem_type member from change in v2.

Cc: Christian König <christian.koenig@amd.com>
Cc: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: <dri-devel@lists.freedesktop.org>
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |  4 ++
 drivers/gpu/drm/ttm/ttm_resource.c     | 89 ++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_vm.c             |  4 ++
 include/drm/ttm/ttm_resource.h         | 56 ++++++++++------
 4 files changed, 132 insertions(+), 21 deletions(-)

Comments

Matthew Brost June 19, 2024, 3:37 a.m. UTC | #1
On Tue, Jun 18, 2024 at 09:18:12AM +0200, Thomas Hellström wrote:

Ugh, replying to correct version again...

> To address the problem with hitches moving when bulk move
> sublists are lru-bumped, register the list cursors with the
> ttm_lru_bulk_move structure when traversing its list, and
> when lru-bumping the list, move the cursor hitch to the tail.

- So the hitch moves to the tail (last) which points to the next item in
  the LRU list
- Then bulk is moved which is from first -> last to the end of the LRU
  list
- Now the hitch remains in the correct position in the list (i.e. it
  didn't move with the bulk)

Did I get that right?

> This also means it's mandatory for drivers to call
> ttm_lru_bulk_move_init() and ttm_lru_bulk_move_fini() when
> initializing and finalizing the bulk move structure, so add
> those calls to the amdgpu- and xe driver.
> 
> Compared to v1 this is slightly more code but less fragile
> and hopefully easier to understand.
> 
> Changes in previous series:
> - Completely rework the functionality
> - Avoid a NULL pointer dereference assigning manager->mem_type
> - Remove some leftover code causing build problems
> v2:
> - For hitch bulk tail moves, store the mem_type in the cursor
>   instead of with the manager.
> v3:
> - Remove leftover mem_type member from change in v2.
> 
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: <dri-devel@lists.freedesktop.org>
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |  4 ++
>  drivers/gpu/drm/ttm/ttm_resource.c     | 89 ++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_vm.c             |  4 ++
>  include/drm/ttm/ttm_resource.h         | 56 ++++++++++------
>  4 files changed, 132 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3abfa66d72a2..97743993d711 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2420,6 +2420,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  	if (r)
>  		return r;
>  
> +	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
> +
>  	vm->is_compute_context = false;
>  
>  	vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
> @@ -2484,6 +2486,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  error_free_delayed:
>  	dma_fence_put(vm->last_tlb_flush);
>  	dma_fence_put(vm->last_unlocked);
> +	ttm_lru_bulk_move_fini(&adev->mman.bdev, &vm->lru_bulk_move);
>  	amdgpu_vm_fini_entities(vm);
>  
>  	return r;
> @@ -2640,6 +2643,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>  		}
>  	}
>  
> +	ttm_lru_bulk_move_fini(&adev->mman.bdev, &vm->lru_bulk_move);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c
> index 9c8b6499edfb..a03090683e79 100644
> --- a/drivers/gpu/drm/ttm/ttm_resource.c
> +++ b/drivers/gpu/drm/ttm/ttm_resource.c
> @@ -33,6 +33,49 @@
>  
>  #include <drm/drm_util.h>
>  
> +/* Detach the cursor from the bulk move list*/
> +static void
> +ttm_resource_cursor_clear_bulk(struct ttm_resource_cursor *cursor)
> +{

A lockdep annotation wouldn't hurt here.

> +	cursor->bulk = NULL;
> +	list_del_init(&cursor->bulk_link);
> +}
> +
> +/* Move the cursor to the end of the bulk move list it's in */
> +static void ttm_resource_cursor_move_bulk_tail(struct ttm_lru_bulk_move *bulk,
> +					       struct ttm_resource_cursor *cursor)
> +{
> +	struct ttm_lru_bulk_move_pos *pos;
> +

A lockdep annotation wouldn't hurt here too.

> +	if (WARN_ON_ONCE(bulk != cursor->bulk)) {
> +		list_del_init(&cursor->bulk_link);
> +		return;
> +	}
> +
> +	pos = &bulk->pos[cursor->mem_type][cursor->priority];
> +	if (pos)

'if (pos->last)'?

As 'if (pos)' is going to always be true given you are using the address
of operator (&) on an instantiated struct ttm_lru_bulk_move_pos.

> +		list_move(&cursor->hitch.link, &pos->last->lru.link);

This should be list_move_tail, right? So last->next == hitch.

As the code is last->prev == hitch which means the hitch would be
included in the bulk move, right?

> +	ttm_resource_cursor_clear_bulk(cursor);
> +}
> +
> +/* Move all cursors attached to a bulk move to its end */
> +static void ttm_bulk_move_adjust_cursors(struct ttm_lru_bulk_move *bulk)
> +{
> +	struct ttm_resource_cursor *cursor, *next;
> +
> +	list_for_each_entry_safe(cursor, next, &bulk->cursor_list, bulk_link)
> +		ttm_resource_cursor_move_bulk_tail(bulk, cursor);
> +}
> +
> +/* Remove a cursor from an empty bulk move list */
> +static void ttm_bulk_move_drop_cursors(struct ttm_lru_bulk_move *bulk)
> +{
> +	struct ttm_resource_cursor *cursor, *next;
> +
> +	list_for_each_entry_safe(cursor, next, &bulk->cursor_list, bulk_link)
> +		ttm_resource_cursor_clear_bulk(cursor);
> +}
> +
>  /**
>   * ttm_resource_cursor_fini_locked() - Finalize the LRU list cursor usage
>   * @cursor: The struct ttm_resource_cursor to finalize.
> @@ -45,6 +88,7 @@ void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor *cursor)
>  {
>  	lockdep_assert_held(&cursor->man->bdev->lru_lock);
>  	list_del_init(&cursor->hitch.link);
> +	ttm_resource_cursor_clear_bulk(cursor);
>  }
>  
>  /**
> @@ -73,9 +117,27 @@ void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor)
>  void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk)
>  {
>  	memset(bulk, 0, sizeof(*bulk));
> +	INIT_LIST_HEAD(&bulk->cursor_list);
>  }
>  EXPORT_SYMBOL(ttm_lru_bulk_move_init);
>  
> +/**
> + * ttm_lru_bulk_move_fini - finalize a bulk move structure
> + * @bdev: The struct ttm_device
> + * @bulk: the structure to finalize
> + *
> + * Sanity checks that bulk moves don't have any
> + * resources left and hence no cursors attached.
> + */
> +void ttm_lru_bulk_move_fini(struct ttm_device *bdev,
> +			    struct ttm_lru_bulk_move *bulk)
> +{
> +	spin_lock(&bdev->lru_lock);
> +	ttm_bulk_move_drop_cursors(bulk);
> +	spin_unlock(&bdev->lru_lock);
> +}
> +EXPORT_SYMBOL(ttm_lru_bulk_move_fini);
> +
>  /**
>   * ttm_lru_bulk_move_tail - bulk move range of resources to the LRU tail.
>   *
> @@ -88,6 +150,7 @@ void ttm_lru_bulk_move_tail(struct ttm_lru_bulk_move *bulk)
>  {
>  	unsigned i, j;
>  
> +	ttm_bulk_move_adjust_cursors(bulk);
>  	for (i = 0; i < TTM_NUM_MEM_TYPES; ++i) {
>  		for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>  			struct ttm_lru_bulk_move_pos *pos = &bulk->pos[i][j];
> @@ -515,6 +578,29 @@ void ttm_resource_manager_debug(struct ttm_resource_manager *man,
>  }
>  EXPORT_SYMBOL(ttm_resource_manager_debug);
>  
> +static void
> +ttm_resource_cursor_check_bulk(struct ttm_resource_cursor *cursor,
> +			       struct ttm_lru_item *next_lru)
> +{
> +	struct ttm_resource *next = ttm_lru_item_to_res(next_lru);
> +	struct ttm_lru_bulk_move *bulk = NULL;
> +	struct ttm_buffer_object *bo = next->bo;
> +
> +	lockdep_assert_held(&cursor->man->bdev->lru_lock);
> +	if (bo && bo->resource == next)
> +		bulk = bo->bulk_move;


Can you explain what the above if statement is doing, struggling a bit
here. Is this a weird case where the LRU item (struct ttm_resource) is
fully (1st condition) or partially (2nd condition) detached from a BO?

> +
> +	if (cursor->bulk != bulk) {
> +		if (bulk) {
> +			list_move_tail(&cursor->bulk_link, &bulk->cursor_list);
> +			cursor->mem_type = next->mem_type;
> +		} else {
> +			list_del_init(&cursor->bulk_link);
> +		}
> +		cursor->bulk = bulk;
> +	}
> +}
> +
>  /**
>   * ttm_resource_manager_first() - Start iterating over the resources
>   * of a resource manager
> @@ -535,6 +621,7 @@ ttm_resource_manager_first(struct ttm_resource_manager *man,
>  	cursor->priority = 0;
>  	cursor->man = man;
>  	ttm_lru_item_init(&cursor->hitch, TTM_LRU_HITCH);
> +	INIT_LIST_HEAD(&cursor->bulk_link);
>  	list_add(&cursor->hitch.link, &man->lru[cursor->priority]);
>  
>  	return ttm_resource_manager_next(cursor);
> @@ -559,6 +646,7 @@ ttm_resource_manager_next(struct ttm_resource_cursor *cursor)
>  		lru = &cursor->hitch;
>  		list_for_each_entry_continue(lru, &man->lru[cursor->priority], link) {
>  			if (ttm_lru_item_is_res(lru)) {
> +				ttm_resource_cursor_check_bulk(cursor, lru);
>  				list_move(&cursor->hitch.link, &lru->link);

Sorry noticing this here from a different patch. Shouldn't this be
list_move_tail so if the LRU can't be evicted we don't pick it again?

Matt

>  				return ttm_lru_item_to_res(lru);
>  			}
> @@ -568,6 +656,7 @@ ttm_resource_manager_next(struct ttm_resource_cursor *cursor)
>  			break;
>  
>  		list_move(&cursor->hitch.link, &man->lru[cursor->priority]);
> +		ttm_resource_cursor_clear_bulk(cursor);
>  	}
>  
>  	ttm_resource_cursor_fini_locked(cursor);
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 61d4d95a5377..226da3c74f9c 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1339,6 +1339,8 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
>  
>  	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
>  
> +	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
> +
>  	INIT_LIST_HEAD(&vm->preempt.exec_queues);
>  	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
>  
> @@ -1462,6 +1464,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
>  	mutex_destroy(&vm->snap_mutex);
>  	for_each_tile(tile, xe, id)
>  		xe_range_fence_tree_fini(&vm->rftree[id]);
> +	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
>  	kfree(vm);
>  	if (flags & XE_VM_FLAG_LR_MODE)
>  		xe_pm_runtime_put(xe);
> @@ -1605,6 +1608,7 @@ static void vm_destroy_work_func(struct work_struct *w)
>  		XE_WARN_ON(vm->pt_root[id]);
>  
>  	trace_xe_vm_free(vm);
> +	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
>  	kfree(vm);
>  }
>  
> diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h
> index 8fac781f641e..571abb4861a6 100644
> --- a/include/drm/ttm/ttm_resource.h
> +++ b/include/drm/ttm/ttm_resource.h
> @@ -269,26 +269,6 @@ ttm_lru_item_to_res(struct ttm_lru_item *item)
>  	return container_of(item, struct ttm_resource, lru);
>  }
>  
> -/**
> - * struct ttm_resource_cursor
> - *
> - * @man: The resource manager currently being iterated over.
> - * @hitch: A hitch list node inserted before the next resource
> - * to iterate over.
> - * @priority: the current priority
> - *
> - * Cursor to iterate over the resources in a manager.
> - */
> -struct ttm_resource_cursor {
> -	struct ttm_resource_manager *man;
> -	struct ttm_lru_item hitch;
> -	unsigned int priority;
> -};
> -
> -void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor *cursor);
> -
> -void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor);
> -
>  /**
>   * struct ttm_lru_bulk_move_pos
>   *
> @@ -304,8 +284,9 @@ struct ttm_lru_bulk_move_pos {
>  
>  /**
>   * struct ttm_lru_bulk_move
> - *
>   * @pos: first/last lru entry for resources in the each domain/priority
> + * @cursor_list: The list of cursors currently traversing any of
> + * the sublists of @pos. Protected by the ttm device's lru_lock.
>   *
>   * Container for the current bulk move state. Should be used with
>   * ttm_lru_bulk_move_init() and ttm_bo_set_bulk_move().
> @@ -315,8 +296,39 @@ struct ttm_lru_bulk_move_pos {
>   */
>  struct ttm_lru_bulk_move {
>  	struct ttm_lru_bulk_move_pos pos[TTM_NUM_MEM_TYPES][TTM_MAX_BO_PRIORITY];
> +	struct list_head cursor_list;
>  };
>  
> +/**
> + * struct ttm_resource_cursor
> + * @man: The resource manager currently being iterated over
> + * @hitch: A hitch list node inserted before the next resource
> + * to iterate over.
> + * @bulk_link: A list link for the list of cursors traversing the
> + * bulk sublist of @bulk. Protected by the ttm device's lru_lock.
> + * @bulk: Pointer to struct ttm_lru_bulk_move whose subrange @hitch is
> + * inserted to. NULL if none. Never dereference this pointer since
> + * the struct ttm_lru_bulk_move object pointed to might have been
> + * freed. The pointer is only for comparison.
> + * @mem_type: The memory type of the LRU list being traversed.
> + * This field is valid iff @bulk != NULL.
> + * @priority: the current priority
> + *
> + * Cursor to iterate over the resources in a manager.
> + */
> +struct ttm_resource_cursor {
> +	struct ttm_resource_manager *man;
> +	struct ttm_lru_item hitch;
> +	struct list_head bulk_link;
> +	struct ttm_lru_bulk_move *bulk;
> +	unsigned int mem_type;
> +	unsigned int priority;
> +};
> +
> +void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor *cursor);
> +
> +void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor);
> +
>  /**
>   * struct ttm_kmap_iter_iomap - Specialization for a struct io_mapping +
>   * struct sg_table backed struct ttm_resource.
> @@ -405,6 +417,8 @@ ttm_resource_manager_cleanup(struct ttm_resource_manager *man)
>  
>  void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk);
>  void ttm_lru_bulk_move_tail(struct ttm_lru_bulk_move *bulk);
> +void ttm_lru_bulk_move_fini(struct ttm_device *bdev,
> +			    struct ttm_lru_bulk_move *bulk);
>  
>  void ttm_resource_add_bulk_move(struct ttm_resource *res,
>  				struct ttm_buffer_object *bo);
> -- 
> 2.44.0
>
Thomas Hellström June 19, 2024, 8:24 a.m. UTC | #2
On Wed, 2024-06-19 at 03:37 +0000, Matthew Brost wrote:
> On Tue, Jun 18, 2024 at 09:18:12AM +0200, Thomas Hellström wrote:
> 
> Ugh, replying to correct version again...
> 
> > To address the problem with hitches moving when bulk move
> > sublists are lru-bumped, register the list cursors with the
> > ttm_lru_bulk_move structure when traversing its list, and
> > when lru-bumping the list, move the cursor hitch to the tail.
> 
> - So the hitch moves to the tail (last) which points to the next item
> in
>   the LRU list
> - Then bulk is moved which is from first -> last to the end of the
> LRU
>   list
> - Now the hitch remains in the correct position in the list (i.e. it
>   didn't move with the bulk)
> 
> Did I get that right?

Yes, correct.

> 
> > This also means it's mandatory for drivers to call
> > ttm_lru_bulk_move_init() and ttm_lru_bulk_move_fini() when
> > initializing and finalizing the bulk move structure, so add
> > those calls to the amdgpu- and xe driver.
> > 
> > Compared to v1 this is slightly more code but less fragile
> > and hopefully easier to understand.
> > 
> > Changes in previous series:
> > - Completely rework the functionality
> > - Avoid a NULL pointer dereference assigning manager->mem_type
> > - Remove some leftover code causing build problems
> > v2:
> > - For hitch bulk tail moves, store the mem_type in the cursor
> >   instead of with the manager.
> > v3:
> > - Remove leftover mem_type member from change in v2.
> > 
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> > Cc: Matthew Brost <matthew.brost@intel.com>
> > Cc: <dri-devel@lists.freedesktop.org>
> > Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |  4 ++
> >  drivers/gpu/drm/ttm/ttm_resource.c     | 89
> > ++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/xe_vm.c             |  4 ++
> >  include/drm/ttm/ttm_resource.h         | 56 ++++++++++------
> >  4 files changed, 132 insertions(+), 21 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 3abfa66d72a2..97743993d711 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -2420,6 +2420,8 @@ int amdgpu_vm_init(struct amdgpu_device
> > *adev, struct amdgpu_vm *vm,
> >  	if (r)
> >  		return r;
> >  
> > +	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
> > +
> >  	vm->is_compute_context = false;
> >  
> >  	vm->use_cpu_for_update = !!(adev-
> > >vm_manager.vm_update_mode &
> > @@ -2484,6 +2486,7 @@ int amdgpu_vm_init(struct amdgpu_device
> > *adev, struct amdgpu_vm *vm,
> >  error_free_delayed:
> >  	dma_fence_put(vm->last_tlb_flush);
> >  	dma_fence_put(vm->last_unlocked);
> > +	ttm_lru_bulk_move_fini(&adev->mman.bdev, &vm-
> > >lru_bulk_move);
> >  	amdgpu_vm_fini_entities(vm);
> >  
> >  	return r;
> > @@ -2640,6 +2643,7 @@ void amdgpu_vm_fini(struct amdgpu_device
> > *adev, struct amdgpu_vm *vm)
> >  		}
> >  	}
> >  
> > +	ttm_lru_bulk_move_fini(&adev->mman.bdev, &vm-
> > >lru_bulk_move);
> >  }
> >  
> >  /**
> > diff --git a/drivers/gpu/drm/ttm/ttm_resource.c
> > b/drivers/gpu/drm/ttm/ttm_resource.c
> > index 9c8b6499edfb..a03090683e79 100644
> > --- a/drivers/gpu/drm/ttm/ttm_resource.c
> > +++ b/drivers/gpu/drm/ttm/ttm_resource.c
> > @@ -33,6 +33,49 @@
> >  
> >  #include <drm/drm_util.h>
> >  
> > +/* Detach the cursor from the bulk move list*/
> > +static void
> > +ttm_resource_cursor_clear_bulk(struct ttm_resource_cursor *cursor)
> > +{
> 
> A lockdep annotation wouldn't hurt here.

Will add.

> 
> > +	cursor->bulk = NULL;
> > +	list_del_init(&cursor->bulk_link);
> > +}
> > +
> > +/* Move the cursor to the end of the bulk move list it's in */
> > +static void ttm_resource_cursor_move_bulk_tail(struct
> > ttm_lru_bulk_move *bulk,
> > +					       struct
> > ttm_resource_cursor *cursor)
> > +{
> > +	struct ttm_lru_bulk_move_pos *pos;
> > +
> 
> A lockdep annotation wouldn't hurt here too.

+1!

> 
> > +	if (WARN_ON_ONCE(bulk != cursor->bulk)) {
> > +		list_del_init(&cursor->bulk_link);
> > +		return;
> > +	}
> > +
> > +	pos = &bulk->pos[cursor->mem_type][cursor->priority];
> > +	if (pos)
> 
> 'if (pos->last)'?
> 
> As 'if (pos)' is going to always be true given you are using the
> address
> of operator (&) on an instantiated struct ttm_lru_bulk_move_pos.

Good catch! I'll fix that up.

> 
> > +		list_move(&cursor->hitch.link, &pos->last-
> > >lru.link);
> 
> This should be list_move_tail, right? So last->next == hitch.
> 
> As the code is last->prev == hitch which means the hitch would be
> included in the bulk move, right?

It's the other way around right? list_move(a, b) will insert a as b-
>next, which is what we want.

> 
> > +	ttm_resource_cursor_clear_bulk(cursor);
> > +}
> > +
> > +/* Move all cursors attached to a bulk move to its end */
> > +static void ttm_bulk_move_adjust_cursors(struct ttm_lru_bulk_move
> > *bulk)
> > +{
> > +	struct ttm_resource_cursor *cursor, *next;
> > +
> > +	list_for_each_entry_safe(cursor, next, &bulk->cursor_list,
> > bulk_link)
> > +		ttm_resource_cursor_move_bulk_tail(bulk, cursor);
> > +}
> > +
> > +/* Remove a cursor from an empty bulk move list */
> > +static void ttm_bulk_move_drop_cursors(struct ttm_lru_bulk_move
> > *bulk)
> > +{
> > +	struct ttm_resource_cursor *cursor, *next;
> > +
> > +	list_for_each_entry_safe(cursor, next, &bulk->cursor_list,
> > bulk_link)
> > +		ttm_resource_cursor_clear_bulk(cursor);
> > +}
> > +
> >  /**
> >   * ttm_resource_cursor_fini_locked() - Finalize the LRU list
> > cursor usage
> >   * @cursor: The struct ttm_resource_cursor to finalize.
> > @@ -45,6 +88,7 @@ void ttm_resource_cursor_fini_locked(struct
> > ttm_resource_cursor *cursor)
> >  {
> >  	lockdep_assert_held(&cursor->man->bdev->lru_lock);
> >  	list_del_init(&cursor->hitch.link);
> > +	ttm_resource_cursor_clear_bulk(cursor);
> >  }
> >  
> >  /**
> > @@ -73,9 +117,27 @@ void ttm_resource_cursor_fini(struct
> > ttm_resource_cursor *cursor)
> >  void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk)
> >  {
> >  	memset(bulk, 0, sizeof(*bulk));
> > +	INIT_LIST_HEAD(&bulk->cursor_list);
> >  }
> >  EXPORT_SYMBOL(ttm_lru_bulk_move_init);
> >  
> > +/**
> > + * ttm_lru_bulk_move_fini - finalize a bulk move structure
> > + * @bdev: The struct ttm_device
> > + * @bulk: the structure to finalize
> > + *
> > + * Sanity checks that bulk moves don't have any
> > + * resources left and hence no cursors attached.
> > + */
> > +void ttm_lru_bulk_move_fini(struct ttm_device *bdev,
> > +			    struct ttm_lru_bulk_move *bulk)
> > +{
> > +	spin_lock(&bdev->lru_lock);
> > +	ttm_bulk_move_drop_cursors(bulk);
> > +	spin_unlock(&bdev->lru_lock);
> > +}
> > +EXPORT_SYMBOL(ttm_lru_bulk_move_fini);
> > +
> >  /**
> >   * ttm_lru_bulk_move_tail - bulk move range of resources to the
> > LRU tail.
> >   *
> > @@ -88,6 +150,7 @@ void ttm_lru_bulk_move_tail(struct
> > ttm_lru_bulk_move *bulk)
> >  {
> >  	unsigned i, j;
> >  
> > +	ttm_bulk_move_adjust_cursors(bulk);
> >  	for (i = 0; i < TTM_NUM_MEM_TYPES; ++i) {
> >  		for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
> >  			struct ttm_lru_bulk_move_pos *pos = &bulk-
> > >pos[i][j];
> > @@ -515,6 +578,29 @@ void ttm_resource_manager_debug(struct
> > ttm_resource_manager *man,
> >  }
> >  EXPORT_SYMBOL(ttm_resource_manager_debug);
> >  
> > +static void
> > +ttm_resource_cursor_check_bulk(struct ttm_resource_cursor *cursor,
> > +			       struct ttm_lru_item *next_lru)
> > +{
> > +	struct ttm_resource *next = ttm_lru_item_to_res(next_lru);
> > +	struct ttm_lru_bulk_move *bulk = NULL;
> > +	struct ttm_buffer_object *bo = next->bo;
> > +
> > +	lockdep_assert_held(&cursor->man->bdev->lru_lock);
> > +	if (bo && bo->resource == next)
> > +		bulk = bo->bulk_move;
> 
> 
> Can you explain what the above if statement is doing, struggling a
> bit
> here. Is this a weird case where the LRU item (struct ttm_resource)
> is
> fully (1st condition) or partially (2nd condition) detached from a
> BO?

Yeah, this is a weird corner case where the resource is handed over to
a ghost object, and the lock protection is not clearly specified. From
my reading of the code, at least bo->resource is not protected by the
LRU lock when clearing, but bo->bulk_move is, so given that, perhaps
the test is indeed unnecessary.

> 
> > +
> > +	if (cursor->bulk != bulk) {
> > +		if (bulk) {
> > +			list_move_tail(&cursor->bulk_link, &bulk-
> > >cursor_list);
> > +			cursor->mem_type = next->mem_type;
> > +		} else {
> > +			list_del_init(&cursor->bulk_link);
> > +		}
> > +		cursor->bulk = bulk;
> > +	}
> > +}
> > +
> >  /**
> >   * ttm_resource_manager_first() - Start iterating over the
> > resources
> >   * of a resource manager
> > @@ -535,6 +621,7 @@ ttm_resource_manager_first(struct
> > ttm_resource_manager *man,
> >  	cursor->priority = 0;
> >  	cursor->man = man;
> >  	ttm_lru_item_init(&cursor->hitch, TTM_LRU_HITCH);
> > +	INIT_LIST_HEAD(&cursor->bulk_link);
> >  	list_add(&cursor->hitch.link, &man->lru[cursor-
> > >priority]);
> >  
> >  	return ttm_resource_manager_next(cursor);
> > @@ -559,6 +646,7 @@ ttm_resource_manager_next(struct
> > ttm_resource_cursor *cursor)
> >  		lru = &cursor->hitch;
> >  		list_for_each_entry_continue(lru, &man-
> > >lru[cursor->priority], link) {
> >  			if (ttm_lru_item_is_res(lru)) {
> > +				ttm_resource_cursor_check_bulk(cur
> > sor, lru);
> >  				list_move(&cursor->hitch.link,
> > &lru->link);
> 
> Sorry noticing this here from a different patch. Shouldn't this be
> list_move_tail so if the LRU can't be evicted we don't pick it again?

Same as above.

> 
> Matt
> 
> >  				return ttm_lru_item_to_res(lru);
> >  			}
> > @@ -568,6 +656,7 @@ ttm_resource_manager_next(struct
> > ttm_resource_cursor *cursor)
> >  			break;
> >  
> >  		list_move(&cursor->hitch.link, &man->lru[cursor-
> > >priority]);
> > +		ttm_resource_cursor_clear_bulk(cursor);
> >  	}
> >  
> >  	ttm_resource_cursor_fini_locked(cursor);
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > b/drivers/gpu/drm/xe/xe_vm.c
> > index 61d4d95a5377..226da3c74f9c 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -1339,6 +1339,8 @@ struct xe_vm *xe_vm_create(struct xe_device
> > *xe, u32 flags)
> >  
> >  	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
> >  
> > +	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
> > +
> >  	INIT_LIST_HEAD(&vm->preempt.exec_queues);
> >  	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up
> > to uAPI */
> >  
> > @@ -1462,6 +1464,7 @@ struct xe_vm *xe_vm_create(struct xe_device
> > *xe, u32 flags)
> >  	mutex_destroy(&vm->snap_mutex);
> >  	for_each_tile(tile, xe, id)
> >  		xe_range_fence_tree_fini(&vm->rftree[id]);
> > +	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
> >  	kfree(vm);
> >  	if (flags & XE_VM_FLAG_LR_MODE)
> >  		xe_pm_runtime_put(xe);
> > @@ -1605,6 +1608,7 @@ static void vm_destroy_work_func(struct
> > work_struct *w)
> >  		XE_WARN_ON(vm->pt_root[id]);
> >  
> >  	trace_xe_vm_free(vm);
> > +	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
> >  	kfree(vm);
> >  }
> >  
> > diff --git a/include/drm/ttm/ttm_resource.h
> > b/include/drm/ttm/ttm_resource.h
> > index 8fac781f641e..571abb4861a6 100644
> > --- a/include/drm/ttm/ttm_resource.h
> > +++ b/include/drm/ttm/ttm_resource.h
> > @@ -269,26 +269,6 @@ ttm_lru_item_to_res(struct ttm_lru_item *item)
> >  	return container_of(item, struct ttm_resource, lru);
> >  }
> >  
> > -/**
> > - * struct ttm_resource_cursor
> > - *
> > - * @man: The resource manager currently being iterated over.
> > - * @hitch: A hitch list node inserted before the next resource
> > - * to iterate over.
> > - * @priority: the current priority
> > - *
> > - * Cursor to iterate over the resources in a manager.
> > - */
> > -struct ttm_resource_cursor {
> > -	struct ttm_resource_manager *man;
> > -	struct ttm_lru_item hitch;
> > -	unsigned int priority;
> > -};
> > -
> > -void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor
> > *cursor);
> > -
> > -void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor);
> > -
> >  /**
> >   * struct ttm_lru_bulk_move_pos
> >   *
> > @@ -304,8 +284,9 @@ struct ttm_lru_bulk_move_pos {
> >  
> >  /**
> >   * struct ttm_lru_bulk_move
> > - *
> >   * @pos: first/last lru entry for resources in the each
> > domain/priority
> > + * @cursor_list: The list of cursors currently traversing any of
> > + * the sublists of @pos. Protected by the ttm device's lru_lock.
> >   *
> >   * Container for the current bulk move state. Should be used with
> >   * ttm_lru_bulk_move_init() and ttm_bo_set_bulk_move().
> > @@ -315,8 +296,39 @@ struct ttm_lru_bulk_move_pos {
> >   */
> >  struct ttm_lru_bulk_move {
> >  	struct ttm_lru_bulk_move_pos
> > pos[TTM_NUM_MEM_TYPES][TTM_MAX_BO_PRIORITY];
> > +	struct list_head cursor_list;
> >  };
> >  
> > +/**
> > + * struct ttm_resource_cursor
> > + * @man: The resource manager currently being iterated over
> > + * @hitch: A hitch list node inserted before the next resource
> > + * to iterate over.
> > + * @bulk_link: A list link for the list of cursors traversing the
> > + * bulk sublist of @bulk. Protected by the ttm device's lru_lock.
> > + * @bulk: Pointer to struct ttm_lru_bulk_move whose subrange
> > @hitch is
> > + * inserted to. NULL if none. Never dereference this pointer since
> > + * the struct ttm_lru_bulk_move object pointed to might have been
> > + * freed. The pointer is only for comparison.
> > + * @mem_type: The memory type of the LRU list being traversed.
> > + * This field is valid iff @bulk != NULL.
> > + * @priority: the current priority
> > + *
> > + * Cursor to iterate over the resources in a manager.
> > + */
> > +struct ttm_resource_cursor {
> > +	struct ttm_resource_manager *man;
> > +	struct ttm_lru_item hitch;
> > +	struct list_head bulk_link;
> > +	struct ttm_lru_bulk_move *bulk;
> > +	unsigned int mem_type;
> > +	unsigned int priority;
> > +};
> > +
> > +void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor
> > *cursor);
> > +
> > +void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor);
> > +
> >  /**
> >   * struct ttm_kmap_iter_iomap - Specialization for a struct
> > io_mapping +
> >   * struct sg_table backed struct ttm_resource.
> > @@ -405,6 +417,8 @@ ttm_resource_manager_cleanup(struct
> > ttm_resource_manager *man)
> >  
> >  void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk);
> >  void ttm_lru_bulk_move_tail(struct ttm_lru_bulk_move *bulk);
> > +void ttm_lru_bulk_move_fini(struct ttm_device *bdev,
> > +			    struct ttm_lru_bulk_move *bulk);
> >  
> >  void ttm_resource_add_bulk_move(struct ttm_resource *res,
> >  				struct ttm_buffer_object *bo);
> > -- 
> > 2.44.0
> >
Matthew Brost June 19, 2024, 2:44 p.m. UTC | #3
On Wed, Jun 19, 2024 at 10:24:25AM +0200, Thomas Hellström wrote:
> On Wed, 2024-06-19 at 03:37 +0000, Matthew Brost wrote:
> > On Tue, Jun 18, 2024 at 09:18:12AM +0200, Thomas Hellström wrote:
> > 
> > Ugh, replying to correct version again...
> > 
> > > To address the problem with hitches moving when bulk move
> > > sublists are lru-bumped, register the list cursors with the
> > > ttm_lru_bulk_move structure when traversing its list, and
> > > when lru-bumping the list, move the cursor hitch to the tail.
> > 
> > - So the hitch moves to the tail (last) which points to the next item
> > in
> >   the LRU list
> > - Then bulk is moved which is from first -> last to the end of the
> > LRU
> >   list
> > - Now the hitch remains in the correct position in the list (i.e. it
> >   didn't move with the bulk)
> > 
> > Did I get that right?
> 
> Yes, correct.
> 
> > 
> > > This also means it's mandatory for drivers to call
> > > ttm_lru_bulk_move_init() and ttm_lru_bulk_move_fini() when
> > > initializing and finalizing the bulk move structure, so add
> > > those calls to the amdgpu- and xe driver.
> > > 
> > > Compared to v1 this is slightly more code but less fragile
> > > and hopefully easier to understand.
> > > 
> > > Changes in previous series:
> > > - Completely rework the functionality
> > > - Avoid a NULL pointer dereference assigning manager->mem_type
> > > - Remove some leftover code causing build problems
> > > v2:
> > > - For hitch bulk tail moves, store the mem_type in the cursor
> > >   instead of with the manager.
> > > v3:
> > > - Remove leftover mem_type member from change in v2.
> > > 
> > > Cc: Christian König <christian.koenig@amd.com>
> > > Cc: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> > > Cc: Matthew Brost <matthew.brost@intel.com>
> > > Cc: <dri-devel@lists.freedesktop.org>
> > > Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |  4 ++
> > >  drivers/gpu/drm/ttm/ttm_resource.c     | 89
> > > ++++++++++++++++++++++++++
> > >  drivers/gpu/drm/xe/xe_vm.c             |  4 ++
> > >  include/drm/ttm/ttm_resource.h         | 56 ++++++++++------
> > >  4 files changed, 132 insertions(+), 21 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > index 3abfa66d72a2..97743993d711 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > @@ -2420,6 +2420,8 @@ int amdgpu_vm_init(struct amdgpu_device
> > > *adev, struct amdgpu_vm *vm,
> > >  	if (r)
> > >  		return r;
> > >  
> > > +	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
> > > +
> > >  	vm->is_compute_context = false;
> > >  
> > >  	vm->use_cpu_for_update = !!(adev-
> > > >vm_manager.vm_update_mode &
> > > @@ -2484,6 +2486,7 @@ int amdgpu_vm_init(struct amdgpu_device
> > > *adev, struct amdgpu_vm *vm,
> > >  error_free_delayed:
> > >  	dma_fence_put(vm->last_tlb_flush);
> > >  	dma_fence_put(vm->last_unlocked);
> > > +	ttm_lru_bulk_move_fini(&adev->mman.bdev, &vm-
> > > >lru_bulk_move);
> > >  	amdgpu_vm_fini_entities(vm);
> > >  
> > >  	return r;
> > > @@ -2640,6 +2643,7 @@ void amdgpu_vm_fini(struct amdgpu_device
> > > *adev, struct amdgpu_vm *vm)
> > >  		}
> > >  	}
> > >  
> > > +	ttm_lru_bulk_move_fini(&adev->mman.bdev, &vm-
> > > >lru_bulk_move);
> > >  }
> > >  
> > >  /**
> > > diff --git a/drivers/gpu/drm/ttm/ttm_resource.c
> > > b/drivers/gpu/drm/ttm/ttm_resource.c
> > > index 9c8b6499edfb..a03090683e79 100644
> > > --- a/drivers/gpu/drm/ttm/ttm_resource.c
> > > +++ b/drivers/gpu/drm/ttm/ttm_resource.c
> > > @@ -33,6 +33,49 @@
> > >  
> > >  #include <drm/drm_util.h>
> > >  
> > > +/* Detach the cursor from the bulk move list*/
> > > +static void
> > > +ttm_resource_cursor_clear_bulk(struct ttm_resource_cursor *cursor)
> > > +{
> > 
> > A lockdep annotation wouldn't hurt here.
> 
> Will add.
> 
> > 
> > > +	cursor->bulk = NULL;
> > > +	list_del_init(&cursor->bulk_link);
> > > +}
> > > +
> > > +/* Move the cursor to the end of the bulk move list it's in */
> > > +static void ttm_resource_cursor_move_bulk_tail(struct
> > > ttm_lru_bulk_move *bulk,
> > > +					       struct
> > > ttm_resource_cursor *cursor)
> > > +{
> > > +	struct ttm_lru_bulk_move_pos *pos;
> > > +
> > 
> > A lockdep annotation wouldn't hurt here too.
> 
> +1!
> 
> > 
> > > +	if (WARN_ON_ONCE(bulk != cursor->bulk)) {
> > > +		list_del_init(&cursor->bulk_link);
> > > +		return;
> > > +	}
> > > +
> > > +	pos = &bulk->pos[cursor->mem_type][cursor->priority];
> > > +	if (pos)
> > 
> > 'if (pos->last)'?
> > 
> > As 'if (pos)' is going to always be true given you are using the
> > address
> > of operator (&) on an instantiated struct ttm_lru_bulk_move_pos.
> 
> Good catch! I'll fix that up.
> 
> > 
> > > +		list_move(&cursor->hitch.link, &pos->last-
> > > >lru.link);
> > 
> > This should be list_move_tail, right? So last->next == hitch.
> > 
> > As the code is last->prev == hitch which means the hitch would be
> > included in the bulk move, right?
> 
> It's the other way around right? list_move(a, b) will insert a as b-
> >next, which is what we want.

I realized a bit after typing this, yes this is correct as is.

> 
> > 
> > > +	ttm_resource_cursor_clear_bulk(cursor);
> > > +}
> > > +
> > > +/* Move all cursors attached to a bulk move to its end */
> > > +static void ttm_bulk_move_adjust_cursors(struct ttm_lru_bulk_move
> > > *bulk)
> > > +{
> > > +	struct ttm_resource_cursor *cursor, *next;
> > > +
> > > +	list_for_each_entry_safe(cursor, next, &bulk->cursor_list,
> > > bulk_link)
> > > +		ttm_resource_cursor_move_bulk_tail(bulk, cursor);
> > > +}
> > > +
> > > +/* Remove a cursor from an empty bulk move list */
> > > +static void ttm_bulk_move_drop_cursors(struct ttm_lru_bulk_move
> > > *bulk)
> > > +{
> > > +	struct ttm_resource_cursor *cursor, *next;
> > > +
> > > +	list_for_each_entry_safe(cursor, next, &bulk->cursor_list,
> > > bulk_link)
> > > +		ttm_resource_cursor_clear_bulk(cursor);
> > > +}
> > > +
> > >  /**
> > >   * ttm_resource_cursor_fini_locked() - Finalize the LRU list
> > > cursor usage
> > >   * @cursor: The struct ttm_resource_cursor to finalize.
> > > @@ -45,6 +88,7 @@ void ttm_resource_cursor_fini_locked(struct
> > > ttm_resource_cursor *cursor)
> > >  {
> > >  	lockdep_assert_held(&cursor->man->bdev->lru_lock);
> > >  	list_del_init(&cursor->hitch.link);
> > > +	ttm_resource_cursor_clear_bulk(cursor);
> > >  }
> > >  
> > >  /**
> > > @@ -73,9 +117,27 @@ void ttm_resource_cursor_fini(struct
> > > ttm_resource_cursor *cursor)
> > >  void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk)
> > >  {
> > >  	memset(bulk, 0, sizeof(*bulk));
> > > +	INIT_LIST_HEAD(&bulk->cursor_list);
> > >  }
> > >  EXPORT_SYMBOL(ttm_lru_bulk_move_init);
> > >  
> > > +/**
> > > + * ttm_lru_bulk_move_fini - finalize a bulk move structure
> > > + * @bdev: The struct ttm_device
> > > + * @bulk: the structure to finalize
> > > + *
> > > + * Sanity checks that bulk moves don't have any
> > > + * resources left and hence no cursors attached.
> > > + */
> > > +void ttm_lru_bulk_move_fini(struct ttm_device *bdev,
> > > +			    struct ttm_lru_bulk_move *bulk)
> > > +{
> > > +	spin_lock(&bdev->lru_lock);
> > > +	ttm_bulk_move_drop_cursors(bulk);
> > > +	spin_unlock(&bdev->lru_lock);
> > > +}
> > > +EXPORT_SYMBOL(ttm_lru_bulk_move_fini);
> > > +
> > >  /**
> > >   * ttm_lru_bulk_move_tail - bulk move range of resources to the
> > > LRU tail.
> > >   *
> > > @@ -88,6 +150,7 @@ void ttm_lru_bulk_move_tail(struct
> > > ttm_lru_bulk_move *bulk)
> > >  {
> > >  	unsigned i, j;
> > >  
> > > +	ttm_bulk_move_adjust_cursors(bulk);
> > >  	for (i = 0; i < TTM_NUM_MEM_TYPES; ++i) {
> > >  		for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
> > >  			struct ttm_lru_bulk_move_pos *pos = &bulk-
> > > >pos[i][j];
> > > @@ -515,6 +578,29 @@ void ttm_resource_manager_debug(struct
> > > ttm_resource_manager *man,
> > >  }
> > >  EXPORT_SYMBOL(ttm_resource_manager_debug);
> > >  
> > > +static void
> > > +ttm_resource_cursor_check_bulk(struct ttm_resource_cursor *cursor,
> > > +			       struct ttm_lru_item *next_lru)
> > > +{
> > > +	struct ttm_resource *next = ttm_lru_item_to_res(next_lru);
> > > +	struct ttm_lru_bulk_move *bulk = NULL;
> > > +	struct ttm_buffer_object *bo = next->bo;
> > > +
> > > +	lockdep_assert_held(&cursor->man->bdev->lru_lock);
> > > +	if (bo && bo->resource == next)
> > > +		bulk = bo->bulk_move;
> > 
> > 
> > Can you explain what the above if statement is doing, struggling a
> > bit
> > here. Is this a weird case where the LRU item (struct ttm_resource)
> > is
> > fully (1st condition) or partially (2nd condition) detached from a
> > BO?
> 
> Yeah, this is a weird corner case where the resource is handed over to
> a ghost object, and the lock protection is not clearly specified. From
> my reading of the code, at least bo->resource is not protected by the
> LRU lock when clearing, but bo->bulk_move is, so given that, perhaps
> the test is indeed unnecessary.
>

Looking at that code I think the above statement is correct.
 
> > 
> > > +
> > > +	if (cursor->bulk != bulk) {
> > > +		if (bulk) {
> > > +			list_move_tail(&cursor->bulk_link, &bulk-
> > > >cursor_list);
> > > +			cursor->mem_type = next->mem_type;
> > > +		} else {
> > > +			list_del_init(&cursor->bulk_link);
> > > +		}
> > > +		cursor->bulk = bulk;
> > > +	}
> > > +}
> > > +
> > >  /**
> > >   * ttm_resource_manager_first() - Start iterating over the
> > > resources
> > >   * of a resource manager
> > > @@ -535,6 +621,7 @@ ttm_resource_manager_first(struct
> > > ttm_resource_manager *man,
> > >  	cursor->priority = 0;
> > >  	cursor->man = man;
> > >  	ttm_lru_item_init(&cursor->hitch, TTM_LRU_HITCH);
> > > +	INIT_LIST_HEAD(&cursor->bulk_link);
> > >  	list_add(&cursor->hitch.link, &man->lru[cursor-
> > > >priority]);
> > >  
> > >  	return ttm_resource_manager_next(cursor);
> > > @@ -559,6 +646,7 @@ ttm_resource_manager_next(struct
> > > ttm_resource_cursor *cursor)
> > >  		lru = &cursor->hitch;
> > >  		list_for_each_entry_continue(lru, &man-
> > > >lru[cursor->priority], link) {
> > >  			if (ttm_lru_item_is_res(lru)) {
> > > +				ttm_resource_cursor_check_bulk(cur
> > > sor, lru);
> > >  				list_move(&cursor->hitch.link,
> > > &lru->link);
> > 
> > Sorry noticing this here from a different patch. Shouldn't this be
> > list_move_tail so if the LRU can't be evicted we don't pick it again?
> 
> Same as above.
> 

Yep, my mistake.

Matt

> > 
> > Matt
> > 
> > >  				return ttm_lru_item_to_res(lru);
> > >  			}
> > > @@ -568,6 +656,7 @@ ttm_resource_manager_next(struct
> > > ttm_resource_cursor *cursor)
> > >  			break;
> > >  
> > >  		list_move(&cursor->hitch.link, &man->lru[cursor-
> > > >priority]);
> > > +		ttm_resource_cursor_clear_bulk(cursor);
> > >  	}
> > >  
> > >  	ttm_resource_cursor_fini_locked(cursor);
> > > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > > b/drivers/gpu/drm/xe/xe_vm.c
> > > index 61d4d95a5377..226da3c74f9c 100644
> > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > @@ -1339,6 +1339,8 @@ struct xe_vm *xe_vm_create(struct xe_device
> > > *xe, u32 flags)
> > >  
> > >  	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
> > >  
> > > +	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
> > > +
> > >  	INIT_LIST_HEAD(&vm->preempt.exec_queues);
> > >  	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up
> > > to uAPI */
> > >  
> > > @@ -1462,6 +1464,7 @@ struct xe_vm *xe_vm_create(struct xe_device
> > > *xe, u32 flags)
> > >  	mutex_destroy(&vm->snap_mutex);
> > >  	for_each_tile(tile, xe, id)
> > >  		xe_range_fence_tree_fini(&vm->rftree[id]);
> > > +	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
> > >  	kfree(vm);
> > >  	if (flags & XE_VM_FLAG_LR_MODE)
> > >  		xe_pm_runtime_put(xe);
> > > @@ -1605,6 +1608,7 @@ static void vm_destroy_work_func(struct
> > > work_struct *w)
> > >  		XE_WARN_ON(vm->pt_root[id]);
> > >  
> > >  	trace_xe_vm_free(vm);
> > > +	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
> > >  	kfree(vm);
> > >  }
> > >  
> > > diff --git a/include/drm/ttm/ttm_resource.h
> > > b/include/drm/ttm/ttm_resource.h
> > > index 8fac781f641e..571abb4861a6 100644
> > > --- a/include/drm/ttm/ttm_resource.h
> > > +++ b/include/drm/ttm/ttm_resource.h
> > > @@ -269,26 +269,6 @@ ttm_lru_item_to_res(struct ttm_lru_item *item)
> > >  	return container_of(item, struct ttm_resource, lru);
> > >  }
> > >  
> > > -/**
> > > - * struct ttm_resource_cursor
> > > - *
> > > - * @man: The resource manager currently being iterated over.
> > > - * @hitch: A hitch list node inserted before the next resource
> > > - * to iterate over.
> > > - * @priority: the current priority
> > > - *
> > > - * Cursor to iterate over the resources in a manager.
> > > - */
> > > -struct ttm_resource_cursor {
> > > -	struct ttm_resource_manager *man;
> > > -	struct ttm_lru_item hitch;
> > > -	unsigned int priority;
> > > -};
> > > -
> > > -void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor
> > > *cursor);
> > > -
> > > -void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor);
> > > -
> > >  /**
> > >   * struct ttm_lru_bulk_move_pos
> > >   *
> > > @@ -304,8 +284,9 @@ struct ttm_lru_bulk_move_pos {
> > >  
> > >  /**
> > >   * struct ttm_lru_bulk_move
> > > - *
> > >   * @pos: first/last lru entry for resources in the each
> > > domain/priority
> > > + * @cursor_list: The list of cursors currently traversing any of
> > > + * the sublists of @pos. Protected by the ttm device's lru_lock.
> > >   *
> > >   * Container for the current bulk move state. Should be used with
> > >   * ttm_lru_bulk_move_init() and ttm_bo_set_bulk_move().
> > > @@ -315,8 +296,39 @@ struct ttm_lru_bulk_move_pos {
> > >   */
> > >  struct ttm_lru_bulk_move {
> > >  	struct ttm_lru_bulk_move_pos
> > > pos[TTM_NUM_MEM_TYPES][TTM_MAX_BO_PRIORITY];
> > > +	struct list_head cursor_list;
> > >  };
> > >  
> > > +/**
> > > + * struct ttm_resource_cursor
> > > + * @man: The resource manager currently being iterated over
> > > + * @hitch: A hitch list node inserted before the next resource
> > > + * to iterate over.
> > > + * @bulk_link: A list link for the list of cursors traversing the
> > > + * bulk sublist of @bulk. Protected by the ttm device's lru_lock.
> > > + * @bulk: Pointer to struct ttm_lru_bulk_move whose subrange
> > > @hitch is
> > > + * inserted to. NULL if none. Never dereference this pointer since
> > > + * the struct ttm_lru_bulk_move object pointed to might have been
> > > + * freed. The pointer is only for comparison.
> > > + * @mem_type: The memory type of the LRU list being traversed.
> > > + * This field is valid iff @bulk != NULL.
> > > + * @priority: the current priority
> > > + *
> > > + * Cursor to iterate over the resources in a manager.
> > > + */
> > > +struct ttm_resource_cursor {
> > > +	struct ttm_resource_manager *man;
> > > +	struct ttm_lru_item hitch;
> > > +	struct list_head bulk_link;
> > > +	struct ttm_lru_bulk_move *bulk;
> > > +	unsigned int mem_type;
> > > +	unsigned int priority;
> > > +};
> > > +
> > > +void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor
> > > *cursor);
> > > +
> > > +void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor);
> > > +
> > >  /**
> > >   * struct ttm_kmap_iter_iomap - Specialization for a struct
> > > io_mapping +
> > >   * struct sg_table backed struct ttm_resource.
> > > @@ -405,6 +417,8 @@ ttm_resource_manager_cleanup(struct
> > > ttm_resource_manager *man)
> > >  
> > >  void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk);
> > >  void ttm_lru_bulk_move_tail(struct ttm_lru_bulk_move *bulk);
> > > +void ttm_lru_bulk_move_fini(struct ttm_device *bdev,
> > > +			    struct ttm_lru_bulk_move *bulk);
> > >  
> > >  void ttm_resource_add_bulk_move(struct ttm_resource *res,
> > >  				struct ttm_buffer_object *bo);
> > > -- 
> > > 2.44.0
> > > 
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3abfa66d72a2..97743993d711 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2420,6 +2420,8 @@  int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	if (r)
 		return r;
 
+	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
+
 	vm->is_compute_context = false;
 
 	vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
@@ -2484,6 +2486,7 @@  int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 error_free_delayed:
 	dma_fence_put(vm->last_tlb_flush);
 	dma_fence_put(vm->last_unlocked);
+	ttm_lru_bulk_move_fini(&adev->mman.bdev, &vm->lru_bulk_move);
 	amdgpu_vm_fini_entities(vm);
 
 	return r;
@@ -2640,6 +2643,7 @@  void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 		}
 	}
 
+	ttm_lru_bulk_move_fini(&adev->mman.bdev, &vm->lru_bulk_move);
 }
 
 /**
diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c
index 9c8b6499edfb..a03090683e79 100644
--- a/drivers/gpu/drm/ttm/ttm_resource.c
+++ b/drivers/gpu/drm/ttm/ttm_resource.c
@@ -33,6 +33,49 @@ 
 
 #include <drm/drm_util.h>
 
+/* Detach the cursor from the bulk move list*/
+static void
+ttm_resource_cursor_clear_bulk(struct ttm_resource_cursor *cursor)
+{
+	cursor->bulk = NULL;
+	list_del_init(&cursor->bulk_link);
+}
+
+/* Move the cursor to the end of the bulk move list it's in */
+static void ttm_resource_cursor_move_bulk_tail(struct ttm_lru_bulk_move *bulk,
+					       struct ttm_resource_cursor *cursor)
+{
+	struct ttm_lru_bulk_move_pos *pos;
+
+	if (WARN_ON_ONCE(bulk != cursor->bulk)) {
+		list_del_init(&cursor->bulk_link);
+		return;
+	}
+
+	pos = &bulk->pos[cursor->mem_type][cursor->priority];
+	if (pos)
+		list_move(&cursor->hitch.link, &pos->last->lru.link);
+	ttm_resource_cursor_clear_bulk(cursor);
+}
+
+/* Move all cursors attached to a bulk move to its end */
+static void ttm_bulk_move_adjust_cursors(struct ttm_lru_bulk_move *bulk)
+{
+	struct ttm_resource_cursor *cursor, *next;
+
+	list_for_each_entry_safe(cursor, next, &bulk->cursor_list, bulk_link)
+		ttm_resource_cursor_move_bulk_tail(bulk, cursor);
+}
+
+/* Remove a cursor from an empty bulk move list */
+static void ttm_bulk_move_drop_cursors(struct ttm_lru_bulk_move *bulk)
+{
+	struct ttm_resource_cursor *cursor, *next;
+
+	list_for_each_entry_safe(cursor, next, &bulk->cursor_list, bulk_link)
+		ttm_resource_cursor_clear_bulk(cursor);
+}
+
 /**
  * ttm_resource_cursor_fini_locked() - Finalize the LRU list cursor usage
  * @cursor: The struct ttm_resource_cursor to finalize.
@@ -45,6 +88,7 @@  void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor *cursor)
 {
 	lockdep_assert_held(&cursor->man->bdev->lru_lock);
 	list_del_init(&cursor->hitch.link);
+	ttm_resource_cursor_clear_bulk(cursor);
 }
 
 /**
@@ -73,9 +117,27 @@  void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor)
 void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk)
 {
 	memset(bulk, 0, sizeof(*bulk));
+	INIT_LIST_HEAD(&bulk->cursor_list);
 }
 EXPORT_SYMBOL(ttm_lru_bulk_move_init);
 
+/**
+ * ttm_lru_bulk_move_fini - finalize a bulk move structure
+ * @bdev: The struct ttm_device
+ * @bulk: the structure to finalize
+ *
+ * Sanity checks that bulk moves don't have any
+ * resources left and hence no cursors attached.
+ */
+void ttm_lru_bulk_move_fini(struct ttm_device *bdev,
+			    struct ttm_lru_bulk_move *bulk)
+{
+	spin_lock(&bdev->lru_lock);
+	ttm_bulk_move_drop_cursors(bulk);
+	spin_unlock(&bdev->lru_lock);
+}
+EXPORT_SYMBOL(ttm_lru_bulk_move_fini);
+
 /**
  * ttm_lru_bulk_move_tail - bulk move range of resources to the LRU tail.
  *
@@ -88,6 +150,7 @@  void ttm_lru_bulk_move_tail(struct ttm_lru_bulk_move *bulk)
 {
 	unsigned i, j;
 
+	ttm_bulk_move_adjust_cursors(bulk);
 	for (i = 0; i < TTM_NUM_MEM_TYPES; ++i) {
 		for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
 			struct ttm_lru_bulk_move_pos *pos = &bulk->pos[i][j];
@@ -515,6 +578,29 @@  void ttm_resource_manager_debug(struct ttm_resource_manager *man,
 }
 EXPORT_SYMBOL(ttm_resource_manager_debug);
 
+static void
+ttm_resource_cursor_check_bulk(struct ttm_resource_cursor *cursor,
+			       struct ttm_lru_item *next_lru)
+{
+	struct ttm_resource *next = ttm_lru_item_to_res(next_lru);
+	struct ttm_lru_bulk_move *bulk = NULL;
+	struct ttm_buffer_object *bo = next->bo;
+
+	lockdep_assert_held(&cursor->man->bdev->lru_lock);
+	if (bo && bo->resource == next)
+		bulk = bo->bulk_move;
+
+	if (cursor->bulk != bulk) {
+		if (bulk) {
+			list_move_tail(&cursor->bulk_link, &bulk->cursor_list);
+			cursor->mem_type = next->mem_type;
+		} else {
+			list_del_init(&cursor->bulk_link);
+		}
+		cursor->bulk = bulk;
+	}
+}
+
 /**
  * ttm_resource_manager_first() - Start iterating over the resources
  * of a resource manager
@@ -535,6 +621,7 @@  ttm_resource_manager_first(struct ttm_resource_manager *man,
 	cursor->priority = 0;
 	cursor->man = man;
 	ttm_lru_item_init(&cursor->hitch, TTM_LRU_HITCH);
+	INIT_LIST_HEAD(&cursor->bulk_link);
 	list_add(&cursor->hitch.link, &man->lru[cursor->priority]);
 
 	return ttm_resource_manager_next(cursor);
@@ -559,6 +646,7 @@  ttm_resource_manager_next(struct ttm_resource_cursor *cursor)
 		lru = &cursor->hitch;
 		list_for_each_entry_continue(lru, &man->lru[cursor->priority], link) {
 			if (ttm_lru_item_is_res(lru)) {
+				ttm_resource_cursor_check_bulk(cursor, lru);
 				list_move(&cursor->hitch.link, &lru->link);
 				return ttm_lru_item_to_res(lru);
 			}
@@ -568,6 +656,7 @@  ttm_resource_manager_next(struct ttm_resource_cursor *cursor)
 			break;
 
 		list_move(&cursor->hitch.link, &man->lru[cursor->priority]);
+		ttm_resource_cursor_clear_bulk(cursor);
 	}
 
 	ttm_resource_cursor_fini_locked(cursor);
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 61d4d95a5377..226da3c74f9c 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -1339,6 +1339,8 @@  struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
 
 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
 
+	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
+
 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
 	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
 
@@ -1462,6 +1464,7 @@  struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
 	mutex_destroy(&vm->snap_mutex);
 	for_each_tile(tile, xe, id)
 		xe_range_fence_tree_fini(&vm->rftree[id]);
+	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
 	kfree(vm);
 	if (flags & XE_VM_FLAG_LR_MODE)
 		xe_pm_runtime_put(xe);
@@ -1605,6 +1608,7 @@  static void vm_destroy_work_func(struct work_struct *w)
 		XE_WARN_ON(vm->pt_root[id]);
 
 	trace_xe_vm_free(vm);
+	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
 	kfree(vm);
 }
 
diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h
index 8fac781f641e..571abb4861a6 100644
--- a/include/drm/ttm/ttm_resource.h
+++ b/include/drm/ttm/ttm_resource.h
@@ -269,26 +269,6 @@  ttm_lru_item_to_res(struct ttm_lru_item *item)
 	return container_of(item, struct ttm_resource, lru);
 }
 
-/**
- * struct ttm_resource_cursor
- *
- * @man: The resource manager currently being iterated over.
- * @hitch: A hitch list node inserted before the next resource
- * to iterate over.
- * @priority: the current priority
- *
- * Cursor to iterate over the resources in a manager.
- */
-struct ttm_resource_cursor {
-	struct ttm_resource_manager *man;
-	struct ttm_lru_item hitch;
-	unsigned int priority;
-};
-
-void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor *cursor);
-
-void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor);
-
 /**
  * struct ttm_lru_bulk_move_pos
  *
@@ -304,8 +284,9 @@  struct ttm_lru_bulk_move_pos {
 
 /**
  * struct ttm_lru_bulk_move
- *
  * @pos: first/last lru entry for resources in the each domain/priority
+ * @cursor_list: The list of cursors currently traversing any of
+ * the sublists of @pos. Protected by the ttm device's lru_lock.
  *
  * Container for the current bulk move state. Should be used with
  * ttm_lru_bulk_move_init() and ttm_bo_set_bulk_move().
@@ -315,8 +296,39 @@  struct ttm_lru_bulk_move_pos {
  */
 struct ttm_lru_bulk_move {
 	struct ttm_lru_bulk_move_pos pos[TTM_NUM_MEM_TYPES][TTM_MAX_BO_PRIORITY];
+	struct list_head cursor_list;
 };
 
+/**
+ * struct ttm_resource_cursor
+ * @man: The resource manager currently being iterated over
+ * @hitch: A hitch list node inserted before the next resource
+ * to iterate over.
+ * @bulk_link: A list link for the list of cursors traversing the
+ * bulk sublist of @bulk. Protected by the ttm device's lru_lock.
+ * @bulk: Pointer to struct ttm_lru_bulk_move whose subrange @hitch is
+ * inserted to. NULL if none. Never dereference this pointer since
+ * the struct ttm_lru_bulk_move object pointed to might have been
+ * freed. The pointer is only for comparison.
+ * @mem_type: The memory type of the LRU list being traversed.
+ * This field is valid iff @bulk != NULL.
+ * @priority: the current priority
+ *
+ * Cursor to iterate over the resources in a manager.
+ */
+struct ttm_resource_cursor {
+	struct ttm_resource_manager *man;
+	struct ttm_lru_item hitch;
+	struct list_head bulk_link;
+	struct ttm_lru_bulk_move *bulk;
+	unsigned int mem_type;
+	unsigned int priority;
+};
+
+void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor *cursor);
+
+void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor);
+
 /**
  * struct ttm_kmap_iter_iomap - Specialization for a struct io_mapping +
  * struct sg_table backed struct ttm_resource.
@@ -405,6 +417,8 @@  ttm_resource_manager_cleanup(struct ttm_resource_manager *man)
 
 void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk);
 void ttm_lru_bulk_move_tail(struct ttm_lru_bulk_move *bulk);
+void ttm_lru_bulk_move_fini(struct ttm_device *bdev,
+			    struct ttm_lru_bulk_move *bulk);
 
 void ttm_resource_add_bulk_move(struct ttm_resource *res,
 				struct ttm_buffer_object *bo);