diff mbox

drm/ttm: remove fence_lock

Message ID 507835EF.2020806@canonical.com (mailing list archive)
State New, archived
Headers show

Commit Message

Maarten Lankhorst Oct. 12, 2012, 3:23 p.m. UTC
With the nouveau calls fixed there were only 2 places left that used
fence_lock without a reservation, those are fixed in this patch:

ttm_bo_cleanup_refs_or_queue is fixed by simply doing things the other
way around.

ttm_bo_cleanup_refs is fixed by taking a reservation first, then a pointer
to the fence object and backs off from the reservation if waiting has to
be performed.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
This patch should be applied only after all the previous patches I sent are applied,
since it depends on sync_obj_arg removal (kinda, probably fails to apply otherwise),
uses ttm_bo_is_reserved, and depends on nouveau wait behavior being fixed.

 drivers/gpu/drm/nouveau/nouveau_bo.c   |    4 -
 drivers/gpu/drm/nouveau/nouveau_gem.c  |   15 +---
 drivers/gpu/drm/radeon/radeon_object.c |    2 -
 drivers/gpu/drm/ttm/ttm_bo.c           |  124 ++++++++++++--------------------
 drivers/gpu/drm/ttm/ttm_bo_util.c      |    3 -
 drivers/gpu/drm/ttm/ttm_bo_vm.c        |    5 -
 drivers/gpu/drm/ttm/ttm_execbuf_util.c |    2 -
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c    |    3 -
 include/drm/ttm/ttm_bo_api.h           |   12 +--
 include/drm/ttm/ttm_bo_driver.h        |    3 -
 10 files changed, 52 insertions(+), 121 deletions(-)

Comments

Thomas Hellström (VMware) Oct. 18, 2012, 7:28 a.m. UTC | #1
Hi, Maarten,

As you know I have been having my doubts about this change.
To me it seems insane to be forced to read the fence pointer under a
reserved lock, simply because when you take the reserve lock, another
process may have it and there is a substantial chance that that process
will also be waiting for idle while holding the reserve lock.

In essence, to read the fence pointer, there is a large chance you will
be waiting for idle to be able to access it. That's why it's protected by
a separate spinlock in the first place.

So even if this change might not affect current driver much it's a change
to the TTM api that leads to an IMHO very poor design.

/Thomas


On 10/12/2012 05:23 PM, Maarten Lankhorst wrote:
> With the nouveau calls fixed there were only 2 places left that used
> fence_lock without a reservation, those are fixed in this patch:
>
> ttm_bo_cleanup_refs_or_queue is fixed by simply doing things the other
> way around.
>
> ttm_bo_cleanup_refs is fixed by taking a reservation first, then a pointer
> to the fence object and backs off from the reservation if waiting has to
> be performed.
>
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
> ---
> This patch should be applied only after all the previous patches I sent are applied,
> since it depends on sync_obj_arg removal (kinda, probably fails to apply otherwise),
> uses ttm_bo_is_reserved, and depends on nouveau wait behavior being fixed.
>
>   drivers/gpu/drm/nouveau/nouveau_bo.c   |    4 -
>   drivers/gpu/drm/nouveau/nouveau_gem.c  |   15 +---
>   drivers/gpu/drm/radeon/radeon_object.c |    2 -
>   drivers/gpu/drm/ttm/ttm_bo.c           |  124 ++++++++++++--------------------
>   drivers/gpu/drm/ttm/ttm_bo_util.c      |    3 -
>   drivers/gpu/drm/ttm/ttm_bo_vm.c        |    5 -
>   drivers/gpu/drm/ttm/ttm_execbuf_util.c |    2 -
>   drivers/gpu/drm/vmwgfx/vmwgfx_drv.c    |    3 -
>   include/drm/ttm/ttm_bo_api.h           |   12 +--
>   include/drm/ttm/ttm_bo_driver.h        |    3 -
>   10 files changed, 52 insertions(+), 121 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index cee7448..9749c45 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -1449,10 +1449,8 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence)
>   	if (likely(fence))
>   		nouveau_fence_ref(fence);
>   
> -	spin_lock(&nvbo->bo.bdev->fence_lock);
>   	old_fence = nvbo->bo.sync_obj;
>   	nvbo->bo.sync_obj = fence;
> -	spin_unlock(&nvbo->bo.bdev->fence_lock);
>   
>   	nouveau_fence_unref(&old_fence);
>   }
> @@ -1552,9 +1550,7 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma)
>   	if (vma->node) {
>   		if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) {
>   			ttm_bo_reserve(&nvbo->bo, false, false, false, 0);
> -			spin_lock(&nvbo->bo.bdev->fence_lock);
>   			ttm_bo_wait(&nvbo->bo, false, false, false);
> -			spin_unlock(&nvbo->bo.bdev->fence_lock);
>   			ttm_bo_unreserve(&nvbo->bo);
>   			nouveau_vm_unmap(vma);
>   		}
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index 6d8391d..eaa700a 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -391,18 +391,11 @@ retry:
>   static int
>   validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo)
>   {
> -	struct nouveau_fence *fence = NULL;
> +	struct nouveau_fence *fence = nvbo->bo.sync_obj;
>   	int ret = 0;
>   
> -	spin_lock(&nvbo->bo.bdev->fence_lock);
> -	if (nvbo->bo.sync_obj)
> -		fence = nouveau_fence_ref(nvbo->bo.sync_obj);
> -	spin_unlock(&nvbo->bo.bdev->fence_lock);
> -
> -	if (fence) {
> +	if (fence)
>   		ret = nouveau_fence_sync(fence, chan);
> -		nouveau_fence_unref(&fence);
> -	}
>   
>   	return ret;
>   }
> @@ -614,9 +607,7 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev,
>   				data |= r->vor;
>   		}
>   
> -		spin_lock(&nvbo->bo.bdev->fence_lock);
>   		ret = ttm_bo_wait(&nvbo->bo, false, false, false);
> -		spin_unlock(&nvbo->bo.bdev->fence_lock);
>   		if (ret) {
>   			NV_ERROR(drm, "reloc wait_idle failed: %d\n", ret);
>   			break;
> @@ -848,11 +839,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
>   
>   	ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0);
>   	if (!ret) {
> -		spin_lock(&nvbo->bo.bdev->fence_lock);
>   		ret = ttm_bo_wait(&nvbo->bo, true, true, true);
>   		if (!no_wait && ret)
>   			fence = nouveau_fence_ref(nvbo->bo.sync_obj);
> -		spin_unlock(&nvbo->bo.bdev->fence_lock);
>   
>   		ttm_bo_unreserve(&nvbo->bo);
>   	}
> diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
> index 808c444..df430e4 100644
> --- a/drivers/gpu/drm/radeon/radeon_object.c
> +++ b/drivers/gpu/drm/radeon/radeon_object.c
> @@ -613,12 +613,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, bool no_wait)
>   	r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0);
>   	if (unlikely(r != 0))
>   		return r;
> -	spin_lock(&bo->tbo.bdev->fence_lock);
>   	if (mem_type)
>   		*mem_type = bo->tbo.mem.mem_type;
>   	if (bo->tbo.sync_obj)
>   		r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
> -	spin_unlock(&bo->tbo.bdev->fence_lock);
>   	ttm_bo_unreserve(&bo->tbo);
>   	return r;
>   }
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index f6d7026..69fc29b 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -498,28 +498,23 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
>   {
>   	struct ttm_bo_device *bdev = bo->bdev;
>   	struct ttm_bo_global *glob = bo->glob;
> -	struct ttm_bo_driver *driver;
> +	struct ttm_bo_driver *driver = bdev->driver;
>   	void *sync_obj = NULL;
>   	int put_count;
>   	int ret;
>   
> -	spin_lock(&bdev->fence_lock);
> -	(void) ttm_bo_wait(bo, false, false, true);
> -	if (!bo->sync_obj) {
> -
> -		spin_lock(&glob->lru_lock);
> -
> -		/**
> -		 * Lock inversion between bo:reserve and bdev::fence_lock here,
> -		 * but that's OK, since we're only trylocking.
> -		 */
> -
> -		ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
> +	spin_lock(&glob->lru_lock);
> +	ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
> +	if (!ret) {
> +		ret = ttm_bo_wait(bo, false, false, true);
>   
> -		if (unlikely(ret == -EBUSY))
> +		if (unlikely(ret == -EBUSY)) {
> +			sync_obj = driver->sync_obj_ref(bo->sync_obj);
> +			atomic_set(&bo->reserved, 0);
> +			wake_up_all(&bo->event_queue);
>   			goto queue;
> +		}
>   
> -		spin_unlock(&bdev->fence_lock);
>   		put_count = ttm_bo_del_from_lru(bo);
>   
>   		spin_unlock(&glob->lru_lock);
> @@ -528,18 +523,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
>   		ttm_bo_list_ref_sub(bo, put_count, true);
>   
>   		return;
> -	} else {
> -		spin_lock(&glob->lru_lock);
>   	}
>   queue:
> -	driver = bdev->driver;
> -	if (bo->sync_obj)
> -		sync_obj = driver->sync_obj_ref(bo->sync_obj);
> -
>   	kref_get(&bo->list_kref);
>   	list_add_tail(&bo->ddestroy, &bdev->ddestroy);
>   	spin_unlock(&glob->lru_lock);
> -	spin_unlock(&bdev->fence_lock);
>   
>   	if (sync_obj) {
>   		driver->sync_obj_flush(sync_obj);
> @@ -565,25 +553,15 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
>   			       bool no_wait_gpu)
>   {
>   	struct ttm_bo_device *bdev = bo->bdev;
> +	struct ttm_bo_driver *driver = bdev->driver;
>   	struct ttm_bo_global *glob = bo->glob;
>   	int put_count;
>   	int ret = 0;
> +	void *sync_obj;
>   
>   retry:
> -	spin_lock(&bdev->fence_lock);
> -	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> -	spin_unlock(&bdev->fence_lock);
> -
> -	if (unlikely(ret != 0))
> -		return ret;
> -
>   	spin_lock(&glob->lru_lock);
>   
> -	if (unlikely(list_empty(&bo->ddestroy))) {
> -		spin_unlock(&glob->lru_lock);
> -		return 0;
> -	}
> -
>   	ret = ttm_bo_reserve_locked(bo, interruptible,
>   				    no_wait_reserve, false, 0);
>   
> @@ -592,18 +570,37 @@ retry:
>   		return ret;
>   	}
>   
> -	/**
> -	 * We can re-check for sync object without taking
> -	 * the bo::lock since setting the sync object requires
> -	 * also bo::reserved. A busy object at this point may
> -	 * be caused by another thread recently starting an accelerated
> -	 * eviction.
> -	 */
> +	if (unlikely(list_empty(&bo->ddestroy))) {
> +		atomic_set(&bo->reserved, 0);
> +		wake_up_all(&bo->event_queue);
> +		spin_unlock(&glob->lru_lock);
> +		return 0;
> +	}
> +	ret = ttm_bo_wait(bo, false, false, true);
> +
> +	if (ret) {
> +		if (no_wait_gpu) {
> +			atomic_set(&bo->reserved, 0);
> +			wake_up_all(&bo->event_queue);
> +			spin_unlock(&glob->lru_lock);
> +			return ret;
> +		}
>   
> -	if (unlikely(bo->sync_obj)) {
> +		/**
> +		 * Take a reference to the fence and unreserve, if the wait
> +		 * was succesful and no new sync_obj was attached,
> +		 * ttm_bo_wait in retry will return ret = 0, and end the loop.
> +		 */
> +
> +		sync_obj = driver->sync_obj_ref(&bo->sync_obj);
>   		atomic_set(&bo->reserved, 0);
>   		wake_up_all(&bo->event_queue);
>   		spin_unlock(&glob->lru_lock);
> +
> +		ret = driver->sync_obj_wait(bo->sync_obj, false, interruptible);
> +		driver->sync_obj_unref(&sync_obj);
> +		if (ret)
> +			return ret;
>   		goto retry;
>   	}
>   
> @@ -735,9 +732,7 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible,
>   	struct ttm_placement placement;
>   	int ret = 0;
>   
> -	spin_lock(&bdev->fence_lock);
>   	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> -	spin_unlock(&bdev->fence_lock);
>   
>   	if (unlikely(ret != 0)) {
>   		if (ret != -ERESTARTSYS) {
> @@ -1054,7 +1049,6 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
>   {
>   	int ret = 0;
>   	struct ttm_mem_reg mem;
> -	struct ttm_bo_device *bdev = bo->bdev;
>   
>   	BUG_ON(!ttm_bo_is_reserved(bo));
>   
> @@ -1063,9 +1057,7 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
>   	 * Have the driver move function wait for idle when necessary,
>   	 * instead of doing it here.
>   	 */
> -	spin_lock(&bdev->fence_lock);
>   	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> -	spin_unlock(&bdev->fence_lock);
>   	if (ret)
>   		return ret;
>   	mem.num_pages = bo->num_pages;
> @@ -1554,7 +1546,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
>   	bdev->glob = glob;
>   	bdev->need_dma32 = need_dma32;
>   	bdev->val_seq = 0;
> -	spin_lock_init(&bdev->fence_lock);
>   	mutex_lock(&glob->device_list_mutex);
>   	list_add_tail(&bdev->device_list, &glob->device_list);
>   	mutex_unlock(&glob->device_list_mutex);
> @@ -1690,52 +1681,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo,
>   		bool lazy, bool interruptible, bool no_wait)
>   {
>   	struct ttm_bo_driver *driver = bo->bdev->driver;
> -	struct ttm_bo_device *bdev = bo->bdev;
> -	void *sync_obj;
>   	int ret = 0;
>   
> +	WARN_ON_ONCE(!ttm_bo_is_reserved(bo));
> +
>   	if (likely(bo->sync_obj == NULL))
>   		return 0;
>   
> -	while (bo->sync_obj) {
> -
> +	if (bo->sync_obj) {
>   		if (driver->sync_obj_signaled(bo->sync_obj)) {
>   			void *tmp_obj = bo->sync_obj;
>   			bo->sync_obj = NULL;
>   			clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
> -			spin_unlock(&bdev->fence_lock);
>   			driver->sync_obj_unref(&tmp_obj);
> -			spin_lock(&bdev->fence_lock);
> -			continue;
> +			return 0;
>   		}
>   
>   		if (no_wait)
>   			return -EBUSY;
>   
> -		sync_obj = driver->sync_obj_ref(bo->sync_obj);
> -		spin_unlock(&bdev->fence_lock);
> -		ret = driver->sync_obj_wait(sync_obj,
> +		ret = driver->sync_obj_wait(bo->sync_obj,
>   					    lazy, interruptible);
>   		if (unlikely(ret != 0)) {
> -			driver->sync_obj_unref(&sync_obj);
> -			spin_lock(&bdev->fence_lock);
>   			return ret;
>   		}
> -		spin_lock(&bdev->fence_lock);
> -		if (likely(bo->sync_obj == sync_obj)) {
> -			void *tmp_obj = bo->sync_obj;
> -			bo->sync_obj = NULL;
> -			clear_bit(TTM_BO_PRIV_FLAG_MOVING,
> -				  &bo->priv_flags);
> -			spin_unlock(&bdev->fence_lock);
> -			driver->sync_obj_unref(&sync_obj);
> -			driver->sync_obj_unref(&tmp_obj);
> -			spin_lock(&bdev->fence_lock);
> -		} else {
> -			spin_unlock(&bdev->fence_lock);
> -			driver->sync_obj_unref(&sync_obj);
> -			spin_lock(&bdev->fence_lock);
> -		}
> +		driver->sync_obj_unref(&bo->sync_obj);
> +		clear_bit(TTM_BO_PRIV_FLAG_MOVING,
> +			  &bo->priv_flags);
>   	}
>   	return 0;
>   }
> @@ -1799,9 +1771,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
>   	 * Wait for GPU, then move to system cached.
>   	 */
>   
> -	spin_lock(&bo->bdev->fence_lock);
>   	ret = ttm_bo_wait(bo, false, false, false);
> -	spin_unlock(&bo->bdev->fence_lock);
>   
>   	if (unlikely(ret != 0))
>   		goto out;
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
> index d80d1e8..84d6e01 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> @@ -622,7 +622,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
>   	struct ttm_buffer_object *ghost_obj;
>   	void *tmp_obj = NULL;
>   
> -	spin_lock(&bdev->fence_lock);
>   	if (bo->sync_obj) {
>   		tmp_obj = bo->sync_obj;
>   		bo->sync_obj = NULL;
> @@ -630,7 +629,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
>   	bo->sync_obj = driver->sync_obj_ref(sync_obj);
>   	if (evict) {
>   		ret = ttm_bo_wait(bo, false, false, false);
> -		spin_unlock(&bdev->fence_lock);
>   		if (tmp_obj)
>   			driver->sync_obj_unref(&tmp_obj);
>   		if (ret)
> @@ -653,7 +651,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
>   		 */
>   
>   		set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
> -		spin_unlock(&bdev->fence_lock);
>   		if (tmp_obj)
>   			driver->sync_obj_unref(&tmp_obj);
>   
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index a877813..55718c2 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -122,17 +122,14 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>   	 * move.
>   	 */
>   
> -	spin_lock(&bdev->fence_lock);
>   	if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
>   		ret = ttm_bo_wait(bo, false, true, false);
> -		spin_unlock(&bdev->fence_lock);
>   		if (unlikely(ret != 0)) {
>   			retval = (ret != -ERESTARTSYS) ?
>   			    VM_FAULT_SIGBUS : VM_FAULT_NOPAGE;
>   			goto out_unlock;
>   		}
> -	} else
> -		spin_unlock(&bdev->fence_lock);
> +	}
>   
>   	ret = ttm_mem_io_lock(man, true);
>   	if (unlikely(ret != 0)) {
> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> index 721886e..51b5e97 100644
> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> @@ -207,7 +207,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
>   	driver = bdev->driver;
>   	glob = bo->glob;
>   
> -	spin_lock(&bdev->fence_lock);
>   	spin_lock(&glob->lru_lock);
>   
>   	list_for_each_entry(entry, list, head) {
> @@ -218,7 +217,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
>   		entry->reserved = false;
>   	}
>   	spin_unlock(&glob->lru_lock);
> -	spin_unlock(&bdev->fence_lock);
>   
>   	list_for_each_entry(entry, list, head) {
>   		if (entry->old_sync_obj)
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> index ed3c1e7..e038c9a 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> @@ -251,13 +251,10 @@ static void vmw_dummy_query_bo_prepare(struct vmw_private *dev_priv)
>   	volatile SVGA3dQueryResult *result;
>   	bool dummy;
>   	int ret;
> -	struct ttm_bo_device *bdev = &dev_priv->bdev;
>   	struct ttm_buffer_object *bo = dev_priv->dummy_query_bo;
>   
>   	ttm_bo_reserve(bo, false, false, false, 0);
> -	spin_lock(&bdev->fence_lock);
>   	ret = ttm_bo_wait(bo, false, false, false);
> -	spin_unlock(&bdev->fence_lock);
>   	if (unlikely(ret != 0))
>   		(void) vmw_fallback_wait(dev_priv, false, true, 0, false,
>   					 10*HZ);
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> index 816d9b1..6c69224 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -222,6 +222,8 @@ struct ttm_buffer_object {
>   	struct file *persistent_swap_storage;
>   	struct ttm_tt *ttm;
>   	bool evicted;
> +	void *sync_obj;
> +	unsigned long priv_flags;
>   
>   	/**
>   	 * Members protected by the bdev::lru_lock.
> @@ -242,16 +244,6 @@ struct ttm_buffer_object {
>   	atomic_t reserved;
>   
>   	/**
> -	 * Members protected by struct buffer_object_device::fence_lock
> -	 * In addition, setting sync_obj to anything else
> -	 * than NULL requires bo::reserved to be held. This allows for
> -	 * checking NULL while reserved but not holding the mentioned lock.
> -	 */
> -
> -	void *sync_obj;
> -	unsigned long priv_flags;
> -
> -	/**
>   	 * Members protected by the bdev::vm_lock
>   	 */
>   
> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
> index 0c8c3b5..aac101b 100644
> --- a/include/drm/ttm/ttm_bo_driver.h
> +++ b/include/drm/ttm/ttm_bo_driver.h
> @@ -515,8 +515,6 @@ struct ttm_bo_global {
>    *
>    * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
>    * @man: An array of mem_type_managers.
> - * @fence_lock: Protects the synchronizing members on *all* bos belonging
> - * to this device.
>    * @addr_space_mm: Range manager for the device address space.
>    * lru_lock: Spinlock that protects the buffer+device lru lists and
>    * ddestroy lists.
> @@ -539,7 +537,6 @@ struct ttm_bo_device {
>   	struct ttm_bo_driver *driver;
>   	rwlock_t vm_lock;
>   	struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
> -	spinlock_t fence_lock;
>   	/*
>   	 * Protected by the vm lock.
>   	 */
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
Thomas Hellström (VMware) Oct. 18, 2012, 7:59 a.m. UTC | #2
On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
> Hi, Maarten,
>
> As you know I have been having my doubts about this change.
> To me it seems insane to be forced to read the fence pointer under a
> reserved lock, simply because when you take the reserve lock, another
> process may have it and there is a substantial chance that that process
> will also be waiting for idle while holding the reserve lock.
>
> In essence, to read the fence pointer, there is a large chance you will
> be waiting for idle to be able to access it. That's why it's protected by
> a separate spinlock in the first place.
>
> So even if this change might not affect current driver much it's a change
> to the TTM api that leads to an IMHO very poor design.
>
> /Thomas
>

One way we could perhaps improve on this, if you think this is 
necessary, is to
build a bit on the initial rcu suggestion, still require reservation 
when the fence pointer is modified,
but to also use rcu_assign_pointer() for the fence pointer.

Anyone that wants quick access to the fence pointer would then use 
rcu_dereference_x(), but
if the fence is indeed signaled, that caller needs to avoid setting the 
bo fence pointer to NULL.

A check for bo idle without taking any (bo) locks would then mean imply 
reading the fence pointer
in this fashion, and if it's non-NULL check whether the fence is signaled.

/Thomas

















>
> On 10/12/2012 05:23 PM, Maarten Lankhorst wrote:
>> With the nouveau calls fixed there were only 2 places left that used
>> fence_lock without a reservation, those are fixed in this patch:
>>
>> ttm_bo_cleanup_refs_or_queue is fixed by simply doing things the other
>> way around.
>>
>> ttm_bo_cleanup_refs is fixed by taking a reservation first, then a 
>> pointer
>> to the fence object and backs off from the reservation if waiting has to
>> be performed.
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>> ---
>> This patch should be applied only after all the previous patches I 
>> sent are applied,
>> since it depends on sync_obj_arg removal (kinda, probably fails to 
>> apply otherwise),
>> uses ttm_bo_is_reserved, and depends on nouveau wait behavior being 
>> fixed.
>>
>>   drivers/gpu/drm/nouveau/nouveau_bo.c   |    4 -
>>   drivers/gpu/drm/nouveau/nouveau_gem.c  |   15 +---
>>   drivers/gpu/drm/radeon/radeon_object.c |    2 -
>>   drivers/gpu/drm/ttm/ttm_bo.c           |  124 
>> ++++++++++++--------------------
>>   drivers/gpu/drm/ttm/ttm_bo_util.c      |    3 -
>>   drivers/gpu/drm/ttm/ttm_bo_vm.c        |    5 -
>>   drivers/gpu/drm/ttm/ttm_execbuf_util.c |    2 -
>>   drivers/gpu/drm/vmwgfx/vmwgfx_drv.c    |    3 -
>>   include/drm/ttm/ttm_bo_api.h           |   12 +--
>>   include/drm/ttm/ttm_bo_driver.h        |    3 -
>>   10 files changed, 52 insertions(+), 121 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c 
>> b/drivers/gpu/drm/nouveau/nouveau_bo.c
>> index cee7448..9749c45 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
>> @@ -1449,10 +1449,8 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, 
>> struct nouveau_fence *fence)
>>       if (likely(fence))
>>           nouveau_fence_ref(fence);
>>   -    spin_lock(&nvbo->bo.bdev->fence_lock);
>>       old_fence = nvbo->bo.sync_obj;
>>       nvbo->bo.sync_obj = fence;
>> -    spin_unlock(&nvbo->bo.bdev->fence_lock);
>>         nouveau_fence_unref(&old_fence);
>>   }
>> @@ -1552,9 +1550,7 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, 
>> struct nouveau_vma *vma)
>>       if (vma->node) {
>>           if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) {
>>               ttm_bo_reserve(&nvbo->bo, false, false, false, 0);
>> -            spin_lock(&nvbo->bo.bdev->fence_lock);
>>               ttm_bo_wait(&nvbo->bo, false, false, false);
>> -            spin_unlock(&nvbo->bo.bdev->fence_lock);
>>               ttm_bo_unreserve(&nvbo->bo);
>>               nouveau_vm_unmap(vma);
>>           }
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c 
>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> index 6d8391d..eaa700a 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> @@ -391,18 +391,11 @@ retry:
>>   static int
>>   validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo)
>>   {
>> -    struct nouveau_fence *fence = NULL;
>> +    struct nouveau_fence *fence = nvbo->bo.sync_obj;
>>       int ret = 0;
>>   -    spin_lock(&nvbo->bo.bdev->fence_lock);
>> -    if (nvbo->bo.sync_obj)
>> -        fence = nouveau_fence_ref(nvbo->bo.sync_obj);
>> -    spin_unlock(&nvbo->bo.bdev->fence_lock);
>> -
>> -    if (fence) {
>> +    if (fence)
>>           ret = nouveau_fence_sync(fence, chan);
>> -        nouveau_fence_unref(&fence);
>> -    }
>>         return ret;
>>   }
>> @@ -614,9 +607,7 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device 
>> *dev,
>>                   data |= r->vor;
>>           }
>>   -        spin_lock(&nvbo->bo.bdev->fence_lock);
>>           ret = ttm_bo_wait(&nvbo->bo, false, false, false);
>> -        spin_unlock(&nvbo->bo.bdev->fence_lock);
>>           if (ret) {
>>               NV_ERROR(drm, "reloc wait_idle failed: %d\n", ret);
>>               break;
>> @@ -848,11 +839,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device 
>> *dev, void *data,
>>         ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0);
>>       if (!ret) {
>> -        spin_lock(&nvbo->bo.bdev->fence_lock);
>>           ret = ttm_bo_wait(&nvbo->bo, true, true, true);
>>           if (!no_wait && ret)
>>               fence = nouveau_fence_ref(nvbo->bo.sync_obj);
>> -        spin_unlock(&nvbo->bo.bdev->fence_lock);
>>             ttm_bo_unreserve(&nvbo->bo);
>>       }
>> diff --git a/drivers/gpu/drm/radeon/radeon_object.c 
>> b/drivers/gpu/drm/radeon/radeon_object.c
>> index 808c444..df430e4 100644
>> --- a/drivers/gpu/drm/radeon/radeon_object.c
>> +++ b/drivers/gpu/drm/radeon/radeon_object.c
>> @@ -613,12 +613,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32 
>> *mem_type, bool no_wait)
>>       r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0);
>>       if (unlikely(r != 0))
>>           return r;
>> -    spin_lock(&bo->tbo.bdev->fence_lock);
>>       if (mem_type)
>>           *mem_type = bo->tbo.mem.mem_type;
>>       if (bo->tbo.sync_obj)
>>           r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
>> -    spin_unlock(&bo->tbo.bdev->fence_lock);
>>       ttm_bo_unreserve(&bo->tbo);
>>       return r;
>>   }
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> index f6d7026..69fc29b 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -498,28 +498,23 @@ static void ttm_bo_cleanup_refs_or_queue(struct 
>> ttm_buffer_object *bo)
>>   {
>>       struct ttm_bo_device *bdev = bo->bdev;
>>       struct ttm_bo_global *glob = bo->glob;
>> -    struct ttm_bo_driver *driver;
>> +    struct ttm_bo_driver *driver = bdev->driver;
>>       void *sync_obj = NULL;
>>       int put_count;
>>       int ret;
>>   -    spin_lock(&bdev->fence_lock);
>> -    (void) ttm_bo_wait(bo, false, false, true);
>> -    if (!bo->sync_obj) {
>> -
>> -        spin_lock(&glob->lru_lock);
>> -
>> -        /**
>> -         * Lock inversion between bo:reserve and bdev::fence_lock here,
>> -         * but that's OK, since we're only trylocking.
>> -         */
>> -
>> -        ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
>> +    spin_lock(&glob->lru_lock);
>> +    ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
>> +    if (!ret) {
>> +        ret = ttm_bo_wait(bo, false, false, true);
>>   -        if (unlikely(ret == -EBUSY))
>> +        if (unlikely(ret == -EBUSY)) {
>> +            sync_obj = driver->sync_obj_ref(bo->sync_obj);
>> +            atomic_set(&bo->reserved, 0);
>> +            wake_up_all(&bo->event_queue);
>>               goto queue;
>> +        }
>>   -        spin_unlock(&bdev->fence_lock);
>>           put_count = ttm_bo_del_from_lru(bo);
>>             spin_unlock(&glob->lru_lock);
>> @@ -528,18 +523,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct 
>> ttm_buffer_object *bo)
>>           ttm_bo_list_ref_sub(bo, put_count, true);
>>             return;
>> -    } else {
>> -        spin_lock(&glob->lru_lock);
>>       }
>>   queue:
>> -    driver = bdev->driver;
>> -    if (bo->sync_obj)
>> -        sync_obj = driver->sync_obj_ref(bo->sync_obj);
>> -
>>       kref_get(&bo->list_kref);
>>       list_add_tail(&bo->ddestroy, &bdev->ddestroy);
>>       spin_unlock(&glob->lru_lock);
>> -    spin_unlock(&bdev->fence_lock);
>>         if (sync_obj) {
>>           driver->sync_obj_flush(sync_obj);
>> @@ -565,25 +553,15 @@ static int ttm_bo_cleanup_refs(struct 
>> ttm_buffer_object *bo,
>>                      bool no_wait_gpu)
>>   {
>>       struct ttm_bo_device *bdev = bo->bdev;
>> +    struct ttm_bo_driver *driver = bdev->driver;
>>       struct ttm_bo_global *glob = bo->glob;
>>       int put_count;
>>       int ret = 0;
>> +    void *sync_obj;
>>     retry:
>> -    spin_lock(&bdev->fence_lock);
>> -    ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
>> -    spin_unlock(&bdev->fence_lock);
>> -
>> -    if (unlikely(ret != 0))
>> -        return ret;
>> -
>>       spin_lock(&glob->lru_lock);
>>   -    if (unlikely(list_empty(&bo->ddestroy))) {
>> -        spin_unlock(&glob->lru_lock);
>> -        return 0;
>> -    }
>> -
>>       ret = ttm_bo_reserve_locked(bo, interruptible,
>>                       no_wait_reserve, false, 0);
>>   @@ -592,18 +570,37 @@ retry:
>>           return ret;
>>       }
>>   -    /**
>> -     * We can re-check for sync object without taking
>> -     * the bo::lock since setting the sync object requires
>> -     * also bo::reserved. A busy object at this point may
>> -     * be caused by another thread recently starting an accelerated
>> -     * eviction.
>> -     */
>> +    if (unlikely(list_empty(&bo->ddestroy))) {
>> +        atomic_set(&bo->reserved, 0);
>> +        wake_up_all(&bo->event_queue);
>> +        spin_unlock(&glob->lru_lock);
>> +        return 0;
>> +    }
>> +    ret = ttm_bo_wait(bo, false, false, true);
>> +
>> +    if (ret) {
>> +        if (no_wait_gpu) {
>> +            atomic_set(&bo->reserved, 0);
>> +            wake_up_all(&bo->event_queue);
>> +            spin_unlock(&glob->lru_lock);
>> +            return ret;
>> +        }
>>   -    if (unlikely(bo->sync_obj)) {
>> +        /**
>> +         * Take a reference to the fence and unreserve, if the wait
>> +         * was succesful and no new sync_obj was attached,
>> +         * ttm_bo_wait in retry will return ret = 0, and end the loop.
>> +         */
>> +
>> +        sync_obj = driver->sync_obj_ref(&bo->sync_obj);
>>           atomic_set(&bo->reserved, 0);
>>           wake_up_all(&bo->event_queue);
>>           spin_unlock(&glob->lru_lock);
>> +
>> +        ret = driver->sync_obj_wait(bo->sync_obj, false, 
>> interruptible);
>> +        driver->sync_obj_unref(&sync_obj);
>> +        if (ret)
>> +            return ret;
>>           goto retry;
>>       }
>>   @@ -735,9 +732,7 @@ static int ttm_bo_evict(struct 
>> ttm_buffer_object *bo, bool interruptible,
>>       struct ttm_placement placement;
>>       int ret = 0;
>>   -    spin_lock(&bdev->fence_lock);
>>       ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
>> -    spin_unlock(&bdev->fence_lock);
>>         if (unlikely(ret != 0)) {
>>           if (ret != -ERESTARTSYS) {
>> @@ -1054,7 +1049,6 @@ int ttm_bo_move_buffer(struct ttm_buffer_object 
>> *bo,
>>   {
>>       int ret = 0;
>>       struct ttm_mem_reg mem;
>> -    struct ttm_bo_device *bdev = bo->bdev;
>>         BUG_ON(!ttm_bo_is_reserved(bo));
>>   @@ -1063,9 +1057,7 @@ int ttm_bo_move_buffer(struct 
>> ttm_buffer_object *bo,
>>        * Have the driver move function wait for idle when necessary,
>>        * instead of doing it here.
>>        */
>> -    spin_lock(&bdev->fence_lock);
>>       ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
>> -    spin_unlock(&bdev->fence_lock);
>>       if (ret)
>>           return ret;
>>       mem.num_pages = bo->num_pages;
>> @@ -1554,7 +1546,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
>>       bdev->glob = glob;
>>       bdev->need_dma32 = need_dma32;
>>       bdev->val_seq = 0;
>> -    spin_lock_init(&bdev->fence_lock);
>>       mutex_lock(&glob->device_list_mutex);
>>       list_add_tail(&bdev->device_list, &glob->device_list);
>>       mutex_unlock(&glob->device_list_mutex);
>> @@ -1690,52 +1681,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo,
>>           bool lazy, bool interruptible, bool no_wait)
>>   {
>>       struct ttm_bo_driver *driver = bo->bdev->driver;
>> -    struct ttm_bo_device *bdev = bo->bdev;
>> -    void *sync_obj;
>>       int ret = 0;
>>   +    WARN_ON_ONCE(!ttm_bo_is_reserved(bo));
>> +
>>       if (likely(bo->sync_obj == NULL))
>>           return 0;
>>   -    while (bo->sync_obj) {
>> -
>> +    if (bo->sync_obj) {
>>           if (driver->sync_obj_signaled(bo->sync_obj)) {
>>               void *tmp_obj = bo->sync_obj;
>>               bo->sync_obj = NULL;
>>               clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
>> -            spin_unlock(&bdev->fence_lock);
>>               driver->sync_obj_unref(&tmp_obj);
>> -            spin_lock(&bdev->fence_lock);
>> -            continue;
>> +            return 0;
>>           }
>>             if (no_wait)
>>               return -EBUSY;
>>   -        sync_obj = driver->sync_obj_ref(bo->sync_obj);
>> -        spin_unlock(&bdev->fence_lock);
>> -        ret = driver->sync_obj_wait(sync_obj,
>> +        ret = driver->sync_obj_wait(bo->sync_obj,
>>                           lazy, interruptible);
>>           if (unlikely(ret != 0)) {
>> -            driver->sync_obj_unref(&sync_obj);
>> -            spin_lock(&bdev->fence_lock);
>>               return ret;
>>           }
>> -        spin_lock(&bdev->fence_lock);
>> -        if (likely(bo->sync_obj == sync_obj)) {
>> -            void *tmp_obj = bo->sync_obj;
>> -            bo->sync_obj = NULL;
>> -            clear_bit(TTM_BO_PRIV_FLAG_MOVING,
>> -                  &bo->priv_flags);
>> -            spin_unlock(&bdev->fence_lock);
>> -            driver->sync_obj_unref(&sync_obj);
>> -            driver->sync_obj_unref(&tmp_obj);
>> -            spin_lock(&bdev->fence_lock);
>> -        } else {
>> -            spin_unlock(&bdev->fence_lock);
>> -            driver->sync_obj_unref(&sync_obj);
>> -            spin_lock(&bdev->fence_lock);
>> -        }
>> +        driver->sync_obj_unref(&bo->sync_obj);
>> +        clear_bit(TTM_BO_PRIV_FLAG_MOVING,
>> +              &bo->priv_flags);
>>       }
>>       return 0;
>>   }
>> @@ -1799,9 +1771,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink 
>> *shrink)
>>        * Wait for GPU, then move to system cached.
>>        */
>>   -    spin_lock(&bo->bdev->fence_lock);
>>       ret = ttm_bo_wait(bo, false, false, false);
>> -    spin_unlock(&bo->bdev->fence_lock);
>>         if (unlikely(ret != 0))
>>           goto out;
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c 
>> b/drivers/gpu/drm/ttm/ttm_bo_util.c
>> index d80d1e8..84d6e01 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
>> @@ -622,7 +622,6 @@ int ttm_bo_move_accel_cleanup(struct 
>> ttm_buffer_object *bo,
>>       struct ttm_buffer_object *ghost_obj;
>>       void *tmp_obj = NULL;
>>   -    spin_lock(&bdev->fence_lock);
>>       if (bo->sync_obj) {
>>           tmp_obj = bo->sync_obj;
>>           bo->sync_obj = NULL;
>> @@ -630,7 +629,6 @@ int ttm_bo_move_accel_cleanup(struct 
>> ttm_buffer_object *bo,
>>       bo->sync_obj = driver->sync_obj_ref(sync_obj);
>>       if (evict) {
>>           ret = ttm_bo_wait(bo, false, false, false);
>> -        spin_unlock(&bdev->fence_lock);
>>           if (tmp_obj)
>>               driver->sync_obj_unref(&tmp_obj);
>>           if (ret)
>> @@ -653,7 +651,6 @@ int ttm_bo_move_accel_cleanup(struct 
>> ttm_buffer_object *bo,
>>            */
>>             set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
>> -        spin_unlock(&bdev->fence_lock);
>>           if (tmp_obj)
>>               driver->sync_obj_unref(&tmp_obj);
>>   diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c 
>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> index a877813..55718c2 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> @@ -122,17 +122,14 @@ static int ttm_bo_vm_fault(struct 
>> vm_area_struct *vma, struct vm_fault *vmf)
>>        * move.
>>        */
>>   -    spin_lock(&bdev->fence_lock);
>>       if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
>>           ret = ttm_bo_wait(bo, false, true, false);
>> -        spin_unlock(&bdev->fence_lock);
>>           if (unlikely(ret != 0)) {
>>               retval = (ret != -ERESTARTSYS) ?
>>                   VM_FAULT_SIGBUS : VM_FAULT_NOPAGE;
>>               goto out_unlock;
>>           }
>> -    } else
>> -        spin_unlock(&bdev->fence_lock);
>> +    }
>>         ret = ttm_mem_io_lock(man, true);
>>       if (unlikely(ret != 0)) {
>> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c 
>> b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> index 721886e..51b5e97 100644
>> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> @@ -207,7 +207,6 @@ void ttm_eu_fence_buffer_objects(struct list_head 
>> *list, void *sync_obj)
>>       driver = bdev->driver;
>>       glob = bo->glob;
>>   -    spin_lock(&bdev->fence_lock);
>>       spin_lock(&glob->lru_lock);
>>         list_for_each_entry(entry, list, head) {
>> @@ -218,7 +217,6 @@ void ttm_eu_fence_buffer_objects(struct list_head 
>> *list, void *sync_obj)
>>           entry->reserved = false;
>>       }
>>       spin_unlock(&glob->lru_lock);
>> -    spin_unlock(&bdev->fence_lock);
>>         list_for_each_entry(entry, list, head) {
>>           if (entry->old_sync_obj)
>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c 
>> b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
>> index ed3c1e7..e038c9a 100644
>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
>> @@ -251,13 +251,10 @@ static void vmw_dummy_query_bo_prepare(struct 
>> vmw_private *dev_priv)
>>       volatile SVGA3dQueryResult *result;
>>       bool dummy;
>>       int ret;
>> -    struct ttm_bo_device *bdev = &dev_priv->bdev;
>>       struct ttm_buffer_object *bo = dev_priv->dummy_query_bo;
>>         ttm_bo_reserve(bo, false, false, false, 0);
>> -    spin_lock(&bdev->fence_lock);
>>       ret = ttm_bo_wait(bo, false, false, false);
>> -    spin_unlock(&bdev->fence_lock);
>>       if (unlikely(ret != 0))
>>           (void) vmw_fallback_wait(dev_priv, false, true, 0, false,
>>                        10*HZ);
>> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
>> index 816d9b1..6c69224 100644
>> --- a/include/drm/ttm/ttm_bo_api.h
>> +++ b/include/drm/ttm/ttm_bo_api.h
>> @@ -222,6 +222,8 @@ struct ttm_buffer_object {
>>       struct file *persistent_swap_storage;
>>       struct ttm_tt *ttm;
>>       bool evicted;
>> +    void *sync_obj;
>> +    unsigned long priv_flags;
>>         /**
>>        * Members protected by the bdev::lru_lock.
>> @@ -242,16 +244,6 @@ struct ttm_buffer_object {
>>       atomic_t reserved;
>>         /**
>> -     * Members protected by struct buffer_object_device::fence_lock
>> -     * In addition, setting sync_obj to anything else
>> -     * than NULL requires bo::reserved to be held. This allows for
>> -     * checking NULL while reserved but not holding the mentioned lock.
>> -     */
>> -
>> -    void *sync_obj;
>> -    unsigned long priv_flags;
>> -
>> -    /**
>>        * Members protected by the bdev::vm_lock
>>        */
>>   diff --git a/include/drm/ttm/ttm_bo_driver.h 
>> b/include/drm/ttm/ttm_bo_driver.h
>> index 0c8c3b5..aac101b 100644
>> --- a/include/drm/ttm/ttm_bo_driver.h
>> +++ b/include/drm/ttm/ttm_bo_driver.h
>> @@ -515,8 +515,6 @@ struct ttm_bo_global {
>>    *
>>    * @driver: Pointer to a struct ttm_bo_driver struct setup by the 
>> driver.
>>    * @man: An array of mem_type_managers.
>> - * @fence_lock: Protects the synchronizing members on *all* bos 
>> belonging
>> - * to this device.
>>    * @addr_space_mm: Range manager for the device address space.
>>    * lru_lock: Spinlock that protects the buffer+device lru lists and
>>    * ddestroy lists.
>> @@ -539,7 +537,6 @@ struct ttm_bo_device {
>>       struct ttm_bo_driver *driver;
>>       rwlock_t vm_lock;
>>       struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
>> -    spinlock_t fence_lock;
>>       /*
>>        * Protected by the vm lock.
>>        */
>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>
Maarten Lankhorst Oct. 18, 2012, 8:37 a.m. UTC | #3
Hey,

Op 18-10-12 09:59, Thomas Hellstrom schreef:
>
>
>
> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>> Hi, Maarten,
>>
>> As you know I have been having my doubts about this change.
>> To me it seems insane to be forced to read the fence pointer under a
>> reserved lock, simply because when you take the reserve lock, another
>> process may have it and there is a substantial chance that that process
>> will also be waiting for idle while holding the reserve lock.

I think it makes perfect sense, the only times you want to read the fence
is when you want to change the members protected by the reservation.

>> In essence, to read the fence pointer, there is a large chance you will
>> be waiting for idle to be able to access it. That's why it's protected by
>> a separate spinlock in the first place.
>>
>> So even if this change might not affect current driver much it's a change
>> to the TTM api that leads to an IMHO very poor design.

I would argue the opposite, no longer having a separate lock, with clear
semantics when fencing is allowed, gives you a way to clean up the core
of ttm immensely.

There were only 2 functions affected by fence lock removal and they were
on buffer destruction, ttm_bo_cleanup_refs (and *_or_queue). But since
the *_or_queue can simply change order around, you only have to worry about
ttm_bo_cleanup_refs. This function is already ugly for other reasons, and
the followup patch I was suggesting cleaned up the ugliness there too.

The only thing done differently is backing off from the reservation early.
With the cleanup it won't even try to get the reservation again, since
nobody should set a new fence on the bo when it's dead. Instead all
destruction is moved until list refcount drops to 0.

> One way we could perhaps improve on this, if you think this is necessary, is to
> build a bit on the initial rcu suggestion, still require reservation when the fence pointer is modified,
> but to also use rcu_assign_pointer() for the fence pointer.

This is a massive overkill when the only time you read the fence pointer
without reservation is during buffer destruction. RCU is only good if
there's ~10x more reads than writes, and for us it's simply 50% reads 50%
writes..

> Anyone that wants quick access to the fence pointer would then use rcu_dereference_x(), but
> if the fence is indeed signaled, that caller needs to avoid setting the bo fence pointer to NULL.
>
> A check for bo idle without taking any (bo) locks would then mean imply reading the fence pointer
> in this fashion, and if it's non-NULL check whether the fence is signaled.

Sure it may look easier, but you add more overhead and complexity. I thought
you wanted to avoid overhead in the reservation path? RCU won't be the way
to do that.

~Maarten
Thomas Hellström (VMware) Oct. 18, 2012, 11:02 a.m. UTC | #4
On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
> Hey,
>
> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>
>>
>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>> Hi, Maarten,
>>>
>>> As you know I have been having my doubts about this change.
>>> To me it seems insane to be forced to read the fence pointer under a
>>> reserved lock, simply because when you take the reserve lock, another
>>> process may have it and there is a substantial chance that that process
>>> will also be waiting for idle while holding the reserve lock.
> I think it makes perfect sense, the only times you want to read the fence
> is when you want to change the members protected by the reservation.

No, that's not true. A typical case (or the only case)
is where you want to do a map with no_wait semantics. You will want
to be able to access a buffer's results even if the eviction code
is about to schedule an unbind from the GPU, and have the buffer
reserved?

>
>>> In essence, to read the fence pointer, there is a large chance you will
>>> be waiting for idle to be able to access it. That's why it's protected by
>>> a separate spinlock in the first place.
>>>
>>> So even if this change might not affect current driver much it's a change
>>> to the TTM api that leads to an IMHO very poor design.
> I would argue the opposite, no longer having a separate lock, with clear
> semantics when fencing is allowed, gives you a way to clean up the core
> of ttm immensely.
>
> There were only 2 functions affected by fence lock removal and they were
> on buffer destruction, ttm_bo_cleanup_refs (and *_or_queue).

The actual code and the number of users is irrelevant here, since
we're discussing the implications of changing the API.

>> One way we could perhaps improve on this, if you think this is necessary, is to
>> build a bit on the initial rcu suggestion, still require reservation when the fence pointer is modified,
>> but to also use rcu_assign_pointer() for the fence pointer.
> This is a massive overkill when the only time you read the fence pointer
> without reservation is during buffer destruction. RCU is only good if
> there's ~10x more reads than writes, and for us it's simply 50% reads 50%
> writes..
>

I think you misunderstand me. I'm not suggesting going down the full RCU 
path, I'm merely making
sure that reads and writes to the bo's fence pointer are atomic, using 
the RCU functions
for this. I'm not suggesting any RCU syncs. This means your patch can be 
kept largely as
it is, just make sure you do atomic_writes to the fence pointers.

/Thomas
Maarten Lankhorst Oct. 18, 2012, 11:38 a.m. UTC | #5
Op 18-10-12 13:02, Thomas Hellstrom schreef:
> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>> Hey,
>>
>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>
>>>
>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>> Hi, Maarten,
>>>>
>>>> As you know I have been having my doubts about this change.
>>>> To me it seems insane to be forced to read the fence pointer under a
>>>> reserved lock, simply because when you take the reserve lock, another
>>>> process may have it and there is a substantial chance that that process
>>>> will also be waiting for idle while holding the reserve lock.
>> I think it makes perfect sense, the only times you want to read the fence
>> is when you want to change the members protected by the reservation.
>
> No, that's not true. A typical case (or the only case)
> is where you want to do a map with no_wait semantics. You will want
> to be able to access a buffer's results even if the eviction code
> is about to schedule an unbind from the GPU, and have the buffer
> reserved?
Well either block on reserve or return -EBUSY if reserved, presumably the
former..

ttm_bo_vm_fault does the latter already, anyway.

You don't need to hold the reservation while performing the wait itself though,
you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
take a ref to the sync_obj member and then wait after unreserving. You won't
reset sync_obj member to NULL in that case, but that should be harmless.
This will allow you to keep the reservations fast and short. Maybe a helper
would be appropriate for this since radeon and nouveau both seem to do this.

The next time someone wants to do a wait it will go through the fastpath and
unset the sync_obj member, since it's already signaled, or it's removed when
ttm_execbuffer_util is used.

~Maarten
Thomas Hellström (VMware) Oct. 18, 2012, 11:55 a.m. UTC | #6
On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>> Hey,
>>>
>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>
>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>> Hi, Maarten,
>>>>>
>>>>> As you know I have been having my doubts about this change.
>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>> process may have it and there is a substantial chance that that process
>>>>> will also be waiting for idle while holding the reserve lock.
>>> I think it makes perfect sense, the only times you want to read the fence
>>> is when you want to change the members protected by the reservation.
>> No, that's not true. A typical case (or the only case)
>> is where you want to do a map with no_wait semantics. You will want
>> to be able to access a buffer's results even if the eviction code
>> is about to schedule an unbind from the GPU, and have the buffer
>> reserved?
> Well either block on reserve or return -EBUSY if reserved, presumably the
> former..
>
> ttm_bo_vm_fault does the latter already, anyway

ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
a waiting reserve but for different reasons. Typically a user-space app 
will keep
asynchronous maps to TTM during a buffer lifetime, and the buffer 
lifetime may
be long as user-space apps keep caches.
That's not the same as accessing a buffer after the GPU is done with it.

>
> You don't need to hold the reservation while performing the wait itself though,
> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
> take a ref to the sync_obj member and then wait after unreserving. You won't
> reset sync_obj member to NULL in that case, but that should be harmless.
> This will allow you to keep the reservations fast and short. Maybe a helper
> would be appropriate for this since radeon and nouveau both seem to do this.
>

The problem is that as long as other users are waiting for idle with 
reservation
held, for example to switch GPU engine or to delete a GPU bind, waiting
for reserve will in many case mean wait for GPU.

/Thomas
Maarten Lankhorst Oct. 18, 2012, 2:45 p.m. UTC | #7
Op 18-10-12 13:55, Thomas Hellstrom schreef:
> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>> Hey,
>>>>
>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>
>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>> Hi, Maarten,
>>>>>>
>>>>>> As you know I have been having my doubts about this change.
>>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>>> process may have it and there is a substantial chance that that process
>>>>>> will also be waiting for idle while holding the reserve lock.
>>>> I think it makes perfect sense, the only times you want to read the fence
>>>> is when you want to change the members protected by the reservation.
>>> No, that's not true. A typical case (or the only case)
>>> is where you want to do a map with no_wait semantics. You will want
>>> to be able to access a buffer's results even if the eviction code
>>> is about to schedule an unbind from the GPU, and have the buffer
>>> reserved?
>> Well either block on reserve or return -EBUSY if reserved, presumably the
>> former..
>>
>> ttm_bo_vm_fault does the latter already, anyway
>
> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
> a waiting reserve but for different reasons. Typically a user-space app will keep
> asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
> be long as user-space apps keep caches.
> That's not the same as accessing a buffer after the GPU is done with it.
Ah indeed.
>>
>> You don't need to hold the reservation while performing the wait itself though,
>> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>> take a ref to the sync_obj member and then wait after unreserving. You won't
>> reset sync_obj member to NULL in that case, but that should be harmless.
>> This will allow you to keep the reservations fast and short. Maybe a helper
>> would be appropriate for this since radeon and nouveau both seem to do this.
>>
>
> The problem is that as long as other users are waiting for idle with reservation
> held, for example to switch GPU engine or to delete a GPU bind, waiting
> for reserve will in many case mean wait for GPU.
This example sounds inefficient, I know nouveau can do this, but this essentially
just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
It wouldn't surprise me if performance in nouveau could be improved simply by
fixing those cases up instead, since it stalls the application completely too for other uses.

If reservations are held, it also becomes very easy to find all outliers, I didn't hook
this part of lockdep up yet, but I intend to. See Documentation/lockstat.txt .

Lockstat becomes slightly trickier since we do multi object reservation, but we
could follow the same path as lock_mutex_interruptible in the interrupted case.
If the waiting on a reservation becomes a problem, I intend to make it a very
measurable problem. :-)

And the only reason I haven't fixed the nouveau function I mentioned
is because I wanted to see if I could make this show up as issue, and
check how much it affects normal workloads.

All other places already did the reservation before wait, so it would be
really valuable to quantify how much of a problem this really is, and fix
up those pain points instead.

~Maarten
Thomas Hellström (VMware) Oct. 18, 2012, 4:43 p.m. UTC | #8
On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
> Op 18-10-12 13:55, Thomas Hellstrom schreef:
>> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>>> Hey,
>>>>>
>>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>>> Hi, Maarten,
>>>>>>>
>>>>>>> As you know I have been having my doubts about this change.
>>>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>>>> process may have it and there is a substantial chance that that process
>>>>>>> will also be waiting for idle while holding the reserve lock.
>>>>> I think it makes perfect sense, the only times you want to read the fence
>>>>> is when you want to change the members protected by the reservation.
>>>> No, that's not true. A typical case (or the only case)
>>>> is where you want to do a map with no_wait semantics. You will want
>>>> to be able to access a buffer's results even if the eviction code
>>>> is about to schedule an unbind from the GPU, and have the buffer
>>>> reserved?
>>> Well either block on reserve or return -EBUSY if reserved, presumably the
>>> former..
>>>
>>> ttm_bo_vm_fault does the latter already, anyway
>> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
>> a waiting reserve but for different reasons. Typically a user-space app will keep
>> asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
>> be long as user-space apps keep caches.
>> That's not the same as accessing a buffer after the GPU is done with it.
> Ah indeed.
>>> You don't need to hold the reservation while performing the wait itself though,
>>> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>>> take a ref to the sync_obj member and then wait after unreserving. You won't
>>> reset sync_obj member to NULL in that case, but that should be harmless.
>>> This will allow you to keep the reservations fast and short. Maybe a helper
>>> would be appropriate for this since radeon and nouveau both seem to do this.
>>>
>> The problem is that as long as other users are waiting for idle with reservation
>> held, for example to switch GPU engine or to delete a GPU bind, waiting
>> for reserve will in many case mean wait for GPU.
> This example sounds inefficient, I know nouveau can do this, but this essentially
> just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
> It wouldn't surprise me if performance in nouveau could be improved simply by
> fixing those cases up instead, since it stalls the application completely too for other uses.
>
Please see the Nouveau cpu_prep patch as well.

While there are a number of cases that can be fixed up, also in Radeon, 
there's no way around that reservation
is a heavyweight lock that, particularly on simpler hardware without 
support for fence ordering
with barriers and / or "semaphores" and accelerated eviction will be 
held while waiting for idle.

As such, it is unsuitable to protect read access to the fence pointer. 
If the intention is to keep a single fence pointer
I think the best solution is to allow reading the fence pointer outside 
reservation, but make sure this can be done
atomically. If the intention is to protect an array or list of fence 
pointers, I think a spinlock is the better solution.

/Thomas
Jerome Glisse Oct. 18, 2012, 5:04 p.m. UTC | #9
On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
> >Op 18-10-12 13:55, Thomas Hellstrom schreef:
> >>On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
> >>>Op 18-10-12 13:02, Thomas Hellstrom schreef:
> >>>>On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
> >>>>>Hey,
> >>>>>
> >>>>>Op 18-10-12 09:59, Thomas Hellstrom schreef:
> >>>>>>On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
> >>>>>>>Hi, Maarten,
> >>>>>>>
> >>>>>>>As you know I have been having my doubts about this change.
> >>>>>>>To me it seems insane to be forced to read the fence pointer under a
> >>>>>>>reserved lock, simply because when you take the reserve lock, another
> >>>>>>>process may have it and there is a substantial chance that that process
> >>>>>>>will also be waiting for idle while holding the reserve lock.
> >>>>>I think it makes perfect sense, the only times you want to read the fence
> >>>>>is when you want to change the members protected by the reservation.
> >>>>No, that's not true. A typical case (or the only case)
> >>>>is where you want to do a map with no_wait semantics. You will want
> >>>>to be able to access a buffer's results even if the eviction code
> >>>>is about to schedule an unbind from the GPU, and have the buffer
> >>>>reserved?
> >>>Well either block on reserve or return -EBUSY if reserved, presumably the
> >>>former..
> >>>
> >>>ttm_bo_vm_fault does the latter already, anyway
> >>ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
> >>a waiting reserve but for different reasons. Typically a user-space app will keep
> >>asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
> >>be long as user-space apps keep caches.
> >>That's not the same as accessing a buffer after the GPU is done with it.
> >Ah indeed.
> >>>You don't need to hold the reservation while performing the wait itself though,
> >>>you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
> >>>take a ref to the sync_obj member and then wait after unreserving. You won't
> >>>reset sync_obj member to NULL in that case, but that should be harmless.
> >>>This will allow you to keep the reservations fast and short. Maybe a helper
> >>>would be appropriate for this since radeon and nouveau both seem to do this.
> >>>
> >>The problem is that as long as other users are waiting for idle with reservation
> >>held, for example to switch GPU engine or to delete a GPU bind, waiting
> >>for reserve will in many case mean wait for GPU.
> >This example sounds inefficient, I know nouveau can do this, but this essentially
> >just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
> >It wouldn't surprise me if performance in nouveau could be improved simply by
> >fixing those cases up instead, since it stalls the application completely too for other uses.
> >
> Please see the Nouveau cpu_prep patch as well.
> 
> While there are a number of cases that can be fixed up, also in
> Radeon, there's no way around that reservation
> is a heavyweight lock that, particularly on simpler hardware without
> support for fence ordering
> with barriers and / or "semaphores" and accelerated eviction will be
> held while waiting for idle.
> 
> As such, it is unsuitable to protect read access to the fence
> pointer. If the intention is to keep a single fence pointer
> I think the best solution is to allow reading the fence pointer
> outside reservation, but make sure this can be done
> atomically. If the intention is to protect an array or list of fence
> pointers, I think a spinlock is the better solution.
> 
> /Thomas

Just wanted to point out that like Thomas i am concern about having to
have object reserved when waiting on its associated fence. I fear it
will hurt us somehow.

I will try to spend couple days stress testing your branch on radeon
trying to see if it hurts performance anyhow with current use case.

Cheers,
Jerome
Dave Airlie March 20, 2014, 11:55 p.m. UTC | #10
On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse <j.glisse@gmail.com> wrote:
> On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
>> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
>> >Op 18-10-12 13:55, Thomas Hellstrom schreef:
>> >>On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>> >>>Op 18-10-12 13:02, Thomas Hellstrom schreef:
>> >>>>On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>> >>>>>Hey,
>> >>>>>
>> >>>>>Op 18-10-12 09:59, Thomas Hellstrom schreef:
>> >>>>>>On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>> >>>>>>>Hi, Maarten,
>> >>>>>>>
>> >>>>>>>As you know I have been having my doubts about this change.
>> >>>>>>>To me it seems insane to be forced to read the fence pointer under a
>> >>>>>>>reserved lock, simply because when you take the reserve lock, another
>> >>>>>>>process may have it and there is a substantial chance that that process
>> >>>>>>>will also be waiting for idle while holding the reserve lock.
>> >>>>>I think it makes perfect sense, the only times you want to read the fence
>> >>>>>is when you want to change the members protected by the reservation.
>> >>>>No, that's not true. A typical case (or the only case)
>> >>>>is where you want to do a map with no_wait semantics. You will want
>> >>>>to be able to access a buffer's results even if the eviction code
>> >>>>is about to schedule an unbind from the GPU, and have the buffer
>> >>>>reserved?
>> >>>Well either block on reserve or return -EBUSY if reserved, presumably the
>> >>>former..
>> >>>
>> >>>ttm_bo_vm_fault does the latter already, anyway
>> >>ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
>> >>a waiting reserve but for different reasons. Typically a user-space app will keep
>> >>asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
>> >>be long as user-space apps keep caches.
>> >>That's not the same as accessing a buffer after the GPU is done with it.
>> >Ah indeed.
>> >>>You don't need to hold the reservation while performing the wait itself though,
>> >>>you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>> >>>take a ref to the sync_obj member and then wait after unreserving. You won't
>> >>>reset sync_obj member to NULL in that case, but that should be harmless.
>> >>>This will allow you to keep the reservations fast and short. Maybe a helper
>> >>>would be appropriate for this since radeon and nouveau both seem to do this.
>> >>>
>> >>The problem is that as long as other users are waiting for idle with reservation
>> >>held, for example to switch GPU engine or to delete a GPU bind, waiting
>> >>for reserve will in many case mean wait for GPU.
>> >This example sounds inefficient, I know nouveau can do this, but this essentially
>> >just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
>> >It wouldn't surprise me if performance in nouveau could be improved simply by
>> >fixing those cases up instead, since it stalls the application completely too for other uses.
>> >
>> Please see the Nouveau cpu_prep patch as well.
>>
>> While there are a number of cases that can be fixed up, also in
>> Radeon, there's no way around that reservation
>> is a heavyweight lock that, particularly on simpler hardware without
>> support for fence ordering
>> with barriers and / or "semaphores" and accelerated eviction will be
>> held while waiting for idle.
>>
>> As such, it is unsuitable to protect read access to the fence
>> pointer. If the intention is to keep a single fence pointer
>> I think the best solution is to allow reading the fence pointer
>> outside reservation, but make sure this can be done
>> atomically. If the intention is to protect an array or list of fence
>> pointers, I think a spinlock is the better solution.
>>
>> /Thomas
>
> Just wanted to point out that like Thomas i am concern about having to
> have object reserved when waiting on its associated fence. I fear it
> will hurt us somehow.
>
> I will try to spend couple days stress testing your branch on radeon
> trying to see if it hurts performance anyhow with current use case.
>

I've been trying to figure out what to do with Maarten's patches going
forward since they are essential for all kinds of SoC people,

However I'm still not 100% sure I believe either the fact that the
problem is anything more than a microoptimisation, and possibly a
premature one at that, this needs someone to start suggesting
benchmarks we can run and a driver set to run them on, otherwise I'm
starting to tend towards we are taking about an optimisation we can
fix later,

The other option is to somehow merge this stuff under an option that
allows us to test it using a command line option, but I don't think
that is sane either,

So Maarten, Jerome, Thomas, please start discussing actual real world
tests you think merging this code will affect and then I can make a
better consideration.

Dave.
Thomas Hellström (VMware) March 21, 2014, 8:27 a.m. UTC | #11
On 03/21/2014 12:55 AM, Dave Airlie wrote:
> On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse <j.glisse@gmail.com> wrote:
>> On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
>>> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
>>>> Op 18-10-12 13:55, Thomas Hellstrom schreef:
>>>>> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>>>>>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>>>>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>>>>>> Hey,
>>>>>>>>
>>>>>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>>>>>> Hi, Maarten,
>>>>>>>>>>
>>>>>>>>>> As you know I have been having my doubts about this change.
>>>>>>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>>>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>>>>>>> process may have it and there is a substantial chance that that process
>>>>>>>>>> will also be waiting for idle while holding the reserve lock.
>>>>>>>> I think it makes perfect sense, the only times you want to read the fence
>>>>>>>> is when you want to change the members protected by the reservation.
>>>>>>> No, that's not true. A typical case (or the only case)
>>>>>>> is where you want to do a map with no_wait semantics. You will want
>>>>>>> to be able to access a buffer's results even if the eviction code
>>>>>>> is about to schedule an unbind from the GPU, and have the buffer
>>>>>>> reserved?
>>>>>> Well either block on reserve or return -EBUSY if reserved, presumably the
>>>>>> former..
>>>>>>
>>>>>> ttm_bo_vm_fault does the latter already, anyway
>>>>> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
>>>>> a waiting reserve but for different reasons. Typically a user-space app will keep
>>>>> asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
>>>>> be long as user-space apps keep caches.
>>>>> That's not the same as accessing a buffer after the GPU is done with it.
>>>> Ah indeed.
>>>>>> You don't need to hold the reservation while performing the wait itself though,
>>>>>> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>>>>>> take a ref to the sync_obj member and then wait after unreserving. You won't
>>>>>> reset sync_obj member to NULL in that case, but that should be harmless.
>>>>>> This will allow you to keep the reservations fast and short. Maybe a helper
>>>>>> would be appropriate for this since radeon and nouveau both seem to do this.
>>>>>>
>>>>> The problem is that as long as other users are waiting for idle with reservation
>>>>> held, for example to switch GPU engine or to delete a GPU bind, waiting
>>>>> for reserve will in many case mean wait for GPU.
>>>> This example sounds inefficient, I know nouveau can do this, but this essentially
>>>> just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
>>>> It wouldn't surprise me if performance in nouveau could be improved simply by
>>>> fixing those cases up instead, since it stalls the application completely too for other uses.
>>>>
>>> Please see the Nouveau cpu_prep patch as well.
>>>
>>> While there are a number of cases that can be fixed up, also in
>>> Radeon, there's no way around that reservation
>>> is a heavyweight lock that, particularly on simpler hardware without
>>> support for fence ordering
>>> with barriers and / or "semaphores" and accelerated eviction will be
>>> held while waiting for idle.
>>>
>>> As such, it is unsuitable to protect read access to the fence
>>> pointer. If the intention is to keep a single fence pointer
>>> I think the best solution is to allow reading the fence pointer
>>> outside reservation, but make sure this can be done
>>> atomically. If the intention is to protect an array or list of fence
>>> pointers, I think a spinlock is the better solution.
>>>
>>> /Thomas
>> Just wanted to point out that like Thomas i am concern about having to
>> have object reserved when waiting on its associated fence. I fear it
>> will hurt us somehow.
>>
>> I will try to spend couple days stress testing your branch on radeon
>> trying to see if it hurts performance anyhow with current use case.
>>
> I've been trying to figure out what to do with Maarten's patches going
> forward since they are essential for all kinds of SoC people,
>
> However I'm still not 100% sure I believe either the fact that the
> problem is anything more than a microoptimisation, and possibly a
> premature one at that, this needs someone to start suggesting
> benchmarks we can run and a driver set to run them on, otherwise I'm
> starting to tend towards we are taking about an optimisation we can
> fix later,
>
> The other option is to somehow merge this stuff under an option that
> allows us to test it using a command line option, but I don't think
> that is sane either,
>
> So Maarten, Jerome, Thomas, please start discussing actual real world
> tests you think merging this code will affect and then I can make a
> better consideration.
>
> Dave.

Dave,

This is IMHO a fundamental design discussion, not a micro-optimization.

I'm pretty sure all sorts of horrendous things could be done to the DRM
design without affecting real world application performance.

In TTM data protection is primarily done with spinlocks: This serves two
purposes.

a) You can't unnecessarily hold a data protection lock while waiting for
GPU, which is typically a very stupid thing to do (perhaps not so in
this particular case) but when the sleep while atomic or locking
inversion kernel warning turns up, that should at least
ring a bell. Trying to implement dma-buf fencing using the TTM fencing
callbacks will AFAICT cause a locking inversion.

b) Spinlocks essentially go away on UP systems. The whole reservation
path was essentially lock-free on UP systems pre dma-buf integration,
and with very few atomic locking operations even on SMP systems. It was
all prompted by a test some years ago (by Eric Anholt IIRC) showing that
locking operations turned up quite high on Intel driver profiling.

If we start protecting data with reservations, when we also export the
reservation locks, so that people find them a convenient "serialize work
on this buffer" lock, all kind of interesting things might happen, and
we might end up in a situation
similar to protecting everything with struct_mutex.

This is why I dislike this change. In particular when there is a very
simple remedy.

But if I can get an ACK to convert the reservation object fence pointers
and associated operations on them to be rcu-safe when I have some time
left, I'd be prepared to accept this patch series in it's current state.

/Thomas
Maarten Lankhorst March 21, 2014, 12:12 p.m. UTC | #12
Hey,

op 21-03-14 09:27, Thomas Hellstrom schreef:
> On 03/21/2014 12:55 AM, Dave Airlie wrote:
>> On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse <j.glisse@gmail.com> wrote:
>>> On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
>>>> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
>>>>> Op 18-10-12 13:55, Thomas Hellstrom schreef:
>>>>>> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>>>>>>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>>>>>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>>>>>>> Hey,
>>>>>>>>>
>>>>>>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>>>>>>> Hi, Maarten,
>>>>>>>>>>>
>>>>>>>>>>> As you know I have been having my doubts about this change.
>>>>>>>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>>>>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>>>>>>>> process may have it and there is a substantial chance that that process
>>>>>>>>>>> will also be waiting for idle while holding the reserve lock.
>>>>>>>>> I think it makes perfect sense, the only times you want to read the fence
>>>>>>>>> is when you want to change the members protected by the reservation.
>>>>>>>> No, that's not true. A typical case (or the only case)
>>>>>>>> is where you want to do a map with no_wait semantics. You will want
>>>>>>>> to be able to access a buffer's results even if the eviction code
>>>>>>>> is about to schedule an unbind from the GPU, and have the buffer
>>>>>>>> reserved?
>>>>>>> Well either block on reserve or return -EBUSY if reserved, presumably the
>>>>>>> former..
>>>>>>>
>>>>>>> ttm_bo_vm_fault does the latter already, anyway
>>>>>> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
>>>>>> a waiting reserve but for different reasons. Typically a user-space app will keep
>>>>>> asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
>>>>>> be long as user-space apps keep caches.
>>>>>> That's not the same as accessing a buffer after the GPU is done with it.
>>>>> Ah indeed.
>>>>>>> You don't need to hold the reservation while performing the wait itself though,
>>>>>>> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>>>>>>> take a ref to the sync_obj member and then wait after unreserving. You won't
>>>>>>> reset sync_obj member to NULL in that case, but that should be harmless.
>>>>>>> This will allow you to keep the reservations fast and short. Maybe a helper
>>>>>>> would be appropriate for this since radeon and nouveau both seem to do this.
>>>>>>>
>>>>>> The problem is that as long as other users are waiting for idle with reservation
>>>>>> held, for example to switch GPU engine or to delete a GPU bind, waiting
>>>>>> for reserve will in many case mean wait for GPU.
>>>>> This example sounds inefficient, I know nouveau can do this, but this essentially
>>>>> just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
>>>>> It wouldn't surprise me if performance in nouveau could be improved simply by
>>>>> fixing those cases up instead, since it stalls the application completely too for other uses.
>>>>>
>>>> Please see the Nouveau cpu_prep patch as well.
>>>>
>>>> While there are a number of cases that can be fixed up, also in
>>>> Radeon, there's no way around that reservation
>>>> is a heavyweight lock that, particularly on simpler hardware without
>>>> support for fence ordering
>>>> with barriers and / or "semaphores" and accelerated eviction will be
>>>> held while waiting for idle.
>>>>
>>>> As such, it is unsuitable to protect read access to the fence
>>>> pointer. If the intention is to keep a single fence pointer
>>>> I think the best solution is to allow reading the fence pointer
>>>> outside reservation, but make sure this can be done
>>>> atomically. If the intention is to protect an array or list of fence
>>>> pointers, I think a spinlock is the better solution.
>>>>
>>>> /Thomas
>>> Just wanted to point out that like Thomas i am concern about having to
>>> have object reserved when waiting on its associated fence. I fear it
>>> will hurt us somehow.
>>>
>>> I will try to spend couple days stress testing your branch on radeon
>>> trying to see if it hurts performance anyhow with current use case.
>>>
>> I've been trying to figure out what to do with Maarten's patches going
>> forward since they are essential for all kinds of SoC people,
>>
>> However I'm still not 100% sure I believe either the fact that the
>> problem is anything more than a microoptimisation, and possibly a
>> premature one at that, this needs someone to start suggesting
>> benchmarks we can run and a driver set to run them on, otherwise I'm
>> starting to tend towards we are taking about an optimisation we can
>> fix later,
>>
>> The other option is to somehow merge this stuff under an option that
>> allows us to test it using a command line option, but I don't think
>> that is sane either,
>>
>> So Maarten, Jerome, Thomas, please start discussing actual real world
>> tests you think merging this code will affect and then I can make a
>> better consideration.
>>
>> Dave.
> Dave,
>
> This is IMHO a fundamental design discussion, not a micro-optimization.
>
> I'm pretty sure all sorts of horrendous things could be done to the DRM
> design without affecting real world application performance.
>
> In TTM data protection is primarily done with spinlocks: This serves two
> purposes.
>
> a) You can't unnecessarily hold a data protection lock while waiting for
> GPU, which is typically a very stupid thing to do (perhaps not so in
> this particular case) but when the sleep while atomic or locking
> inversion kernel warning turns up, that should at least
> ring a bell. Trying to implement dma-buf fencing using the TTM fencing
> callbacks will AFAICT cause a locking inversion.
>
> b) Spinlocks essentially go away on UP systems. The whole reservation
> path was essentially lock-free on UP systems pre dma-buf integration,
> and with very few atomic locking operations even on SMP systems. It was
> all prompted by a test some years ago (by Eric Anholt IIRC) showing that
> locking operations turned up quite high on Intel driver profiling.
>
> If we start protecting data with reservations, when we also export the
> reservation locks, so that people find them a convenient "serialize work
> on this buffer" lock, all kind of interesting things might happen, and
> we might end up in a situation
> similar to protecting everything with struct_mutex.
>
> This is why I dislike this change. In particular when there is a very
> simple remedy.
>
> But if I can get an ACK to convert the reservation object fence pointers
> and associated operations on them to be rcu-safe when I have some time
> left, I'd be prepared to accept this patch series in it's current state.
RCU could be a useful way to deal with this. But in my case I've shown there are very few places where it's needed, core ttm does not need it at all.
This is why I wanted to leave it to individual drivers to implement it.

I think kfree_rcu for free in the driver itself should be enough,
and obtaining in the driver would require something like this, similar to whats in rcuref.txt:

rcu_read_lock();
f = rcu_dereference(bo->fence);
if (f && !kref_get_unless-zero(&f->kref)) f = NULL;
rcu_read_unlock();

if (f) {
// do stuff here
...

// free fence
kref_put(&f->kref, fence_put_with_kfree_rcu);
}

Am I wrong or is something like this enough to make sure fence is still alive?
There might still be a small bug when bo->fence's refcount is decreased before bo->fence is set to null. I haven't checked core ttm so that might need fixing.

I added some more people to CC. It might be worthwhile having this in the core fence code and delete all fences with rcu, but I'm not completely certain about that yet.

~Maarten
Thomas Hellström (VMware) March 21, 2014, 1:04 p.m. UTC | #13
On 03/21/2014 01:12 PM, Maarten Lankhorst wrote:
> Hey,
>
> op 21-03-14 09:27, Thomas Hellstrom schreef:
>> On 03/21/2014 12:55 AM, Dave Airlie wrote:
>>> On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse <j.glisse@gmail.com>
>>> wrote:
>>>> On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
>>>>> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
>>>>>> Op 18-10-12 13:55, Thomas Hellstrom schreef:
>>>>>>> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>>>>>>>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>>>>>>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>>>>>>>> Hey,
>>>>>>>>>>
>>>>>>>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>>>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>>>>>>>> Hi, Maarten,
>>>>>>>>>>>>
>>>>>>>>>>>> As you know I have been having my doubts about this change.
>>>>>>>>>>>> To me it seems insane to be forced to read the fence
>>>>>>>>>>>> pointer under a
>>>>>>>>>>>> reserved lock, simply because when you take the reserve
>>>>>>>>>>>> lock, another
>>>>>>>>>>>> process may have it and there is a substantial chance that
>>>>>>>>>>>> that process
>>>>>>>>>>>> will also be waiting for idle while holding the reserve lock.
>>>>>>>>>> I think it makes perfect sense, the only times you want to
>>>>>>>>>> read the fence
>>>>>>>>>> is when you want to change the members protected by the
>>>>>>>>>> reservation.
>>>>>>>>> No, that's not true. A typical case (or the only case)
>>>>>>>>> is where you want to do a map with no_wait semantics. You will
>>>>>>>>> want
>>>>>>>>> to be able to access a buffer's results even if the eviction code
>>>>>>>>> is about to schedule an unbind from the GPU, and have the buffer
>>>>>>>>> reserved?
>>>>>>>> Well either block on reserve or return -EBUSY if reserved,
>>>>>>>> presumably the
>>>>>>>> former..
>>>>>>>>
>>>>>>>> ttm_bo_vm_fault does the latter already, anyway
>>>>>>> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem,
>>>>>>> it's really
>>>>>>> a waiting reserve but for different reasons. Typically a
>>>>>>> user-space app will keep
>>>>>>> asynchronous maps to TTM during a buffer lifetime, and the
>>>>>>> buffer lifetime may
>>>>>>> be long as user-space apps keep caches.
>>>>>>> That's not the same as accessing a buffer after the GPU is done
>>>>>>> with it.
>>>>>> Ah indeed.
>>>>>>>> You don't need to hold the reservation while performing the
>>>>>>>> wait itself though,
>>>>>>>> you could check if ttm_bo_wait(no_wait_gpu = true) returns
>>>>>>>> -EBUSY, and if so
>>>>>>>> take a ref to the sync_obj member and then wait after
>>>>>>>> unreserving. You won't
>>>>>>>> reset sync_obj member to NULL in that case, but that should be
>>>>>>>> harmless.
>>>>>>>> This will allow you to keep the reservations fast and short.
>>>>>>>> Maybe a helper
>>>>>>>> would be appropriate for this since radeon and nouveau both
>>>>>>>> seem to do this.
>>>>>>>>
>>>>>>> The problem is that as long as other users are waiting for idle
>>>>>>> with reservation
>>>>>>> held, for example to switch GPU engine or to delete a GPU bind,
>>>>>>> waiting
>>>>>>> for reserve will in many case mean wait for GPU.
>>>>>> This example sounds inefficient, I know nouveau can do this, but
>>>>>> this essentially
>>>>>> just stalls the gpu entirely. I think guess you mean functions
>>>>>> like nouveau_gem_object_close?
>>>>>> It wouldn't surprise me if performance in nouveau could be
>>>>>> improved simply by
>>>>>> fixing those cases up instead, since it stalls the application
>>>>>> completely too for other uses.
>>>>>>
>>>>> Please see the Nouveau cpu_prep patch as well.
>>>>>
>>>>> While there are a number of cases that can be fixed up, also in
>>>>> Radeon, there's no way around that reservation
>>>>> is a heavyweight lock that, particularly on simpler hardware without
>>>>> support for fence ordering
>>>>> with barriers and / or "semaphores" and accelerated eviction will be
>>>>> held while waiting for idle.
>>>>>
>>>>> As such, it is unsuitable to protect read access to the fence
>>>>> pointer. If the intention is to keep a single fence pointer
>>>>> I think the best solution is to allow reading the fence pointer
>>>>> outside reservation, but make sure this can be done
>>>>> atomically. If the intention is to protect an array or list of fence
>>>>> pointers, I think a spinlock is the better solution.
>>>>>
>>>>> /Thomas
>>>> Just wanted to point out that like Thomas i am concern about having to
>>>> have object reserved when waiting on its associated fence. I fear it
>>>> will hurt us somehow.
>>>>
>>>> I will try to spend couple days stress testing your branch on radeon
>>>> trying to see if it hurts performance anyhow with current use case.
>>>>
>>> I've been trying to figure out what to do with Maarten's patches going
>>> forward since they are essential for all kinds of SoC people,
>>>
>>> However I'm still not 100% sure I believe either the fact that the
>>> problem is anything more than a microoptimisation, and possibly a
>>> premature one at that, this needs someone to start suggesting
>>> benchmarks we can run and a driver set to run them on, otherwise I'm
>>> starting to tend towards we are taking about an optimisation we can
>>> fix later,
>>>
>>> The other option is to somehow merge this stuff under an option that
>>> allows us to test it using a command line option, but I don't think
>>> that is sane either,
>>>
>>> So Maarten, Jerome, Thomas, please start discussing actual real world
>>> tests you think merging this code will affect and then I can make a
>>> better consideration.
>>>
>>> Dave.
>> Dave,
>>
>> This is IMHO a fundamental design discussion, not a micro-optimization.
>>
>> I'm pretty sure all sorts of horrendous things could be done to the DRM
>> design without affecting real world application performance.
>>
>> In TTM data protection is primarily done with spinlocks: This serves two
>> purposes.
>>
>> a) You can't unnecessarily hold a data protection lock while waiting for
>> GPU, which is typically a very stupid thing to do (perhaps not so in
>> this particular case) but when the sleep while atomic or locking
>> inversion kernel warning turns up, that should at least
>> ring a bell. Trying to implement dma-buf fencing using the TTM fencing
>> callbacks will AFAICT cause a locking inversion.
>>
>> b) Spinlocks essentially go away on UP systems. The whole reservation
>> path was essentially lock-free on UP systems pre dma-buf integration,
>> and with very few atomic locking operations even on SMP systems. It was
>> all prompted by a test some years ago (by Eric Anholt IIRC) showing that
>> locking operations turned up quite high on Intel driver profiling.
>>
>> If we start protecting data with reservations, when we also export the
>> reservation locks, so that people find them a convenient "serialize work
>> on this buffer" lock, all kind of interesting things might happen, and
>> we might end up in a situation
>> similar to protecting everything with struct_mutex.
>>
>> This is why I dislike this change. In particular when there is a very
>> simple remedy.
>>
>> But if I can get an ACK to convert the reservation object fence pointers
>> and associated operations on them to be rcu-safe when I have some time
>> left, I'd be prepared to accept this patch series in it's current state.
> RCU could be a useful way to deal with this. But in my case I've shown
> there are very few places where it's needed, core ttm does not need it
> at all.
> This is why I wanted to leave it to individual drivers to implement it.
>
> I think kfree_rcu for free in the driver itself should be enough,
> and obtaining in the driver would require something like this, similar
> to whats in rcuref.txt:
>
> rcu_read_lock();
> f = rcu_dereference(bo->fence);
> if (f && !kref_get_unless-zero(&f->kref)) f = NULL;
> rcu_read_unlock();
>
> if (f) {
> // do stuff here
> ...
>
> // free fence
> kref_put(&f->kref, fence_put_with_kfree_rcu);
> }
>
> Am I wrong or is something like this enough to make sure fence is
> still alive?

No, you're correct.

And a bo check for idle would amount to (since we don't care if the
fence refcount is zero).

rcu_read_lock()
f = rcu_dereference(bo->fence);
signaled = !f || f->signaled;
rcu_read_unlock().

/Thomas











> There might still be a small bug when bo->fence's refcount is
> decreased before bo->fence is set to null. I haven't checked core ttm
> so that might need fixing.
>
> I added some more people to CC. It might be worthwhile having this in
> the core fence code and delete all fences with rcu, but I'm not
> completely certain about that yet.
>
> ~Maarten
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index cee7448..9749c45 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1449,10 +1449,8 @@  nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence)
 	if (likely(fence))
 		nouveau_fence_ref(fence);
 
-	spin_lock(&nvbo->bo.bdev->fence_lock);
 	old_fence = nvbo->bo.sync_obj;
 	nvbo->bo.sync_obj = fence;
-	spin_unlock(&nvbo->bo.bdev->fence_lock);
 
 	nouveau_fence_unref(&old_fence);
 }
@@ -1552,9 +1550,7 @@  nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma)
 	if (vma->node) {
 		if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) {
 			ttm_bo_reserve(&nvbo->bo, false, false, false, 0);
-			spin_lock(&nvbo->bo.bdev->fence_lock);
 			ttm_bo_wait(&nvbo->bo, false, false, false);
-			spin_unlock(&nvbo->bo.bdev->fence_lock);
 			ttm_bo_unreserve(&nvbo->bo);
 			nouveau_vm_unmap(vma);
 		}
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 6d8391d..eaa700a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -391,18 +391,11 @@  retry:
 static int
 validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo)
 {
-	struct nouveau_fence *fence = NULL;
+	struct nouveau_fence *fence = nvbo->bo.sync_obj;
 	int ret = 0;
 
-	spin_lock(&nvbo->bo.bdev->fence_lock);
-	if (nvbo->bo.sync_obj)
-		fence = nouveau_fence_ref(nvbo->bo.sync_obj);
-	spin_unlock(&nvbo->bo.bdev->fence_lock);
-
-	if (fence) {
+	if (fence)
 		ret = nouveau_fence_sync(fence, chan);
-		nouveau_fence_unref(&fence);
-	}
 
 	return ret;
 }
@@ -614,9 +607,7 @@  nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev,
 				data |= r->vor;
 		}
 
-		spin_lock(&nvbo->bo.bdev->fence_lock);
 		ret = ttm_bo_wait(&nvbo->bo, false, false, false);
-		spin_unlock(&nvbo->bo.bdev->fence_lock);
 		if (ret) {
 			NV_ERROR(drm, "reloc wait_idle failed: %d\n", ret);
 			break;
@@ -848,11 +839,9 @@  nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
 
 	ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0);
 	if (!ret) {
-		spin_lock(&nvbo->bo.bdev->fence_lock);
 		ret = ttm_bo_wait(&nvbo->bo, true, true, true);
 		if (!no_wait && ret)
 			fence = nouveau_fence_ref(nvbo->bo.sync_obj);
-		spin_unlock(&nvbo->bo.bdev->fence_lock);
 
 		ttm_bo_unreserve(&nvbo->bo);
 	}
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index 808c444..df430e4 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -613,12 +613,10 @@  int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, bool no_wait)
 	r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0);
 	if (unlikely(r != 0))
 		return r;
-	spin_lock(&bo->tbo.bdev->fence_lock);
 	if (mem_type)
 		*mem_type = bo->tbo.mem.mem_type;
 	if (bo->tbo.sync_obj)
 		r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
-	spin_unlock(&bo->tbo.bdev->fence_lock);
 	ttm_bo_unreserve(&bo->tbo);
 	return r;
 }
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index f6d7026..69fc29b 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -498,28 +498,23 @@  static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_bo_global *glob = bo->glob;
-	struct ttm_bo_driver *driver;
+	struct ttm_bo_driver *driver = bdev->driver;
 	void *sync_obj = NULL;
 	int put_count;
 	int ret;
 
-	spin_lock(&bdev->fence_lock);
-	(void) ttm_bo_wait(bo, false, false, true);
-	if (!bo->sync_obj) {
-
-		spin_lock(&glob->lru_lock);
-
-		/**
-		 * Lock inversion between bo:reserve and bdev::fence_lock here,
-		 * but that's OK, since we're only trylocking.
-		 */
-
-		ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
+	spin_lock(&glob->lru_lock);
+	ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
+	if (!ret) {
+		ret = ttm_bo_wait(bo, false, false, true);
 
-		if (unlikely(ret == -EBUSY))
+		if (unlikely(ret == -EBUSY)) {
+			sync_obj = driver->sync_obj_ref(bo->sync_obj);
+			atomic_set(&bo->reserved, 0);
+			wake_up_all(&bo->event_queue);
 			goto queue;
+		}
 
-		spin_unlock(&bdev->fence_lock);
 		put_count = ttm_bo_del_from_lru(bo);
 
 		spin_unlock(&glob->lru_lock);
@@ -528,18 +523,11 @@  static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 		ttm_bo_list_ref_sub(bo, put_count, true);
 
 		return;
-	} else {
-		spin_lock(&glob->lru_lock);
 	}
 queue:
-	driver = bdev->driver;
-	if (bo->sync_obj)
-		sync_obj = driver->sync_obj_ref(bo->sync_obj);
-
 	kref_get(&bo->list_kref);
 	list_add_tail(&bo->ddestroy, &bdev->ddestroy);
 	spin_unlock(&glob->lru_lock);
-	spin_unlock(&bdev->fence_lock);
 
 	if (sync_obj) {
 		driver->sync_obj_flush(sync_obj);
@@ -565,25 +553,15 @@  static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
 			       bool no_wait_gpu)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
+	struct ttm_bo_driver *driver = bdev->driver;
 	struct ttm_bo_global *glob = bo->glob;
 	int put_count;
 	int ret = 0;
+	void *sync_obj;
 
 retry:
-	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
-	spin_unlock(&bdev->fence_lock);
-
-	if (unlikely(ret != 0))
-		return ret;
-
 	spin_lock(&glob->lru_lock);
 
-	if (unlikely(list_empty(&bo->ddestroy))) {
-		spin_unlock(&glob->lru_lock);
-		return 0;
-	}
-
 	ret = ttm_bo_reserve_locked(bo, interruptible,
 				    no_wait_reserve, false, 0);
 
@@ -592,18 +570,37 @@  retry:
 		return ret;
 	}
 
-	/**
-	 * We can re-check for sync object without taking
-	 * the bo::lock since setting the sync object requires
-	 * also bo::reserved. A busy object at this point may
-	 * be caused by another thread recently starting an accelerated
-	 * eviction.
-	 */
+	if (unlikely(list_empty(&bo->ddestroy))) {
+		atomic_set(&bo->reserved, 0);
+		wake_up_all(&bo->event_queue);
+		spin_unlock(&glob->lru_lock);
+		return 0;
+	}
+	ret = ttm_bo_wait(bo, false, false, true);
+
+	if (ret) {
+		if (no_wait_gpu) {
+			atomic_set(&bo->reserved, 0);
+			wake_up_all(&bo->event_queue);
+			spin_unlock(&glob->lru_lock);
+			return ret;
+		}
 
-	if (unlikely(bo->sync_obj)) {
+		/**
+		 * Take a reference to the fence and unreserve, if the wait
+		 * was succesful and no new sync_obj was attached,
+		 * ttm_bo_wait in retry will return ret = 0, and end the loop.
+		 */
+
+		sync_obj = driver->sync_obj_ref(&bo->sync_obj);
 		atomic_set(&bo->reserved, 0);
 		wake_up_all(&bo->event_queue);
 		spin_unlock(&glob->lru_lock);
+
+		ret = driver->sync_obj_wait(bo->sync_obj, false, interruptible);
+		driver->sync_obj_unref(&sync_obj);
+		if (ret)
+			return ret;
 		goto retry;
 	}
 
@@ -735,9 +732,7 @@  static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible,
 	struct ttm_placement placement;
 	int ret = 0;
 
-	spin_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
-	spin_unlock(&bdev->fence_lock);
 
 	if (unlikely(ret != 0)) {
 		if (ret != -ERESTARTSYS) {
@@ -1054,7 +1049,6 @@  int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
 {
 	int ret = 0;
 	struct ttm_mem_reg mem;
-	struct ttm_bo_device *bdev = bo->bdev;
 
 	BUG_ON(!ttm_bo_is_reserved(bo));
 
@@ -1063,9 +1057,7 @@  int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
 	 * Have the driver move function wait for idle when necessary,
 	 * instead of doing it here.
 	 */
-	spin_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
-	spin_unlock(&bdev->fence_lock);
 	if (ret)
 		return ret;
 	mem.num_pages = bo->num_pages;
@@ -1554,7 +1546,6 @@  int ttm_bo_device_init(struct ttm_bo_device *bdev,
 	bdev->glob = glob;
 	bdev->need_dma32 = need_dma32;
 	bdev->val_seq = 0;
-	spin_lock_init(&bdev->fence_lock);
 	mutex_lock(&glob->device_list_mutex);
 	list_add_tail(&bdev->device_list, &glob->device_list);
 	mutex_unlock(&glob->device_list_mutex);
@@ -1690,52 +1681,33 @@  int ttm_bo_wait(struct ttm_buffer_object *bo,
 		bool lazy, bool interruptible, bool no_wait)
 {
 	struct ttm_bo_driver *driver = bo->bdev->driver;
-	struct ttm_bo_device *bdev = bo->bdev;
-	void *sync_obj;
 	int ret = 0;
 
+	WARN_ON_ONCE(!ttm_bo_is_reserved(bo));
+
 	if (likely(bo->sync_obj == NULL))
 		return 0;
 
-	while (bo->sync_obj) {
-
+	if (bo->sync_obj) {
 		if (driver->sync_obj_signaled(bo->sync_obj)) {
 			void *tmp_obj = bo->sync_obj;
 			bo->sync_obj = NULL;
 			clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
-			spin_unlock(&bdev->fence_lock);
 			driver->sync_obj_unref(&tmp_obj);
-			spin_lock(&bdev->fence_lock);
-			continue;
+			return 0;
 		}
 
 		if (no_wait)
 			return -EBUSY;
 
-		sync_obj = driver->sync_obj_ref(bo->sync_obj);
-		spin_unlock(&bdev->fence_lock);
-		ret = driver->sync_obj_wait(sync_obj,
+		ret = driver->sync_obj_wait(bo->sync_obj,
 					    lazy, interruptible);
 		if (unlikely(ret != 0)) {
-			driver->sync_obj_unref(&sync_obj);
-			spin_lock(&bdev->fence_lock);
 			return ret;
 		}
-		spin_lock(&bdev->fence_lock);
-		if (likely(bo->sync_obj == sync_obj)) {
-			void *tmp_obj = bo->sync_obj;
-			bo->sync_obj = NULL;
-			clear_bit(TTM_BO_PRIV_FLAG_MOVING,
-				  &bo->priv_flags);
-			spin_unlock(&bdev->fence_lock);
-			driver->sync_obj_unref(&sync_obj);
-			driver->sync_obj_unref(&tmp_obj);
-			spin_lock(&bdev->fence_lock);
-		} else {
-			spin_unlock(&bdev->fence_lock);
-			driver->sync_obj_unref(&sync_obj);
-			spin_lock(&bdev->fence_lock);
-		}
+		driver->sync_obj_unref(&bo->sync_obj);
+		clear_bit(TTM_BO_PRIV_FLAG_MOVING,
+			  &bo->priv_flags);
 	}
 	return 0;
 }
@@ -1799,9 +1771,7 @@  static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 	 * Wait for GPU, then move to system cached.
 	 */
 
-	spin_lock(&bo->bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, false, false);
-	spin_unlock(&bo->bdev->fence_lock);
 
 	if (unlikely(ret != 0))
 		goto out;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index d80d1e8..84d6e01 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -622,7 +622,6 @@  int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
 	struct ttm_buffer_object *ghost_obj;
 	void *tmp_obj = NULL;
 
-	spin_lock(&bdev->fence_lock);
 	if (bo->sync_obj) {
 		tmp_obj = bo->sync_obj;
 		bo->sync_obj = NULL;
@@ -630,7 +629,6 @@  int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
 	bo->sync_obj = driver->sync_obj_ref(sync_obj);
 	if (evict) {
 		ret = ttm_bo_wait(bo, false, false, false);
-		spin_unlock(&bdev->fence_lock);
 		if (tmp_obj)
 			driver->sync_obj_unref(&tmp_obj);
 		if (ret)
@@ -653,7 +651,6 @@  int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
 		 */
 
 		set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
-		spin_unlock(&bdev->fence_lock);
 		if (tmp_obj)
 			driver->sync_obj_unref(&tmp_obj);
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index a877813..55718c2 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -122,17 +122,14 @@  static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * move.
 	 */
 
-	spin_lock(&bdev->fence_lock);
 	if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
 		ret = ttm_bo_wait(bo, false, true, false);
-		spin_unlock(&bdev->fence_lock);
 		if (unlikely(ret != 0)) {
 			retval = (ret != -ERESTARTSYS) ?
 			    VM_FAULT_SIGBUS : VM_FAULT_NOPAGE;
 			goto out_unlock;
 		}
-	} else
-		spin_unlock(&bdev->fence_lock);
+	}
 
 	ret = ttm_mem_io_lock(man, true);
 	if (unlikely(ret != 0)) {
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index 721886e..51b5e97 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -207,7 +207,6 @@  void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
 	driver = bdev->driver;
 	glob = bo->glob;
 
-	spin_lock(&bdev->fence_lock);
 	spin_lock(&glob->lru_lock);
 
 	list_for_each_entry(entry, list, head) {
@@ -218,7 +217,6 @@  void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
 		entry->reserved = false;
 	}
 	spin_unlock(&glob->lru_lock);
-	spin_unlock(&bdev->fence_lock);
 
 	list_for_each_entry(entry, list, head) {
 		if (entry->old_sync_obj)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index ed3c1e7..e038c9a 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -251,13 +251,10 @@  static void vmw_dummy_query_bo_prepare(struct vmw_private *dev_priv)
 	volatile SVGA3dQueryResult *result;
 	bool dummy;
 	int ret;
-	struct ttm_bo_device *bdev = &dev_priv->bdev;
 	struct ttm_buffer_object *bo = dev_priv->dummy_query_bo;
 
 	ttm_bo_reserve(bo, false, false, false, 0);
-	spin_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, false, false);
-	spin_unlock(&bdev->fence_lock);
 	if (unlikely(ret != 0))
 		(void) vmw_fallback_wait(dev_priv, false, true, 0, false,
 					 10*HZ);
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 816d9b1..6c69224 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -222,6 +222,8 @@  struct ttm_buffer_object {
 	struct file *persistent_swap_storage;
 	struct ttm_tt *ttm;
 	bool evicted;
+	void *sync_obj;
+	unsigned long priv_flags;
 
 	/**
 	 * Members protected by the bdev::lru_lock.
@@ -242,16 +244,6 @@  struct ttm_buffer_object {
 	atomic_t reserved;
 
 	/**
-	 * Members protected by struct buffer_object_device::fence_lock
-	 * In addition, setting sync_obj to anything else
-	 * than NULL requires bo::reserved to be held. This allows for
-	 * checking NULL while reserved but not holding the mentioned lock.
-	 */
-
-	void *sync_obj;
-	unsigned long priv_flags;
-
-	/**
 	 * Members protected by the bdev::vm_lock
 	 */
 
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 0c8c3b5..aac101b 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -515,8 +515,6 @@  struct ttm_bo_global {
  *
  * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
  * @man: An array of mem_type_managers.
- * @fence_lock: Protects the synchronizing members on *all* bos belonging
- * to this device.
  * @addr_space_mm: Range manager for the device address space.
  * lru_lock: Spinlock that protects the buffer+device lru lists and
  * ddestroy lists.
@@ -539,7 +537,6 @@  struct ttm_bo_device {
 	struct ttm_bo_driver *driver;
 	rwlock_t vm_lock;
 	struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
-	spinlock_t fence_lock;
 	/*
 	 * Protected by the vm lock.
 	 */