diff mbox

[04/10] drm/ttm: change fence_lock to inner lock, v3

Message ID 1352728811-21860-4-git-send-email-maarten.lankhorst@canonical.com (mailing list archive)
State New, archived
Headers show

Commit Message

Maarten Lankhorst Nov. 12, 2012, 2 p.m. UTC
I changed the hierarchy to make fence_lock the most inner lock,
instead of outer lock. This will simplify things slightly, and
hopefully makes it easier to make fence_lock global at one point
should it be needed.

To make things clearer, I change the order around in ttm_bo_cleanup_refs
and ttm_bo_cleanup_refs_or_queue.

A reservation is taken first, then fence lock is taken and a wait is attempted.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>

v2:
 - fix conflict with upstream race fix, simplifies ttm_bo_cleanup_refs
v3:
 - change removal of fence_lock to making it a inner lock instead
---
 drivers/gpu/drm/ttm/ttm_bo.c           | 95 ++++++++++++++++------------------
 drivers/gpu/drm/ttm/ttm_execbuf_util.c |  4 +-
 2 files changed, 48 insertions(+), 51 deletions(-)

Comments

Thomas Hellstrom Nov. 19, 2012, 2:17 p.m. UTC | #1
Hi,

This patch looks mostly good, although I think ttm_bo_cleanup_refs 
becomes overly complicated:
Could this do, or am I missing something?


static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
                    bool interruptible,
                    bool no_wait_reserve,
                    bool no_wait_gpu)
{
     struct ttm_bo_device *bdev = bo->bdev;
     struct ttm_bo_global *glob = bo->glob;
     int put_count;
     int ret = 0;

     /*
      * First, reserve while making sure we're still on the
      * ddestroy list.
      */
retry_reserve:
     spin_lock(&glob->lru_lock);

     if (unlikely(list_empty(&bo->ddestroy))) {
         spin_unlock(&glob->lru_lock);
         return 0;
     }

     ret = ttm_bo_reserve_locked(bo, false, true, false, 0);

     if (unlikely(ret == -EBUSY)) {
         spin_unlock(&glob->lru_lock);
         if (likely(!no_wait_reserve))
             ret = ttm_bo_wait_unreserved(bo, interruptible);
         if (unlikely(ret != 0))
             return ret;

         goto retry_reserve;
     }

     BUG_ON(ret != 0);

     spin_lock(&bdev->fence_lock);
     ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
     spin_unlock(&bdev->fence_lock);

     if (unlikely(ret != 0)) {
         atomic_set(&bo->reserved, 0);
         wake_up_all(&bo->event_queue);
         spin_unlock(&glob->lru_lock);
         return ret;
     }

     put_count = ttm_bo_del_from_lru(bo);
     list_del_init(&bo->ddestroy);
     ++put_count;

     spin_unlock(&glob->lru_lock);
     ttm_bo_cleanup_memtype_use(bo);

     atomic_set(&bo_reserved, 0);
     wake_up_all(&bo->event_queue);
     ttm_bo_list_ref_sub(bo, put_count, true);

     return 0;
}


On 11/12/2012 03:00 PM, Maarten Lankhorst wrote:
> I changed the hierarchy to make fence_lock the most inner lock,
> instead of outer lock. This will simplify things slightly, and
> hopefully makes it easier to make fence_lock global at one point
> should it be needed.
>
> To make things clearer, I change the order around in ttm_bo_cleanup_refs
> and ttm_bo_cleanup_refs_or_queue.
>
> A reservation is taken first, then fence lock is taken and a wait is attempted.
>
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>
> v2:
>   - fix conflict with upstream race fix, simplifies ttm_bo_cleanup_refs
> v3:
>   - change removal of fence_lock to making it a inner lock instead
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c           | 95 ++++++++++++++++------------------
>   drivers/gpu/drm/ttm/ttm_execbuf_util.c |  4 +-
>   2 files changed, 48 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index a3383a7..70285ff 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -478,28 +478,26 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
>   {
>   	struct ttm_bo_device *bdev = bo->bdev;
>   	struct ttm_bo_global *glob = bo->glob;
> -	struct ttm_bo_driver *driver;
> +	struct ttm_bo_driver *driver = bdev->driver;
>   	void *sync_obj = NULL;
>   	int put_count;
>   	int ret;
>   
> -	spin_lock(&bdev->fence_lock);
> -	(void) ttm_bo_wait(bo, false, false, true);
> -	if (!bo->sync_obj) {
> -
> -		spin_lock(&glob->lru_lock);
> -
> -		/**
> -		 * Lock inversion between bo:reserve and bdev::fence_lock here,
> -		 * but that's OK, since we're only trylocking.
> -		 */
> -
> -		ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
> +	spin_lock(&glob->lru_lock);
> +	ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
> +	if (!ret) {
> +		spin_lock(&bdev->fence_lock);
> +		ret = ttm_bo_wait(bo, false, false, true);
>   
> -		if (unlikely(ret == -EBUSY))
> +		if (unlikely(ret == -EBUSY)) {
> +			sync_obj = driver->sync_obj_ref(bo->sync_obj);
> +			spin_unlock(&bdev->fence_lock);
> +			atomic_set(&bo->reserved, 0);
> +			wake_up_all(&bo->event_queue);
>   			goto queue;
> -
> +		}
>   		spin_unlock(&bdev->fence_lock);
> +
>   		put_count = ttm_bo_del_from_lru(bo);
>   
>   		atomic_set(&bo->reserved, 0);
> @@ -509,18 +507,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
>   		ttm_bo_list_ref_sub(bo, put_count, true);
>   
>   		return;
> -	} else {
> -		spin_lock(&glob->lru_lock);
>   	}
>   queue:
> -	driver = bdev->driver;
> -	if (bo->sync_obj)
> -		sync_obj = driver->sync_obj_ref(bo->sync_obj);
> -
>   	kref_get(&bo->list_kref);
>   	list_add_tail(&bo->ddestroy, &bdev->ddestroy);
>   	spin_unlock(&glob->lru_lock);
> -	spin_unlock(&bdev->fence_lock);
>   
>   	if (sync_obj) {
>   		driver->sync_obj_flush(sync_obj);
> @@ -546,54 +537,60 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
>   			       bool no_wait_gpu)
>   {
>   	struct ttm_bo_device *bdev = bo->bdev;
> +	struct ttm_bo_driver *driver = bdev->driver;
>   	struct ttm_bo_global *glob = bo->glob;
>   	int put_count;
>   	int ret = 0;
> +	void *sync_obj;
>   
>   retry:
> -	spin_lock(&bdev->fence_lock);
> -	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> -	spin_unlock(&bdev->fence_lock);
> +	spin_lock(&glob->lru_lock);
>   
> -	if (unlikely(ret != 0))
> -		return ret;
> +	ret = ttm_bo_reserve_locked(bo, interruptible,
> +				    no_wait_reserve, false, 0);
>   
> -retry_reserve:
> -	spin_lock(&glob->lru_lock);
> +	if (unlikely(ret)) {
> +		spin_unlock(&glob->lru_lock);
> +		return ret;
> +	}
>   
>   	if (unlikely(list_empty(&bo->ddestroy))) {
> +		atomic_set(&bo->reserved, 0);
> +		wake_up_all(&bo->event_queue);
>   		spin_unlock(&glob->lru_lock);
>   		return 0;
>   	}
>   
> -	ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
> -
> -	if (unlikely(ret == -EBUSY)) {
> -		spin_unlock(&glob->lru_lock);
> -		if (likely(!no_wait_reserve))
> -			ret = ttm_bo_wait_unreserved(bo, interruptible);
> -		if (unlikely(ret != 0))
> +	spin_lock(&bdev->fence_lock);
> +	ret = ttm_bo_wait(bo, false, false, true);
> +	if (ret) {
> +		if (no_wait_gpu) {
> +			spin_unlock(&bdev->fence_lock);
> +			atomic_set(&bo->reserved, 0);
> +			wake_up_all(&bo->event_queue);
> +			spin_unlock(&glob->lru_lock);
>   			return ret;
> +		}
>   
> -		goto retry_reserve;
> -	}
> -
> -	BUG_ON(ret != 0);
> -
> -	/**
> -	 * We can re-check for sync object without taking
> -	 * the bo::lock since setting the sync object requires
> -	 * also bo::reserved. A busy object at this point may
> -	 * be caused by another thread recently starting an accelerated
> -	 * eviction.
> -	 */
> +		/**
> +		 * Take a reference to the fence and unreserve, if the wait
> +		 * was succesful and no new sync_obj was attached,
> +		 * ttm_bo_wait in retry will return ret = 0, and end the loop.
> +		 */
>   
> -	if (unlikely(bo->sync_obj)) {
> +		sync_obj = driver->sync_obj_ref(&bo->sync_obj);
> +		spin_unlock(&bdev->fence_lock);
>   		atomic_set(&bo->reserved, 0);
>   		wake_up_all(&bo->event_queue);
>   		spin_unlock(&glob->lru_lock);
> +
> +		ret = driver->sync_obj_wait(bo->sync_obj, false, interruptible);
> +		driver->sync_obj_unref(&sync_obj);
> +		if (ret)
> +			return ret;
>   		goto retry;
>   	}
> +	spin_unlock(&bdev->fence_lock);
>   
>   	put_count = ttm_bo_del_from_lru(bo);
>   	list_del_init(&bo->ddestroy);
> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> index 1986d00..cd9e452 100644
> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> @@ -213,8 +213,8 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
>   	driver = bdev->driver;
>   	glob = bo->glob;
>   
> -	spin_lock(&bdev->fence_lock);
>   	spin_lock(&glob->lru_lock);
> +	spin_lock(&bdev->fence_lock);
>   
>   	list_for_each_entry(entry, list, head) {
>   		bo = entry->bo;
> @@ -223,8 +223,8 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
>   		ttm_bo_unreserve_locked(bo);
>   		entry->reserved = false;
>   	}
> -	spin_unlock(&glob->lru_lock);
>   	spin_unlock(&bdev->fence_lock);
> +	spin_unlock(&glob->lru_lock);
>   
>   	list_for_each_entry(entry, list, head) {
>   		if (entry->old_sync_obj)
Thomas Hellstrom Nov. 19, 2012, 3:04 p.m. UTC | #2
On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
> Hi,
>
> This patch looks mostly good, although I think ttm_bo_cleanup_refs 
> becomes overly complicated:
> Could this do, or am I missing something?
>

Actually, my version is bad, because ttm_bo_wait() is called with the 
lru lock held.

/Thomas


>
> static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
>                    bool interruptible,
>                    bool no_wait_reserve,
>                    bool no_wait_gpu)
> {
>     struct ttm_bo_device *bdev = bo->bdev;
>     struct ttm_bo_global *glob = bo->glob;
>     int put_count;
>     int ret = 0;
>
>     /*
>      * First, reserve while making sure we're still on the
>      * ddestroy list.
>      */
> retry_reserve:
>     spin_lock(&glob->lru_lock);
>
>     if (unlikely(list_empty(&bo->ddestroy))) {
>         spin_unlock(&glob->lru_lock);
>         return 0;
>     }
>
>     ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
>
>     if (unlikely(ret == -EBUSY)) {
>         spin_unlock(&glob->lru_lock);
>         if (likely(!no_wait_reserve))
>             ret = ttm_bo_wait_unreserved(bo, interruptible);
>         if (unlikely(ret != 0))
>             return ret;
>
>         goto retry_reserve;
>     }
>
>     BUG_ON(ret != 0);
>
>     spin_lock(&bdev->fence_lock);
>     ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
>     spin_unlock(&bdev->fence_lock);
>
>     if (unlikely(ret != 0)) {
>         atomic_set(&bo->reserved, 0);
>         wake_up_all(&bo->event_queue);
>         spin_unlock(&glob->lru_lock);
>         return ret;
>     }
>
>     put_count = ttm_bo_del_from_lru(bo);
>     list_del_init(&bo->ddestroy);
>     ++put_count;
>
>     spin_unlock(&glob->lru_lock);
>     ttm_bo_cleanup_memtype_use(bo);
>
>     atomic_set(&bo_reserved, 0);
>     wake_up_all(&bo->event_queue);
>     ttm_bo_list_ref_sub(bo, put_count, true);
>
>     return 0;
> }
>
>
> On 11/12/2012 03:00 PM, Maarten Lankhorst wrote:
>> I changed the hierarchy to make fence_lock the most inner lock,
>> instead of outer lock. This will simplify things slightly, and
>> hopefully makes it easier to make fence_lock global at one point
>> should it be needed.
>>
>> To make things clearer, I change the order around in ttm_bo_cleanup_refs
>> and ttm_bo_cleanup_refs_or_queue.
>>
>> A reservation is taken first, then fence lock is taken and a wait is 
>> attempted.
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>>
>> v2:
>>   - fix conflict with upstream race fix, simplifies ttm_bo_cleanup_refs
>> v3:
>>   - change removal of fence_lock to making it a inner lock instead
>> ---
>>   drivers/gpu/drm/ttm/ttm_bo.c           | 95 
>> ++++++++++++++++------------------
>>   drivers/gpu/drm/ttm/ttm_execbuf_util.c |  4 +-
>>   2 files changed, 48 insertions(+), 51 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> index a3383a7..70285ff 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -478,28 +478,26 @@ static void ttm_bo_cleanup_refs_or_queue(struct 
>> ttm_buffer_object *bo)
>>   {
>>       struct ttm_bo_device *bdev = bo->bdev;
>>       struct ttm_bo_global *glob = bo->glob;
>> -    struct ttm_bo_driver *driver;
>> +    struct ttm_bo_driver *driver = bdev->driver;
>>       void *sync_obj = NULL;
>>       int put_count;
>>       int ret;
>>   -    spin_lock(&bdev->fence_lock);
>> -    (void) ttm_bo_wait(bo, false, false, true);
>> -    if (!bo->sync_obj) {
>> -
>> -        spin_lock(&glob->lru_lock);
>> -
>> -        /**
>> -         * Lock inversion between bo:reserve and bdev::fence_lock here,
>> -         * but that's OK, since we're only trylocking.
>> -         */
>> -
>> -        ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
>> +    spin_lock(&glob->lru_lock);
>> +    ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
>> +    if (!ret) {
>> +        spin_lock(&bdev->fence_lock);
>> +        ret = ttm_bo_wait(bo, false, false, true);
>>   -        if (unlikely(ret == -EBUSY))
>> +        if (unlikely(ret == -EBUSY)) {
>> +            sync_obj = driver->sync_obj_ref(bo->sync_obj);
>> +            spin_unlock(&bdev->fence_lock);
>> +            atomic_set(&bo->reserved, 0);
>> +            wake_up_all(&bo->event_queue);
>>               goto queue;
>> -
>> +        }
>>           spin_unlock(&bdev->fence_lock);
>> +
>>           put_count = ttm_bo_del_from_lru(bo);
>>             atomic_set(&bo->reserved, 0);
>> @@ -509,18 +507,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct 
>> ttm_buffer_object *bo)
>>           ttm_bo_list_ref_sub(bo, put_count, true);
>>             return;
>> -    } else {
>> -        spin_lock(&glob->lru_lock);
>>       }
>>   queue:
>> -    driver = bdev->driver;
>> -    if (bo->sync_obj)
>> -        sync_obj = driver->sync_obj_ref(bo->sync_obj);
>> -
>>       kref_get(&bo->list_kref);
>>       list_add_tail(&bo->ddestroy, &bdev->ddestroy);
>>       spin_unlock(&glob->lru_lock);
>> -    spin_unlock(&bdev->fence_lock);
>>         if (sync_obj) {
>>           driver->sync_obj_flush(sync_obj);
>> @@ -546,54 +537,60 @@ static int ttm_bo_cleanup_refs(struct 
>> ttm_buffer_object *bo,
>>                      bool no_wait_gpu)
>>   {
>>       struct ttm_bo_device *bdev = bo->bdev;
>> +    struct ttm_bo_driver *driver = bdev->driver;
>>       struct ttm_bo_global *glob = bo->glob;
>>       int put_count;
>>       int ret = 0;
>> +    void *sync_obj;
>>     retry:
>> -    spin_lock(&bdev->fence_lock);
>> -    ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
>> -    spin_unlock(&bdev->fence_lock);
>> +    spin_lock(&glob->lru_lock);
>>   -    if (unlikely(ret != 0))
>> -        return ret;
>> +    ret = ttm_bo_reserve_locked(bo, interruptible,
>> +                    no_wait_reserve, false, 0);
>>   -retry_reserve:
>> -    spin_lock(&glob->lru_lock);
>> +    if (unlikely(ret)) {
>> +        spin_unlock(&glob->lru_lock);
>> +        return ret;
>> +    }
>>         if (unlikely(list_empty(&bo->ddestroy))) {
>> +        atomic_set(&bo->reserved, 0);
>> +        wake_up_all(&bo->event_queue);
>>           spin_unlock(&glob->lru_lock);
>>           return 0;
>>       }
>>   -    ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
>> -
>> -    if (unlikely(ret == -EBUSY)) {
>> -        spin_unlock(&glob->lru_lock);
>> -        if (likely(!no_wait_reserve))
>> -            ret = ttm_bo_wait_unreserved(bo, interruptible);
>> -        if (unlikely(ret != 0))
>> +    spin_lock(&bdev->fence_lock);
>> +    ret = ttm_bo_wait(bo, false, false, true);
>> +    if (ret) {
>> +        if (no_wait_gpu) {
>> +            spin_unlock(&bdev->fence_lock);
>> +            atomic_set(&bo->reserved, 0);
>> +            wake_up_all(&bo->event_queue);
>> +            spin_unlock(&glob->lru_lock);
>>               return ret;
>> +        }
>>   -        goto retry_reserve;
>> -    }
>> -
>> -    BUG_ON(ret != 0);
>> -
>> -    /**
>> -     * We can re-check for sync object without taking
>> -     * the bo::lock since setting the sync object requires
>> -     * also bo::reserved. A busy object at this point may
>> -     * be caused by another thread recently starting an accelerated
>> -     * eviction.
>> -     */
>> +        /**
>> +         * Take a reference to the fence and unreserve, if the wait
>> +         * was succesful and no new sync_obj was attached,
>> +         * ttm_bo_wait in retry will return ret = 0, and end the loop.
>> +         */
>>   -    if (unlikely(bo->sync_obj)) {
>> +        sync_obj = driver->sync_obj_ref(&bo->sync_obj);
>> +        spin_unlock(&bdev->fence_lock);
>>           atomic_set(&bo->reserved, 0);
>>           wake_up_all(&bo->event_queue);
>>           spin_unlock(&glob->lru_lock);
>> +
>> +        ret = driver->sync_obj_wait(bo->sync_obj, false, 
>> interruptible);
>> +        driver->sync_obj_unref(&sync_obj);
>> +        if (ret)
>> +            return ret;
>>           goto retry;
>>       }
>> +    spin_unlock(&bdev->fence_lock);
>>         put_count = ttm_bo_del_from_lru(bo);
>>       list_del_init(&bo->ddestroy);
>> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c 
>> b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> index 1986d00..cd9e452 100644
>> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> @@ -213,8 +213,8 @@ void ttm_eu_fence_buffer_objects(struct list_head 
>> *list, void *sync_obj)
>>       driver = bdev->driver;
>>       glob = bo->glob;
>>   -    spin_lock(&bdev->fence_lock);
>>       spin_lock(&glob->lru_lock);
>> +    spin_lock(&bdev->fence_lock);
>>         list_for_each_entry(entry, list, head) {
>>           bo = entry->bo;
>> @@ -223,8 +223,8 @@ void ttm_eu_fence_buffer_objects(struct list_head 
>> *list, void *sync_obj)
>>           ttm_bo_unreserve_locked(bo);
>>           entry->reserved = false;
>>       }
>> -    spin_unlock(&glob->lru_lock);
>>       spin_unlock(&bdev->fence_lock);
>> +    spin_unlock(&glob->lru_lock);
>>         list_for_each_entry(entry, list, head) {
>>           if (entry->old_sync_obj)
>
Maarten Lankhorst Nov. 19, 2012, 3:33 p.m. UTC | #3
Op 19-11-12 16:04, Thomas Hellstrom schreef:
> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>> Hi,
>>
>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>> Could this do, or am I missing something?
>>
>
> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>
> /Thomas
Oh digging through it made me remember why I had to release the reservation early and
had to allow move_notify to be called without reservation.

Fortunately move_notify has a NULL parameter, which is the only time that happens,
so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
move_notify handler.

05/10 removed the loop and assumed no new fence could be attached after the driver has
declared the bo dead.

However, at that point it may no longer hold a reservation to confirm this, that's why
I moved the cleanup to be done in the release_list handler. It could still be done in
ttm_bo_release, but we no longer have a reservation after we waited. Getting
a reservation can fail if the bo is imported for example.

While it would be true that in that case a new fence may be attached as well, that
would be less harmful since that operation wouldn't involve this device, so the
ttm bo can still be removed in that case. When that time comes I should probably
fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)

I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
in for a kernel release or 2. But according to the rules that would be the only time you
could attach a new fence and trigger the WARN_ON for now..

~Maarten
Thomas Hellström (VMware) Nov. 20, 2012, 7:48 a.m. UTC | #4
On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>> Hi,
>>>
>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>> Could this do, or am I missing something?
>>>
>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>
>> /Thomas
> Oh digging through it made me remember why I had to release the reservation early and
> had to allow move_notify to be called without reservation.
>
> Fortunately move_notify has a NULL parameter, which is the only time that happens,
> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
> move_notify handler.
>
> 05/10 removed the loop and assumed no new fence could be attached after the driver has
> declared the bo dead.
>
> However, at that point it may no longer hold a reservation to confirm this, that's why
> I moved the cleanup to be done in the release_list handler. It could still be done in
> ttm_bo_release, but we no longer have a reservation after we waited. Getting
> a reservation can fail if the bo is imported for example.
>
> While it would be true that in that case a new fence may be attached as well, that
> would be less harmful since that operation wouldn't involve this device, so the
> ttm bo can still be removed in that case. When that time comes I should probably
> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>
> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
> in for a kernel release or 2. But according to the rules that would be the only time you
> could attach a new fence and trigger the WARN_ON for now..

Hmm, I'd appreciate if you could group patches with functional changes 
that depend on eachother togeteher,
and "this is done because ...", which makes it much easier to review, 
(and to follow the commit history in case
something goes terribly wrong and we need to revert).

Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot 
any culprits.

In general, as long as a bo is on a LRU list, we must be able to attach 
fences because of accelerated eviction.

/Thomas











> ~Maarten
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
Maarten Lankhorst Nov. 20, 2012, 11:33 a.m. UTC | #5
Op 20-11-12 08:48, Thomas Hellstrom schreef:
> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>> Hi,
>>>>
>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>> Could this do, or am I missing something?
>>>>
>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>
>>> /Thomas
>> Oh digging through it made me remember why I had to release the reservation early and
>> had to allow move_notify to be called without reservation.
>>
>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>> move_notify handler.
>>
>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>> declared the bo dead.
>>
>> However, at that point it may no longer hold a reservation to confirm this, that's why
>> I moved the cleanup to be done in the release_list handler. It could still be done in
>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>> a reservation can fail if the bo is imported for example.
>>
>> While it would be true that in that case a new fence may be attached as well, that
>> would be less harmful since that operation wouldn't involve this device, so the
>> ttm bo can still be removed in that case. When that time comes I should probably
>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>
>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>> in for a kernel release or 2. But according to the rules that would be the only time you
>> could attach a new fence and trigger the WARN_ON for now..
>
> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
> something goes terribly wrong and we need to revert).
>
> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>
> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
I thought it was deliberately designed in such a way that it was kept on the lru list,
but since it's also on the ddestroy list it won't start accelerated eviction,
since it branches into cleanup_refs early, and lru_lock still protects all the list entries.

Of course any previous acceleration may still happen, but since we take a reservation first before waiting,
we're already sure that any previous acceleration command has finished fencing, and no new one can
start since it appears on the ddestroy list which would force it to perform the same wait.

The wait is legal, and no new fences can be attached.

I do agree all those patches probably needs a lot longer commit message to explain it though. :-)

~Maarten
Maarten Lankhorst Nov. 20, 2012, 11:59 a.m. UTC | #6
Op 20-11-12 12:33, Maarten Lankhorst schreef:
> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>> Hi,
>>>>>
>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>> Could this do, or am I missing something?
>>>>>
>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>
>>>> /Thomas
>>> Oh digging through it made me remember why I had to release the reservation early and
>>> had to allow move_notify to be called without reservation.
>>>
>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>> move_notify handler.
>>>
>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>> declared the bo dead.
>>>
>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>> a reservation can fail if the bo is imported for example.
>>>
>>> While it would be true that in that case a new fence may be attached as well, that
>>> would be less harmful since that operation wouldn't involve this device, so the
>>> ttm bo can still be removed in that case. When that time comes I should probably
>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>
>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>> could attach a new fence and trigger the WARN_ON for now..
>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>> something goes terribly wrong and we need to revert).
>>
>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>
>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
> I thought it was deliberately designed in such a way that it was kept on the lru list,
> but since it's also on the ddestroy list it won't start accelerated eviction,
> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
>
> Of course any previous acceleration may still happen, but since we take a reservation first before waiting,
> we're already sure that any previous acceleration command has finished fencing, and no new one can
> start since it appears on the ddestroy list which would force it to perform the same wait.
>
> The wait is legal, and no new fences can be attached.
>
> I do agree all those patches probably needs a lot longer commit message to explain it though. :-)
>
Or maybe an alternative patch..

We could move the checks. There are only 2 places that are allowed to hold
reservations at that point right?

ttm_bo_swapout and evict_mem_first.

If cleanup_refs_or_queue fails because reservation fails, it must mean it's in one of those 2 places.
If it succeeds, we can remove it from the lru list and swap list, and if wait fails move it to ddestroy list.

unreserve in swapout doesn't add it back to any lists. No special handling needed there.
unreserve in evict_mem_first does, but we could take the lock before unreserve, and only
re-add it to the swap/lru list when it's not on ddestroy.

That way we wouldn't need to call ttm_bo_cleanup_refs from multiple places,
and the cleanup would only ever need to be done in the ttm_bo_delayed_delete without race.

I thought it was a feature that it still appeared on the lru list after death, so evict_mem_first could
wait on it, but if it's an annoyance it could be easily fixed like that.

But even if it's a feature to be preserved, evict_mem_first and swapout could be modified to check
the ddestroy list first for buffers to destroy. In that case those functions would explicitly prefer waiting for
destruction of bo's before queueing new work to swapout or evict bo's.

~Maarten
Thomas Hellstrom Nov. 20, 2012, 12:03 p.m. UTC | #7
On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>> Hi,
>>>>>
>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>> Could this do, or am I missing something?
>>>>>
>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>
>>>> /Thomas
>>> Oh digging through it made me remember why I had to release the reservation early and
>>> had to allow move_notify to be called without reservation.
>>>
>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>> move_notify handler.
>>>
>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>> declared the bo dead.
>>>
>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>> a reservation can fail if the bo is imported for example.
>>>
>>> While it would be true that in that case a new fence may be attached as well, that
>>> would be less harmful since that operation wouldn't involve this device, so the
>>> ttm bo can still be removed in that case. When that time comes I should probably
>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>
>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>> could attach a new fence and trigger the WARN_ON for now..
>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>> something goes terribly wrong and we need to revert).
>>
>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>
>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
> I thought it was deliberately designed in such a way that it was kept on the lru list,
> but since it's also on the ddestroy list it won't start accelerated eviction,
> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
I used bad wording. I meant that unbinding might be accelerated, but  
currently (quite inefficiently)
do synchronized unbinding, assuming that only the CPU can do that. When 
we start to support
unsynchronized moves, we need to be able to attach fences at least at 
the last move_notify(bo, NULL);

/Thomas
Maarten Lankhorst Nov. 20, 2012, 1:13 p.m. UTC | #8
Op 20-11-12 13:03, Thomas Hellstrom schreef:
> On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
>> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>>> Hi,
>>>>>>
>>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>>> Could this do, or am I missing something?
>>>>>>
>>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>>
>>>>> /Thomas
>>>> Oh digging through it made me remember why I had to release the reservation early and
>>>> had to allow move_notify to be called without reservation.
>>>>
>>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>>> move_notify handler.
>>>>
>>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>>> declared the bo dead.
>>>>
>>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>>> a reservation can fail if the bo is imported for example.
>>>>
>>>> While it would be true that in that case a new fence may be attached as well, that
>>>> would be less harmful since that operation wouldn't involve this device, so the
>>>> ttm bo can still be removed in that case. When that time comes I should probably
>>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>>
>>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>>> could attach a new fence and trigger the WARN_ON for now..
>>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>>> something goes terribly wrong and we need to revert).
>>>
>>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>>
>>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
>> I thought it was deliberately designed in such a way that it was kept on the lru list,
>> but since it's also on the ddestroy list it won't start accelerated eviction,
>> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
> I used bad wording. I meant that unbinding might be accelerated, but  currently (quite inefficiently)
> do synchronized unbinding, assuming that only the CPU can do that. When we start to support
> unsynchronized moves, we need to be able to attach fences at least at the last move_notify(bo, NULL);
Would you need to wait in that case on fence_wait being completed before calling move_notify?

If not, you would still only need to perform one wait, but you'd have to make sure move_notify only gets
called by 1 thread before checking the fence pointer and performing a wait. At that point you still hold the
lru_lock though, so it shouldn't be too hard to make something safe.

~Maarten
Thomas Hellstrom Nov. 20, 2012, 3:08 p.m. UTC | #9
On 11/20/2012 02:13 PM, Maarten Lankhorst wrote:
> Op 20-11-12 13:03, Thomas Hellstrom schreef:
>> On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
>>> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>>>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>>>> Could this do, or am I missing something?
>>>>>>>
>>>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>>>
>>>>>> /Thomas
>>>>> Oh digging through it made me remember why I had to release the reservation early and
>>>>> had to allow move_notify to be called without reservation.
>>>>>
>>>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>>>> move_notify handler.
>>>>>
>>>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>>>> declared the bo dead.
>>>>>
>>>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>>>> a reservation can fail if the bo is imported for example.
>>>>>
>>>>> While it would be true that in that case a new fence may be attached as well, that
>>>>> would be less harmful since that operation wouldn't involve this device, so the
>>>>> ttm bo can still be removed in that case. When that time comes I should probably
>>>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>>>
>>>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>>>> could attach a new fence and trigger the WARN_ON for now..
>>>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>>>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>>>> something goes terribly wrong and we need to revert).
>>>>
>>>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>>>
>>>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
>>> I thought it was deliberately designed in such a way that it was kept on the lru list,
>>> but since it's also on the ddestroy list it won't start accelerated eviction,
>>> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
>> I used bad wording. I meant that unbinding might be accelerated, but  currently (quite inefficiently)
>> do synchronized unbinding, assuming that only the CPU can do that. When we start to support
>> unsynchronized moves, we need to be able to attach fences at least at the last move_notify(bo, NULL);
> Would you need to wait in that case on fence_wait being completed before calling move_notify?
>
> If not, you would still only need to perform one wait, but you'd have to make sure move_notify only gets
> called by 1 thread before checking the fence pointer and performing a wait. At that point you still hold the
> lru_lock though, so it shouldn't be too hard to make something safe.

I think typically a driver that wants to implement asynchronous moves 
don't want to wait before calling
move_notify, but may wait in move_notify or move. Typically (upcoming 
vmwgfx) it would invalidate the buffer in move_notify(bo, NULL), attach 
a fence and then use the normal delayed destroy to wait on that fence 
before destroying the buffer.

Otherwise, since binds / unbinds are handled in the GPU command stream 
there's never any need to wait for moves except when there's a CPU
access.

/Thomas


>
> ~Maarten
>
Maarten Lankhorst Nov. 21, 2012, 11:38 a.m. UTC | #10
Hey,

Op 20-11-12 16:08, Thomas Hellstrom schreef:
> On 11/20/2012 02:13 PM, Maarten Lankhorst wrote:
>> Op 20-11-12 13:03, Thomas Hellstrom schreef:
>>> On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
>>>> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>>>>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>>>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>>>>> Could this do, or am I missing something?
>>>>>>>>
>>>>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>>>>
>>>>>>> /Thomas
>>>>>> Oh digging through it made me remember why I had to release the reservation early and
>>>>>> had to allow move_notify to be called without reservation.
>>>>>>
>>>>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>>>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>>>>> move_notify handler.
>>>>>>
>>>>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>>>>> declared the bo dead.
>>>>>>
>>>>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>>>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>>>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>>>>> a reservation can fail if the bo is imported for example.
>>>>>>
>>>>>> While it would be true that in that case a new fence may be attached as well, that
>>>>>> would be less harmful since that operation wouldn't involve this device, so the
>>>>>> ttm bo can still be removed in that case. When that time comes I should probably
>>>>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>>>>
>>>>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>>>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>>>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>>>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>>>>> could attach a new fence and trigger the WARN_ON for now..
>>>>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>>>>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>>>>> something goes terribly wrong and we need to revert).
>>>>>
>>>>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>>>>
>>>>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
>>>> I thought it was deliberately designed in such a way that it was kept on the lru list,
>>>> but since it's also on the ddestroy list it won't start accelerated eviction,
>>>> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
>>> I used bad wording. I meant that unbinding might be accelerated, but  currently (quite inefficiently)
>>> do synchronized unbinding, assuming that only the CPU can do that. When we start to support
>>> unsynchronized moves, we need to be able to attach fences at least at the last move_notify(bo, NULL);
>> Would you need to wait in that case on fence_wait being completed before calling move_notify?
>>
>> If not, you would still only need to perform one wait, but you'd have to make sure move_notify only gets
>> called by 1 thread before checking the fence pointer and performing a wait. At that point you still hold the
>> lru_lock though, so it shouldn't be too hard to make something safe.
>
> I think typically a driver that wants to implement asynchronous moves don't want to wait before calling
> move_notify, but may wait in move_notify or move. Typically (upcoming vmwgfx) it would invalidate the buffer in move_notify(bo, NULL), attach a fence and then use the normal delayed destroy to wait on that fence before destroying the buffer.
>
> Otherwise, since binds / unbinds are handled in the GPU command stream there's never any need to wait for moves except when there's a CPU
> access.
Well, nouveau actually needs fence_wait to finish first, since vm changes are out of band.
But I guess it should be possible to attach it as work to the fence when it's signaled, and I
may want to do something like that already for performance reasons in a different place,
so I guess it doesn't matter.

Is calling move_notify(bo, NULL) legal and a noop the second time? That would save a flag in the bo to check if it's called already,
although I suppose we could always define a TTM_BO_PRIV_FLAG_* for it otherwise.

move_notify might end up being called with the lru_lock held, but that shouldn't be a problem.

~Maarten
Thomas Hellstrom Nov. 21, 2012, 12:42 p.m. UTC | #11
On 11/21/2012 12:38 PM, Maarten Lankhorst wrote:
> Hey,
>
> Op 20-11-12 16:08, Thomas Hellstrom schreef:
>> On 11/20/2012 02:13 PM, Maarten Lankhorst wrote:
>>> Op 20-11-12 13:03, Thomas Hellstrom schreef:
>>>> On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
>>>>> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>>>>>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>>>>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>>>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>>>>>> Could this do, or am I missing something?
>>>>>>>>>
>>>>>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>>>>>
>>>>>>>> /Thomas
>>>>>>> Oh digging through it made me remember why I had to release the reservation early and
>>>>>>> had to allow move_notify to be called without reservation.
>>>>>>>
>>>>>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>>>>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>>>>>> move_notify handler.
>>>>>>>
>>>>>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>>>>>> declared the bo dead.
>>>>>>>
>>>>>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>>>>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>>>>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>>>>>> a reservation can fail if the bo is imported for example.
>>>>>>>
>>>>>>> While it would be true that in that case a new fence may be attached as well, that
>>>>>>> would be less harmful since that operation wouldn't involve this device, so the
>>>>>>> ttm bo can still be removed in that case. When that time comes I should probably
>>>>>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>>>>>
>>>>>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>>>>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>>>>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>>>>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>>>>>> could attach a new fence and trigger the WARN_ON for now..
>>>>>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>>>>>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>>>>>> something goes terribly wrong and we need to revert).
>>>>>>
>>>>>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>>>>>
>>>>>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
>>>>> I thought it was deliberately designed in such a way that it was kept on the lru list,
>>>>> but since it's also on the ddestroy list it won't start accelerated eviction,
>>>>> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
>>>> I used bad wording. I meant that unbinding might be accelerated, but  currently (quite inefficiently)
>>>> do synchronized unbinding, assuming that only the CPU can do that. When we start to support
>>>> unsynchronized moves, we need to be able to attach fences at least at the last move_notify(bo, NULL);
>>> Would you need to wait in that case on fence_wait being completed before calling move_notify?
>>>
>>> If not, you would still only need to perform one wait, but you'd have to make sure move_notify only gets
>>> called by 1 thread before checking the fence pointer and performing a wait. At that point you still hold the
>>> lru_lock though, so it shouldn't be too hard to make something safe.
>> I think typically a driver that wants to implement asynchronous moves don't want to wait before calling
>> move_notify, but may wait in move_notify or move. Typically (upcoming vmwgfx) it would invalidate the buffer in move_notify(bo, NULL), attach a fence and then use the normal delayed destroy to wait on that fence before destroying the buffer.
>>
>> Otherwise, since binds / unbinds are handled in the GPU command stream there's never any need to wait for moves except when there's a CPU
>> access.
> Well, nouveau actually needs fence_wait to finish first, since vm changes are out of band.
> But I guess it should be possible to attach it as work to the fence when it's signaled, and I
> may want to do something like that already for performance reasons in a different place,
> so I guess it doesn't matter.

Actions to be performed on fence signaling tend to be very cpu 
consuming, I think due to the context switches involved.
We had to replace that in the old psb driver and batch things like TTM 
does instead.

Also remember that TTM fences are not required to signal in finite time 
unless fence_flush is called.

I think nouveau doesn't use fence irqs to signal its fences.

>
> Is calling move_notify(bo, NULL) legal and a noop the second time?

I see no fundamental reason why it shouldn't be OK, although we might 
need to patch drivers to cope with it.

>   That would save a flag in the bo to check if it's called already,
> although I suppose we could always define a TTM_BO_PRIV_FLAG_* for it otherwise.
>
> move_notify might end up being called with the lru_lock held, but that shouldn't be a problem.

I don't think that's a good idea. Drivers sleeping in move_notify will 
need to release the spinlock, and that means it's
better to release it before move_notify is called.

/Thomas


>
> ~Maarten
>
Maarten Lankhorst Nov. 21, 2012, 1:12 p.m. UTC | #12
Op 21-11-12 13:42, Thomas Hellstrom schreef:
> On 11/21/2012 12:38 PM, Maarten Lankhorst wrote:
>> Hey,
>>
>> Op 20-11-12 16:08, Thomas Hellstrom schreef:
>>> On 11/20/2012 02:13 PM, Maarten Lankhorst wrote:
>>>> Op 20-11-12 13:03, Thomas Hellstrom schreef:
>>>>> On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
>>>>>> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>>>>>>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>>>>>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>>>>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>>>>>>> Hi,
>>>>>>>>>>
>>>>>>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>>>>>>> Could this do, or am I missing something?
>>>>>>>>>>
>>>>>>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>>>>>>
>>>>>>>>> /Thomas
>>>>>>>> Oh digging through it made me remember why I had to release the reservation early and
>>>>>>>> had to allow move_notify to be called without reservation.
>>>>>>>>
>>>>>>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>>>>>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>>>>>>> move_notify handler.
>>>>>>>>
>>>>>>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>>>>>>> declared the bo dead.
>>>>>>>>
>>>>>>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>>>>>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>>>>>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>>>>>>> a reservation can fail if the bo is imported for example.
>>>>>>>>
>>>>>>>> While it would be true that in that case a new fence may be attached as well, that
>>>>>>>> would be less harmful since that operation wouldn't involve this device, so the
>>>>>>>> ttm bo can still be removed in that case. When that time comes I should probably
>>>>>>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>>>>>>
>>>>>>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>>>>>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>>>>>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>>>>>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>>>>>>> could attach a new fence and trigger the WARN_ON for now..
>>>>>>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>>>>>>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>>>>>>> something goes terribly wrong and we need to revert).
>>>>>>>
>>>>>>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>>>>>>
>>>>>>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
>>>>>> I thought it was deliberately designed in such a way that it was kept on the lru list,
>>>>>> but since it's also on the ddestroy list it won't start accelerated eviction,
>>>>>> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
>>>>> I used bad wording. I meant that unbinding might be accelerated, but  currently (quite inefficiently)
>>>>> do synchronized unbinding, assuming that only the CPU can do that. When we start to support
>>>>> unsynchronized moves, we need to be able to attach fences at least at the last move_notify(bo, NULL);
>>>> Would you need to wait in that case on fence_wait being completed before calling move_notify?
>>>>
>>>> If not, you would still only need to perform one wait, but you'd have to make sure move_notify only gets
>>>> called by 1 thread before checking the fence pointer and performing a wait. At that point you still hold the
>>>> lru_lock though, so it shouldn't be too hard to make something safe.
>>> I think typically a driver that wants to implement asynchronous moves don't want to wait before calling
>>> move_notify, but may wait in move_notify or move. Typically (upcoming vmwgfx) it would invalidate the buffer in move_notify(bo, NULL), attach a fence and then use the normal delayed destroy to wait on that fence before destroying the buffer.
>>>
>>> Otherwise, since binds / unbinds are handled in the GPU command stream there's never any need to wait for moves except when there's a CPU
>>> access.
>> Well, nouveau actually needs fence_wait to finish first, since vm changes are out of band.
>> But I guess it should be possible to attach it as work to the fence when it's signaled, and I
>> may want to do something like that already for performance reasons in a different place,
>> so I guess it doesn't matter.
>
> Actions to be performed on fence signaling tend to be very cpu consuming, I think due to the context switches involved.
> We had to replace that in the old psb driver and batch things like TTM does instead.
>
> Also remember that TTM fences are not required to signal in finite time unless fence_flush is called.
>
> I think nouveau doesn't use fence irqs to signal its fences.
>
>>
>> Is calling move_notify(bo, NULL) legal and a noop the second time?
>
> I see no fundamental reason why it shouldn't be OK, although we might need to patch drivers to cope with it.
>
>>   That would save a flag in the bo to check if it's called already,
>> although I suppose we could always define a TTM_BO_PRIV_FLAG_* for it otherwise.
>>
>> move_notify might end up being called with the lru_lock held, but that shouldn't be a problem.
>
> I don't think that's a good idea. Drivers sleeping in move_notify will need to release the spinlock, and that means it's
> better to release it before move_notify is called.
Is the only sleeping being done on fences? In that case we might wish to split it up in 2 pieces for destruction,
the piece that runs immediately, and a piece to run after the new fence has signaled (current behavior).

Nouveau needs the final move_notify unmap to be called after object is idle, like it is now. It doesn't need
to attach a new fence.

~Maarten
Thomas Hellstrom Nov. 21, 2012, 1:27 p.m. UTC | #13
On 11/21/2012 02:12 PM, Maarten Lankhorst wrote:
> Op 21-11-12 13:42, Thomas Hellstrom schreef:
>> On 11/21/2012 12:38 PM, Maarten Lankhorst wrote:
>>> Hey,
>>>
>>> Op 20-11-12 16:08, Thomas Hellstrom schreef:
>>>> On 11/20/2012 02:13 PM, Maarten Lankhorst wrote:
>>>>> Op 20-11-12 13:03, Thomas Hellstrom schreef:
>>>>>> On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
>>>>>>> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>>>>>>>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>>>>>>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>>>>>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>>>>>>>> Hi,
>>>>>>>>>>>
>>>>>>>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>>>>>>>> Could this do, or am I missing something?
>>>>>>>>>>>
>>>>>>>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>>>>>>>
>>>>>>>>>> /Thomas
>>>>>>>>> Oh digging through it made me remember why I had to release the reservation early and
>>>>>>>>> had to allow move_notify to be called without reservation.
>>>>>>>>>
>>>>>>>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>>>>>>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>>>>>>>> move_notify handler.
>>>>>>>>>
>>>>>>>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>>>>>>>> declared the bo dead.
>>>>>>>>>
>>>>>>>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>>>>>>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>>>>>>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>>>>>>>> a reservation can fail if the bo is imported for example.
>>>>>>>>>
>>>>>>>>> While it would be true that in that case a new fence may be attached as well, that
>>>>>>>>> would be less harmful since that operation wouldn't involve this device, so the
>>>>>>>>> ttm bo can still be removed in that case. When that time comes I should probably
>>>>>>>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>>>>>>>
>>>>>>>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>>>>>>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>>>>>>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>>>>>>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>>>>>>>> could attach a new fence and trigger the WARN_ON for now..
>>>>>>>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>>>>>>>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>>>>>>>> something goes terribly wrong and we need to revert).
>>>>>>>>
>>>>>>>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>>>>>>>
>>>>>>>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
>>>>>>> I thought it was deliberately designed in such a way that it was kept on the lru list,
>>>>>>> but since it's also on the ddestroy list it won't start accelerated eviction,
>>>>>>> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
>>>>>> I used bad wording. I meant that unbinding might be accelerated, but  currently (quite inefficiently)
>>>>>> do synchronized unbinding, assuming that only the CPU can do that. When we start to support
>>>>>> unsynchronized moves, we need to be able to attach fences at least at the last move_notify(bo, NULL);
>>>>> Would you need to wait in that case on fence_wait being completed before calling move_notify?
>>>>>
>>>>> If not, you would still only need to perform one wait, but you'd have to make sure move_notify only gets
>>>>> called by 1 thread before checking the fence pointer and performing a wait. At that point you still hold the
>>>>> lru_lock though, so it shouldn't be too hard to make something safe.
>>>> I think typically a driver that wants to implement asynchronous moves don't want to wait before calling
>>>> move_notify, but may wait in move_notify or move. Typically (upcoming vmwgfx) it would invalidate the buffer in move_notify(bo, NULL), attach a fence and then use the normal delayed destroy to wait on that fence before destroying the buffer.
>>>>
>>>> Otherwise, since binds / unbinds are handled in the GPU command stream there's never any need to wait for moves except when there's a CPU
>>>> access.
>>> Well, nouveau actually needs fence_wait to finish first, since vm changes are out of band.
>>> But I guess it should be possible to attach it as work to the fence when it's signaled, and I
>>> may want to do something like that already for performance reasons in a different place,
>>> so I guess it doesn't matter.
>> Actions to be performed on fence signaling tend to be very cpu consuming, I think due to the context switches involved.
>> We had to replace that in the old psb driver and batch things like TTM does instead.
>>
>> Also remember that TTM fences are not required to signal in finite time unless fence_flush is called.
>>
>> I think nouveau doesn't use fence irqs to signal its fences.
>>
>>> Is calling move_notify(bo, NULL) legal and a noop the second time?
>> I see no fundamental reason why it shouldn't be OK, although we might need to patch drivers to cope with it.
>>
>>>    That would save a flag in the bo to check if it's called already,
>>> although I suppose we could always define a TTM_BO_PRIV_FLAG_* for it otherwise.
>>>
>>> move_notify might end up being called with the lru_lock held, but that shouldn't be a problem.
>> I don't think that's a good idea. Drivers sleeping in move_notify will need to release the spinlock, and that means it's
>> better to release it before move_notify is called.
> Is the only sleeping being done on fences? In that case we might wish to split it up in 2 pieces for destruction,
> the piece that runs immediately, and a piece to run after the new fence has signaled (current behavior).
>
> Nouveau needs the final move_notify unmap to be called after object is idle, like it is now. It doesn't need
> to attach a new fence.

In that case it might be best to worry about asynchronous stuff later?
We will eventually implement it on the new vmwgfx hardware revision, but 
it's not ready yet.

/Thomas



> ~Maarten
>
Maarten Lankhorst Nov. 22, 2012, 3:51 p.m. UTC | #14
Op 21-11-12 14:27, Thomas Hellstrom schreef:
> On 11/21/2012 02:12 PM, Maarten Lankhorst wrote:
>> Op 21-11-12 13:42, Thomas Hellstrom schreef:
>>> On 11/21/2012 12:38 PM, Maarten Lankhorst wrote:
>>>> Hey,
>>>>
>>>> Op 20-11-12 16:08, Thomas Hellstrom schreef:
>>>>> On 11/20/2012 02:13 PM, Maarten Lankhorst wrote:
>>>>>> Op 20-11-12 13:03, Thomas Hellstrom schreef:
>>>>>>> On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
>>>>>>>> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>>>>>>>>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>>>>>>>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>>>>>>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>>>>>>>>> Hi,
>>>>>>>>>>>>
>>>>>>>>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>>>>>>>>> Could this do, or am I missing something?
>>>>>>>>>>>>
>>>>>>>>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>>>>>>>>
>>>>>>>>>>> /Thomas
>>>>>>>>>> Oh digging through it made me remember why I had to release the reservation early and
>>>>>>>>>> had to allow move_notify to be called without reservation.
>>>>>>>>>>
>>>>>>>>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>>>>>>>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>>>>>>>>> move_notify handler.
>>>>>>>>>>
>>>>>>>>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>>>>>>>>> declared the bo dead.
>>>>>>>>>>
>>>>>>>>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>>>>>>>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>>>>>>>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>>>>>>>>> a reservation can fail if the bo is imported for example.
>>>>>>>>>>
>>>>>>>>>> While it would be true that in that case a new fence may be attached as well, that
>>>>>>>>>> would be less harmful since that operation wouldn't involve this device, so the
>>>>>>>>>> ttm bo can still be removed in that case. When that time comes I should probably
>>>>>>>>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>>>>>>>>
>>>>>>>>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>>>>>>>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>>>>>>>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>>>>>>>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>>>>>>>>> could attach a new fence and trigger the WARN_ON for now..
>>>>>>>>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>>>>>>>>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>>>>>>>>> something goes terribly wrong and we need to revert).
>>>>>>>>>
>>>>>>>>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>>>>>>>>
>>>>>>>>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
>>>>>>>> I thought it was deliberately designed in such a way that it was kept on the lru list,
>>>>>>>> but since it's also on the ddestroy list it won't start accelerated eviction,
>>>>>>>> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
>>>>>>> I used bad wording. I meant that unbinding might be accelerated, but  currently (quite inefficiently)
>>>>>>> do synchronized unbinding, assuming that only the CPU can do that. When we start to support
>>>>>>> unsynchronized moves, we need to be able to attach fences at least at the last move_notify(bo, NULL);
>>>>>> Would you need to wait in that case on fence_wait being completed before calling move_notify?
>>>>>>
>>>>>> If not, you would still only need to perform one wait, but you'd have to make sure move_notify only gets
>>>>>> called by 1 thread before checking the fence pointer and performing a wait. At that point you still hold the
>>>>>> lru_lock though, so it shouldn't be too hard to make something safe.
>>>>> I think typically a driver that wants to implement asynchronous moves don't want to wait before calling
>>>>> move_notify, but may wait in move_notify or move. Typically (upcoming vmwgfx) it would invalidate the buffer in move_notify(bo, NULL), attach a fence and then use the normal delayed destroy to wait on that fence before destroying the buffer.
>>>>>
>>>>> Otherwise, since binds / unbinds are handled in the GPU command stream there's never any need to wait for moves except when there's a CPU
>>>>> access.
>>>> Well, nouveau actually needs fence_wait to finish first, since vm changes are out of band.
>>>> But I guess it should be possible to attach it as work to the fence when it's signaled, and I
>>>> may want to do something like that already for performance reasons in a different place,
>>>> so I guess it doesn't matter.
>>> Actions to be performed on fence signaling tend to be very cpu consuming, I think due to the context switches involved.
>>> We had to replace that in the old psb driver and batch things like TTM does instead.
>>>
>>> Also remember that TTM fences are not required to signal in finite time unless fence_flush is called.
>>>
>>> I think nouveau doesn't use fence irqs to signal its fences.
>>>
>>>> Is calling move_notify(bo, NULL) legal and a noop the second time?
>>> I see no fundamental reason why it shouldn't be OK, although we might need to patch drivers to cope with it.
>>>
>>>>    That would save a flag in the bo to check if it's called already,
>>>> although I suppose we could always define a TTM_BO_PRIV_FLAG_* for it otherwise.
>>>>
>>>> move_notify might end up being called with the lru_lock held, but that shouldn't be a problem.
>>> I don't think that's a good idea. Drivers sleeping in move_notify will need to release the spinlock, and that means it's
>>> better to release it before move_notify is called.
>> Is the only sleeping being done on fences? In that case we might wish to split it up in 2 pieces for destruction,
>> the piece that runs immediately, and a piece to run after the new fence has signaled (current behavior).
>>
>> Nouveau needs the final move_notify unmap to be called after object is idle, like it is now. It doesn't need
>> to attach a new fence.
>
> In that case it might be best to worry about asynchronous stuff later?
> We will eventually implement it on the new vmwgfx hardware revision, but it's not ready yet.
>
> /Thomas
Ok sounds good.

In that case what do you want me to change from the first 4 patches apart from more verbose commit messages?
- 03/10 I got that I need to re-add the list_empty check after -EBUSY was returned in evict_mem_first.

Also PATCH 05/10 cleans up the spinning in ttm_bo_cleanup_refs, so I hope it's ok that it's a big
ugly in 04/10, as long as it doesn't result in any new bugs being introduced.

~Maarten

PS: I did a plain rebase of my git tree to deal with the conflicts in drm-next.
Thomas Hellstrom Nov. 22, 2012, 8:29 p.m. UTC | #15
On 11/22/2012 04:51 PM, Maarten Lankhorst wrote:
> Op 21-11-12 14:27, Thomas Hellstrom schreef:
>> On 11/21/2012 02:12 PM, Maarten Lankhorst wrote:
>>> Op 21-11-12 13:42, Thomas Hellstrom schreef:
>>>> On 11/21/2012 12:38 PM, Maarten Lankhorst wrote:
>>>>> Hey,
>>>>>
>>>>> Op 20-11-12 16:08, Thomas Hellstrom schreef:
>>>>>> On 11/20/2012 02:13 PM, Maarten Lankhorst wrote:
>>>>>>> Op 20-11-12 13:03, Thomas Hellstrom schreef:
>>>>>>>> On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
>>>>>>>>> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>>>>>>>>>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>>>>>>>>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>>>>>>>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>>>>>>>>>> Hi,
>>>>>>>>>>>>>
>>>>>>>>>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>>>>>>>>>> Could this do, or am I missing something?
>>>>>>>>>>>>>
>>>>>>>>>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>>>>>>>>>
>>>>>>>>>>>> /Thomas
>>>>>>>>>>> Oh digging through it made me remember why I had to release the reservation early and
>>>>>>>>>>> had to allow move_notify to be called without reservation.
>>>>>>>>>>>
>>>>>>>>>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>>>>>>>>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>>>>>>>>>> move_notify handler.
>>>>>>>>>>>
>>>>>>>>>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>>>>>>>>>> declared the bo dead.
>>>>>>>>>>>
>>>>>>>>>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>>>>>>>>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>>>>>>>>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>>>>>>>>>> a reservation can fail if the bo is imported for example.
>>>>>>>>>>>
>>>>>>>>>>> While it would be true that in that case a new fence may be attached as well, that
>>>>>>>>>>> would be less harmful since that operation wouldn't involve this device, so the
>>>>>>>>>>> ttm bo can still be removed in that case. When that time comes I should probably
>>>>>>>>>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>>>>>>>>>
>>>>>>>>>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>>>>>>>>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>>>>>>>>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>>>>>>>>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>>>>>>>>>> could attach a new fence and trigger the WARN_ON for now..
>>>>>>>>>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>>>>>>>>>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>>>>>>>>>> something goes terribly wrong and we need to revert).
>>>>>>>>>>
>>>>>>>>>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>>>>>>>>>
>>>>>>>>>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
>>>>>>>>> I thought it was deliberately designed in such a way that it was kept on the lru list,
>>>>>>>>> but since it's also on the ddestroy list it won't start accelerated eviction,
>>>>>>>>> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
>>>>>>>> I used bad wording. I meant that unbinding might be accelerated, but  currently (quite inefficiently)
>>>>>>>> do synchronized unbinding, assuming that only the CPU can do that. When we start to support
>>>>>>>> unsynchronized moves, we need to be able to attach fences at least at the last move_notify(bo, NULL);
>>>>>>> Would you need to wait in that case on fence_wait being completed before calling move_notify?
>>>>>>>
>>>>>>> If not, you would still only need to perform one wait, but you'd have to make sure move_notify only gets
>>>>>>> called by 1 thread before checking the fence pointer and performing a wait. At that point you still hold the
>>>>>>> lru_lock though, so it shouldn't be too hard to make something safe.
>>>>>> I think typically a driver that wants to implement asynchronous moves don't want to wait before calling
>>>>>> move_notify, but may wait in move_notify or move. Typically (upcoming vmwgfx) it would invalidate the buffer in move_notify(bo, NULL), attach a fence and then use the normal delayed destroy to wait on that fence before destroying the buffer.
>>>>>>
>>>>>> Otherwise, since binds / unbinds are handled in the GPU command stream there's never any need to wait for moves except when there's a CPU
>>>>>> access.
>>>>> Well, nouveau actually needs fence_wait to finish first, since vm changes are out of band.
>>>>> But I guess it should be possible to attach it as work to the fence when it's signaled, and I
>>>>> may want to do something like that already for performance reasons in a different place,
>>>>> so I guess it doesn't matter.
>>>> Actions to be performed on fence signaling tend to be very cpu consuming, I think due to the context switches involved.
>>>> We had to replace that in the old psb driver and batch things like TTM does instead.
>>>>
>>>> Also remember that TTM fences are not required to signal in finite time unless fence_flush is called.
>>>>
>>>> I think nouveau doesn't use fence irqs to signal its fences.
>>>>
>>>>> Is calling move_notify(bo, NULL) legal and a noop the second time?
>>>> I see no fundamental reason why it shouldn't be OK, although we might need to patch drivers to cope with it.
>>>>
>>>>>     That would save a flag in the bo to check if it's called already,
>>>>> although I suppose we could always define a TTM_BO_PRIV_FLAG_* for it otherwise.
>>>>>
>>>>> move_notify might end up being called with the lru_lock held, but that shouldn't be a problem.
>>>> I don't think that's a good idea. Drivers sleeping in move_notify will need to release the spinlock, and that means it's
>>>> better to release it before move_notify is called.
>>> Is the only sleeping being done on fences? In that case we might wish to split it up in 2 pieces for destruction,
>>> the piece that runs immediately, and a piece to run after the new fence has signaled (current behavior).
>>>
>>> Nouveau needs the final move_notify unmap to be called after object is idle, like it is now. It doesn't need
>>> to attach a new fence.
>> In that case it might be best to worry about asynchronous stuff later?
>> We will eventually implement it on the new vmwgfx hardware revision, but it's not ready yet.
>>
>> /Thomas
> Ok sounds good.
>
> In that case what do you want me to change from the first 4 patches apart from more verbose commit messages?
> - 03/10 I got that I need to re-add the list_empty check after -EBUSY was returned in evict_mem_first.
>
> Also PATCH 05/10 cleans up the spinning in ttm_bo_cleanup_refs, so I hope it's ok that it's a big
> ugly in 04/10, as long as it doesn't result in any new bugs being introduced.
>
> ~Maarten
>
> PS: I did a plain rebase of my git tree to deal with the conflicts in drm-next.
>

Maarten, it seems to me the purpose of the patches are the following 
(not necessarily the correct order).

1) Change fence lock locking order w r t LRU lock - should be a trivial 
and very small change.
2) Change reservations from lists to always be trylock, skipping already 
reserved bos.
3) Remove the lru lock around reservations.
4) Various optimizations / cleanups.

If you could reorganize and make 4 patch series like this, it would be 
much easier to follow what happens and why, and would make it much 
easier for me to review. It seems to me if patch series 1-3 focus on the 
intended changes and the intended changes only, they would be quite small?

/Thomas
Maarten Lankhorst Nov. 27, 2012, 12:35 p.m. UTC | #16
Op 22-11-12 21:29, Thomas Hellstrom schreef:
> On 11/22/2012 04:51 PM, Maarten Lankhorst wrote:
>> Op 21-11-12 14:27, Thomas Hellstrom schreef:
>>> On 11/21/2012 02:12 PM, Maarten Lankhorst wrote:
>>>> Op 21-11-12 13:42, Thomas Hellstrom schreef:
>>>>> On 11/21/2012 12:38 PM, Maarten Lankhorst wrote:
>>>>>> Hey,
>>>>>>
>>>>>> Op 20-11-12 16:08, Thomas Hellstrom schreef:
>>>>>>> On 11/20/2012 02:13 PM, Maarten Lankhorst wrote:
>>>>>>>> Op 20-11-12 13:03, Thomas Hellstrom schreef:
>>>>>>>>> On 11/20/2012 12:33 PM, Maarten Lankhorst wrote:
>>>>>>>>>> Op 20-11-12 08:48, Thomas Hellstrom schreef:
>>>>>>>>>>> On 11/19/2012 04:33 PM, Maarten Lankhorst wrote:
>>>>>>>>>>>> Op 19-11-12 16:04, Thomas Hellstrom schreef:
>>>>>>>>>>>>> On 11/19/2012 03:17 PM, Thomas Hellstrom wrote:
>>>>>>>>>>>>>> Hi,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This patch looks mostly good, although I think ttm_bo_cleanup_refs becomes overly complicated:
>>>>>>>>>>>>>> Could this do, or am I missing something?
>>>>>>>>>>>>>>
>>>>>>>>>>>>> Actually, my version is bad, because ttm_bo_wait() is called with the lru lock held.
>>>>>>>>>>>>>
>>>>>>>>>>>>> /Thomas
>>>>>>>>>>>> Oh digging through it made me remember why I had to release the reservation early and
>>>>>>>>>>>> had to allow move_notify to be called without reservation.
>>>>>>>>>>>>
>>>>>>>>>>>> Fortunately move_notify has a NULL parameter, which is the only time that happens,
>>>>>>>>>>>> so you can still check do BUG_ON(mem != NULL && !ttm_bo_reserved(bo)); in your
>>>>>>>>>>>> move_notify handler.
>>>>>>>>>>>>
>>>>>>>>>>>> 05/10 removed the loop and assumed no new fence could be attached after the driver has
>>>>>>>>>>>> declared the bo dead.
>>>>>>>>>>>>
>>>>>>>>>>>> However, at that point it may no longer hold a reservation to confirm this, that's why
>>>>>>>>>>>> I moved the cleanup to be done in the release_list handler. It could still be done in
>>>>>>>>>>>> ttm_bo_release, but we no longer have a reservation after we waited. Getting
>>>>>>>>>>>> a reservation can fail if the bo is imported for example.
>>>>>>>>>>>>
>>>>>>>>>>>> While it would be true that in that case a new fence may be attached as well, that
>>>>>>>>>>>> would be less harmful since that operation wouldn't involve this device, so the
>>>>>>>>>>>> ttm bo can still be removed in that case. When that time comes I should probably
>>>>>>>>>>>> fix up that WARN_ON(ret) in ttm_bo_cleanup_refs. :-)
>>>>>>>>>>>>
>>>>>>>>>>>> I did add a WARN_ON(!atomic_read(&bo->kref.refcount)); to
>>>>>>>>>>>> ttm_bo_reserve and ttm_eu_reserve_buffers to be sure nothing is done on the device
>>>>>>>>>>>> itself. If that is too paranoid, those WARN_ON's could be dropped. I prefer to leave them
>>>>>>>>>>>> in for a kernel release or 2. But according to the rules that would be the only time you
>>>>>>>>>>>> could attach a new fence and trigger the WARN_ON for now..
>>>>>>>>>>> Hmm, I'd appreciate if you could group patches with functional changes that depend on eachother togeteher,
>>>>>>>>>>> and "this is done because ...", which makes it much easier to review, (and to follow the commit history in case
>>>>>>>>>>> something goes terribly wrong and we need to revert).
>>>>>>>>>>>
>>>>>>>>>>> Meanwhile I'll take a look at the final ttm_bo.c and see if I can spot any culprits.
>>>>>>>>>>>
>>>>>>>>>>> In general, as long as a bo is on a LRU list, we must be able to attach fences because of accelerated eviction.
>>>>>>>>>> I thought it was deliberately designed in such a way that it was kept on the lru list,
>>>>>>>>>> but since it's also on the ddestroy list it won't start accelerated eviction,
>>>>>>>>>> since it branches into cleanup_refs early, and lru_lock still protects all the list entries.
>>>>>>>>> I used bad wording. I meant that unbinding might be accelerated, but  currently (quite inefficiently)
>>>>>>>>> do synchronized unbinding, assuming that only the CPU can do that. When we start to support
>>>>>>>>> unsynchronized moves, we need to be able to attach fences at least at the last move_notify(bo, NULL);
>>>>>>>> Would you need to wait in that case on fence_wait being completed before calling move_notify?
>>>>>>>>
>>>>>>>> If not, you would still only need to perform one wait, but you'd have to make sure move_notify only gets
>>>>>>>> called by 1 thread before checking the fence pointer and performing a wait. At that point you still hold the
>>>>>>>> lru_lock though, so it shouldn't be too hard to make something safe.
>>>>>>> I think typically a driver that wants to implement asynchronous moves don't want to wait before calling
>>>>>>> move_notify, but may wait in move_notify or move. Typically (upcoming vmwgfx) it would invalidate the buffer in move_notify(bo, NULL), attach a fence and then use the normal delayed destroy to wait on that fence before destroying the buffer.
>>>>>>>
>>>>>>> Otherwise, since binds / unbinds are handled in the GPU command stream there's never any need to wait for moves except when there's a CPU
>>>>>>> access.
>>>>>> Well, nouveau actually needs fence_wait to finish first, since vm changes are out of band.
>>>>>> But I guess it should be possible to attach it as work to the fence when it's signaled, and I
>>>>>> may want to do something like that already for performance reasons in a different place,
>>>>>> so I guess it doesn't matter.
>>>>> Actions to be performed on fence signaling tend to be very cpu consuming, I think due to the context switches involved.
>>>>> We had to replace that in the old psb driver and batch things like TTM does instead.
>>>>>
>>>>> Also remember that TTM fences are not required to signal in finite time unless fence_flush is called.
>>>>>
>>>>> I think nouveau doesn't use fence irqs to signal its fences.
>>>>>
>>>>>> Is calling move_notify(bo, NULL) legal and a noop the second time?
>>>>> I see no fundamental reason why it shouldn't be OK, although we might need to patch drivers to cope with it.
>>>>>
>>>>>>     That would save a flag in the bo to check if it's called already,
>>>>>> although I suppose we could always define a TTM_BO_PRIV_FLAG_* for it otherwise.
>>>>>>
>>>>>> move_notify might end up being called with the lru_lock held, but that shouldn't be a problem.
>>>>> I don't think that's a good idea. Drivers sleeping in move_notify will need to release the spinlock, and that means it's
>>>>> better to release it before move_notify is called.
>>>> Is the only sleeping being done on fences? In that case we might wish to split it up in 2 pieces for destruction,
>>>> the piece that runs immediately, and a piece to run after the new fence has signaled (current behavior).
>>>>
>>>> Nouveau needs the final move_notify unmap to be called after object is idle, like it is now. It doesn't need
>>>> to attach a new fence.
>>> In that case it might be best to worry about asynchronous stuff later?
>>> We will eventually implement it on the new vmwgfx hardware revision, but it's not ready yet.
>>>
>>> /Thomas
>> Ok sounds good.
>>
>> In that case what do you want me to change from the first 4 patches apart from more verbose commit messages?
>> - 03/10 I got that I need to re-add the list_empty check after -EBUSY was returned in evict_mem_first.
>>
>> Also PATCH 05/10 cleans up the spinning in ttm_bo_cleanup_refs, so I hope it's ok that it's a big
>> ugly in 04/10, as long as it doesn't result in any new bugs being introduced.
>>
>> ~Maarten
>>
>> PS: I did a plain rebase of my git tree to deal with the conflicts in drm-next.
>>
>
> Maarten, it seems to me the purpose of the patches are the following (not necessarily the correct order).
>
> 1) Change fence lock locking order w r t LRU lock - should be a trivial and very small change.
Hm yeah, this seems to be small in itself if I only do that.

> 2) Change reservations from lists to always be trylock, skipping already reserved bos.
Yeah, but unfortunately this was easier to be done after some of the cleanups.

> 3) Remove the lru lock around reservations.
This is a separate patch, but unfortunately dependent on all previous optimizations/cleanups.
> 4) Various optimizations / cleanups.
This was a bit harder, some of the changes are a lot easier with the cleanups/optimizations done first.

I need the cleanup_refs changes before the reservation trylock change, since they cause cleanup_refs to be
called with reservation and lru lock held, this will prevent ever blocking inside that function on something
other than wait.

The real reason those patches are in this order is because some patches can only be done after some
previous changes have been made first. However I can decrease the amount of changes slightly,
I was thinking of this:

0. change cleanup_refs_or_queue order of reservation and wait check
1. fence_lock <-> lru_lock nesting change, needed for patch 3
- small patch if I only focus on the inversion itself, I fear there is no race free to do this
without squashing this with patch 0. There will be a race otherwise where we waited on the
previous fence and a new fence was attached between unlocking fence lock and locking lru lock.
Both patches only touch ttm_bo_cleanup_refs_or_queue (and 2 lines in ttm_eu_fence_buffer_objects),
so it won't affect reviewability much if it's done in 1 patch only.

2. fix radeon move_notify to be callable without reservation, needed for patch 3
- small patch

3. call ttm_bo_cleanup_refs with reservation and lru lock held, drop the looping in ttm_bo_cleanup_refs, makes patch 4 and 5 easier
- instead of touching this function multiple times, just touch it first, to the final cleaner form to be done with it.

4. loop the trylocking in swapout
5. loop the trylocking in ttm_mem_evict_first
6. drop now unused no_wait_reserve argument from ttm_mem_evict_first
- separate patch since it's just touching a lot of functions without any functional change

Bonus series, those are probably independent patches, but might depend on above first:
7. cleanup ttm_bo_force_list_clean's taking lru lock twice for every bo
- depends loosely on no_wait_reserve argument being dropped to apply cleanly, no real depends otherwise
8. lru lock is now no longer needed to protect reservations, cleanup
- depends on the whole series
9. replace calls to ttm_bo_wait_unreserved with ttm_bo_reserve_slowpath, maps a lot better to mutexes
- Probably best if I split this one up in 4 patches, first introduce new function, make nouveau/ttm_eu use it, then drop old function.
> If you could reorganize and make 4 patch series like this, it would be much easier to follow what happens and why, and would make it much easier for me to review. It seems to me if patch series 1-3 focus on the intended changes and the intended changes only, they would be quite small?
Would the above patches 1 to 6 in 1 series be ok too?

If so, a completely untested version is up at my git tree http://cgit.freedesktop.org/~mlankhorst/linux/log/

"drm/ttm: change fence_lock to inner lock" up to "drm/ttm: remove no_wait_reserve, v2"

At this point those patches are not even boot-tested, so I'll do some testing first before
resubmitting those.

The last 3 changes can be reviewed independently, but last 2 changes
are probably best committed in that order, else I need to fixup reserve_slowpath too.

~Maarten
diff mbox

Patch

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index a3383a7..70285ff 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -478,28 +478,26 @@  static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_bo_global *glob = bo->glob;
-	struct ttm_bo_driver *driver;
+	struct ttm_bo_driver *driver = bdev->driver;
 	void *sync_obj = NULL;
 	int put_count;
 	int ret;
 
-	spin_lock(&bdev->fence_lock);
-	(void) ttm_bo_wait(bo, false, false, true);
-	if (!bo->sync_obj) {
-
-		spin_lock(&glob->lru_lock);
-
-		/**
-		 * Lock inversion between bo:reserve and bdev::fence_lock here,
-		 * but that's OK, since we're only trylocking.
-		 */
-
-		ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
+	spin_lock(&glob->lru_lock);
+	ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
+	if (!ret) {
+		spin_lock(&bdev->fence_lock);
+		ret = ttm_bo_wait(bo, false, false, true);
 
-		if (unlikely(ret == -EBUSY))
+		if (unlikely(ret == -EBUSY)) {
+			sync_obj = driver->sync_obj_ref(bo->sync_obj);
+			spin_unlock(&bdev->fence_lock);
+			atomic_set(&bo->reserved, 0);
+			wake_up_all(&bo->event_queue);
 			goto queue;
-
+		}
 		spin_unlock(&bdev->fence_lock);
+
 		put_count = ttm_bo_del_from_lru(bo);
 
 		atomic_set(&bo->reserved, 0);
@@ -509,18 +507,11 @@  static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 		ttm_bo_list_ref_sub(bo, put_count, true);
 
 		return;
-	} else {
-		spin_lock(&glob->lru_lock);
 	}
 queue:
-	driver = bdev->driver;
-	if (bo->sync_obj)
-		sync_obj = driver->sync_obj_ref(bo->sync_obj);
-
 	kref_get(&bo->list_kref);
 	list_add_tail(&bo->ddestroy, &bdev->ddestroy);
 	spin_unlock(&glob->lru_lock);
-	spin_unlock(&bdev->fence_lock);
 
 	if (sync_obj) {
 		driver->sync_obj_flush(sync_obj);
@@ -546,54 +537,60 @@  static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
 			       bool no_wait_gpu)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
+	struct ttm_bo_driver *driver = bdev->driver;
 	struct ttm_bo_global *glob = bo->glob;
 	int put_count;
 	int ret = 0;
+	void *sync_obj;
 
 retry:
-	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
-	spin_unlock(&bdev->fence_lock);
+	spin_lock(&glob->lru_lock);
 
-	if (unlikely(ret != 0))
-		return ret;
+	ret = ttm_bo_reserve_locked(bo, interruptible,
+				    no_wait_reserve, false, 0);
 
-retry_reserve:
-	spin_lock(&glob->lru_lock);
+	if (unlikely(ret)) {
+		spin_unlock(&glob->lru_lock);
+		return ret;
+	}
 
 	if (unlikely(list_empty(&bo->ddestroy))) {
+		atomic_set(&bo->reserved, 0);
+		wake_up_all(&bo->event_queue);
 		spin_unlock(&glob->lru_lock);
 		return 0;
 	}
 
-	ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
-
-	if (unlikely(ret == -EBUSY)) {
-		spin_unlock(&glob->lru_lock);
-		if (likely(!no_wait_reserve))
-			ret = ttm_bo_wait_unreserved(bo, interruptible);
-		if (unlikely(ret != 0))
+	spin_lock(&bdev->fence_lock);
+	ret = ttm_bo_wait(bo, false, false, true);
+	if (ret) {
+		if (no_wait_gpu) {
+			spin_unlock(&bdev->fence_lock);
+			atomic_set(&bo->reserved, 0);
+			wake_up_all(&bo->event_queue);
+			spin_unlock(&glob->lru_lock);
 			return ret;
+		}
 
-		goto retry_reserve;
-	}
-
-	BUG_ON(ret != 0);
-
-	/**
-	 * We can re-check for sync object without taking
-	 * the bo::lock since setting the sync object requires
-	 * also bo::reserved. A busy object at this point may
-	 * be caused by another thread recently starting an accelerated
-	 * eviction.
-	 */
+		/**
+		 * Take a reference to the fence and unreserve, if the wait
+		 * was succesful and no new sync_obj was attached,
+		 * ttm_bo_wait in retry will return ret = 0, and end the loop.
+		 */
 
-	if (unlikely(bo->sync_obj)) {
+		sync_obj = driver->sync_obj_ref(&bo->sync_obj);
+		spin_unlock(&bdev->fence_lock);
 		atomic_set(&bo->reserved, 0);
 		wake_up_all(&bo->event_queue);
 		spin_unlock(&glob->lru_lock);
+
+		ret = driver->sync_obj_wait(bo->sync_obj, false, interruptible);
+		driver->sync_obj_unref(&sync_obj);
+		if (ret)
+			return ret;
 		goto retry;
 	}
+	spin_unlock(&bdev->fence_lock);
 
 	put_count = ttm_bo_del_from_lru(bo);
 	list_del_init(&bo->ddestroy);
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index 1986d00..cd9e452 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -213,8 +213,8 @@  void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
 	driver = bdev->driver;
 	glob = bo->glob;
 
-	spin_lock(&bdev->fence_lock);
 	spin_lock(&glob->lru_lock);
+	spin_lock(&bdev->fence_lock);
 
 	list_for_each_entry(entry, list, head) {
 		bo = entry->bo;
@@ -223,8 +223,8 @@  void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
 		ttm_bo_unreserve_locked(bo);
 		entry->reserved = false;
 	}
-	spin_unlock(&glob->lru_lock);
 	spin_unlock(&bdev->fence_lock);
+	spin_unlock(&glob->lru_lock);
 
 	list_for_each_entry(entry, list, head) {
 		if (entry->old_sync_obj)