diff mbox

[RFC,6/9] drm/i915: Delay the freeing of requests until retire time

Message ID 1437143483-6234-7-git-send-email-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

John Harrison July 17, 2015, 2:31 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

The request structure is reference counted. When the count reached
zero, the request was immediately freed and all associated objects
were unrefereced/unallocated. This meant that the driver mutex lock
must be held at the point where the count reaches zero. This was fine
while all references were held internally to the driver. However, the
plan is to allow the underlying fence object (and hence the request
itself) to be returned to other drivers and to userland. External
users cannot be expected to acquire a driver private mutex lock.

Rather than attempt to disentangle the request structure from the
driver mutex lock, the decsion was to defer the free code until a
later (safer) point. Hence this patch changes the unreference callback
to merely move the request onto a delayed free list. The driver's
retire worker thread will then process the list and actually call the
free function on the requests.

[new patch in series]

For: VIZ-5190
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         | 22 +++---------------
 drivers/gpu/drm/i915/i915_gem.c         | 41 +++++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/intel_display.c    |  2 +-
 drivers/gpu/drm/i915/intel_lrc.c        |  2 ++
 drivers/gpu/drm/i915/intel_pm.c         |  2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c |  2 ++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  4 ++++
 7 files changed, 50 insertions(+), 25 deletions(-)

Comments

Tvrtko Ursulin July 23, 2015, 2:25 p.m. UTC | #1
Hi,

On 07/17/2015 03:31 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> The request structure is reference counted. When the count reached
> zero, the request was immediately freed and all associated objects
> were unrefereced/unallocated. This meant that the driver mutex lock
> must be held at the point where the count reaches zero. This was fine
> while all references were held internally to the driver. However, the
> plan is to allow the underlying fence object (and hence the request
> itself) to be returned to other drivers and to userland. External
> users cannot be expected to acquire a driver private mutex lock.
>
> Rather than attempt to disentangle the request structure from the
> driver mutex lock, the decsion was to defer the free code until a
> later (safer) point. Hence this patch changes the unreference callback
> to merely move the request onto a delayed free list. The driver's
> retire worker thread will then process the list and actually call the
> free function on the requests.
>
> [new patch in series]
>
> For: VIZ-5190
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h         | 22 +++---------------
>   drivers/gpu/drm/i915/i915_gem.c         | 41 +++++++++++++++++++++++++++++----
>   drivers/gpu/drm/i915/intel_display.c    |  2 +-
>   drivers/gpu/drm/i915/intel_lrc.c        |  2 ++
>   drivers/gpu/drm/i915/intel_pm.c         |  2 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.c |  2 ++
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  4 ++++
>   7 files changed, 50 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 88a4746..61c3db2 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2161,14 +2161,9 @@ void i915_gem_track_fb(struct drm_i915_gem_object *old,
>    * initial reference taken using kref_init
>    */
>   struct drm_i915_gem_request {
> -	/**
> -	 * Underlying object for implementing the signal/wait stuff.
> -	 * NB: Never return this fence object to user land! It is unsafe to
> -	 * let anything outside of the i915 driver get hold of the fence
> -	 * object as the clean up when decrementing the reference count
> -	 * requires holding the driver mutex lock.
> -	 */
> +	/** Underlying object for implementing the signal/wait stuff. */
>   	struct fence fence;
> +	struct list_head delay_free_list;

Maybe call this delay_free_link to continue the established convention.

>
>   	/** On Which ring this request was generated */
>   	struct drm_i915_private *i915;
> @@ -2281,21 +2276,10 @@ i915_gem_request_reference(struct drm_i915_gem_request *req)
>   static inline void
>   i915_gem_request_unreference(struct drm_i915_gem_request *req)
>   {
> -	WARN_ON(!mutex_is_locked(&req->ring->dev->struct_mutex));
> -	fence_put(&req->fence);
> -}
> -
> -static inline void
> -i915_gem_request_unreference__unlocked(struct drm_i915_gem_request *req)
> -{
> -	struct drm_device *dev;
> -
>   	if (!req)
>   		return;
>
> -	dev = req->ring->dev;
> -	if (kref_put_mutex(&req->fence.refcount, fence_release, &dev->struct_mutex))
> -		mutex_unlock(&dev->struct_mutex);
> +	fence_put(&req->fence);
>   }
>
>   static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index af79716..482835a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2616,10 +2616,27 @@ static void i915_set_reset_status(struct drm_i915_private *dev_priv,
>   	}
>   }
>
> -static void i915_gem_request_free(struct fence *req_fence)
> +static void i915_gem_request_release(struct fence *req_fence)
>   {
>   	struct drm_i915_gem_request *req = container_of(req_fence,
>   						 typeof(*req), fence);
> +	struct intel_engine_cs *ring = req->ring;
> +	struct drm_i915_private *dev_priv = to_i915(ring->dev);
> +	unsigned long flags;
> +
> +	/*
> +	 * Need to add the request to a deferred dereference list to be
> +	 * processed at a mutex lock safe time.
> +	 */
> +	spin_lock_irqsave(&ring->delayed_free_lock, flags);

At the moment there is no request unreferencing from irq handlers right? 
Unless (or until) you plan to add that you could use simple spin_lock 
here. (And in the i915_gem_retire_requests_ring.)

> +	list_add_tail(&req->delay_free_list, &ring->delayed_free_list);
> +	spin_unlock_irqrestore(&ring->delayed_free_lock, flags);
> +
> +	queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);

Have you decided to re-use the retire worker just for convenience of for 
some other reason as well?

I found it a bit unexpected and though dedicated request free worker 
would be cleaner, but I don't know, not a strong opinion.

> +}
> +
> +static void i915_gem_request_free(struct drm_i915_gem_request *req)
> +{
>   	struct intel_context *ctx = req->ctx;
>
>   	BUG_ON(!mutex_is_locked(&req->ring->dev->struct_mutex));
> @@ -2696,7 +2713,7 @@ static const struct fence_ops i915_gem_request_fops = {
>   	.enable_signaling	= i915_gem_request_enable_signaling,
>   	.signaled		= i915_gem_request_is_completed,
>   	.wait			= fence_default_wait,
> -	.release		= i915_gem_request_free,
> +	.release		= i915_gem_request_release,
>   	.fence_value_str	= i915_fence_value_str,
>   	.timeline_value_str	= i915_fence_timeline_value_str,
>   };
> @@ -2992,6 +3009,21 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>   		i915_gem_request_assign(&ring->trace_irq_req, NULL);
>   	}
>
> +	while (!list_empty(&ring->delayed_free_list)) {
> +		struct drm_i915_gem_request *request;
> +		unsigned long flags;
> +
> +		request = list_first_entry(&ring->delayed_free_list,
> +					   struct drm_i915_gem_request,
> +					   delay_free_list);

Need a spinlock to sample list head here. Then maybe move it on a 
temporary list and do the freeing afterwards.

Regards,

Tvrtko
John Harrison Oct. 28, 2015, 1 p.m. UTC | #2
On 23/07/2015 15:25, Tvrtko Ursulin wrote:
>
> Hi,
>
> On 07/17/2015 03:31 PM, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> The request structure is reference counted. When the count reached
>> zero, the request was immediately freed and all associated objects
>> were unrefereced/unallocated. This meant that the driver mutex lock
>> must be held at the point where the count reaches zero. This was fine
>> while all references were held internally to the driver. However, the
>> plan is to allow the underlying fence object (and hence the request
>> itself) to be returned to other drivers and to userland. External
>> users cannot be expected to acquire a driver private mutex lock.
>>
>> Rather than attempt to disentangle the request structure from the
>> driver mutex lock, the decsion was to defer the free code until a
>> later (safer) point. Hence this patch changes the unreference callback
>> to merely move the request onto a delayed free list. The driver's
>> retire worker thread will then process the list and actually call the
>> free function on the requests.
>>
>> [new patch in series]
>>
>> For: VIZ-5190
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h         | 22 +++---------------
>>   drivers/gpu/drm/i915/i915_gem.c         | 41 
>> +++++++++++++++++++++++++++++----
>>   drivers/gpu/drm/i915/intel_display.c    |  2 +-
>>   drivers/gpu/drm/i915/intel_lrc.c        |  2 ++
>>   drivers/gpu/drm/i915/intel_pm.c         |  2 +-
>>   drivers/gpu/drm/i915/intel_ringbuffer.c |  2 ++
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  4 ++++
>>   7 files changed, 50 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h 
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 88a4746..61c3db2 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2161,14 +2161,9 @@ void i915_gem_track_fb(struct 
>> drm_i915_gem_object *old,
>>    * initial reference taken using kref_init
>>    */
>>   struct drm_i915_gem_request {
>> -    /**
>> -     * Underlying object for implementing the signal/wait stuff.
>> -     * NB: Never return this fence object to user land! It is unsafe to
>> -     * let anything outside of the i915 driver get hold of the fence
>> -     * object as the clean up when decrementing the reference count
>> -     * requires holding the driver mutex lock.
>> -     */
>> +    /** Underlying object for implementing the signal/wait stuff. */
>>       struct fence fence;
>> +    struct list_head delay_free_list;
>
> Maybe call this delay_free_link to continue the established convention.
>
>>
>>       /** On Which ring this request was generated */
>>       struct drm_i915_private *i915;
>> @@ -2281,21 +2276,10 @@ i915_gem_request_reference(struct 
>> drm_i915_gem_request *req)
>>   static inline void
>>   i915_gem_request_unreference(struct drm_i915_gem_request *req)
>>   {
>> - WARN_ON(!mutex_is_locked(&req->ring->dev->struct_mutex));
>> -    fence_put(&req->fence);
>> -}
>> -
>> -static inline void
>> -i915_gem_request_unreference__unlocked(struct drm_i915_gem_request 
>> *req)
>> -{
>> -    struct drm_device *dev;
>> -
>>       if (!req)
>>           return;
>>
>> -    dev = req->ring->dev;
>> -    if (kref_put_mutex(&req->fence.refcount, fence_release, 
>> &dev->struct_mutex))
>> -        mutex_unlock(&dev->struct_mutex);
>> +    fence_put(&req->fence);
>>   }
>>
>>   static inline void i915_gem_request_assign(struct 
>> drm_i915_gem_request **pdst,
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c 
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index af79716..482835a 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2616,10 +2616,27 @@ static void i915_set_reset_status(struct 
>> drm_i915_private *dev_priv,
>>       }
>>   }
>>
>> -static void i915_gem_request_free(struct fence *req_fence)
>> +static void i915_gem_request_release(struct fence *req_fence)
>>   {
>>       struct drm_i915_gem_request *req = container_of(req_fence,
>>                            typeof(*req), fence);
>> +    struct intel_engine_cs *ring = req->ring;
>> +    struct drm_i915_private *dev_priv = to_i915(ring->dev);
>> +    unsigned long flags;
>> +
>> +    /*
>> +     * Need to add the request to a deferred dereference list to be
>> +     * processed at a mutex lock safe time.
>> +     */
>> +    spin_lock_irqsave(&ring->delayed_free_lock, flags);
>
> At the moment there is no request unreferencing from irq handlers 
> right? Unless (or until) you plan to add that you could use simple 
> spin_lock here. (And in the i915_gem_retire_requests_ring.)

I don't believe there is an unreference at IRQ time at this precise 
moment. However, there certainly have been in various other iterations 
of the code (including one on the display side that has since 
disappeared due to changes by others completely unrelated to this work). 
So I would be nervous about not making it IRQ compatible. It seems like 
a bug waiting to happen.


>> + list_add_tail(&req->delay_free_list, &ring->delayed_free_list);
>> +    spin_unlock_irqrestore(&ring->delayed_free_lock, flags);
>> +
>> +    queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
>
> Have you decided to re-use the retire worker just for convenience of 
> for some other reason as well?

It seemed like the logical place to do this. It is a periodic function 
that cleans up requests. Converting the unreference into a free isn't a 
hugely time critical thing so adding an entire dedicated work handler 
seems like overkill. Plus, retire requests is usually the place that 
releases the last reference so it makes sense to do the free at the end 
of that code.

>
> I found it a bit unexpected and though dedicated request free worker 
> would be cleaner, but I don't know, not a strong opinion.
>
>> +}
>> +
>> +static void i915_gem_request_free(struct drm_i915_gem_request *req)
>> +{
>>       struct intel_context *ctx = req->ctx;
>>
>> BUG_ON(!mutex_is_locked(&req->ring->dev->struct_mutex));
>> @@ -2696,7 +2713,7 @@ static const struct fence_ops 
>> i915_gem_request_fops = {
>>       .enable_signaling    = i915_gem_request_enable_signaling,
>>       .signaled        = i915_gem_request_is_completed,
>>       .wait            = fence_default_wait,
>> -    .release        = i915_gem_request_free,
>> +    .release        = i915_gem_request_release,
>>       .fence_value_str    = i915_fence_value_str,
>>       .timeline_value_str    = i915_fence_timeline_value_str,
>>   };
>> @@ -2992,6 +3009,21 @@ i915_gem_retire_requests_ring(struct 
>> intel_engine_cs *ring)
>>           i915_gem_request_assign(&ring->trace_irq_req, NULL);
>>       }
>>
>> +    while (!list_empty(&ring->delayed_free_list)) {
>> +        struct drm_i915_gem_request *request;
>> +        unsigned long flags;
>> +
>> +        request = list_first_entry(&ring->delayed_free_list,
>> +                       struct drm_i915_gem_request,
>> +                       delay_free_list);
>
> Need a spinlock to sample list head here. Then maybe move it on a 
> temporary list and do the freeing afterwards.

Not necessary. The only other usage of the list is to add to it. So this 
code can't pull an entry that gets removed beneath its feet. Either the 
list empty test will return true and nothing further happens or there is 
definitely a node on the list and list_first_entry() will return 
something sane. The spinlock is only required when actually deleting 
that node.


>
> Regards,
>
> Tvrtko
Tvrtko Ursulin Oct. 28, 2015, 1:42 p.m. UTC | #3
On 28/10/15 13:00, John Harrison wrote:
> On 23/07/2015 15:25, Tvrtko Ursulin wrote:
>>
>> Hi,
>>
>> On 07/17/2015 03:31 PM, John.C.Harrison@Intel.com wrote:
>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> The request structure is reference counted. When the count reached
>>> zero, the request was immediately freed and all associated objects
>>> were unrefereced/unallocated. This meant that the driver mutex lock
>>> must be held at the point where the count reaches zero. This was fine
>>> while all references were held internally to the driver. However, the
>>> plan is to allow the underlying fence object (and hence the request
>>> itself) to be returned to other drivers and to userland. External
>>> users cannot be expected to acquire a driver private mutex lock.
>>>
>>> Rather than attempt to disentangle the request structure from the
>>> driver mutex lock, the decsion was to defer the free code until a
>>> later (safer) point. Hence this patch changes the unreference callback
>>> to merely move the request onto a delayed free list. The driver's
>>> retire worker thread will then process the list and actually call the
>>> free function on the requests.
>>>
>>> [new patch in series]
>>>
>>> For: VIZ-5190
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/i915_drv.h         | 22 +++---------------
>>>   drivers/gpu/drm/i915/i915_gem.c         | 41
>>> +++++++++++++++++++++++++++++----
>>>   drivers/gpu/drm/i915/intel_display.c    |  2 +-
>>>   drivers/gpu/drm/i915/intel_lrc.c        |  2 ++
>>>   drivers/gpu/drm/i915/intel_pm.c         |  2 +-
>>>   drivers/gpu/drm/i915/intel_ringbuffer.c |  2 ++
>>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  4 ++++
>>>   7 files changed, 50 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>>> b/drivers/gpu/drm/i915/i915_drv.h
>>> index 88a4746..61c3db2 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -2161,14 +2161,9 @@ void i915_gem_track_fb(struct
>>> drm_i915_gem_object *old,
>>>    * initial reference taken using kref_init
>>>    */
>>>   struct drm_i915_gem_request {
>>> -    /**
>>> -     * Underlying object for implementing the signal/wait stuff.
>>> -     * NB: Never return this fence object to user land! It is unsafe to
>>> -     * let anything outside of the i915 driver get hold of the fence
>>> -     * object as the clean up when decrementing the reference count
>>> -     * requires holding the driver mutex lock.
>>> -     */
>>> +    /** Underlying object for implementing the signal/wait stuff. */
>>>       struct fence fence;
>>> +    struct list_head delay_free_list;
>>
>> Maybe call this delay_free_link to continue the established convention.
>>
>>>
>>>       /** On Which ring this request was generated */
>>>       struct drm_i915_private *i915;
>>> @@ -2281,21 +2276,10 @@ i915_gem_request_reference(struct
>>> drm_i915_gem_request *req)
>>>   static inline void
>>>   i915_gem_request_unreference(struct drm_i915_gem_request *req)
>>>   {
>>> - WARN_ON(!mutex_is_locked(&req->ring->dev->struct_mutex));
>>> -    fence_put(&req->fence);
>>> -}
>>> -
>>> -static inline void
>>> -i915_gem_request_unreference__unlocked(struct drm_i915_gem_request
>>> *req)
>>> -{
>>> -    struct drm_device *dev;
>>> -
>>>       if (!req)
>>>           return;
>>>
>>> -    dev = req->ring->dev;
>>> -    if (kref_put_mutex(&req->fence.refcount, fence_release,
>>> &dev->struct_mutex))
>>> -        mutex_unlock(&dev->struct_mutex);
>>> +    fence_put(&req->fence);
>>>   }
>>>
>>>   static inline void i915_gem_request_assign(struct
>>> drm_i915_gem_request **pdst,
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
>>> b/drivers/gpu/drm/i915/i915_gem.c
>>> index af79716..482835a 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -2616,10 +2616,27 @@ static void i915_set_reset_status(struct
>>> drm_i915_private *dev_priv,
>>>       }
>>>   }
>>>
>>> -static void i915_gem_request_free(struct fence *req_fence)
>>> +static void i915_gem_request_release(struct fence *req_fence)
>>>   {
>>>       struct drm_i915_gem_request *req = container_of(req_fence,
>>>                            typeof(*req), fence);
>>> +    struct intel_engine_cs *ring = req->ring;
>>> +    struct drm_i915_private *dev_priv = to_i915(ring->dev);
>>> +    unsigned long flags;
>>> +
>>> +    /*
>>> +     * Need to add the request to a deferred dereference list to be
>>> +     * processed at a mutex lock safe time.
>>> +     */
>>> +    spin_lock_irqsave(&ring->delayed_free_lock, flags);
>>
>> At the moment there is no request unreferencing from irq handlers
>> right? Unless (or until) you plan to add that you could use simple
>> spin_lock here. (And in the i915_gem_retire_requests_ring.)
>
> I don't believe there is an unreference at IRQ time at this precise
> moment. However, there certainly have been in various other iterations
> of the code (including one on the display side that has since
> disappeared due to changes by others completely unrelated to this work).
> So I would be nervous about not making it IRQ compatible. It seems like
> a bug waiting to happen.

I think it is bad to take the cost of disabling interrupts for nothing.

Once the unthinkable happens and driver is re-designed so that it is 
possible to unreference from IRQ context it could be added.

>>> @@ -2992,6 +3009,21 @@ i915_gem_retire_requests_ring(struct
>>> intel_engine_cs *ring)
>>>           i915_gem_request_assign(&ring->trace_irq_req, NULL);
>>>       }
>>>
>>> +    while (!list_empty(&ring->delayed_free_list)) {
>>> +        struct drm_i915_gem_request *request;
>>> +        unsigned long flags;
>>> +
>>> +        request = list_first_entry(&ring->delayed_free_list,
>>> +                       struct drm_i915_gem_request,
>>> +                       delay_free_list);
>>
>> Need a spinlock to sample list head here. Then maybe move it on a
>> temporary list and do the freeing afterwards.
>
> Not necessary. The only other usage of the list is to add to it. So this
> code can't pull an entry that gets removed beneath its feet. Either the
> list empty test will return true and nothing further happens or there is
> definitely a node on the list and list_first_entry() will return
> something sane. The spinlock is only required when actually deleting
> that node.

NAK! :D

It only works because you know how lists are implemented. Say if 
list_empty checked for head->prev == head, and list_first_entry 
obviously uses head->next, then depending on ordering in list_add, you 
could grab some garbage, take the lock and dereference that garbage.

I see no gain in doing this trickery and it is fragile. And you still 
lock/unlock once per loop.

Why not use the common pattern of replacing the list under the lock and 
then operating on your local copy unrestricted?

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 88a4746..61c3db2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2161,14 +2161,9 @@  void i915_gem_track_fb(struct drm_i915_gem_object *old,
  * initial reference taken using kref_init
  */
 struct drm_i915_gem_request {
-	/**
-	 * Underlying object for implementing the signal/wait stuff.
-	 * NB: Never return this fence object to user land! It is unsafe to
-	 * let anything outside of the i915 driver get hold of the fence
-	 * object as the clean up when decrementing the reference count
-	 * requires holding the driver mutex lock.
-	 */
+	/** Underlying object for implementing the signal/wait stuff. */
 	struct fence fence;
+	struct list_head delay_free_list;
 
 	/** On Which ring this request was generated */
 	struct drm_i915_private *i915;
@@ -2281,21 +2276,10 @@  i915_gem_request_reference(struct drm_i915_gem_request *req)
 static inline void
 i915_gem_request_unreference(struct drm_i915_gem_request *req)
 {
-	WARN_ON(!mutex_is_locked(&req->ring->dev->struct_mutex));
-	fence_put(&req->fence);
-}
-
-static inline void
-i915_gem_request_unreference__unlocked(struct drm_i915_gem_request *req)
-{
-	struct drm_device *dev;
-
 	if (!req)
 		return;
 
-	dev = req->ring->dev;
-	if (kref_put_mutex(&req->fence.refcount, fence_release, &dev->struct_mutex))
-		mutex_unlock(&dev->struct_mutex);
+	fence_put(&req->fence);
 }
 
 static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index af79716..482835a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2616,10 +2616,27 @@  static void i915_set_reset_status(struct drm_i915_private *dev_priv,
 	}
 }
 
-static void i915_gem_request_free(struct fence *req_fence)
+static void i915_gem_request_release(struct fence *req_fence)
 {
 	struct drm_i915_gem_request *req = container_of(req_fence,
 						 typeof(*req), fence);
+	struct intel_engine_cs *ring = req->ring;
+	struct drm_i915_private *dev_priv = to_i915(ring->dev);
+	unsigned long flags;
+
+	/*
+	 * Need to add the request to a deferred dereference list to be
+	 * processed at a mutex lock safe time.
+	 */
+	spin_lock_irqsave(&ring->delayed_free_lock, flags);
+	list_add_tail(&req->delay_free_list, &ring->delayed_free_list);
+	spin_unlock_irqrestore(&ring->delayed_free_lock, flags);
+
+	queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
+}
+
+static void i915_gem_request_free(struct drm_i915_gem_request *req)
+{
 	struct intel_context *ctx = req->ctx;
 
 	BUG_ON(!mutex_is_locked(&req->ring->dev->struct_mutex));
@@ -2696,7 +2713,7 @@  static const struct fence_ops i915_gem_request_fops = {
 	.enable_signaling	= i915_gem_request_enable_signaling,
 	.signaled		= i915_gem_request_is_completed,
 	.wait			= fence_default_wait,
-	.release		= i915_gem_request_free,
+	.release		= i915_gem_request_release,
 	.fence_value_str	= i915_fence_value_str,
 	.timeline_value_str	= i915_fence_timeline_value_str,
 };
@@ -2992,6 +3009,21 @@  i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
 		i915_gem_request_assign(&ring->trace_irq_req, NULL);
 	}
 
+	while (!list_empty(&ring->delayed_free_list)) {
+		struct drm_i915_gem_request *request;
+		unsigned long flags;
+
+		request = list_first_entry(&ring->delayed_free_list,
+					   struct drm_i915_gem_request,
+					   delay_free_list);
+
+		spin_lock_irqsave(&ring->delayed_free_lock, flags);
+		list_del(&request->delay_free_list);
+		spin_unlock_irqrestore(&ring->delayed_free_lock, flags);
+
+		i915_gem_request_free(request);
+	}
+
 	WARN_ON(i915_verify_lists(ring->dev));
 }
 
@@ -3182,7 +3214,7 @@  i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 			ret = __i915_wait_request(req[i], reset_counter, true,
 						  args->timeout_ns > 0 ? &args->timeout_ns : NULL,
 						  file->driver_priv);
-		i915_gem_request_unreference__unlocked(req[i]);
+		i915_gem_request_unreference(req[i]);
 	}
 	return ret;
 
@@ -4425,7 +4457,7 @@  i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	if (ret == 0)
 		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
 
-	i915_gem_request_unreference__unlocked(target);
+	i915_gem_request_unreference(target);
 
 	return ret;
 }
@@ -5313,6 +5345,7 @@  init_ring_lists(struct intel_engine_cs *ring)
 {
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
+	INIT_LIST_HEAD(&ring->delayed_free_list);
 }
 
 void i915_init_vm(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 93ac43c..59541ad 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11289,7 +11289,7 @@  static void intel_mmio_flip_work_func(struct work_struct *work)
 
 	intel_do_mmio_flip(mmio_flip->crtc);
 
-	i915_gem_request_unreference__unlocked(mmio_flip->req);
+	i915_gem_request_unreference(mmio_flip->req);
 	kfree(mmio_flip);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 8f255de..9ee80f5 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1808,7 +1808,9 @@  static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 	ring->dev = dev;
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
+	INIT_LIST_HEAD(&ring->delayed_free_list);
 	spin_lock_init(&ring->fence_lock);
+	spin_lock_init(&ring->delayed_free_lock);
 	i915_gem_batch_pool_init(dev, &ring->batch_pool);
 	init_waitqueue_head(&ring->irq_queue);
 
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index cb08d9e..52b1c37 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -7317,7 +7317,7 @@  static void __intel_rps_boost_work(struct work_struct *work)
 		gen6_rps_boost(to_i915(req->ring->dev), NULL,
 			       req->emitted_jiffies);
 
-	i915_gem_request_unreference__unlocked(req);
+	i915_gem_request_unreference(req);
 	kfree(boost);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index d1ced30..11494a3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2040,7 +2040,9 @@  static int intel_init_ring_buffer(struct drm_device *dev,
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
 	INIT_LIST_HEAD(&ring->execlist_queue);
+	INIT_LIST_HEAD(&ring->delayed_free_list);
 	spin_lock_init(&ring->fence_lock);
+	spin_lock_init(&ring->delayed_free_lock);
 	i915_gem_batch_pool_init(dev, &ring->batch_pool);
 	ringbuf->size = 32 * PAGE_SIZE;
 	ringbuf->ring = ring;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index e2eebc0..68173a3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -299,6 +299,10 @@  struct  intel_engine_cs {
 	 */
 	u32 last_submitted_seqno;
 
+	/* deferred free list to allow unreferencing requests outside the driver */
+	struct list_head delayed_free_list;
+	spinlock_t delayed_free_lock;
+
 	bool gpu_caches_dirty;
 
 	wait_queue_head_t irq_queue;