diff mbox

[v3,17/28] drm/i915: Convert 'trace_irq' to use requests rather than seqnos

Message ID 1416854990-1920-18-git-send-email-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

John Harrison Nov. 24, 2014, 6:49 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

Updated the trace_irq code to use requests instead of seqnos. This includes
reference counting the request object to ensure it sticks around when required.
Note that getting access to the reference counting functions means moving the
inline i915_trace_irq_get() function from intel_ringbuffer.h to i915_drv.h.

For: VIZ-4377
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Thomas Daniel <Thomas.Daniel@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    7 +++++++
 drivers/gpu/drm/i915/i915_gem.c         |    7 ++++---
 drivers/gpu/drm/i915/i915_trace.h       |    2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h |    8 +-------
 4 files changed, 13 insertions(+), 11 deletions(-)

Comments

Daniel Vetter Nov. 26, 2014, 1:24 p.m. UTC | #1
On Mon, Nov 24, 2014 at 06:49:39PM +0000, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> Updated the trace_irq code to use requests instead of seqnos. This includes
> reference counting the request object to ensure it sticks around when required.
> Note that getting access to the reference counting functions means moving the
> inline i915_trace_irq_get() function from intel_ringbuffer.h to i915_drv.h.
> 
> For: VIZ-4377
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Reviewed-by: Thomas Daniel <Thomas.Daniel@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |    7 +++++++
>  drivers/gpu/drm/i915/i915_gem.c         |    7 ++++---
>  drivers/gpu/drm/i915/i915_trace.h       |    2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    8 +-------
>  4 files changed, 13 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8bfdac6..831fae2 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3130,4 +3130,11 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
>  	}
>  }
>  
> +static inline void i915_trace_irq_get(struct intel_engine_cs *ring,
> +				      struct drm_i915_gem_request *req)
> +{
> +	if (ring->trace_irq_req == NULL && ring->irq_get(ring))
> +		i915_gem_request_assign(&ring->trace_irq_req, req);

This looks a bit suspiciuos. Thus far ring->trace_irq_req was essentially
ot protected at all by anything. But that was nothing troublesome since we
didn't hang a real resource of it.

But now there's a refcounted request in that pointer, which means if we
race we leak. I'll skip this patch for now.
-Daniel

> +}
> +
>  #endif
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 69c3e50..69bddcb 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2810,10 +2810,11 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>  		i915_gem_free_request(request);
>  	}
>  
> -	if (unlikely(ring->trace_irq_seqno &&
> -		     i915_seqno_passed(seqno, ring->trace_irq_seqno))) {
> +	if (unlikely(ring->trace_irq_req &&
> +		     i915_seqno_passed(seqno,
> +			 i915_gem_request_get_seqno(ring->trace_irq_req)))) {
>  		ring->irq_put(ring);
> -		ring->trace_irq_seqno = 0;
> +		i915_gem_request_assign(&ring->trace_irq_req, NULL);
>  	}
>  
>  	while (!list_empty(&ring->delayed_free_list)) {
> diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
> index 2c0327b..2ade958 100644
> --- a/drivers/gpu/drm/i915/i915_trace.h
> +++ b/drivers/gpu/drm/i915/i915_trace.h
> @@ -369,7 +369,7 @@ TRACE_EVENT(i915_gem_ring_dispatch,
>  			   __entry->ring = ring->id;
>  			   __entry->seqno = i915_gem_request_get_seqno(req);
>  			   __entry->flags = flags;
> -			   i915_trace_irq_get(ring, __entry->seqno);
> +			   i915_trace_irq_get(ring, req);
>  			   ),
>  
>  	    TP_printk("dev=%u, ring=%u, seqno=%u, flags=%x",
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index cd1a9f9..c8b84de 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -142,7 +142,7 @@ struct  intel_engine_cs {
>  
>  	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
>  	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */
> -	u32		trace_irq_seqno;
> +	struct drm_i915_gem_request *trace_irq_req;
>  	bool __must_check (*irq_get)(struct intel_engine_cs *ring);
>  	void		(*irq_put)(struct intel_engine_cs *ring);
>  
> @@ -444,10 +444,4 @@ intel_ring_get_request(struct intel_engine_cs *ring)
>  	return ring->outstanding_lazy_request;
>  }
>  
> -static inline void i915_trace_irq_get(struct intel_engine_cs *ring, u32 seqno)
> -{
> -	if (ring->trace_irq_seqno == 0 && ring->irq_get(ring))
> -		ring->trace_irq_seqno = seqno;
> -}
> -
>  #endif /* _INTEL_RINGBUFFER_H_ */
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
John Harrison Nov. 26, 2014, 2:12 p.m. UTC | #2
On 26/11/2014 13:24, Daniel Vetter wrote:
> On Mon, Nov 24, 2014 at 06:49:39PM +0000, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> Updated the trace_irq code to use requests instead of seqnos. This includes
>> reference counting the request object to ensure it sticks around when required.
>> Note that getting access to the reference counting functions means moving the
>> inline i915_trace_irq_get() function from intel_ringbuffer.h to i915_drv.h.
>>
>> For: VIZ-4377
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> Reviewed-by: Thomas Daniel <Thomas.Daniel@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h         |    7 +++++++
>>   drivers/gpu/drm/i915/i915_gem.c         |    7 ++++---
>>   drivers/gpu/drm/i915/i915_trace.h       |    2 +-
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |    8 +-------
>>   4 files changed, 13 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 8bfdac6..831fae2 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -3130,4 +3130,11 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
>>   	}
>>   }
>>   
>> +static inline void i915_trace_irq_get(struct intel_engine_cs *ring,
>> +				      struct drm_i915_gem_request *req)
>> +{
>> +	if (ring->trace_irq_req == NULL && ring->irq_get(ring))
>> +		i915_gem_request_assign(&ring->trace_irq_req, req);
> This looks a bit suspiciuos. Thus far ring->trace_irq_req was essentially
> ot protected at all by anything. But that was nothing troublesome since we
> didn't hang a real resource of it.
>
> But now there's a refcounted request in that pointer, which means if we
> race we leak. I'll skip this patch for now.
> -Daniel

Race how? The assignment only ever occurs from inside execbuffer 
submission at which point the driver mutex lock is held. Therefore it is 
very definitely protected. Not doing the reference count means that 
there is now the possibility of a dangling pointer and thus the 
possibility of going bang with a kernel oops.

>
>> +}
>> +
>>   #endif
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 69c3e50..69bddcb 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2810,10 +2810,11 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>>   		i915_gem_free_request(request);
>>   	}
>>   
>> -	if (unlikely(ring->trace_irq_seqno &&
>> -		     i915_seqno_passed(seqno, ring->trace_irq_seqno))) {
>> +	if (unlikely(ring->trace_irq_req &&
>> +		     i915_seqno_passed(seqno,
>> +			 i915_gem_request_get_seqno(ring->trace_irq_req)))) {
>>   		ring->irq_put(ring);
>> -		ring->trace_irq_seqno = 0;
>> +		i915_gem_request_assign(&ring->trace_irq_req, NULL);
>>   	}
>>   
>>   	while (!list_empty(&ring->delayed_free_list)) {
>> diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
>> index 2c0327b..2ade958 100644
>> --- a/drivers/gpu/drm/i915/i915_trace.h
>> +++ b/drivers/gpu/drm/i915/i915_trace.h
>> @@ -369,7 +369,7 @@ TRACE_EVENT(i915_gem_ring_dispatch,
>>   			   __entry->ring = ring->id;
>>   			   __entry->seqno = i915_gem_request_get_seqno(req);
>>   			   __entry->flags = flags;
>> -			   i915_trace_irq_get(ring, __entry->seqno);
>> +			   i915_trace_irq_get(ring, req);
>>   			   ),
>>   
>>   	    TP_printk("dev=%u, ring=%u, seqno=%u, flags=%x",
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index cd1a9f9..c8b84de 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -142,7 +142,7 @@ struct  intel_engine_cs {
>>   
>>   	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
>>   	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */
>> -	u32		trace_irq_seqno;
>> +	struct drm_i915_gem_request *trace_irq_req;
>>   	bool __must_check (*irq_get)(struct intel_engine_cs *ring);
>>   	void		(*irq_put)(struct intel_engine_cs *ring);
>>   
>> @@ -444,10 +444,4 @@ intel_ring_get_request(struct intel_engine_cs *ring)
>>   	return ring->outstanding_lazy_request;
>>   }
>>   
>> -static inline void i915_trace_irq_get(struct intel_engine_cs *ring, u32 seqno)
>> -{
>> -	if (ring->trace_irq_seqno == 0 && ring->irq_get(ring))
>> -		ring->trace_irq_seqno = seqno;
>> -}
>> -
>>   #endif /* _INTEL_RINGBUFFER_H_ */
>> -- 
>> 1.7.9.5
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Nov. 26, 2014, 2:31 p.m. UTC | #3
On Wed, Nov 26, 2014 at 02:12:53PM +0000, John Harrison wrote:
> On 26/11/2014 13:24, Daniel Vetter wrote:
> >On Mon, Nov 24, 2014 at 06:49:39PM +0000, John.C.Harrison@Intel.com wrote:
> >>From: John Harrison <John.C.Harrison@Intel.com>
> >>
> >>Updated the trace_irq code to use requests instead of seqnos. This includes
> >>reference counting the request object to ensure it sticks around when required.
> >>Note that getting access to the reference counting functions means moving the
> >>inline i915_trace_irq_get() function from intel_ringbuffer.h to i915_drv.h.
> >>
> >>For: VIZ-4377
> >>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> >>Reviewed-by: Thomas Daniel <Thomas.Daniel@intel.com>
> >>---
> >>  drivers/gpu/drm/i915/i915_drv.h         |    7 +++++++
> >>  drivers/gpu/drm/i915/i915_gem.c         |    7 ++++---
> >>  drivers/gpu/drm/i915/i915_trace.h       |    2 +-
> >>  drivers/gpu/drm/i915/intel_ringbuffer.h |    8 +-------
> >>  4 files changed, 13 insertions(+), 11 deletions(-)
> >>
> >>diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >>index 8bfdac6..831fae2 100644
> >>--- a/drivers/gpu/drm/i915/i915_drv.h
> >>+++ b/drivers/gpu/drm/i915/i915_drv.h
> >>@@ -3130,4 +3130,11 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
> >>  	}
> >>  }
> >>+static inline void i915_trace_irq_get(struct intel_engine_cs *ring,
> >>+				      struct drm_i915_gem_request *req)
> >>+{
> >>+	if (ring->trace_irq_req == NULL && ring->irq_get(ring))
> >>+		i915_gem_request_assign(&ring->trace_irq_req, req);
> >This looks a bit suspiciuos. Thus far ring->trace_irq_req was essentially
> >ot protected at all by anything. But that was nothing troublesome since we
> >didn't hang a real resource of it.
> >
> >But now there's a refcounted request in that pointer, which means if we
> >race we leak. I'll skip this patch for now.
> >-Daniel
> 
> Race how? The assignment only ever occurs from inside execbuffer
> submission at which point the driver mutex lock is held. Therefore
> it is very definitely protected. Not doing the reference count means
> that there is now the possibility of a dangling pointer and thus the
> possibility of going bang with a kernel oops.

It's suspicious because the code is broken and you didn't read my patch.
 
> >
> >>+}
> >>+
> >>  #endif
> >>diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >>index 69c3e50..69bddcb 100644
> >>--- a/drivers/gpu/drm/i915/i915_gem.c
> >>+++ b/drivers/gpu/drm/i915/i915_gem.c
> >>@@ -2810,10 +2810,11 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
> >>  		i915_gem_free_request(request);
> >>  	}
> >>-	if (unlikely(ring->trace_irq_seqno &&
> >>-		     i915_seqno_passed(seqno, ring->trace_irq_seqno))) {
> >>+	if (unlikely(ring->trace_irq_req &&
> >>+		     i915_seqno_passed(seqno,
> >>+			 i915_gem_request_get_seqno(ring->trace_irq_req)))) {

It simply does not need transitioning to requests.
-Chris
Daniel Vetter Nov. 26, 2014, 2:42 p.m. UTC | #4
On Wed, Nov 26, 2014 at 3:12 PM, John Harrison
<John.C.Harrison@intel.com> wrote:
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>>> b/drivers/gpu/drm/i915/i915_drv.h
>>> index 8bfdac6..831fae2 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -3130,4 +3130,11 @@ wait_remaining_ms_from_jiffies(unsigned long
>>> timestamp_jiffies, int to_wait_ms)
>>>         }
>>>   }
>>>   +static inline void i915_trace_irq_get(struct intel_engine_cs *ring,
>>> +                                     struct drm_i915_gem_request *req)
>>> +{
>>> +       if (ring->trace_irq_req == NULL && ring->irq_get(ring))
>>> +               i915_gem_request_assign(&ring->trace_irq_req, req);
>>
>> This looks a bit suspiciuos. Thus far ring->trace_irq_req was essentially
>> ot protected at all by anything. But that was nothing troublesome since we
>> didn't hang a real resource of it.
>>
>> But now there's a refcounted request in that pointer, which means if we
>> race we leak. I'll skip this patch for now.
>> -Daniel
>
>
> Race how? The assignment only ever occurs from inside execbuffer submission
> at which point the driver mutex lock is held. Therefore it is very
> definitely protected. Not doing the reference count means that there is now
> the possibility of a dangling pointer and thus the possibility of going bang
> with a kernel oops.

Hm, ->trace_irq_seqno is indeed always touched from the a calling
context with dev->struct_mutex held. Somehow I've misrembered that
since the realtime/tracing folks are really freaked out about what
we're doing here. But from that pov your patch doesn't really make
things worse, so I'll pull it in.

Btw I don't see the oops really without this patch. What would blow up?
-Daniel
John Harrison Nov. 26, 2014, 2:58 p.m. UTC | #5
On 26/11/2014 14:42, Daniel Vetter wrote:
> On Wed, Nov 26, 2014 at 3:12 PM, John Harrison
> <John.C.Harrison@intel.com> wrote:
>>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>>>> b/drivers/gpu/drm/i915/i915_drv.h
>>>> index 8bfdac6..831fae2 100644
>>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>>> @@ -3130,4 +3130,11 @@ wait_remaining_ms_from_jiffies(unsigned long
>>>> timestamp_jiffies, int to_wait_ms)
>>>>          }
>>>>    }
>>>>    +static inline void i915_trace_irq_get(struct intel_engine_cs *ring,
>>>> +                                     struct drm_i915_gem_request *req)
>>>> +{
>>>> +       if (ring->trace_irq_req == NULL && ring->irq_get(ring))
>>>> +               i915_gem_request_assign(&ring->trace_irq_req, req);
>>> This looks a bit suspiciuos. Thus far ring->trace_irq_req was essentially
>>> ot protected at all by anything. But that was nothing troublesome since we
>>> didn't hang a real resource of it.
>>>
>>> But now there's a refcounted request in that pointer, which means if we
>>> race we leak. I'll skip this patch for now.
>>> -Daniel
>>
>> Race how? The assignment only ever occurs from inside execbuffer submission
>> at which point the driver mutex lock is held. Therefore it is very
>> definitely protected. Not doing the reference count means that there is now
>> the possibility of a dangling pointer and thus the possibility of going bang
>> with a kernel oops.
> Hm, ->trace_irq_seqno is indeed always touched from the a calling
> context with dev->struct_mutex held. Somehow I've misrembered that
> since the realtime/tracing folks are really freaked out about what
> we're doing here. But from that pov your patch doesn't really make
> things worse, so I'll pull it in.
>
> Btw I don't see the oops really without this patch. What would blow up?
> -Daniel

The sole access (and clear to null) of the trace pointer is done from 
retire requests after the requests have been retired. Thus the request 
structure could have just been freed immediately before it is used. The 
code could be re-ordered to be safer but I'm not entirely sure what the 
trace pointer is for or what it might potentially be used for in the 
future. With the reference counting, the ordering is irrelevant. If the 
pointer exists then it is safe to use.

The point is that anywhere that keeps a copy of a request pointer really 
should reference count that copy. Otherwise there is the possibility 
that the pointer could become stale. Either now or with future code 
changes. If the copy is always done with the request_assign() function 
then the pointer is guaranteed safe for all time.
Daniel Vetter Nov. 26, 2014, 6:23 p.m. UTC | #6
On Wed, Nov 26, 2014 at 02:58:49PM +0000, John Harrison wrote:
> On 26/11/2014 14:42, Daniel Vetter wrote:
> >On Wed, Nov 26, 2014 at 3:12 PM, John Harrison
> ><John.C.Harrison@intel.com> wrote:
> >>>>diff --git a/drivers/gpu/drm/i915/i915_drv.h
> >>>>b/drivers/gpu/drm/i915/i915_drv.h
> >>>>index 8bfdac6..831fae2 100644
> >>>>--- a/drivers/gpu/drm/i915/i915_drv.h
> >>>>+++ b/drivers/gpu/drm/i915/i915_drv.h
> >>>>@@ -3130,4 +3130,11 @@ wait_remaining_ms_from_jiffies(unsigned long
> >>>>timestamp_jiffies, int to_wait_ms)
> >>>>         }
> >>>>   }
> >>>>   +static inline void i915_trace_irq_get(struct intel_engine_cs *ring,
> >>>>+                                     struct drm_i915_gem_request *req)
> >>>>+{
> >>>>+       if (ring->trace_irq_req == NULL && ring->irq_get(ring))
> >>>>+               i915_gem_request_assign(&ring->trace_irq_req, req);
> >>>This looks a bit suspiciuos. Thus far ring->trace_irq_req was essentially
> >>>ot protected at all by anything. But that was nothing troublesome since we
> >>>didn't hang a real resource of it.
> >>>
> >>>But now there's a refcounted request in that pointer, which means if we
> >>>race we leak. I'll skip this patch for now.
> >>>-Daniel
> >>
> >>Race how? The assignment only ever occurs from inside execbuffer submission
> >>at which point the driver mutex lock is held. Therefore it is very
> >>definitely protected. Not doing the reference count means that there is now
> >>the possibility of a dangling pointer and thus the possibility of going bang
> >>with a kernel oops.
> >Hm, ->trace_irq_seqno is indeed always touched from the a calling
> >context with dev->struct_mutex held. Somehow I've misrembered that
> >since the realtime/tracing folks are really freaked out about what
> >we're doing here. But from that pov your patch doesn't really make
> >things worse, so I'll pull it in.
> >
> >Btw I don't see the oops really without this patch. What would blow up?
> >-Daniel
> 
> The sole access (and clear to null) of the trace pointer is done from retire
> requests after the requests have been retired. Thus the request structure
> could have just been freed immediately before it is used. The code could be
> re-ordered to be safer but I'm not entirely sure what the trace pointer is
> for or what it might potentially be used for in the future. With the
> reference counting, the ordering is irrelevant. If the pointer exists then
> it is safe to use.
> 
> The point is that anywhere that keeps a copy of a request pointer really
> should reference count that copy. Otherwise there is the possibility that
> the pointer could become stale. Either now or with future code changes. If
> the copy is always done with the request_assign() function then the pointer
> is guaranteed safe for all time.

Oh, I guess you've misunderstood what I've done. Ive dropped the entire
patch here, not just the refcounting. Dropping the refcounting alone is
obviously oops-worthy.
-Daniel
Daniel Vetter Nov. 26, 2014, 6:25 p.m. UTC | #7
On Wed, Nov 26, 2014 at 07:23:36PM +0100, Daniel Vetter wrote:
> On Wed, Nov 26, 2014 at 02:58:49PM +0000, John Harrison wrote:
> > On 26/11/2014 14:42, Daniel Vetter wrote:
> > >On Wed, Nov 26, 2014 at 3:12 PM, John Harrison
> > ><John.C.Harrison@intel.com> wrote:
> > >>>>diff --git a/drivers/gpu/drm/i915/i915_drv.h
> > >>>>b/drivers/gpu/drm/i915/i915_drv.h
> > >>>>index 8bfdac6..831fae2 100644
> > >>>>--- a/drivers/gpu/drm/i915/i915_drv.h
> > >>>>+++ b/drivers/gpu/drm/i915/i915_drv.h
> > >>>>@@ -3130,4 +3130,11 @@ wait_remaining_ms_from_jiffies(unsigned long
> > >>>>timestamp_jiffies, int to_wait_ms)
> > >>>>         }
> > >>>>   }
> > >>>>   +static inline void i915_trace_irq_get(struct intel_engine_cs *ring,
> > >>>>+                                     struct drm_i915_gem_request *req)
> > >>>>+{
> > >>>>+       if (ring->trace_irq_req == NULL && ring->irq_get(ring))
> > >>>>+               i915_gem_request_assign(&ring->trace_irq_req, req);
> > >>>This looks a bit suspiciuos. Thus far ring->trace_irq_req was essentially
> > >>>ot protected at all by anything. But that was nothing troublesome since we
> > >>>didn't hang a real resource of it.
> > >>>
> > >>>But now there's a refcounted request in that pointer, which means if we
> > >>>race we leak. I'll skip this patch for now.
> > >>>-Daniel
> > >>
> > >>Race how? The assignment only ever occurs from inside execbuffer submission
> > >>at which point the driver mutex lock is held. Therefore it is very
> > >>definitely protected. Not doing the reference count means that there is now
> > >>the possibility of a dangling pointer and thus the possibility of going bang
> > >>with a kernel oops.
> > >Hm, ->trace_irq_seqno is indeed always touched from the a calling
> > >context with dev->struct_mutex held. Somehow I've misrembered that
> > >since the realtime/tracing folks are really freaked out about what
> > >we're doing here. But from that pov your patch doesn't really make
> > >things worse, so I'll pull it in.
> > >
> > >Btw I don't see the oops really without this patch. What would blow up?
> > >-Daniel
> > 
> > The sole access (and clear to null) of the trace pointer is done from retire
> > requests after the requests have been retired. Thus the request structure
> > could have just been freed immediately before it is used. The code could be
> > re-ordered to be safer but I'm not entirely sure what the trace pointer is
> > for or what it might potentially be used for in the future. With the
> > reference counting, the ordering is irrelevant. If the pointer exists then
> > it is safe to use.
> > 
> > The point is that anywhere that keeps a copy of a request pointer really
> > should reference count that copy. Otherwise there is the possibility that
> > the pointer could become stale. Either now or with future code changes. If
> > the copy is always done with the request_assign() function then the pointer
> > is guaranteed safe for all time.
> 
> Oh, I guess you've misunderstood what I've done. Ive dropped the entire
> patch here, not just the refcounting. Dropping the refcounting alone is
> obviously oops-worthy.

Trying to clarify more: The problem I've thought I've seen is _not_ with
the unref/ref done without holding struct mutex. But with different people
accessing ring->trace_irq_* without locks. Somehow I've thought we've had
no common lock thus far for that - iirc there was a bit of that code in
the irq handler once, but I can't find it any more.
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8bfdac6..831fae2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3130,4 +3130,11 @@  wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
 	}
 }
 
+static inline void i915_trace_irq_get(struct intel_engine_cs *ring,
+				      struct drm_i915_gem_request *req)
+{
+	if (ring->trace_irq_req == NULL && ring->irq_get(ring))
+		i915_gem_request_assign(&ring->trace_irq_req, req);
+}
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 69c3e50..69bddcb 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2810,10 +2810,11 @@  i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
 		i915_gem_free_request(request);
 	}
 
-	if (unlikely(ring->trace_irq_seqno &&
-		     i915_seqno_passed(seqno, ring->trace_irq_seqno))) {
+	if (unlikely(ring->trace_irq_req &&
+		     i915_seqno_passed(seqno,
+			 i915_gem_request_get_seqno(ring->trace_irq_req)))) {
 		ring->irq_put(ring);
-		ring->trace_irq_seqno = 0;
+		i915_gem_request_assign(&ring->trace_irq_req, NULL);
 	}
 
 	while (!list_empty(&ring->delayed_free_list)) {
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 2c0327b..2ade958 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -369,7 +369,7 @@  TRACE_EVENT(i915_gem_ring_dispatch,
 			   __entry->ring = ring->id;
 			   __entry->seqno = i915_gem_request_get_seqno(req);
 			   __entry->flags = flags;
-			   i915_trace_irq_get(ring, __entry->seqno);
+			   i915_trace_irq_get(ring, req);
 			   ),
 
 	    TP_printk("dev=%u, ring=%u, seqno=%u, flags=%x",
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index cd1a9f9..c8b84de 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -142,7 +142,7 @@  struct  intel_engine_cs {
 
 	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
 	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */
-	u32		trace_irq_seqno;
+	struct drm_i915_gem_request *trace_irq_req;
 	bool __must_check (*irq_get)(struct intel_engine_cs *ring);
 	void		(*irq_put)(struct intel_engine_cs *ring);
 
@@ -444,10 +444,4 @@  intel_ring_get_request(struct intel_engine_cs *ring)
 	return ring->outstanding_lazy_request;
 }
 
-static inline void i915_trace_irq_get(struct intel_engine_cs *ring, u32 seqno)
-{
-	if (ring->trace_irq_seqno == 0 && ring->irq_get(ring))
-		ring->trace_irq_seqno = seqno;
-}
-
 #endif /* _INTEL_RINGBUFFER_H_ */