diff mbox series

[5/7] drm/i915: handle interrupts from the OA unit

Message ID 20190116153622.32576-6-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/perf: add OA interrupt support | expand

Commit Message

Lionel Landwerlin Jan. 16, 2019, 3:36 p.m. UTC
The OA unit can notify that its circular buffer is half full through
an interrupt and we would like to give the application the ability to
make use of this interrupt to get rid of CPU checks on the OA buffer.

This change wires up the interrupt to the i915-perf stream and leaves
it ignored for now.

v2: Use spin_lock_irq() to access the IMR register on Haswell (Chris)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         | 21 +++++++++++++
 drivers/gpu/drm/i915/i915_irq.c         | 39 ++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_perf.c        | 26 +++++++++++++++++
 drivers/gpu/drm/i915/i915_reg.h         |  7 +++++
 drivers/gpu/drm/i915/intel_ringbuffer.c |  2 ++
 5 files changed, 88 insertions(+), 7 deletions(-)

Comments

Chris Wilson Jan. 16, 2019, 3:52 p.m. UTC | #1
Quoting Lionel Landwerlin (2019-01-16 15:36:20)
> @@ -1877,6 +1883,21 @@ struct drm_i915_private {
>                         wait_queue_head_t poll_wq;
>                         bool pollin;
>  
> +                       /**
> +                        * Atomic counter incremented by the interrupt
> +                        * handling code for each OA half full interrupt
> +                        * received.
> +                        */
> +                       atomic64_t half_full_count;
> +
> +                       /**
> +                        * Copy of the atomic half_full_count that was last
> +                        * processed in the i915-perf driver. If both counters
> +                        * differ, there is data available to read in the OA
> +                        * buffer.
> +                        */
> +                       u64 half_full_count_last;

Eh? But why a relatively expensive atomic64. You only need one bit, and
reading the tail pointer from WB memory should just be cheap. You should
be able to sample the perf ringbuffer pointers very cheaply... What am I
missing?
-Chris
Lionel Landwerlin Jan. 16, 2019, 3:58 p.m. UTC | #2
On 16/01/2019 15:52, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-01-16 15:36:20)
>> @@ -1877,6 +1883,21 @@ struct drm_i915_private {
>>                          wait_queue_head_t poll_wq;
>>                          bool pollin;
>>   
>> +                       /**
>> +                        * Atomic counter incremented by the interrupt
>> +                        * handling code for each OA half full interrupt
>> +                        * received.
>> +                        */
>> +                       atomic64_t half_full_count;
>> +
>> +                       /**
>> +                        * Copy of the atomic half_full_count that was last
>> +                        * processed in the i915-perf driver. If both counters
>> +                        * differ, there is data available to read in the OA
>> +                        * buffer.
>> +                        */
>> +                       u64 half_full_count_last;
> Eh? But why a relatively expensive atomic64. You only need one bit, and
> reading the tail pointer from WB memory should just be cheap. You should
> be able to sample the perf ringbuffer pointers very cheaply... What am I
> missing?
> -Chris
>
Fair comment.

The thing is with this series there are 2 mechanism that notify the poll_wq.

One is the hrtimer that kicks in at regular interval and polls the 
register with the workaround.

The other is the interrupt which doesn't read the registers and workaround.


Maybe a better way to do it would be to have 2 wait queues, one triggers 
a workqueue for the oa_buffer_check after the interrupt, the other 
notifies the existing poll_wq.


-

Lionel
Lionel Landwerlin Jan. 16, 2019, 4:02 p.m. UTC | #3
On 16/01/2019 15:58, Lionel Landwerlin wrote:
> On 16/01/2019 15:52, Chris Wilson wrote:
>> Quoting Lionel Landwerlin (2019-01-16 15:36:20)
>>> @@ -1877,6 +1883,21 @@ struct drm_i915_private {
>>>                          wait_queue_head_t poll_wq;
>>>                          bool pollin;
>>>   +                       /**
>>> +                        * Atomic counter incremented by the interrupt
>>> +                        * handling code for each OA half full 
>>> interrupt
>>> +                        * received.
>>> +                        */
>>> +                       atomic64_t half_full_count;
>>> +
>>> +                       /**
>>> +                        * Copy of the atomic half_full_count that 
>>> was last
>>> +                        * processed in the i915-perf driver. If 
>>> both counters
>>> +                        * differ, there is data available to read 
>>> in the OA
>>> +                        * buffer.
>>> +                        */
>>> +                       u64 half_full_count_last;
>> Eh? But why a relatively expensive atomic64. You only need one bit, and
>> reading the tail pointer from WB memory should just be cheap. You should
>> be able to sample the perf ringbuffer pointers very cheaply... What am I
>> missing?
>> -Chris
>>
> Fair comment.
>
> The thing is with this series there are 2 mechanism that notify the 
> poll_wq.
>
> One is the hrtimer that kicks in at regular interval and polls the 
> register with the workaround.
>
> The other is the interrupt which doesn't read the registers and 
> workaround.


What I meant is that I need a way to know which one did the wake up so I 
can call oa_check_buffer() in the interrupt case, that's wait the 
atomic64 is there for.


>
>
> Maybe a better way to do it would be to have 2 wait queues, one 
> triggers a workqueue for the oa_buffer_check after the interrupt, the 
> other notifies the existing poll_wq.
>
>
> -
>
> Lionel
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Jan. 16, 2019, 4:05 p.m. UTC | #4
Quoting Lionel Landwerlin (2019-01-16 15:58:00)
> On 16/01/2019 15:52, Chris Wilson wrote:
> > Quoting Lionel Landwerlin (2019-01-16 15:36:20)
> >> @@ -1877,6 +1883,21 @@ struct drm_i915_private {
> >>                          wait_queue_head_t poll_wq;
> >>                          bool pollin;
> >>   
> >> +                       /**
> >> +                        * Atomic counter incremented by the interrupt
> >> +                        * handling code for each OA half full interrupt
> >> +                        * received.
> >> +                        */
> >> +                       atomic64_t half_full_count;
> >> +
> >> +                       /**
> >> +                        * Copy of the atomic half_full_count that was last
> >> +                        * processed in the i915-perf driver. If both counters
> >> +                        * differ, there is data available to read in the OA
> >> +                        * buffer.
> >> +                        */
> >> +                       u64 half_full_count_last;
> > Eh? But why a relatively expensive atomic64. You only need one bit, and
> > reading the tail pointer from WB memory should just be cheap. You should
> > be able to sample the perf ringbuffer pointers very cheaply... What am I
> > missing?
> > -Chris
> >
> Fair comment.
> 
> The thing is with this series there are 2 mechanism that notify the poll_wq.
> 
> One is the hrtimer that kicks in at regular interval and polls the 
> register with the workaround.
> 
> The other is the interrupt which doesn't read the registers and workaround.

What's the complication with the workaround?

> Maybe a better way to do it would be to have 2 wait queues, one triggers 
> a workqueue for the oa_buffer_check after the interrupt, the other 
> notifies the existing poll_wq.

Ok, I was expecting that both the hrtimer, interrupt and general signal
handling (spurious wakeups) fed into the same mechanism to sample the
ringbuffer and notify the client if ready. (And I presume that we sample
from the client's process context anyway to save on the context switch.)
-Chris
Chris Wilson Jan. 16, 2019, 4:09 p.m. UTC | #5
Quoting Lionel Landwerlin (2019-01-16 16:02:51)
> On 16/01/2019 15:58, Lionel Landwerlin wrote:
> > On 16/01/2019 15:52, Chris Wilson wrote:
> >> Quoting Lionel Landwerlin (2019-01-16 15:36:20)
> >>> @@ -1877,6 +1883,21 @@ struct drm_i915_private {
> >>>                          wait_queue_head_t poll_wq;
> >>>                          bool pollin;
> >>>   +                       /**
> >>> +                        * Atomic counter incremented by the interrupt
> >>> +                        * handling code for each OA half full 
> >>> interrupt
> >>> +                        * received.
> >>> +                        */
> >>> +                       atomic64_t half_full_count;
> >>> +
> >>> +                       /**
> >>> +                        * Copy of the atomic half_full_count that 
> >>> was last
> >>> +                        * processed in the i915-perf driver. If 
> >>> both counters
> >>> +                        * differ, there is data available to read 
> >>> in the OA
> >>> +                        * buffer.
> >>> +                        */
> >>> +                       u64 half_full_count_last;
> >> Eh? But why a relatively expensive atomic64. You only need one bit, and
> >> reading the tail pointer from WB memory should just be cheap. You should
> >> be able to sample the perf ringbuffer pointers very cheaply... What am I
> >> missing?
> >> -Chris
> >>
> > Fair comment.
> >
> > The thing is with this series there are 2 mechanism that notify the 
> > poll_wq.
> >
> > One is the hrtimer that kicks in at regular interval and polls the 
> > register with the workaround.
> >
> > The other is the interrupt which doesn't read the registers and 
> > workaround.
> 
> 
> What I meant is that I need a way to know which one did the wake up so I 
> can call oa_check_buffer() in the interrupt case, that's wait the 
> atomic64 is there for.

I thought oa_check_buffer() was just "is there any data for me?"
-Chris
Lionel Landwerlin Jan. 16, 2019, 4:25 p.m. UTC | #6
On 16/01/2019 16:05, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-01-16 15:58:00)
>> On 16/01/2019 15:52, Chris Wilson wrote:
>>> Quoting Lionel Landwerlin (2019-01-16 15:36:20)
>>>> @@ -1877,6 +1883,21 @@ struct drm_i915_private {
>>>>                           wait_queue_head_t poll_wq;
>>>>                           bool pollin;
>>>>    
>>>> +                       /**
>>>> +                        * Atomic counter incremented by the interrupt
>>>> +                        * handling code for each OA half full interrupt
>>>> +                        * received.
>>>> +                        */
>>>> +                       atomic64_t half_full_count;
>>>> +
>>>> +                       /**
>>>> +                        * Copy of the atomic half_full_count that was last
>>>> +                        * processed in the i915-perf driver. If both counters
>>>> +                        * differ, there is data available to read in the OA
>>>> +                        * buffer.
>>>> +                        */
>>>> +                       u64 half_full_count_last;
>>> Eh? But why a relatively expensive atomic64. You only need one bit, and
>>> reading the tail pointer from WB memory should just be cheap. You should
>>> be able to sample the perf ringbuffer pointers very cheaply... What am I
>>> missing?
>>> -Chris
>>>
>> Fair comment.
>>
>> The thing is with this series there are 2 mechanism that notify the poll_wq.
>>
>> One is the hrtimer that kicks in at regular interval and polls the
>> register with the workaround.
>>
>> The other is the interrupt which doesn't read the registers and workaround.
> What's the complication with the workaround?


It's a bit more than just looking at registers, we actually have to look 
at the content of the buffer to figure out what landed in memory.

The register values are not synchronized with the memory writes...


There is a comment in the code (i915_perf_poll_locked) about not 
checking the register after each wakeup because that may be a very hot path.

The atomic64 sounded like a lesser evil.


>
>> Maybe a better way to do it would be to have 2 wait queues, one triggers
>> a workqueue for the oa_buffer_check after the interrupt, the other
>> notifies the existing poll_wq.
> Ok, I was expecting that both the hrtimer, interrupt and general signal
> handling (spurious wakeups) fed into the same mechanism to sample the
> ringbuffer and notify the client if ready. (And I presume that we sample
> from the client's process context anyway to save on the context switch.)


Correct, the client only deals with the output of oa_buffer_check().

The interrupt is just missing the oa_buffer_check() step which maybe we 
can do with a 2 stage thing.

Either timer+buffer_check -> wakeup

Or interrupt-> workqueue+buffer_check -> wakeup


-

Lionel


> -Chris
>
Chris Wilson Jan. 16, 2019, 4:31 p.m. UTC | #7
Quoting Lionel Landwerlin (2019-01-16 16:25:26)
> On 16/01/2019 16:05, Chris Wilson wrote:
> > Quoting Lionel Landwerlin (2019-01-16 15:58:00)
> >> On 16/01/2019 15:52, Chris Wilson wrote:
> >>> Quoting Lionel Landwerlin (2019-01-16 15:36:20)
> >>>> @@ -1877,6 +1883,21 @@ struct drm_i915_private {
> >>>>                           wait_queue_head_t poll_wq;
> >>>>                           bool pollin;
> >>>>    
> >>>> +                       /**
> >>>> +                        * Atomic counter incremented by the interrupt
> >>>> +                        * handling code for each OA half full interrupt
> >>>> +                        * received.
> >>>> +                        */
> >>>> +                       atomic64_t half_full_count;
> >>>> +
> >>>> +                       /**
> >>>> +                        * Copy of the atomic half_full_count that was last
> >>>> +                        * processed in the i915-perf driver. If both counters
> >>>> +                        * differ, there is data available to read in the OA
> >>>> +                        * buffer.
> >>>> +                        */
> >>>> +                       u64 half_full_count_last;
> >>> Eh? But why a relatively expensive atomic64. You only need one bit, and
> >>> reading the tail pointer from WB memory should just be cheap. You should
> >>> be able to sample the perf ringbuffer pointers very cheaply... What am I
> >>> missing?
> >>> -Chris
> >>>
> >> Fair comment.
> >>
> >> The thing is with this series there are 2 mechanism that notify the poll_wq.
> >>
> >> One is the hrtimer that kicks in at regular interval and polls the
> >> register with the workaround.
> >>
> >> The other is the interrupt which doesn't read the registers and workaround.
> > What's the complication with the workaround?
> 
> 
> It's a bit more than just looking at registers, we actually have to look 
> at the content of the buffer to figure out what landed in memory.
> 
> The register values are not synchronized with the memory writes...

I don't want to look at registers at all for polling, and you shouldn't
need to since communication is via a ringbuf.
 
> There is a comment in the code (i915_perf_poll_locked) about not 
> checking the register after each wakeup because that may be a very hot path.
> 
> The atomic64 sounded like a lesser evil.

I'm clearly not understanding something here...

Does the hardware not do:
	update ringbuf data;
	wmb() (or post to global observation point in their parlance)
	update ringbuf tail

Then we just need to sample the ringbuf tail and compare against how far
we read last time?
-Chris
Lionel Landwerlin Jan. 16, 2019, 6:04 p.m. UTC | #8
On 16/01/2019 16:31, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-01-16 16:25:26)
>> On 16/01/2019 16:05, Chris Wilson wrote:
>>> Quoting Lionel Landwerlin (2019-01-16 15:58:00)
>>>> On 16/01/2019 15:52, Chris Wilson wrote:
>>>>> Quoting Lionel Landwerlin (2019-01-16 15:36:20)
>>>>>> @@ -1877,6 +1883,21 @@ struct drm_i915_private {
>>>>>>                            wait_queue_head_t poll_wq;
>>>>>>                            bool pollin;
>>>>>>     
>>>>>> +                       /**
>>>>>> +                        * Atomic counter incremented by the interrupt
>>>>>> +                        * handling code for each OA half full interrupt
>>>>>> +                        * received.
>>>>>> +                        */
>>>>>> +                       atomic64_t half_full_count;
>>>>>> +
>>>>>> +                       /**
>>>>>> +                        * Copy of the atomic half_full_count that was last
>>>>>> +                        * processed in the i915-perf driver. If both counters
>>>>>> +                        * differ, there is data available to read in the OA
>>>>>> +                        * buffer.
>>>>>> +                        */
>>>>>> +                       u64 half_full_count_last;
>>>>> Eh? But why a relatively expensive atomic64. You only need one bit, and
>>>>> reading the tail pointer from WB memory should just be cheap. You should
>>>>> be able to sample the perf ringbuffer pointers very cheaply... What am I
>>>>> missing?
>>>>> -Chris
>>>>>
>>>> Fair comment.
>>>>
>>>> The thing is with this series there are 2 mechanism that notify the poll_wq.
>>>>
>>>> One is the hrtimer that kicks in at regular interval and polls the
>>>> register with the workaround.
>>>>
>>>> The other is the interrupt which doesn't read the registers and workaround.
>>> What's the complication with the workaround?
>>
>> It's a bit more than just looking at registers, we actually have to look
>> at the content of the buffer to figure out what landed in memory.
>>
>> The register values are not synchronized with the memory writes...
> I don't want to look at registers at all for polling, and you shouldn't
> need to since communication is via a ringbuf.
>   
>> There is a comment in the code (i915_perf_poll_locked) about not
>> checking the register after each wakeup because that may be a very hot path.
>>
>> The atomic64 sounded like a lesser evil.
> I'm clearly not understanding something here...
>
> Does the hardware not do:
> 	update ringbuf data;
> 	wmb() (or post to global observation point in their parlance)
> 	update ringbuf tail


As far as I understand, the OA unit :

     sends its memory write requests to the memory controller

     immediately updates the ringbuf tail, without waiting for the 
previous requests to complete



>
> Then we just need to sample the ringbuf tail and compare against how far
> we read last time?
> -Chris
>
Lionel Landwerlin Jan. 17, 2019, 11:43 a.m. UTC | #9
On 16/01/2019 18:04, Lionel Landwerlin wrote:
> On 16/01/2019 16:31, Chris Wilson wrote:
>> Quoting Lionel Landwerlin (2019-01-16 16:25:26)
>>> On 16/01/2019 16:05, Chris Wilson wrote:
>>>> Quoting Lionel Landwerlin (2019-01-16 15:58:00)
>>>>> On 16/01/2019 15:52, Chris Wilson wrote:
>>>>>> Quoting Lionel Landwerlin (2019-01-16 15:36:20)
>>>>>>> @@ -1877,6 +1883,21 @@ struct drm_i915_private {
>>>>>>>                            wait_queue_head_t poll_wq;
>>>>>>>                            bool pollin;
>>>>>>>     +                       /**
>>>>>>> +                        * Atomic counter incremented by the 
>>>>>>> interrupt
>>>>>>> +                        * handling code for each OA half full 
>>>>>>> interrupt
>>>>>>> +                        * received.
>>>>>>> +                        */
>>>>>>> +                       atomic64_t half_full_count;
>>>>>>> +
>>>>>>> +                       /**
>>>>>>> +                        * Copy of the atomic half_full_count 
>>>>>>> that was last
>>>>>>> +                        * processed in the i915-perf driver. If 
>>>>>>> both counters
>>>>>>> +                        * differ, there is data available to 
>>>>>>> read in the OA
>>>>>>> +                        * buffer.
>>>>>>> +                        */
>>>>>>> +                       u64 half_full_count_last;
>>>>>> Eh? But why a relatively expensive atomic64. You only need one 
>>>>>> bit, and
>>>>>> reading the tail pointer from WB memory should just be cheap. You 
>>>>>> should
>>>>>> be able to sample the perf ringbuffer pointers very cheaply... 
>>>>>> What am I
>>>>>> missing?
>>>>>> -Chris
>>>>>>
>>>>> Fair comment.
>>>>>
>>>>> The thing is with this series there are 2 mechanism that notify 
>>>>> the poll_wq.
>>>>>
>>>>> One is the hrtimer that kicks in at regular interval and polls the
>>>>> register with the workaround.
>>>>>
>>>>> The other is the interrupt which doesn't read the registers and 
>>>>> workaround.
>>>> What's the complication with the workaround?
>>>
>>> It's a bit more than just looking at registers, we actually have to 
>>> look
>>> at the content of the buffer to figure out what landed in memory.
>>>
>>> The register values are not synchronized with the memory writes...
>> I don't want to look at registers at all for polling, and you shouldn't
>> need to since communication is via a ringbuf.
>>> There is a comment in the code (i915_perf_poll_locked) about not
>>> checking the register after each wakeup because that may be a very 
>>> hot path.
>>>
>>> The atomic64 sounded like a lesser evil.
>> I'm clearly not understanding something here...
>>
>> Does the hardware not do:
>>     update ringbuf data;
>>     wmb() (or post to global observation point in their parlance)
>>     update ringbuf tail
>
>
> As far as I understand, the OA unit :
>
>     sends its memory write requests to the memory controller
>
>     immediately updates the ringbuf tail, without waiting for the 
> previous requests to complete
>

By experimentation, I've haven't seen a delta between what is available 
in memory and what the OA tail register indicate larger than 768 bytes, 
which is about 3 OA reports at the largest size.
There is probably a maximum number of write requests the OA unit can 
queue before blocking.

So again maybe you would prefer a 2 stage mechanism :

OA interrupt -----> head/tail pointer worker  -----> wake up userspace

                     hrtimer head/tail pointer --|


>
>
>>
>> Then we just need to sample the ringbuf tail and compare against how far
>> we read last time?
>> -Chris
>>
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <div class="moz-cite-prefix">On 16/01/2019 18:04, Lionel Landwerlin
      wrote:<br>
    </div>
    <blockquote type="cite"
      cite="mid:ced9e95d-c2eb-0398-adc4-e174a35ca180@intel.com">On
      16/01/2019 16:31, Chris Wilson wrote:
      <br>
      <blockquote type="cite">Quoting Lionel Landwerlin (2019-01-16
        16:25:26)
        <br>
        <blockquote type="cite">On 16/01/2019 16:05, Chris Wilson wrote:
          <br>
          <blockquote type="cite">Quoting Lionel Landwerlin (2019-01-16
            15:58:00)
            <br>
            <blockquote type="cite">On 16/01/2019 15:52, Chris Wilson
              wrote:
              <br>
              <blockquote type="cite">Quoting Lionel Landwerlin
                (2019-01-16 15:36:20)
                <br>
                <blockquote type="cite">@@ -1877,6 +1883,21 @@ struct
                  drm_i915_private {
                  <br>
                                             wait_queue_head_t poll_wq;
                  <br>
                                             bool pollin;
                  <br>
                      +                       /**
                  <br>
                  +                        * Atomic counter incremented
                  by the interrupt
                  <br>
                  +                        * handling code for each OA
                  half full interrupt
                  <br>
                  +                        * received.
                  <br>
                  +                        */
                  <br>
                  +                       atomic64_t half_full_count;
                  <br>
                  +
                  <br>
                  +                       /**
                  <br>
                  +                        * Copy of the atomic
                  half_full_count that was last
                  <br>
                  +                        * processed in the i915-perf
                  driver. If both counters
                  <br>
                  +                        * differ, there is data
                  available to read in the OA
                  <br>
                  +                        * buffer.
                  <br>
                  +                        */
                  <br>
                  +                       u64 half_full_count_last;
                  <br>
                </blockquote>
                Eh? But why a relatively expensive atomic64. You only
                need one bit, and
                <br>
                reading the tail pointer from WB memory should just be
                cheap. You should
                <br>
                be able to sample the perf ringbuffer pointers very
                cheaply... What am I
                <br>
                missing?
                <br>
                -Chris
                <br>
                <br>
              </blockquote>
              Fair comment.
              <br>
              <br>
              The thing is with this series there are 2 mechanism that
              notify the poll_wq.
              <br>
              <br>
              One is the hrtimer that kicks in at regular interval and
              polls the
              <br>
              register with the workaround.
              <br>
              <br>
              The other is the interrupt which doesn't read the
              registers and workaround.
              <br>
            </blockquote>
            What's the complication with the workaround?
            <br>
          </blockquote>
          <br>
          It's a bit more than just looking at registers, we actually
          have to look
          <br>
          at the content of the buffer to figure out what landed in
          memory.
          <br>
          <br>
          The register values are not synchronized with the memory
          writes...
          <br>
        </blockquote>
        I don't want to look at registers at all for polling, and you
        shouldn't
        <br>
        need to since communication is via a ringbuf.
        <br>
         
        <blockquote type="cite">There is a comment in the code
          (i915_perf_poll_locked) about not
          <br>
          checking the register after each wakeup because that may be a
          very hot path.
          <br>
          <br>
          The atomic64 sounded like a lesser evil.
          <br>
        </blockquote>
        I'm clearly not understanding something here...
        <br>
        <br>
        Does the hardware not do:
        <br>
            update ringbuf data;
        <br>
            wmb() (or post to global observation point in their
        parlance)
        <br>
            update ringbuf tail
        <br>
      </blockquote>
      <br>
      <br>
      As far as I understand, the OA unit :
      <br>
      <br>
          sends its memory write requests to the memory controller
      <br>
      <br>
          immediately updates the ringbuf tail, without waiting for the
      previous requests to complete
      <br>
      <br>
    </blockquote>
    <br>
    By experimentation, I've haven't seen a delta between what is
    available in memory and what the OA tail register indicate larger
    than 768 bytes, which is about 3 OA reports at the largest size.<br>
    There is probably a maximum number of write requests the OA unit can
    queue before blocking.<br>
    <br>
    So again maybe you would prefer a 2 stage mechanism :<br>
    <br>
    <pre>OA interrupt -----&gt; head/tail pointer worker  -----&gt; wake up userspace</pre>
    <pre>                    hrtimer head/tail pointer --|</pre>
    <p><br>
    </p>
    <blockquote type="cite"
      cite="mid:ced9e95d-c2eb-0398-adc4-e174a35ca180@intel.com">
      <br>
      <br>
      <blockquote type="cite">
        <br>
        Then we just need to sample the ringbuf tail and compare against
        how far
        <br>
        we read last time?
        <br>
        -Chris
        <br>
        <br>
      </blockquote>
      <br>
      _______________________________________________
      <br>
      Intel-gfx mailing list
      <br>
      <a class="moz-txt-link-abbreviated" href="mailto:Intel-gfx@lists.freedesktop.org">Intel-gfx@lists.freedesktop.org</a>
      <br>
      <a class="moz-txt-link-freetext" href="https://lists.freedesktop.org/mailman/listinfo/intel-gfx">https://lists.freedesktop.org/mailman/listinfo/intel-gfx</a>
      <br>
    </blockquote>
    <p><br>
    </p>
  </body>
</html>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d535df8f7d0a..850cf35e6163 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1385,6 +1385,12 @@  struct i915_perf_stream {
 	 * buffer should be checked for available data.
 	 */
 	u64 poll_oa_period;
+
+	/**
+	 * @oa_interrupt_monitor: Whether the stream will be notified by OA
+	 * interrupts.
+	 */
+	bool oa_interrupt_monitor;
 };
 
 /**
@@ -1877,6 +1883,21 @@  struct drm_i915_private {
 			wait_queue_head_t poll_wq;
 			bool pollin;
 
+			/**
+			 * Atomic counter incremented by the interrupt
+			 * handling code for each OA half full interrupt
+			 * received.
+			 */
+			atomic64_t half_full_count;
+
+			/**
+			 * Copy of the atomic half_full_count that was last
+			 * processed in the i915-perf driver. If both counters
+			 * differ, there is data available to read in the OA
+			 * buffer.
+			 */
+			u64 half_full_count_last;
+
 			/**
 			 * For rate limiting any notifications of spurious
 			 * invalid OA reports
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 94187e68d39a..7996048565a7 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1214,6 +1214,12 @@  static void notify_ring(struct intel_engine_cs *engine)
 	trace_intel_engine_notify(engine, wait);
 }
 
+static void notify_perfmon_buffer_half_full(struct drm_i915_private *i915)
+{
+	atomic64_inc(&i915->perf.oa.half_full_count);
+	wake_up_all(&i915->perf.oa.poll_wq);
+}
+
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
 			struct intel_rps_ei *ei)
 {
@@ -1478,6 +1484,9 @@  static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT))
 		DRM_DEBUG("Command parser error, gt_iir 0x%08x\n", gt_iir);
 
+	if (gt_iir & GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT)
+		notify_perfmon_buffer_half_full(dev_priv);
+
 	if (gt_iir & GT_PARITY_ERROR(dev_priv))
 		ivybridge_parity_error_irq_handler(dev_priv, gt_iir);
 }
@@ -1499,6 +1508,12 @@  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
 		tasklet_hi_schedule(&engine->execlists.tasklet);
 }
 
+static void gen8_perfmon_handler(struct drm_i915_private *i915, u32 iir)
+{
+	if (iir & GEN8_GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT)
+		notify_perfmon_buffer_half_full(i915);
+}
+
 static void gen8_gt_irq_ack(struct drm_i915_private *i915,
 			    u32 master_ctl, u32 gt_iir[4])
 {
@@ -1508,6 +1523,7 @@  static void gen8_gt_irq_ack(struct drm_i915_private *i915,
 		      GEN8_GT_BCS_IRQ | \
 		      GEN8_GT_VCS1_IRQ | \
 		      GEN8_GT_VCS2_IRQ | \
+		      GEN8_GT_WDBOX_OACS_IRQ | \
 		      GEN8_GT_VECS_IRQ | \
 		      GEN8_GT_PM_IRQ | \
 		      GEN8_GT_GUC_IRQ)
@@ -1530,7 +1546,7 @@  static void gen8_gt_irq_ack(struct drm_i915_private *i915,
 			raw_reg_write(regs, GEN8_GT_IIR(2), gt_iir[2]);
 	}
 
-	if (master_ctl & GEN8_GT_VECS_IRQ) {
+	if (master_ctl & (GEN8_GT_VECS_IRQ | GEN8_GT_WDBOX_OACS_IRQ)) {
 		gt_iir[3] = raw_reg_read(regs, GEN8_GT_IIR(3));
 		if (likely(gt_iir[3]))
 			raw_reg_write(regs, GEN8_GT_IIR(3), gt_iir[3]);
@@ -1554,9 +1570,11 @@  static void gen8_gt_irq_handler(struct drm_i915_private *i915,
 				    gt_iir[1] >> GEN8_VCS2_IRQ_SHIFT);
 	}
 
-	if (master_ctl & GEN8_GT_VECS_IRQ) {
+	if (master_ctl & (GEN8_GT_VECS_IRQ | GEN8_GT_WDBOX_OACS_IRQ)) {
 		gen8_cs_irq_handler(i915->engine[VECS],
 				    gt_iir[3] >> GEN8_VECS_IRQ_SHIFT);
+		gen8_perfmon_handler(i915,
+				     gt_iir[3] >> GEN8_WD_IRQ_SHIFT);
 	}
 
 	if (master_ctl & (GEN8_GT_PM_IRQ | GEN8_GT_GUC_IRQ)) {
@@ -3010,6 +3028,8 @@  gen11_other_irq_handler(struct drm_i915_private * const i915,
 {
 	if (instance == OTHER_GTPM_INSTANCE)
 		return gen6_rps_irq_handler(i915, iir);
+	if (instance == OTHER_WDOAPERF_INSTANCE)
+		return gen8_perfmon_handler(i915, iir);
 
 	WARN_ONCE(1, "unhandled other interrupt instance=0x%x, iir=0x%x\n",
 		  instance, iir);
@@ -4041,6 +4061,10 @@  static void gen5_gt_irq_postinstall(struct drm_device *dev)
 		gt_irqs |= GT_BLT_USER_INTERRUPT | GT_BSD_USER_INTERRUPT;
 	}
 
+	/* We only expose the i915/perf interface on HSW+. */
+	if (IS_HASWELL(dev_priv))
+		gt_irqs |= GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT;
+
 	GEN3_IRQ_INIT(GT, dev_priv->gt_irq_mask, gt_irqs);
 
 	if (INTEL_GEN(dev_priv) >= 6) {
@@ -4170,7 +4194,8 @@  static void gen8_gt_irq_postinstall(struct drm_i915_private *dev_priv)
 			GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT,
 		0,
 		GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT |
-			GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT
+			GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT |
+			GEN8_GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT << GEN8_WD_IRQ_SHIFT
 		};
 
 	dev_priv->pm_ier = 0x0;
@@ -4289,12 +4314,12 @@  static void gen11_gt_irq_postinstall(struct drm_i915_private *dev_priv)
 
 	/*
 	 * RPS interrupts will get enabled/disabled on demand when RPS itself
-	 * is enabled/disabled.
+	 * is enabled/disabled, just enable the OA interrupt for now.
 	 */
-	dev_priv->pm_ier = 0x0;
+	dev_priv->pm_ier = GEN8_GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT;
 	dev_priv->pm_imr = ~dev_priv->pm_ier;
-	I915_WRITE(GEN11_GPM_WGBOXPERF_INTR_ENABLE, 0);
-	I915_WRITE(GEN11_GPM_WGBOXPERF_INTR_MASK,  ~0);
+	I915_WRITE(GEN11_GPM_WGBOXPERF_INTR_ENABLE, dev_priv->pm_ier);
+	I915_WRITE(GEN11_GPM_WGBOXPERF_INTR_MASK,  dev_priv->pm_imr);
 }
 
 static void icp_irq_postinstall(struct drm_device *dev)
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 9ad45ad31b43..82c4e0c08859 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -337,6 +337,7 @@  static const struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
  * @oa_period_exponent: The OA unit sampling period is derived from this
  * @poll_oa_period: The period at which the CPU will check for OA data
  * availability
+ * @oa_interrupt_monitor: Whether we should monitor the OA interrupt.
  *
  * As read_properties_unlocked() enumerates and validates the properties given
  * to open a stream of metrics the configuration is built up in the structure
@@ -354,6 +355,7 @@  struct perf_open_properties {
 	bool oa_periodic;
 	int oa_period_exponent;
 	u64 poll_oa_period;
+	bool oa_interrupt_monitor;
 };
 
 static void free_oa_config(struct drm_i915_private *dev_priv,
@@ -1844,6 +1846,13 @@  static void gen7_oa_enable(struct i915_perf_stream *stream)
 	 */
 	gen7_init_oa_buffer(dev_priv);
 
+	if (stream->oa_interrupt_monitor) {
+		spin_lock_irq(&dev_priv->irq_lock);
+		gen5_enable_gt_irq(dev_priv,
+				   GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT);
+		spin_unlock_irq(&dev_priv->irq_lock);
+	}
+
 	I915_WRITE(GEN7_OACONTROL,
 		   (ctx_id & GEN7_OACONTROL_CTX_MASK) |
 		   (period_exponent <<
@@ -1870,6 +1879,9 @@  static void gen8_oa_enable(struct i915_perf_stream *stream)
 	 */
 	gen8_init_oa_buffer(dev_priv);
 
+	if (stream->oa_interrupt_monitor)
+		I915_WRITE(GEN8_OA_IMR, ~GEN8_OA_IMR_MASK_INTR);
+
 	/*
 	 * Note: we don't rely on the hardware to perform single context
 	 * filtering and instead filter on the cpu based on the context-id
@@ -1895,6 +1907,10 @@  static void i915_oa_stream_enable(struct i915_perf_stream *stream)
 
 	dev_priv->perf.oa.pollin = false;
 
+	dev_priv->perf.oa.half_full_count_last = 0;
+	atomic64_set(&dev_priv->perf.oa.half_full_count,
+		     dev_priv->perf.oa.half_full_count_last);
+
 	dev_priv->perf.oa.ops.oa_enable(stream);
 
 	if (dev_priv->perf.oa.periodic && stream->poll_oa_period)
@@ -1907,6 +1923,13 @@  static void gen7_oa_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
+	if (stream->oa_interrupt_monitor) {
+		spin_lock_irq(&dev_priv->irq_lock);
+		gen5_disable_gt_irq(dev_priv,
+				    GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT);
+		spin_unlock_irq(&dev_priv->irq_lock);
+	}
+
 	I915_WRITE(GEN7_OACONTROL, 0);
 	if (intel_wait_for_register(dev_priv,
 				    GEN7_OACONTROL, GEN7_OACONTROL_ENABLE, 0,
@@ -1918,6 +1941,8 @@  static void gen8_oa_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
+	I915_WRITE(GEN8_OA_IMR, 0xffffffff);
+
 	I915_WRITE(GEN8_OACONTROL, 0);
 	if (intel_wait_for_register(dev_priv,
 				    GEN8_OACONTROL, GEN8_OA_COUNTER_ENABLE, 0,
@@ -2593,6 +2618,7 @@  i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
 	stream->dev_priv = dev_priv;
 	stream->ctx = specific_ctx;
 	stream->poll_oa_period = props->poll_oa_period;
+	stream->oa_interrupt_monitor = props->oa_interrupt_monitor;
 
 	ret = i915_oa_stream_init(stream, param, props);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index fad5a9e8b44d..02f1d3bb85ab 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -229,6 +229,7 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define MAX_ENGINE_CLASS	4
 
 #define OTHER_GTPM_INSTANCE	1
+#define OTHER_WDOAPERF_INSTANCE	2
 #define MAX_ENGINE_INSTANCE    3
 
 /* PCI config space */
@@ -641,6 +642,9 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OABUFFER_SIZE_8M    (6 << 3)
 #define OABUFFER_SIZE_16M   (7 << 3)
 
+#define GEN8_OA_IMR _MMIO(0x2b20)
+#define  GEN8_OA_IMR_MASK_INTR (1 << 28)
+
 /*
  * Flexible, Aggregate EU Counter Registers.
  * Note: these aren't contiguous
@@ -2892,7 +2896,9 @@  enum i915_power_well_id {
 #define GT_BLT_USER_INTERRUPT			(1 << 22)
 #define GT_BSD_CS_ERROR_INTERRUPT		(1 << 15)
 #define GT_BSD_USER_INTERRUPT			(1 << 12)
+#define GEN8_GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT (1 << 12) /* bdw+ */
 #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1	(1 << 11) /* hsw+; rsvd on snb, ivb, vlv */
+#define GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT   (1 <<  9) /* ivb+ but only used on hsw+ */
 #define GT_CONTEXT_SWITCH_INTERRUPT		(1 <<  8)
 #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT	(1 <<  5) /* !snb */
 #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT	(1 <<  4)
@@ -7184,6 +7190,7 @@  enum {
 #define  GEN8_DE_PIPE_B_IRQ		(1 << 17)
 #define  GEN8_DE_PIPE_A_IRQ		(1 << 16)
 #define  GEN8_DE_PIPE_IRQ(pipe)		(1 << (16 + (pipe)))
+#define  GEN8_GT_WDBOX_OACS_IRQ         (1 << 7)
 #define  GEN8_GT_VECS_IRQ		(1 << 6)
 #define  GEN8_GT_GUC_IRQ		(1 << 5)
 #define  GEN8_GT_PM_IRQ			(1 << 4)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 26b7274a2d43..a208fb2382ad 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2222,6 +2222,8 @@  int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
 
 	if (HAS_L3_DPF(dev_priv))
 		engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
+	if (IS_HASWELL(dev_priv))
+		engine->irq_keep_mask |= GT_PERFMON_BUFFER_HALF_FULL_INTERRUPT;
 
 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;