diff mbox

[v7,06/11] drm/i915: Enable i915 perf stream for Haswell OA unit

Message ID 20161024231934.2243-7-robert@sixbynine.org (mailing list archive)
State New, archived
Headers show

Commit Message

Robert Bragg Oct. 24, 2016, 11:19 p.m. UTC
Gen graphics hardware can be set up to periodically write snapshots of
performance counters into a circular buffer via its Observation
Architecture and this patch exposes that capability to userspace via the
i915 perf interface.

v2:
   Make sure to initialize ->specific_ctx_id when opening, without
   relying on _pin_notify hook, in case ctx already pinned.
v3:
   Revert back to pinning ctx upfront when opening stream, removing
   need to hook in to pinning and to update OACONTROL on the fly.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Robert Bragg <robert@sixbynine.org>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>

fix enable hsw
---
 drivers/gpu/drm/i915/i915_drv.h  |   65 ++-
 drivers/gpu/drm/i915/i915_perf.c | 1000 +++++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_reg.h  |  338 +++++++++++++
 include/uapi/drm/i915_drm.h      |   70 ++-
 4 files changed, 1444 insertions(+), 29 deletions(-)

Comments

Matthew Auld Oct. 25, 2016, 9:35 p.m. UTC | #1
On 25 October 2016 at 00:19, Robert Bragg <robert@sixbynine.org> wrote:
> Gen graphics hardware can be set up to periodically write snapshots of
> performance counters into a circular buffer via its Observation
> Architecture and this patch exposes that capability to userspace via the
> i915 perf interface.
>
> v2:
>    Make sure to initialize ->specific_ctx_id when opening, without
>    relying on _pin_notify hook, in case ctx already pinned.
> v3:
>    Revert back to pinning ctx upfront when opening stream, removing
>    need to hook in to pinning and to update OACONTROL on the fly.
>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Robert Bragg <robert@sixbynine.org>
> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
>
> fix enable hsw
Random bit of cruft ?

> ---
>  drivers/gpu/drm/i915/i915_drv.h  |   65 ++-
>  drivers/gpu/drm/i915/i915_perf.c | 1000 +++++++++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_reg.h  |  338 +++++++++++++
>  include/uapi/drm/i915_drm.h      |   70 ++-
>  4 files changed, 1444 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 3448d05..ea24814 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1764,6 +1764,11 @@ struct intel_wm_config {
>         bool sprites_scaled;
>  };
>
> +struct i915_oa_format {
> +       u32 format;
> +       int size;
> +};
> +
>  struct i915_oa_reg {
>         i915_reg_t addr;
>         u32 value;
> @@ -1784,11 +1789,6 @@ struct i915_perf_stream_ops {
>          */
>         void (*disable)(struct i915_perf_stream *stream);
>
> -       /* Return: true if any i915 perf records are ready to read()
> -        * for this stream.
> -        */
> -       bool (*can_read)(struct i915_perf_stream *stream);
> -
>         /* Call poll_wait, passing a wait queue that will be woken
>          * once there is something ready to read() for the stream
>          */
> @@ -1798,9 +1798,7 @@ struct i915_perf_stream_ops {
>
>         /* For handling a blocking read, wait until there is something
>          * to ready to read() for the stream. E.g. wait on the same
> -        * wait queue that would be passed to poll_wait() until
> -        * ->can_read() returns true (if its safe to call ->can_read()
> -        * without the i915 perf lock held).
> +        * wait queue that would be passed to poll_wait().
>          */
>         int (*wait_unlocked)(struct i915_perf_stream *stream);
>
> @@ -1840,11 +1838,28 @@ struct i915_perf_stream {
>         struct list_head link;
>
>         u32 sample_flags;
> +       int sample_size;
>
>         struct i915_gem_context *ctx;
>         bool enabled;
>
> -       struct i915_perf_stream_ops *ops;
> +       const struct i915_perf_stream_ops *ops;
> +};
> +
> +struct i915_oa_ops {
> +       void (*init_oa_buffer)(struct drm_i915_private *dev_priv);
> +       int (*enable_metric_set)(struct drm_i915_private *dev_priv);
> +       void (*disable_metric_set)(struct drm_i915_private *dev_priv);
> +       void (*oa_enable)(struct drm_i915_private *dev_priv);
> +       void (*oa_disable)(struct drm_i915_private *dev_priv);
> +       void (*update_oacontrol)(struct drm_i915_private *dev_priv);
> +       void (*update_hw_ctx_id_locked)(struct drm_i915_private *dev_priv,
> +                                       u32 ctx_id);
> +       int (*read)(struct i915_perf_stream *stream,
> +                   char __user *buf,
> +                   size_t count,
> +                   size_t *offset);
> +       bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
>  };
>
>  struct drm_i915_private {
> @@ -2149,16 +2164,46 @@ struct drm_i915_private {
>
>         struct {
>                 bool initialized;
> +
>                 struct mutex lock;
>                 struct list_head streams;
>
> +               spinlock_t hook_lock;
> +
>                 struct {
> -                       u32 metrics_set;
> +                       struct i915_perf_stream *exclusive_stream;
> +
> +                       u32 specific_ctx_id;
Can we just get rid of this, now that the vma remains pinned we can
simply get the ggtt address at the time of configuring the OA_CONTROL
register ?

> +
> +                       struct hrtimer poll_check_timer;
> +                       wait_queue_head_t poll_wq;
> +                       atomic_t pollin;
> +
> +                       bool periodic;
> +                       int period_exponent;
> +                       int timestamp_frequency;
> +
> +                       int tail_margin;
> +
> +                       int metrics_set;
>
>                         const struct i915_oa_reg *mux_regs;
>                         int mux_regs_len;
>                         const struct i915_oa_reg *b_counter_regs;
>                         int b_counter_regs_len;
> +
> +                       struct {
> +                               struct i915_vma *vma;
> +                               u8 *vaddr;
> +                               int format;
> +                               int format_size;
> +                       } oa_buffer;
> +
> +                       u32 gen7_latched_oastatus1;
> +
> +                       struct i915_oa_ops ops;
> +                       const struct i915_oa_format *oa_formats;
> +                       int n_builtin_sets;
>                 } oa;
>         } perf;
>
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 4d51586..d7a4899 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -25,16 +25,867 @@
>   */
>
>  #include <linux/anon_inodes.h>
> +#include <linux/sizes.h>
>
>  #include "i915_drv.h"
> +#include "i915_oa_hsw.h"
> +
> +/* HW requires this to be a power of two, between 128k and 16M, though driver
> + * is currently generally designed assuming the largest 16M size is used such
> + * that the overflow cases are unlikely in normal operation.
> + */
> +#define OA_BUFFER_SIZE         SZ_16M
> +
> +#define OA_TAKEN(tail, head)   ((tail - head) & (OA_BUFFER_SIZE - 1))
> +
> +/* There's a HW race condition between OA unit tail pointer register updates and
> + * writes to memory whereby the tail pointer can sometimes get ahead of what's
> + * been written out to the OA buffer so far.
> + *
> + * Although this can be observed explicitly by checking for a zeroed report-id
> + * field in tail reports, it seems preferable to account for this earlier e.g.
> + * as part of the _oa_buffer_is_empty checks to minimize -EAGAIN polling cycles
> + * in this situation.
> + *
> + * To give time for the most recent reports to land before they may be copied to
> + * userspace, the driver operates as if the tail pointer effectively lags behind
> + * the HW tail pointer by 'tail_margin' bytes. The margin in bytes is calculated
> + * based on this constant in nanoseconds, the current OA sampling exponent
> + * and current report size.
> + *
> + * There is also a fallback check while reading to simply skip over reports with
> + * a zeroed report-id.
> + */
> +#define OA_TAIL_MARGIN_NSEC    100000ULL
> +
> +/* frequency for checking whether the OA unit has written new reports to the
> + * circular OA buffer...
> + */
> +#define POLL_FREQUENCY 200
> +#define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY)
> +
> +/* The maximum exponent the hardware accepts is 63 (essentially it selects one
> + * of the 64bit timestamp bits to trigger reports from) but there's currently
> + * no known use case for sampling as infrequently as once per 47 thousand years.
> + *
> + * Since the timestamps included in OA reports are only 32bits it seems
> + * reasonable to limit the OA exponent where it's still possible to account for
> + * overflow in OA report timestamps.
> + */
> +#define OA_EXPONENT_MAX 31
> +
> +#define INVALID_CTX_ID 0xffffffff
We shouldn't need this anymore.

> +
> +
> +/* XXX: beware if future OA HW adds new report formats that the current
> + * code assumes all reports have a power-of-two size and ~(size - 1) can
> + * be used as a mask to align the OA tail pointer.
> + */
> +static struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = {
> +       [I915_OA_FORMAT_A13]        = { 0, 64 },
> +       [I915_OA_FORMAT_A29]        = { 1, 128 },
> +       [I915_OA_FORMAT_A13_B8_C8]  = { 2, 128 },
> +       /* A29_B8_C8 Disallowed as 192 bytes doesn't factor into buffer size */
> +       [I915_OA_FORMAT_B4_C8]      = { 4, 64 },
> +       [I915_OA_FORMAT_A45_B8_C8]  = { 5, 256 },
> +       [I915_OA_FORMAT_B4_C8_A16]  = { 6, 128 },
> +       [I915_OA_FORMAT_C4_B8]      = { 7, 64 },
> +};
> +
> +#define SAMPLE_OA_REPORT      (1<<0)
>
>  struct perf_open_properties {
>         u32 sample_flags;
>
>         u64 single_context:1;
>         u64 ctx_handle;
> +
> +       /* OA sampling state */
> +       int metrics_set;
> +       int oa_format;
> +       bool oa_periodic;
> +       int oa_period_exponent;
>  };
>
> +/* NB: This is either called via fops or the poll check hrtimer (atomic ctx)
> + *
> + * It's safe to read OA config state here unlocked, assuming that this is only
> + * called while the stream is enabled, while the global OA configuration can't
> + * be modified.
> + *
> + * Note: we don't lock around the head/tail reads even though there's the slim
> + * possibility of read() fop errors forcing a re-init of the OA buffer
> + * pointers.  A race here could result in a false positive !empty status which
> + * is acceptable.
> + */
> +static bool gen7_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv)
> +{
> +       int report_size = dev_priv->perf.oa.oa_buffer.format_size;
> +       u32 oastatus2 = I915_READ(GEN7_OASTATUS2);
> +       u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
> +       u32 head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
> +       u32 tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
> +
> +       return OA_TAKEN(tail, head) <
> +               dev_priv->perf.oa.tail_margin + report_size;
> +}
> +
> +/**
> + * Appends a status record to a userspace read() buffer.
> + */
> +static int append_oa_status(struct i915_perf_stream *stream,
> +                           char __user *buf,
> +                           size_t count,
> +                           size_t *offset,
> +                           enum drm_i915_perf_record_type type)
> +{
> +       struct drm_i915_perf_record_header header = { type, 0, sizeof(header) };
> +
> +       if ((count - *offset) < header.size)
> +               return -ENOSPC;
> +
> +       if (copy_to_user(buf + *offset, &header, sizeof(header)))
> +               return -EFAULT;
> +
> +       (*offset) += header.size;
> +
> +       return 0;
> +}
> +
> +/**
> + * Copies single OA report into userspace read() buffer.
> + */
> +static int append_oa_sample(struct i915_perf_stream *stream,
> +                           char __user *buf,
> +                           size_t count,
> +                           size_t *offset,
> +                           const u8 *report)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +       int report_size = dev_priv->perf.oa.oa_buffer.format_size;
> +       struct drm_i915_perf_record_header header;
> +       u32 sample_flags = stream->sample_flags;
> +
> +       header.type = DRM_I915_PERF_RECORD_SAMPLE;
> +       header.pad = 0;
> +       header.size = stream->sample_size;
> +
> +       if ((count - *offset) < header.size)
> +               return -ENOSPC;
> +
> +       buf += *offset;
> +       if (copy_to_user(buf, &header, sizeof(header)))
> +               return -EFAULT;
> +       buf += sizeof(header);
> +
> +       if (sample_flags & SAMPLE_OA_REPORT) {
> +               if (copy_to_user(buf, report, report_size))
> +                       return -EFAULT;
> +       }
> +
> +       (*offset) += header.size;
> +
> +       return 0;
> +}
> +
> +/**
> + * Copies all buffered OA reports into userspace read() buffer.
> + * @head_ptr: (inout): the head pointer before and after appending
> + *
> + * Returns 0 on success, negative error code on failure.
> + *
> + * Notably any error condition resulting in a short read (-ENOSPC or
> + * -EFAULT) will be returned even though one or more records may
> + * have been successfully copied. In this case it's up to the caller
> + * to decide if the error should be squashed before returning to
> + * userspace.
> + */
This kernel doc could do with a spring clean.

> +static int gen7_append_oa_reports(struct i915_perf_stream *stream,
> +                                 char __user *buf,
> +                                 size_t count,
> +                                 size_t *offset,
> +                                 u32 *head_ptr,
> +                                 u32 tail)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +       int report_size = dev_priv->perf.oa.oa_buffer.format_size;
> +       u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
> +       int tail_margin = dev_priv->perf.oa.tail_margin;
> +       u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
> +       u32 mask = (OA_BUFFER_SIZE - 1);
> +       u32 head;
> +       u32 taken;
> +       int ret = 0;
> +
> +       BUG_ON(!stream->enabled);
> +
> +       head = *head_ptr - gtt_offset;
> +       tail -= gtt_offset;
> +
> +       /* The OA unit is expected to wrap the tail pointer according to the OA
> +        * buffer size and since we should never write a misaligned head
> +        * pointer we don't expect to read one back either...
> +        */
> +       if (tail > OA_BUFFER_SIZE || head > OA_BUFFER_SIZE ||
> +           head % report_size) {
> +               DRM_ERROR("Inconsistent OA buffer pointer (head = %u, tail = %u): force restart",
> +                         head, tail);
> +               dev_priv->perf.oa.ops.oa_disable(dev_priv);
> +               dev_priv->perf.oa.ops.oa_enable(dev_priv);
> +               *head_ptr = I915_READ(GEN7_OASTATUS2) &
> +                       GEN7_OASTATUS2_HEAD_MASK;
> +               return -EIO;
> +       }
> +
> +
> +       /* The tail pointer increases in 64 byte increments, not in report_size
> +        * steps...
> +        */
> +       tail &= ~(report_size - 1);
> +
> +       /* Move the tail pointer back by the current tail_margin to account for
> +        * the possibility that the latest reports may not have really landed
> +        * in memory yet...
> +        */
> +
> +       if (OA_TAKEN(tail, head) < report_size + tail_margin)
> +               return -EAGAIN;
> +
> +       tail -= tail_margin;
> +       tail &= mask;
> +
> +       for (/* none */;
> +            (taken = OA_TAKEN(tail, head));
> +            head = (head + report_size) & mask) {
> +               u8 *report = oa_buf_base + head;
> +               u32 *report32 = (void *)report;
> +
> +               /* All the report sizes factor neatly into the buffer
> +                * size so we never expect to see a report split
> +                * between the beginning and end of the buffer.
> +                *
> +                * Given the initial alignment check a misalignment
> +                * here would imply a driver bug that would result
> +                * in an overrun.
> +                */
> +               BUG_ON((OA_BUFFER_SIZE - head) < report_size);
> +
> +               /* The report-ID field for periodic samples includes
> +                * some undocumented flags related to what triggered
> +                * the report and is never expected to be zero so we
> +                * can check that the report isn't invalid before
> +                * copying it to userspace...
> +                */
> +               if (report32[0] == 0) {
> +                       DRM_ERROR("Skipping spurious, invalid OA report\n");
> +                       continue;
> +               }
> +
> +               ret = append_oa_sample(stream, buf, count, offset, report);
> +               if (ret)
> +                       break;
> +
> +               /* The above report-id field sanity check is based on
> +                * the assumption that the OA buffer is initially
> +                * zeroed and we reset the field after copying so the
> +                * check is still meaningful once old reports start
> +                * being overwritten.
> +                */
> +               report32[0] = 0;
> +       }
> +
> +       *head_ptr = gtt_offset + head;
> +
> +       return ret;
> +}
> +
> +static int gen7_oa_read(struct i915_perf_stream *stream,
> +                       char __user *buf,
> +                       size_t count,
> +                       size_t *offset)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +       int report_size = dev_priv->perf.oa.oa_buffer.format_size;
> +       u32 oastatus2;
> +       u32 oastatus1;
> +       u32 head;
> +       u32 tail;
> +       int ret;
> +
> +       BUG_ON(!dev_priv->perf.oa.oa_buffer.vaddr);
> +
> +       oastatus2 = I915_READ(GEN7_OASTATUS2);
> +       oastatus1 = I915_READ(GEN7_OASTATUS1);
> +
> +       head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
> +       tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
> +
> +       /* XXX: On Haswell we don't have a safe way to clear oastatus1
> +        * bits while the OA unit is enabled (while the tail pointer
> +        * may be updated asynchronously) so we ignore status bits
> +        * that have already been reported to userspace.
> +        */
> +       oastatus1 &= ~dev_priv->perf.oa.gen7_latched_oastatus1;
> +
> +       /* We treat OABUFFER_OVERFLOW as a significant error:
> +        *
> +        * - The status can be interpreted to mean that the buffer is
> +        *   currently full (with a higher precedence than OA_TAKEN()
> +        *   which will start to report a near-empty buffer after an
> +        *   overflow) but it's awkward that we can't clear the status
> +        *   on Haswell, so without a reset we won't be able to catch
> +        *   the state again.
> +        *
> +        * - Since it also implies the HW has started overwriting old
> +        *   reports it may also affect our sanity checks for invalid
> +        *   reports when copying to userspace that assume new reports
> +        *   are being written to cleared memory.
> +        *
> +        * - In the future we may want to introduce a flight recorder
> +        *   mode where the driver will automatically maintain a safe
> +        *   guard band between head/tail, avoiding this overflow
> +        *   condition, but we avoid the added driver complexity for
> +        *   now.
> +        */
> +       if (unlikely(oastatus1 & GEN7_OASTATUS1_OABUFFER_OVERFLOW)) {
> +               ret = append_oa_status(stream, buf, count, offset,
> +                                      DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
> +               if (ret)
> +                       return ret;
> +
> +               DRM_ERROR("OA buffer overflow: force restart");
> +
> +               dev_priv->perf.oa.ops.oa_disable(dev_priv);
> +               dev_priv->perf.oa.ops.oa_enable(dev_priv);
> +
> +               oastatus2 = I915_READ(GEN7_OASTATUS2);
> +               oastatus1 = I915_READ(GEN7_OASTATUS1);
> +
> +               head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
> +               tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
> +       }
> +
> +       if (unlikely(oastatus1 & GEN7_OASTATUS1_REPORT_LOST)) {
> +               ret = append_oa_status(stream, buf, count, offset,
> +                                      DRM_I915_PERF_RECORD_OA_REPORT_LOST);
> +               if (ret)
> +                       return ret;
> +               dev_priv->perf.oa.gen7_latched_oastatus1 |=
> +                       GEN7_OASTATUS1_REPORT_LOST;
> +       }
> +
> +       ret = gen7_append_oa_reports(stream, buf, count, offset,
> +                                    &head, tail);
> +
> +       /* All the report sizes are a power of two and the
> +        * head should always be incremented by some multiple
> +        * of the report size.
> +        *
> +        * A warning here, but notably if we later read back a
> +        * misaligned pointer we will treat that as a bug since
> +        * it could lead to a buffer overrun.
> +        */
> +       WARN_ONCE(head & (report_size - 1),
> +                 "i915: Writing misaligned OA head pointer");
> +
> +       /* Note: we update the head pointer here even if an error
> +        * was returned since the error may represent a short read
> +        * where some some reports were successfully copied.
> +        */
> +       I915_WRITE(GEN7_OASTATUS2,
> +                  ((head & GEN7_OASTATUS2_HEAD_MASK) |
> +                   OA_MEM_SELECT_GGTT));
> +
> +       return ret;
> +}
> +
> +static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> +       /* We would wait indefinitly if periodic sampling is not enabled */
> +       if (!dev_priv->perf.oa.periodic)
> +               return -EIO;
> +
> +       /* Note: the oa_buffer_is_empty() condition is ok to run unlocked as it
> +        * just performs mmio reads of the OA buffer head + tail pointers and
> +        * it's assumed we're handling some operation that implies the stream
> +        * can't be destroyed until completion (such as a read()) that ensures
> +        * the device + OA buffer can't disappear
> +        */
> +       return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
> +                                       !dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv));
> +}
> +
> +static void i915_oa_poll_wait(struct i915_perf_stream *stream,
> +                             struct file *file,
> +                             poll_table *wait)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> +       poll_wait(file, &dev_priv->perf.oa.poll_wq, wait);
> +}
> +
> +static int i915_oa_read(struct i915_perf_stream *stream,
> +                       char __user *buf,
> +                       size_t count,
> +                       size_t *offset)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> +       return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
> +}
> +
> +static int claim_specific_ctx(struct i915_perf_stream *stream)
> +{
pin_oa_specific_ctx, or something? Also would it not make more sense
to operate on the context, not the stream.

> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +       struct i915_vma *vma;
> +       int ret;
> +
> +       ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> +       if (ret)
> +               return ret;
> +
> +       /* So that we don't have to worry about updating the context ID
> +        * in OACONTOL on the fly we make sure to pin the context
> +        * upfront for the lifetime of the stream...
> +        */
> +       vma = stream->ctx->engine[RCS].state;
> +       ret = i915_vma_pin(vma, 0, stream->ctx->ggtt_alignment,
> +                          PIN_GLOBAL | PIN_HIGH);
> +       if (ret)
> +               return ret;
> +
> +       dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(vma);
> +
> +       mutex_unlock(&dev_priv->drm.struct_mutex);
> +
> +       return 0;
> +}
> +
> +static void release_specific_ctx(struct i915_perf_stream *stream)
Likewise here, unpin_oa_specific_ctx. Just a thought though so feel
free to ignore.

> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> +       mutex_lock(&dev_priv->drm.struct_mutex);
> +
> +       i915_vma_unpin(stream->ctx->engine[RCS].state);
> +       dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
> +
> +       mutex_unlock(&dev_priv->drm.struct_mutex);
> +}
> +
> +static void
> +free_oa_buffer(struct drm_i915_private *i915)
> +{
> +       mutex_lock(&i915->drm.struct_mutex);
> +
> +       i915_gem_object_unpin_map(i915->perf.oa.oa_buffer.vma->obj);
> +       i915_vma_unpin(i915->perf.oa.oa_buffer.vma);
> +       i915_gem_object_put(i915->perf.oa.oa_buffer.vma->obj);
> +
> +       i915->perf.oa.oa_buffer.vma = NULL;
> +       i915->perf.oa.oa_buffer.vaddr = NULL;
> +
> +       mutex_unlock(&i915->drm.struct_mutex);
> +}
> +
> +static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> +       BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
> +
> +       dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
> +
> +       free_oa_buffer(dev_priv);
> +
> +       intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> +       intel_runtime_pm_put(dev_priv);
> +
> +       if (stream->ctx)
> +               release_specific_ctx(stream);
> +
> +       dev_priv->perf.oa.exclusive_stream = NULL;
> +}
> +
> +static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
> +{
> +       u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
> +
> +       /* Pre-DevBDW: OABUFFER must be set with counters off,
> +        * before OASTATUS1, but after OASTATUS2
> +        */
> +       I915_WRITE(GEN7_OASTATUS2, gtt_offset | OA_MEM_SELECT_GGTT); /* head */
> +       I915_WRITE(GEN7_OABUFFER, gtt_offset);
> +       I915_WRITE(GEN7_OASTATUS1, gtt_offset | OABUFFER_SIZE_16M); /* tail */
> +
> +       /* On Haswell we have to track which OASTATUS1 flags we've
> +        * already seen since they can't be cleared while periodic
> +        * sampling is enabled.
> +        */
> +       dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
> +
> +       /* NB: although the OA buffer will initially be allocated
> +        * zeroed via shmfs (and so this memset is redundant when
> +        * first allocating), we may re-init the OA buffer, either
> +        * when re-enabling a stream or in error/reset paths.
> +        *
> +        * The reason we clear the buffer for each re-init is for the
> +        * sanity check in gen7_append_oa_reports() that looks at the
> +        * report-id field to make sure it's non-zero which relies on
> +        * the assumption that new reports are being written to zeroed
> +        * memory...
> +        */
> +       memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
> +
> +       /* Maybe make ->pollin per-stream state if we support multiple
> +        * concurrent streams in the future. */
> +       atomic_set(&dev_priv->perf.oa.pollin, false);
> +}
> +
> +static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
> +{
> +       struct drm_i915_gem_object *bo;
> +       struct i915_vma *vma;
> +       int ret;
> +
> +       BUG_ON(dev_priv->perf.oa.oa_buffer.vma);
> +
> +       ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> +       if (ret)
> +               return ret;
> +
> +       BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
> +       BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
> +
> +       bo = i915_gem_object_create(&dev_priv->drm, OA_BUFFER_SIZE);
> +       if (IS_ERR(bo)) {
> +               DRM_ERROR("Failed to allocate OA buffer\n");
> +               ret = PTR_ERR(bo);
> +               goto unlock;
> +       }
> +
> +       ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> +       if (ret)
> +               goto err_unref;
> +
> +       /* PreHSW required 512K alignment, HSW requires 16M */
> +       vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, PIN_MAPPABLE);
> +       if (IS_ERR(vma)) {
> +               ret = PTR_ERR(vma);
> +               goto err_unref;
> +       }
> +       dev_priv->perf.oa.oa_buffer.vma = vma;
> +
> +       dev_priv->perf.oa.oa_buffer.vaddr =
> +               i915_gem_object_pin_map(bo, I915_MAP_WB);
> +       if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
> +               ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
> +               goto err_unpin;
> +       }
> +
> +       dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
> +
> +       DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
> +                        i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
> +                        dev_priv->perf.oa.oa_buffer.vaddr);
> +
> +       goto unlock;
> +
> +err_unpin:
> +       __i915_vma_unpin(vma);
> +
> +err_unref:
> +       i915_gem_object_put(bo);
> +
> +       dev_priv->perf.oa.oa_buffer.vaddr = NULL;
> +       dev_priv->perf.oa.oa_buffer.vma = NULL;
> +
> +unlock:
> +       mutex_unlock(&dev_priv->drm.struct_mutex);
> +       return ret;
> +}
> +
> +static void config_oa_regs(struct drm_i915_private *dev_priv,
> +                          const struct i915_oa_reg *regs,
> +                          int n_regs)
> +{
> +       int i;
> +
> +       for (i = 0; i < n_regs; i++) {
> +               const struct i915_oa_reg *reg = regs + i;
> +
> +               I915_WRITE(reg->addr, reg->value);
> +       }
> +}
> +
> +static int hsw_enable_metric_set(struct drm_i915_private *dev_priv)
> +{
> +       int ret = i915_oa_select_metric_set_hsw(dev_priv);
> +
> +       if (ret)
> +               return ret;
> +
> +       I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) |
> +                                     GT_NOA_ENABLE));
> +
> +       /* PRM:
> +        *
> +        * OA unit is using “crclk” for its functionality. When trunk
> +        * level clock gating takes place, OA clock would be gated,
> +        * unable to count the events from non-render clock domain.
> +        * Render clock gating must be disabled when OA is enabled to
> +        * count the events from non-render domain. Unit level clock
> +        * gating for RCS should also be disabled.
> +        */
> +       I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
> +                                   ~GEN7_DOP_CLOCK_GATE_ENABLE));
> +       I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
> +                                 GEN6_CSUNIT_CLOCK_GATE_DISABLE));
> +
> +       config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
> +                      dev_priv->perf.oa.mux_regs_len);
> +
> +       /* It apparently takes a fairly long time for a new MUX
> +        * configuration to be be applied after these register writes.
> +        * This delay duration was derived empirically based on the
> +        * render_basic config but hopefully it covers the maximum
> +        * configuration latency.
> +        *
> +        * As a fallback, the checks in _append_oa_reports() to skip
> +        * invalid OA reports do also seem to work to discard reports
> +        * generated before this config has completed - albeit not
> +        * silently.
> +        *
> +        * Unfortunately this is essentially a magic number, since we
> +        * don't currently know of a reliable mechanism for predicting
> +        * how long the MUX config will take to apply and besides
> +        * seeing invalid reports we don't know of a reliable way to
> +        * explicitly check that the MUX config has landed.
> +        *
> +        * It's even possible we've miss characterized the underlying
> +        * problem - it just seems like the simplest explanation why
> +        * a delay at this location would mitigate any invalid reports.
> +        */
> +       usleep_range(15000, 20000);
> +
> +       config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
> +                      dev_priv->perf.oa.b_counter_regs_len);
> +
> +       return 0;
> +}
> +
> +static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
> +{
> +       I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
> +                                 ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
> +       I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
> +                                   GEN7_DOP_CLOCK_GATE_ENABLE));
> +
> +       I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
> +                                     ~GT_NOA_ENABLE));
> +}
> +
> +static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
> +{
> +       assert_spin_locked(&dev_priv->perf.hook_lock);
> +
> +       if (dev_priv->perf.oa.exclusive_stream->enabled) {
> +               struct i915_gem_context *ctx =
> +                       dev_priv->perf.oa.exclusive_stream->ctx;
> +               u32 ctx_id = dev_priv->perf.oa.specific_ctx_id;
> +
> +               bool periodic = dev_priv->perf.oa.periodic;
> +               u32 period_exponent = dev_priv->perf.oa.period_exponent;
> +               u32 report_format = dev_priv->perf.oa.oa_buffer.format;
> +
> +               I915_WRITE(GEN7_OACONTROL,
> +                          (ctx_id & GEN7_OACONTROL_CTX_MASK) |
> +                          (period_exponent <<
> +                           GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
> +                          (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
> +                          (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
> +                          (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
> +                          GEN7_OACONTROL_ENABLE);
> +       } else
> +               I915_WRITE(GEN7_OACONTROL, 0);
> +}
> +
> +static void gen7_oa_enable(struct drm_i915_private *dev_priv)
> +{
> +       unsigned long flags;
> +
> +       /* Reset buf pointers so we don't forward reports from before now.
> +        *
> +        * Think carefully if considering trying to avoid this, since it
> +        * also ensures status flags and the buffer itself are cleared
> +        * in error paths, and we have checks for invalid reports based
> +        * on the assumption that certain fields are written to zeroed
> +        * memory which this helps maintains.
> +        */
> +       gen7_init_oa_buffer(dev_priv);
> +
> +       spin_lock_irqsave(&dev_priv->perf.hook_lock, flags);
> +       gen7_update_oacontrol_locked(dev_priv);
> +       spin_unlock_irqrestore(&dev_priv->perf.hook_lock, flags);
> +}
> +
> +static void i915_oa_stream_enable(struct i915_perf_stream *stream)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> +       dev_priv->perf.oa.ops.oa_enable(dev_priv);
> +
> +       if (dev_priv->perf.oa.periodic)
> +               hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
> +                             ns_to_ktime(POLL_PERIOD),
> +                             HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static void gen7_oa_disable(struct drm_i915_private *dev_priv)
> +{
> +       I915_WRITE(GEN7_OACONTROL, 0);
> +}
> +
> +static void i915_oa_stream_disable(struct i915_perf_stream *stream)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> +       dev_priv->perf.oa.ops.oa_disable(dev_priv);
> +
> +       if (dev_priv->perf.oa.periodic)
> +               hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
> +}
> +
> +static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
> +{
> +       return 1000000000ULL * (2ULL << exponent) /
> +               dev_priv->perf.oa.timestamp_frequency;
> +}
> +
> +static const struct i915_perf_stream_ops i915_oa_stream_ops = {
> +       .destroy = i915_oa_stream_destroy,
> +       .enable = i915_oa_stream_enable,
> +       .disable = i915_oa_stream_disable,
> +       .wait_unlocked = i915_oa_wait_unlocked,
> +       .poll_wait = i915_oa_poll_wait,
> +       .read = i915_oa_read,
> +};
> +
> +static int i915_oa_stream_init(struct i915_perf_stream *stream,
> +                              struct drm_i915_perf_open_param *param,
> +                              struct perf_open_properties *props)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +       int format_size;
> +       int ret;
> +
> +       if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
> +               DRM_ERROR("Only OA report sampling supported\n");
> +               return -EINVAL;
> +       }
> +
> +       if (!dev_priv->perf.oa.ops.init_oa_buffer) {
> +               DRM_ERROR("OA unit not supported\n");
> +               return -ENODEV;
> +       }
> +
> +       /* To avoid the complexity of having to accurately filter
> +        * counter reports and marshal to the appropriate client
> +        * we currently only allow exclusive access
> +        */
> +       if (dev_priv->perf.oa.exclusive_stream) {
> +               DRM_ERROR("OA unit already in use\n");
> +               return -EBUSY;
> +       }
> +
> +       if (!props->metrics_set) {
> +               DRM_ERROR("OA metric set not specified\n");
> +               return -EINVAL;
> +       }
> +
> +       if (!props->oa_format) {
> +               DRM_ERROR("OA report format not specified\n");
> +               return -EINVAL;
> +       }
> +
> +       stream->sample_size = sizeof(struct drm_i915_perf_record_header);
> +
> +       format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
> +
> +       stream->sample_flags |= SAMPLE_OA_REPORT;
> +       stream->sample_size += format_size;
> +
> +       dev_priv->perf.oa.oa_buffer.format_size = format_size;
> +       BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0);
> +
> +       dev_priv->perf.oa.oa_buffer.format =
> +               dev_priv->perf.oa.oa_formats[props->oa_format].format;
> +
> +       dev_priv->perf.oa.metrics_set = props->metrics_set;
> +
> +       dev_priv->perf.oa.periodic = props->oa_periodic;
> +       if (dev_priv->perf.oa.periodic) {
> +               u64 period_ns = oa_exponent_to_ns(dev_priv,
> +                                                 props->oa_period_exponent);
> +
> +               dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
> +
> +               /* See comment for OA_TAIL_MARGIN_NSEC for details
> +                * about this tail_margin...
> +                */
> +               dev_priv->perf.oa.tail_margin =
> +                       ((OA_TAIL_MARGIN_NSEC / period_ns) + 1) * format_size;
> +       }
> +
> +       if (stream->ctx) {
> +               ret = claim_specific_ctx(stream);
> +               if (ret)
> +                       return ret;
> +       }
> +
> +       ret = alloc_oa_buffer(dev_priv);
> +       if (ret)
> +               goto err_oa_buf_alloc;
> +
> +       /* PRM - observability performance counters:
> +        *
> +        *   OACONTROL, performance counter enable, note:
> +        *
> +        *   "When this bit is set, in order to have coherent counts,
> +        *   RC6 power state and trunk clock gating must be disabled.
> +        *   This can be achieved by programming MMIO registers as
> +        *   0xA094=0 and 0xA090[31]=1"
> +        *
> +        *   In our case we are expecting that taking pm + FORCEWAKE
> +        *   references will effectively disable RC6.
> +        */
> +       intel_runtime_pm_get(dev_priv);
> +       intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> +
> +       ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
> +       if (ret)
> +               goto err_enable;
> +
> +       stream->ops = &i915_oa_stream_ops;
> +
> +       dev_priv->perf.oa.exclusive_stream = stream;
> +
> +       return 0;
> +
> +err_enable:
> +       intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> +       intel_runtime_pm_put(dev_priv);
> +       free_oa_buffer(dev_priv);
> +
> +err_oa_buf_alloc:
> +       if (stream->ctx)
> +               release_specific_ctx(stream);
> +
> +       return ret;
> +}
> +
>  static ssize_t i915_perf_read_locked(struct i915_perf_stream *stream,
>                                      struct file *file,
>                                      char __user *buf,
> @@ -78,8 +929,20 @@ static ssize_t i915_perf_read(struct file *file,
>         struct drm_i915_private *dev_priv = stream->dev_priv;
>         ssize_t ret;
>
> +       /* To ensure it's handled consistently we simply treat all reads of a
> +        * disabled stream as an error. In particular it might otherwise lead
> +        * to a deadlock for blocking file descriptors...
> +        */
> +       if (!stream->enabled)
> +               return -EIO;
> +
>         if (!(file->f_flags & O_NONBLOCK)) {
> -               /* Allow false positives from stream->ops->wait_unlocked.
> +               /* There's the small chance of false positives from
> +                * stream->ops->wait_unlocked.
> +                *
> +                * E.g. with single context filtering since we only wait until
> +                * oabuffer has >= 1 report we don't immediately know whether
> +                * any reports really belong to the current context
>                  */
>                 do {
>                         ret = stream->ops->wait_unlocked(stream);
> @@ -97,21 +960,50 @@ static ssize_t i915_perf_read(struct file *file,
>                 mutex_unlock(&dev_priv->perf.lock);
>         }
>
> +       if (ret >= 0) {
> +               /* Maybe make ->pollin per-stream state if we support multiple
> +                * concurrent streams in the future. */
> +               atomic_set(&dev_priv->perf.oa.pollin, false);
> +       }
> +
>         return ret;
>  }
>
> -static unsigned int i915_perf_poll_locked(struct i915_perf_stream *stream,
> +static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
> +{
> +       struct drm_i915_private *dev_priv =
> +               container_of(hrtimer, typeof(*dev_priv),
> +                            perf.oa.poll_check_timer);
> +
> +       if (!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv)) {
> +               atomic_set(&dev_priv->perf.oa.pollin, true);
> +               wake_up(&dev_priv->perf.oa.poll_wq);
> +       }
> +
> +       hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
> +
> +       return HRTIMER_RESTART;
> +}
> +
> +static unsigned int i915_perf_poll_locked(struct drm_i915_private *dev_priv,
> +                                         struct i915_perf_stream *stream,
>                                           struct file *file,
>                                           poll_table *wait)
>  {
> -       unsigned int streams = 0;
> +       unsigned int events = 0;
>
>         stream->ops->poll_wait(stream, file, wait);
>
> -       if (stream->ops->can_read(stream))
> -               streams |= POLLIN;
> +       /* Note: we don't explicitly check whether there's something to read
> +        * here since this path may be very hot depending on what else
> +        * userspace is polling, or on the timeout in use. We rely solely on
> +        * the hrtimer/oa_poll_check_timer_cb to notify us when there are
> +        * samples to read.
> +        */
> +       if (atomic_read(&dev_priv->perf.oa.pollin))
> +               events |= POLLIN;
>
> -       return streams;
> +       return events;
>  }
>
>  static unsigned int i915_perf_poll(struct file *file, poll_table *wait)
> @@ -121,7 +1013,7 @@ static unsigned int i915_perf_poll(struct file *file, poll_table *wait)
>         int ret;
>
>         mutex_lock(&dev_priv->perf.lock);
> -       ret = i915_perf_poll_locked(stream, file, wait);
> +       ret = i915_perf_poll_locked(dev_priv, stream, file, wait);
>         mutex_unlock(&dev_priv->perf.lock);
>
>         return ret;
> @@ -285,18 +1177,18 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
>                 goto err_ctx;
>         }
>
> -       stream->sample_flags = props->sample_flags;
>         stream->dev_priv = dev_priv;
>         stream->ctx = specific_ctx;
>
> -       /*
> -        * TODO: support sampling something
> -        *
> -        * For now this is as far as we can go.
> +       ret = i915_oa_stream_init(stream, param, props);
> +       if (ret)
> +               goto err_alloc;
> +
> +       /* we avoid simply assigning stream->sample_flags = props->sample_flags
> +        * to have _stream_init check the combination of sample flags more
> +        * thoroughly, but still this is the expected result at this point.
>          */
> -       DRM_ERROR("Unsupported i915 perf stream configuration\n");
> -       ret = -EINVAL;
> -       goto err_alloc;
> +       BUG_ON(stream->sample_flags != props->sample_flags);
>
>         list_add(&stream->link, &dev_priv->perf.streams);
>
> @@ -376,6 +1268,56 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
>                         props->single_context = 1;
>                         props->ctx_handle = value;
>                         break;
> +               case DRM_I915_PERF_PROP_SAMPLE_OA:
> +                       props->sample_flags |= SAMPLE_OA_REPORT;
> +                       break;
> +               case DRM_I915_PERF_PROP_OA_METRICS_SET:
> +                       if (value == 0 ||
> +                           value > dev_priv->perf.oa.n_builtin_sets) {
> +                               DRM_ERROR("Unknown OA metric set ID");
> +                               return -EINVAL;
> +                       }
> +                       props->metrics_set = value;
> +                       break;
> +               case DRM_I915_PERF_PROP_OA_FORMAT:
> +                       if (value == 0 || value >= I915_OA_FORMAT_MAX) {
> +                               DRM_ERROR("Invalid OA report format\n");
> +                               return -EINVAL;
> +                       }
> +                       if (!dev_priv->perf.oa.oa_formats[value].size) {
> +                               DRM_ERROR("Invalid OA report format\n");
> +                               return -EINVAL;
> +                       }
> +                       props->oa_format = value;
> +                       break;
> +               case DRM_I915_PERF_PROP_OA_EXPONENT:
> +                       if (value > OA_EXPONENT_MAX) {
> +                               DRM_ERROR("OA timer exponent too high (> %u)\n",
> +                                         OA_EXPONENT_MAX);
> +                               return -EINVAL;
> +                       }
> +
> +                       /* NB: The exponent represents a period as follows:
> +                        *
> +                        *   80ns * 2^(period_exponent + 1)
> +                        *
> +                        * Theoretically we can program the OA unit to sample
> +                        * every 160ns but don't allow that by default unless
> +                        * root.
> +                        *
> +                        * Referring to perf's
> +                        * kernel.perf_event_max_sample_rate for a precedent
> +                        * (100000 by default); with an OA exponent of 6 we get
> +                        * a period of 10.240 microseconds -just under 100000Hz
> +                        */
> +                       if (value < 6 && !capable(CAP_SYS_ADMIN)) {
> +                               DRM_ERROR("Sampling period too high without root privileges\n");
Print the minimum sampling period here? Could be useful especially
when in a later patch we make this configurable.

> +                               return -EACCES;
> +                       }
> +
> +                       props->oa_periodic = true;
> +                       props->oa_period_exponent = value;
> +                       break;
>                 default:
>                         MISSING_CASE(id);
>                         DRM_ERROR("Unknown i915 perf property ID");
> @@ -426,8 +1368,33 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
>
>  void i915_perf_init(struct drm_i915_private *dev_priv)
>  {
> +       if (!IS_HASWELL(dev_priv))
> +               return;
> +
> +       hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
> +                    CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +       dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
> +       init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
> +
>         INIT_LIST_HEAD(&dev_priv->perf.streams);
>         mutex_init(&dev_priv->perf.lock);
> +       spin_lock_init(&dev_priv->perf.hook_lock);
> +
> +       dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
> +       dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
> +       dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
> +       dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
> +       dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
> +       dev_priv->perf.oa.ops.read = gen7_oa_read;
> +       dev_priv->perf.oa.ops.oa_buffer_is_empty =
> +               gen7_oa_buffer_is_empty_fop_unlocked;
> +
> +       dev_priv->perf.oa.timestamp_frequency = 12500000;
> +
> +       dev_priv->perf.oa.oa_formats = hsw_oa_formats;
> +
> +       dev_priv->perf.oa.n_builtin_sets =
> +               i915_oa_n_builtin_metric_sets_hsw;
>
>         dev_priv->perf.initialized = true;
>  }
> @@ -437,7 +1404,6 @@ void i915_perf_fini(struct drm_i915_private *dev_priv)
>         if (!dev_priv->perf.initialized)
>                 return;
>
> -       /* Currently nothing to clean up */
> -
> +       memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops));
>         dev_priv->perf.initialized = false;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 070d3297..2557b3f 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -616,6 +616,343 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>  #define HSW_CS_GPR_UDW(n)               _MMIO(0x2600 + (n) * 8 + 4)
>
>  #define GEN7_OACONTROL _MMIO(0x2360)
> +#define  GEN7_OACONTROL_CTX_MASK           0xFFFFF000
> +#define  GEN7_OACONTROL_TIMER_PERIOD_MASK   0x3F
> +#define  GEN7_OACONTROL_TIMER_PERIOD_SHIFT  6
> +#define  GEN7_OACONTROL_TIMER_ENABLE       (1<<5)
> +#define  GEN7_OACONTROL_FORMAT_A13         (0<<2)
> +#define  GEN7_OACONTROL_FORMAT_A29         (1<<2)
> +#define  GEN7_OACONTROL_FORMAT_A13_B8_C8    (2<<2)
> +#define  GEN7_OACONTROL_FORMAT_A29_B8_C8    (3<<2)
> +#define  GEN7_OACONTROL_FORMAT_B4_C8       (4<<2)
> +#define  GEN7_OACONTROL_FORMAT_A45_B8_C8    (5<<2)
> +#define  GEN7_OACONTROL_FORMAT_B4_C8_A16    (6<<2)
> +#define  GEN7_OACONTROL_FORMAT_C4_B8       (7<<2)
> +#define  GEN7_OACONTROL_FORMAT_SHIFT       2
> +#define  GEN7_OACONTROL_PER_CTX_ENABLE     (1<<1)
> +#define  GEN7_OACONTROL_ENABLE             (1<<0)
> +
> +#define GEN8_OACTXID _MMIO(0x2364)
> +
> +#define GEN8_OACONTROL _MMIO(0x2B00)
> +#define  GEN8_OA_REPORT_FORMAT_A12         (0<<2)
> +#define  GEN8_OA_REPORT_FORMAT_A12_B8_C8    (2<<2)
> +#define  GEN8_OA_REPORT_FORMAT_A36_B8_C8    (5<<2)
> +#define  GEN8_OA_REPORT_FORMAT_C4_B8       (7<<2)
> +#define  GEN8_OA_REPORT_FORMAT_SHIFT       2
> +#define  GEN8_OA_SPECIFIC_CONTEXT_ENABLE    (1<<1)
> +#define  GEN8_OA_COUNTER_ENABLE             (1<<0)
> +
> +#define GEN8_OACTXCONTROL _MMIO(0x2360)
> +#define  GEN8_OA_TIMER_PERIOD_MASK         0x3F
> +#define  GEN8_OA_TIMER_PERIOD_SHIFT        2
> +#define  GEN8_OA_TIMER_ENABLE              (1<<1)
> +#define  GEN8_OA_COUNTER_RESUME                    (1<<0)
> +
> +#define GEN7_OABUFFER _MMIO(0x23B0) /* R/W */
> +#define  GEN7_OABUFFER_OVERRUN_DISABLE     (1<<3)
> +#define  GEN7_OABUFFER_EDGE_TRIGGER        (1<<2)
> +#define  GEN7_OABUFFER_STOP_RESUME_ENABLE   (1<<1)
> +#define  GEN7_OABUFFER_RESUME              (1<<0)
> +
> +#define GEN8_OABUFFER _MMIO(0x2b14)
> +
> +#define GEN7_OASTATUS1 _MMIO(0x2364)
> +#define  GEN7_OASTATUS1_TAIL_MASK          0xffffffc0
> +#define  GEN7_OASTATUS1_COUNTER_OVERFLOW    (1<<2)
> +#define  GEN7_OASTATUS1_OABUFFER_OVERFLOW   (1<<1)
> +#define  GEN7_OASTATUS1_REPORT_LOST        (1<<0)
> +
> +#define GEN7_OASTATUS2 _MMIO(0x2368)
> +#define GEN7_OASTATUS2_HEAD_MASK    0xffffffc0
> +
> +#define GEN8_OASTATUS _MMIO(0x2b08)
> +#define  GEN8_OASTATUS_OVERRUN_STATUS      (1<<3)
> +#define  GEN8_OASTATUS_COUNTER_OVERFLOW     (1<<2)
> +#define  GEN8_OASTATUS_OABUFFER_OVERFLOW    (1<<1)
> +#define  GEN8_OASTATUS_REPORT_LOST         (1<<0)
> +
> +#define GEN8_OAHEADPTR _MMIO(0x2B0C)
> +#define GEN8_OATAILPTR _MMIO(0x2B10)
> +
> +#define OABUFFER_SIZE_128K  (0<<3)
> +#define OABUFFER_SIZE_256K  (1<<3)
> +#define OABUFFER_SIZE_512K  (2<<3)
> +#define OABUFFER_SIZE_1M    (3<<3)
> +#define OABUFFER_SIZE_2M    (4<<3)
> +#define OABUFFER_SIZE_4M    (5<<3)
> +#define OABUFFER_SIZE_8M    (6<<3)
> +#define OABUFFER_SIZE_16M   (7<<3)
> +
> +#define OA_MEM_SELECT_GGTT  (1<<0)
> +
> +#define EU_PERF_CNTL0      _MMIO(0xe458)
> +
> +#define GDT_CHICKEN_BITS    _MMIO(0x9840)
> +#define GT_NOA_ENABLE      0x00000080
> +
> +/*
> + * OA Boolean state
> + */
> +
> +#define OAREPORTTRIG1 _MMIO(0x2740)
> +#define OAREPORTTRIG1_THRESHOLD_MASK 0xffff
> +#define OAREPORTTRIG1_EDGE_LEVEL_TRIGER_SELECT_MASK 0xffff0000 /* 0=level */
> +
> +#define OAREPORTTRIG2 _MMIO(0x2744)
> +#define OAREPORTTRIG2_INVERT_A_0  (1<<0)
> +#define OAREPORTTRIG2_INVERT_A_1  (1<<1)
> +#define OAREPORTTRIG2_INVERT_A_2  (1<<2)
> +#define OAREPORTTRIG2_INVERT_A_3  (1<<3)
> +#define OAREPORTTRIG2_INVERT_A_4  (1<<4)
> +#define OAREPORTTRIG2_INVERT_A_5  (1<<5)
> +#define OAREPORTTRIG2_INVERT_A_6  (1<<6)
> +#define OAREPORTTRIG2_INVERT_A_7  (1<<7)
> +#define OAREPORTTRIG2_INVERT_A_8  (1<<8)
> +#define OAREPORTTRIG2_INVERT_A_9  (1<<9)
> +#define OAREPORTTRIG2_INVERT_A_10 (1<<10)
> +#define OAREPORTTRIG2_INVERT_A_11 (1<<11)
> +#define OAREPORTTRIG2_INVERT_A_12 (1<<12)
> +#define OAREPORTTRIG2_INVERT_A_13 (1<<13)
> +#define OAREPORTTRIG2_INVERT_A_14 (1<<14)
> +#define OAREPORTTRIG2_INVERT_A_15 (1<<15)
> +#define OAREPORTTRIG2_INVERT_B_0  (1<<16)
> +#define OAREPORTTRIG2_INVERT_B_1  (1<<17)
> +#define OAREPORTTRIG2_INVERT_B_2  (1<<18)
> +#define OAREPORTTRIG2_INVERT_B_3  (1<<19)
> +#define OAREPORTTRIG2_INVERT_C_0  (1<<20)
> +#define OAREPORTTRIG2_INVERT_C_1  (1<<21)
> +#define OAREPORTTRIG2_INVERT_D_0  (1<<22)
> +#define OAREPORTTRIG2_THRESHOLD_ENABLE     (1<<23)
> +#define OAREPORTTRIG2_REPORT_TRIGGER_ENABLE (1<<31)
> +
> +#define OAREPORTTRIG3 _MMIO(0x2748)
> +#define OAREPORTTRIG3_NOA_SELECT_MASK      0xf
> +#define OAREPORTTRIG3_NOA_SELECT_8_SHIFT    0
> +#define OAREPORTTRIG3_NOA_SELECT_9_SHIFT    4
> +#define OAREPORTTRIG3_NOA_SELECT_10_SHIFT   8
> +#define OAREPORTTRIG3_NOA_SELECT_11_SHIFT   12
> +#define OAREPORTTRIG3_NOA_SELECT_12_SHIFT   16
> +#define OAREPORTTRIG3_NOA_SELECT_13_SHIFT   20
> +#define OAREPORTTRIG3_NOA_SELECT_14_SHIFT   24
> +#define OAREPORTTRIG3_NOA_SELECT_15_SHIFT   28
> +
> +#define OAREPORTTRIG4 _MMIO(0x274c)
> +#define OAREPORTTRIG4_NOA_SELECT_MASK      0xf
> +#define OAREPORTTRIG4_NOA_SELECT_0_SHIFT    0
> +#define OAREPORTTRIG4_NOA_SELECT_1_SHIFT    4
> +#define OAREPORTTRIG4_NOA_SELECT_2_SHIFT    8
> +#define OAREPORTTRIG4_NOA_SELECT_3_SHIFT    12
> +#define OAREPORTTRIG4_NOA_SELECT_4_SHIFT    16
> +#define OAREPORTTRIG4_NOA_SELECT_5_SHIFT    20
> +#define OAREPORTTRIG4_NOA_SELECT_6_SHIFT    24
> +#define OAREPORTTRIG4_NOA_SELECT_7_SHIFT    28
> +
> +#define OAREPORTTRIG5 _MMIO(0x2750)
> +#define OAREPORTTRIG5_THRESHOLD_MASK 0xffff
> +#define OAREPORTTRIG5_EDGE_LEVEL_TRIGER_SELECT_MASK 0xffff0000 /* 0=level */
> +
> +#define OAREPORTTRIG6 _MMIO(0x2754)
> +#define OAREPORTTRIG6_INVERT_A_0  (1<<0)
> +#define OAREPORTTRIG6_INVERT_A_1  (1<<1)
> +#define OAREPORTTRIG6_INVERT_A_2  (1<<2)
> +#define OAREPORTTRIG6_INVERT_A_3  (1<<3)
> +#define OAREPORTTRIG6_INVERT_A_4  (1<<4)
> +#define OAREPORTTRIG6_INVERT_A_5  (1<<5)
> +#define OAREPORTTRIG6_INVERT_A_6  (1<<6)
> +#define OAREPORTTRIG6_INVERT_A_7  (1<<7)
> +#define OAREPORTTRIG6_INVERT_A_8  (1<<8)
> +#define OAREPORTTRIG6_INVERT_A_9  (1<<9)
> +#define OAREPORTTRIG6_INVERT_A_10 (1<<10)
> +#define OAREPORTTRIG6_INVERT_A_11 (1<<11)
> +#define OAREPORTTRIG6_INVERT_A_12 (1<<12)
> +#define OAREPORTTRIG6_INVERT_A_13 (1<<13)
> +#define OAREPORTTRIG6_INVERT_A_14 (1<<14)
> +#define OAREPORTTRIG6_INVERT_A_15 (1<<15)
> +#define OAREPORTTRIG6_INVERT_B_0  (1<<16)
> +#define OAREPORTTRIG6_INVERT_B_1  (1<<17)
> +#define OAREPORTTRIG6_INVERT_B_2  (1<<18)
> +#define OAREPORTTRIG6_INVERT_B_3  (1<<19)
> +#define OAREPORTTRIG6_INVERT_C_0  (1<<20)
> +#define OAREPORTTRIG6_INVERT_C_1  (1<<21)
> +#define OAREPORTTRIG6_INVERT_D_0  (1<<22)
> +#define OAREPORTTRIG6_THRESHOLD_ENABLE     (1<<23)
> +#define OAREPORTTRIG6_REPORT_TRIGGER_ENABLE (1<<31)
> +
> +#define OAREPORTTRIG7 _MMIO(0x2758)
> +#define OAREPORTTRIG7_NOA_SELECT_MASK      0xf
> +#define OAREPORTTRIG7_NOA_SELECT_8_SHIFT    0
> +#define OAREPORTTRIG7_NOA_SELECT_9_SHIFT    4
> +#define OAREPORTTRIG7_NOA_SELECT_10_SHIFT   8
> +#define OAREPORTTRIG7_NOA_SELECT_11_SHIFT   12
> +#define OAREPORTTRIG7_NOA_SELECT_12_SHIFT   16
> +#define OAREPORTTRIG7_NOA_SELECT_13_SHIFT   20
> +#define OAREPORTTRIG7_NOA_SELECT_14_SHIFT   24
> +#define OAREPORTTRIG7_NOA_SELECT_15_SHIFT   28
> +
> +#define OAREPORTTRIG8 _MMIO(0x275c)
> +#define OAREPORTTRIG8_NOA_SELECT_MASK      0xf
> +#define OAREPORTTRIG8_NOA_SELECT_0_SHIFT    0
> +#define OAREPORTTRIG8_NOA_SELECT_1_SHIFT    4
> +#define OAREPORTTRIG8_NOA_SELECT_2_SHIFT    8
> +#define OAREPORTTRIG8_NOA_SELECT_3_SHIFT    12
> +#define OAREPORTTRIG8_NOA_SELECT_4_SHIFT    16
> +#define OAREPORTTRIG8_NOA_SELECT_5_SHIFT    20
> +#define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
> +#define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
> +
> +#define OASTARTTRIG1 _MMIO(0x2710)
> +#define OASTARTTRIG1_THRESHOLD_COUNT_MASK_MBZ 0xffff0000
> +#define OASTARTTRIG1_THRESHOLD_MASK          0xffff
> +
> +#define OASTARTTRIG2 _MMIO(0x2714)
> +#define OASTARTTRIG2_INVERT_A_0 (1<<0)
> +#define OASTARTTRIG2_INVERT_A_1 (1<<1)
> +#define OASTARTTRIG2_INVERT_A_2 (1<<2)
> +#define OASTARTTRIG2_INVERT_A_3 (1<<3)
> +#define OASTARTTRIG2_INVERT_A_4 (1<<4)
> +#define OASTARTTRIG2_INVERT_A_5 (1<<5)
> +#define OASTARTTRIG2_INVERT_A_6 (1<<6)
> +#define OASTARTTRIG2_INVERT_A_7 (1<<7)
> +#define OASTARTTRIG2_INVERT_A_8 (1<<8)
> +#define OASTARTTRIG2_INVERT_A_9 (1<<9)
> +#define OASTARTTRIG2_INVERT_A_10 (1<<10)
> +#define OASTARTTRIG2_INVERT_A_11 (1<<11)
> +#define OASTARTTRIG2_INVERT_A_12 (1<<12)
> +#define OASTARTTRIG2_INVERT_A_13 (1<<13)
> +#define OASTARTTRIG2_INVERT_A_14 (1<<14)
> +#define OASTARTTRIG2_INVERT_A_15 (1<<15)
> +#define OASTARTTRIG2_INVERT_B_0 (1<<16)
> +#define OASTARTTRIG2_INVERT_B_1 (1<<17)
> +#define OASTARTTRIG2_INVERT_B_2 (1<<18)
> +#define OASTARTTRIG2_INVERT_B_3 (1<<19)
> +#define OASTARTTRIG2_INVERT_C_0 (1<<20)
> +#define OASTARTTRIG2_INVERT_C_1 (1<<21)
> +#define OASTARTTRIG2_INVERT_D_0 (1<<22)
> +#define OASTARTTRIG2_THRESHOLD_ENABLE      (1<<23)
> +#define OASTARTTRIG2_START_TRIG_FLAG_MBZ    (1<<24)
> +#define OASTARTTRIG2_EVENT_SELECT_0  (1<<28)
> +#define OASTARTTRIG2_EVENT_SELECT_1  (1<<29)
> +#define OASTARTTRIG2_EVENT_SELECT_2  (1<<30)
> +#define OASTARTTRIG2_EVENT_SELECT_3  (1<<31)
> +
> +#define OASTARTTRIG3 _MMIO(0x2718)
> +#define OASTARTTRIG3_NOA_SELECT_MASK      0xf
> +#define OASTARTTRIG3_NOA_SELECT_8_SHIFT    0
> +#define OASTARTTRIG3_NOA_SELECT_9_SHIFT    4
> +#define OASTARTTRIG3_NOA_SELECT_10_SHIFT   8
> +#define OASTARTTRIG3_NOA_SELECT_11_SHIFT   12
> +#define OASTARTTRIG3_NOA_SELECT_12_SHIFT   16
> +#define OASTARTTRIG3_NOA_SELECT_13_SHIFT   20
> +#define OASTARTTRIG3_NOA_SELECT_14_SHIFT   24
> +#define OASTARTTRIG3_NOA_SELECT_15_SHIFT   28
> +
> +#define OASTARTTRIG4 _MMIO(0x271c)
> +#define OASTARTTRIG4_NOA_SELECT_MASK       0xf
> +#define OASTARTTRIG4_NOA_SELECT_0_SHIFT    0
> +#define OASTARTTRIG4_NOA_SELECT_1_SHIFT    4
> +#define OASTARTTRIG4_NOA_SELECT_2_SHIFT    8
> +#define OASTARTTRIG4_NOA_SELECT_3_SHIFT    12
> +#define OASTARTTRIG4_NOA_SELECT_4_SHIFT    16
> +#define OASTARTTRIG4_NOA_SELECT_5_SHIFT    20
> +#define OASTARTTRIG4_NOA_SELECT_6_SHIFT    24
> +#define OASTARTTRIG4_NOA_SELECT_7_SHIFT    28
> +
> +#define OASTARTTRIG5 _MMIO(0x2720)
> +#define OASTARTTRIG5_THRESHOLD_COUNT_MASK_MBZ 0xffff0000
> +#define OASTARTTRIG5_THRESHOLD_MASK          0xffff
> +
> +#define OASTARTTRIG6 _MMIO(0x2724)
> +#define OASTARTTRIG6_INVERT_A_0 (1<<0)
> +#define OASTARTTRIG6_INVERT_A_1 (1<<1)
> +#define OASTARTTRIG6_INVERT_A_2 (1<<2)
> +#define OASTARTTRIG6_INVERT_A_3 (1<<3)
> +#define OASTARTTRIG6_INVERT_A_4 (1<<4)
> +#define OASTARTTRIG6_INVERT_A_5 (1<<5)
> +#define OASTARTTRIG6_INVERT_A_6 (1<<6)
> +#define OASTARTTRIG6_INVERT_A_7 (1<<7)
> +#define OASTARTTRIG6_INVERT_A_8 (1<<8)
> +#define OASTARTTRIG6_INVERT_A_9 (1<<9)
> +#define OASTARTTRIG6_INVERT_A_10 (1<<10)
> +#define OASTARTTRIG6_INVERT_A_11 (1<<11)
> +#define OASTARTTRIG6_INVERT_A_12 (1<<12)
> +#define OASTARTTRIG6_INVERT_A_13 (1<<13)
> +#define OASTARTTRIG6_INVERT_A_14 (1<<14)
> +#define OASTARTTRIG6_INVERT_A_15 (1<<15)
> +#define OASTARTTRIG6_INVERT_B_0 (1<<16)
> +#define OASTARTTRIG6_INVERT_B_1 (1<<17)
> +#define OASTARTTRIG6_INVERT_B_2 (1<<18)
> +#define OASTARTTRIG6_INVERT_B_3 (1<<19)
> +#define OASTARTTRIG6_INVERT_C_0 (1<<20)
> +#define OASTARTTRIG6_INVERT_C_1 (1<<21)
> +#define OASTARTTRIG6_INVERT_D_0 (1<<22)
> +#define OASTARTTRIG6_THRESHOLD_ENABLE      (1<<23)
> +#define OASTARTTRIG6_START_TRIG_FLAG_MBZ    (1<<24)
> +#define OASTARTTRIG6_EVENT_SELECT_4  (1<<28)
> +#define OASTARTTRIG6_EVENT_SELECT_5  (1<<29)
> +#define OASTARTTRIG6_EVENT_SELECT_6  (1<<30)
> +#define OASTARTTRIG6_EVENT_SELECT_7  (1<<31)
> +
> +#define OASTARTTRIG7 _MMIO(0x2728)
> +#define OASTARTTRIG7_NOA_SELECT_MASK      0xf
> +#define OASTARTTRIG7_NOA_SELECT_8_SHIFT    0
> +#define OASTARTTRIG7_NOA_SELECT_9_SHIFT    4
> +#define OASTARTTRIG7_NOA_SELECT_10_SHIFT   8
> +#define OASTARTTRIG7_NOA_SELECT_11_SHIFT   12
> +#define OASTARTTRIG7_NOA_SELECT_12_SHIFT   16
> +#define OASTARTTRIG7_NOA_SELECT_13_SHIFT   20
> +#define OASTARTTRIG7_NOA_SELECT_14_SHIFT   24
> +#define OASTARTTRIG7_NOA_SELECT_15_SHIFT   28
> +
> +#define OASTARTTRIG8 _MMIO(0x272c)
> +#define OASTARTTRIG8_NOA_SELECT_MASK      0xf
> +#define OASTARTTRIG8_NOA_SELECT_0_SHIFT    0
> +#define OASTARTTRIG8_NOA_SELECT_1_SHIFT    4
> +#define OASTARTTRIG8_NOA_SELECT_2_SHIFT    8
> +#define OASTARTTRIG8_NOA_SELECT_3_SHIFT    12
> +#define OASTARTTRIG8_NOA_SELECT_4_SHIFT    16
> +#define OASTARTTRIG8_NOA_SELECT_5_SHIFT    20
> +#define OASTARTTRIG8_NOA_SELECT_6_SHIFT    24
> +#define OASTARTTRIG8_NOA_SELECT_7_SHIFT    28
> +
> +/* CECX_0 */
> +#define OACEC_COMPARE_LESS_OR_EQUAL    6
> +#define OACEC_COMPARE_NOT_EQUAL                5
> +#define OACEC_COMPARE_LESS_THAN                4
> +#define OACEC_COMPARE_GREATER_OR_EQUAL 3
> +#define OACEC_COMPARE_EQUAL            2
> +#define OACEC_COMPARE_GREATER_THAN     1
> +#define OACEC_COMPARE_ANY_EQUAL                0
> +
> +#define OACEC_COMPARE_VALUE_MASK    0xffff
> +#define OACEC_COMPARE_VALUE_SHIFT   3
> +
> +#define OACEC_SELECT_NOA       (0<<19)
> +#define OACEC_SELECT_PREV      (1<<19)
> +#define OACEC_SELECT_BOOLEAN   (2<<19)
> +
> +/* CECX_1 */
> +#define OACEC_MASK_MASK                    0xffff
> +#define OACEC_CONSIDERATIONS_MASK   0xffff
> +#define OACEC_CONSIDERATIONS_SHIFT  16
> +
> +#define OACEC0_0 _MMIO(0x2770)
> +#define OACEC0_1 _MMIO(0x2774)
> +#define OACEC1_0 _MMIO(0x2778)
> +#define OACEC1_1 _MMIO(0x277c)
> +#define OACEC2_0 _MMIO(0x2780)
> +#define OACEC2_1 _MMIO(0x2784)
> +#define OACEC3_0 _MMIO(0x2788)
> +#define OACEC3_1 _MMIO(0x278c)
> +#define OACEC4_0 _MMIO(0x2790)
> +#define OACEC4_1 _MMIO(0x2794)
> +#define OACEC5_0 _MMIO(0x2798)
> +#define OACEC5_1 _MMIO(0x279c)
> +#define OACEC6_0 _MMIO(0x27a0)
> +#define OACEC6_1 _MMIO(0x27a4)
> +#define OACEC7_0 _MMIO(0x27a8)
> +#define OACEC7_1 _MMIO(0x27ac)
> +
>
>  #define _GEN7_PIPEA_DE_LOAD_SL 0x70068
>  #define _GEN7_PIPEB_DE_LOAD_SL 0x71068
> @@ -6982,6 +7319,7 @@ enum {
>  # define GEN6_RCCUNIT_CLOCK_GATE_DISABLE               (1 << 11)
>
>  #define GEN6_UCGCTL3                           _MMIO(0x9408)
> +# define GEN6_OACSUNIT_CLOCK_GATE_DISABLE              (1 << 20)
>
>  #define GEN7_UCGCTL4                           _MMIO(0x940c)
>  #define  GEN7_L3BANK2X_CLOCK_GATE_DISABLE      (1<<25)
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 98cd493..bf3b8e2 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1224,6 +1224,18 @@ struct drm_i915_gem_context_param {
>         __u64 value;
>  };
>
> +enum drm_i915_oa_format {
> +       I915_OA_FORMAT_A13 = 1,
> +       I915_OA_FORMAT_A29,
> +       I915_OA_FORMAT_A13_B8_C8,
> +       I915_OA_FORMAT_B4_C8,
> +       I915_OA_FORMAT_A45_B8_C8,
> +       I915_OA_FORMAT_B4_C8_A16,
> +       I915_OA_FORMAT_C4_B8,
> +
> +       I915_OA_FORMAT_MAX          /* non-ABI */
> +};
> +
>  enum drm_i915_perf_property_id {
>         /**
>          * Open the stream for a specific context handle (as used with
> @@ -1232,6 +1244,32 @@ enum drm_i915_perf_property_id {
>          */
>         DRM_I915_PERF_PROP_CTX_HANDLE = 1,
>
> +       /**
> +        * A value of 1 requests the inclusion of raw OA unit reports as
> +        * part of stream samples.
> +        */
> +       DRM_I915_PERF_PROP_SAMPLE_OA,
> +
> +       /**
> +        * The value specifies which set of OA unit metrics should be
> +        * be configured, defining the contents of any OA unit reports.
> +        */
> +       DRM_I915_PERF_PROP_OA_METRICS_SET,
> +
> +       /**
> +        * The value specifies the size and layout of OA unit reports.
> +        */
> +       DRM_I915_PERF_PROP_OA_FORMAT,
> +
> +       /**
> +        * Specifying this property implicitly requests periodic OA unit
> +        * sampling and (at least on Haswell) the sampling frequency is derived
> +        * from this exponent as follows:
> +        *
> +        *   80ns * 2^(period_exponent + 1)
> +        */
> +       DRM_I915_PERF_PROP_OA_EXPONENT,
> +
>         DRM_I915_PERF_PROP_MAX /* non-ABI */
>  };
>
> @@ -1251,7 +1289,22 @@ struct drm_i915_perf_open_param {
>         __u64 __user properties_ptr;
>  };
>
> +/**
> + * Enable data capture for a stream that was either opened in a disabled state
> + * via I915_PERF_FLAG_DIABLED or was later disabled via I915_PERF_IOCTL_DISABLE.
I915_PERF_FLAG_DISABLED

> + *
> + * It is intended to be cheaper to disable and enable a stream than it may be
> + * to close and re-open a stream with the same configuration.
> + *
> + * It's undefined whether any pending data for the stream will be lost.
> + */
>  #define I915_PERF_IOCTL_ENABLE _IO('i', 0x0)
> +
> +/**
> + * Disable data capture for a stream.
> + *
> + * It is an error to try and read a stream that is disabled.
> + */
>  #define I915_PERF_IOCTL_DISABLE        _IO('i', 0x1)
>
>  /**
> @@ -1275,17 +1328,30 @@ enum drm_i915_perf_record_type {
>          * every sample.
>          *
>          * The order of these sample properties given by userspace has no
> -        * affect on the ordering of data within a sample. The order will be
> +        * affect on the ordering of data within a sample. The order is
>          * documented here.
>          *
>          * struct {
>          *     struct drm_i915_perf_record_header header;
>          *
> -        *     TODO: itemize extensible sample data here
> +        *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
>          * };
>          */
>         DRM_I915_PERF_RECORD_SAMPLE = 1,
>
> +       /*
> +        * Indicates that one or more OA reports were not written by the
> +        * hardware. This can happen for example if an MI_REPORT_PERF_COUNT
> +        * command collides with periodic sampling - which would be more likely
> +        * at higher sampling frequencies.
> +        */
> +       DRM_I915_PERF_RECORD_OA_REPORT_LOST = 2,
> +
> +       /**
> +        * An error occurred that resulted in all pending OA reports being lost.
> +        */
> +       DRM_I915_PERF_RECORD_OA_BUFFER_LOST = 3,
> +
>         DRM_I915_PERF_RECORD_MAX /* non-ABI */
>  };
>
> --
> 2.10.1
>

Otherwise looks good.
Chris Wilson Oct. 25, 2016, 11:05 p.m. UTC | #2
On Tue, Oct 25, 2016 at 12:19:29AM +0100, Robert Bragg wrote:
> +static int claim_specific_ctx(struct i915_perf_stream *stream)
> +{
> +	struct drm_i915_private *dev_priv = stream->dev_priv;
> +	struct i915_vma *vma;
> +	int ret;
> +
> +	ret = i915_mutex_lock_interruptible(&dev_priv->drm);

Looking forward to the day these don't need struct_mutex.

> +	if (ret)
> +		return ret;
> +
> +	/* So that we don't have to worry about updating the context ID
> +	 * in OACONTOL on the fly we make sure to pin the context
> +	 * upfront for the lifetime of the stream...
> +	 */
> +	vma = stream->ctx->engine[RCS].state;

There's a caveat here that suggests I had better wrap up this into its
own function. (We need to flush dirty cachelines to memory on first
binding of the context.)

> +	ret = i915_vma_pin(vma, 0, stream->ctx->ggtt_alignment,
> +			   PIN_GLOBAL | PIN_HIGH);
> +	if (ret)
> +		return ret;

Oops.

> +
> +	dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(vma);
> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +
> +	return 0;
> +}


> +static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
> +{
> +	struct drm_i915_gem_object *bo;
> +	struct i915_vma *vma;
> +	int ret;
> +
> +	BUG_ON(dev_priv->perf.oa.oa_buffer.vma);
> +
> +	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> +	if (ret)
> +		return ret;
> +
> +	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
> +	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
> +
> +	bo = i915_gem_object_create(&dev_priv->drm, OA_BUFFER_SIZE);
> +	if (IS_ERR(bo)) {
> +		DRM_ERROR("Failed to allocate OA buffer\n");
> +		ret = PTR_ERR(bo);
> +		goto unlock;
> +	}
> +
> +	ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> +	if (ret)
> +		goto err_unref;
> +
> +	/* PreHSW required 512K alignment, HSW requires 16M */
> +	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, PIN_MAPPABLE);

Does this need mappable aperture space for OA? You aren't accessing it
via the aperture, but is the hw limited to it?

> +	if (IS_ERR(vma)) {
> +		ret = PTR_ERR(vma);
> +		goto err_unref;
> +	}
> +	dev_priv->perf.oa.oa_buffer.vma = vma;
> +
> +	dev_priv->perf.oa.oa_buffer.vaddr =
> +		i915_gem_object_pin_map(bo, I915_MAP_WB);
> +	if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
> +		ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
> +		goto err_unpin;
> +	}
> +
> +	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
> +
> +	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
> +			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
> +			 dev_priv->perf.oa.oa_buffer.vaddr);
> +
> +	goto unlock;
> +
> +err_unpin:
> +	__i915_vma_unpin(vma);
> +
> +err_unref:
> +	i915_gem_object_put(bo);
> +
> +	dev_priv->perf.oa.oa_buffer.vaddr = NULL;
> +	dev_priv->perf.oa.oa_buffer.vma = NULL;
> +
> +unlock:
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +	return ret;
> +}


> +	if (ret >= 0) {
> +		/* Maybe make ->pollin per-stream state if we support multiple
> +		 * concurrent streams in the future. */
> +		atomic_set(&dev_priv->perf.oa.pollin, false);
> +	}
> +
>  	return ret;
>  }
>  
> -static unsigned int i915_perf_poll_locked(struct i915_perf_stream *stream,
> +static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(hrtimer, typeof(*dev_priv),
> +			     perf.oa.poll_check_timer);
> +
> +	if (!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv)) {
> +		atomic_set(&dev_priv->perf.oa.pollin, true);
> +		wake_up(&dev_priv->perf.oa.poll_wq);
> +	}
> +
> +	hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
> +
> +	return HRTIMER_RESTART;
> +}
> +
> +static unsigned int i915_perf_poll_locked(struct drm_i915_private *dev_priv,
> +					  struct i915_perf_stream *stream,
>  					  struct file *file,
>  					  poll_table *wait)
>  {
> -	unsigned int streams = 0;
> +	unsigned int events = 0;
>  
>  	stream->ops->poll_wait(stream, file, wait);
>  
> -	if (stream->ops->can_read(stream))
> -		streams |= POLLIN;
> +	/* Note: we don't explicitly check whether there's something to read
> +	 * here since this path may be very hot depending on what else
> +	 * userspace is polling, or on the timeout in use. We rely solely on
> +	 * the hrtimer/oa_poll_check_timer_cb to notify us when there are
> +	 * samples to read.
> +	 */
> +	if (atomic_read(&dev_priv->perf.oa.pollin))
> +		events |= POLLIN;

The atomic_set() and atomic_read() are superfluous, they don't even
impose any memory barriers. The required barrier here is from wake_up().

You can just use dev_priv->perf.ao.pollin = true; WRITE_ONCE() /
READ_ONCE() if you want to clearly show that it is outside of the lock
and barriers are imposed elsewhere.


> @@ -285,18 +1177,18 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
> +	/* we avoid simply assigning stream->sample_flags = props->sample_flags
> +	 * to have _stream_init check the combination of sample flags more
> +	 * thoroughly, but still this is the expected result at this point.
>  	 */
> -	DRM_ERROR("Unsupported i915 perf stream configuration\n");
> -	ret = -EINVAL;
> -	goto err_alloc;
> +	BUG_ON(stream->sample_flags != props->sample_flags);

	if (WARN_ON(...)) {
		ret = -ENODEV;
		goto err_alloc;
	}

just to avoid checkpatch complaining.
-Chris
Chris Wilson Oct. 25, 2016, 11:12 p.m. UTC | #3
On Wed, Oct 26, 2016 at 12:05:44AM +0100, Chris Wilson wrote:
> On Tue, Oct 25, 2016 at 12:19:29AM +0100, Robert Bragg wrote:
> > +	/* So that we don't have to worry about updating the context ID
> > +	 * in OACONTOL on the fly we make sure to pin the context
> > +	 * upfront for the lifetime of the stream...
> > +	 */
> > +	vma = stream->ctx->engine[RCS].state;
> 
> There's a caveat here that suggests I had better wrap up this into its
> own function. (We need to flush dirty cachelines to memory on first
> binding of the context.)

Not that actually affects hsw.
-Chris
Robert Bragg Oct. 25, 2016, 11:51 p.m. UTC | #4
On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld <
matthew.william.auld@gmail.com> wrote:

> On 25 October 2016 at 00:19, Robert Bragg <robert@sixbynine.org> wrote:



>
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> b/drivers/gpu/drm/i915/i915_drv.h
> > index 3448d05..ea24814 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
>
> >
> >  struct drm_i915_private {
> > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
> >
> >         struct {
> >                 bool initialized;
> > +
> >                 struct mutex lock;
> >                 struct list_head streams;
> >
> > +               spinlock_t hook_lock;
> > +
> >                 struct {
> > -                       u32 metrics_set;
> > +                       struct i915_perf_stream *exclusive_stream;
> > +
> > +                       u32 specific_ctx_id;
> Can we just get rid of this, now that the vma remains pinned we can
> simply get the ggtt address at the time of configuring the OA_CONTROL
> register ?
>

I considered that, but would ideally prefer to keep it considering the
gen8+ patches to come. For gen8+ (with execlists) the context ID isn't a
gtt offset.


>
> > +
> > +                       struct hrtimer poll_check_timer;
> > +                       wait_queue_head_t poll_wq;
> > +                       atomic_t pollin;
> > +
>
>

> > +/* The maximum exponent the hardware accepts is 63 (essentially it
> selects one
> > + * of the 64bit timestamp bits to trigger reports from) but there's
> currently
> > + * no known use case for sampling as infrequently as once per 47
> thousand years.
> > + *
> > + * Since the timestamps included in OA reports are only 32bits it seems
> > + * reasonable to limit the OA exponent where it's still possible to
> account for
> > + * overflow in OA report timestamps.
> > + */
> > +#define OA_EXPONENT_MAX 31
> > +
> > +#define INVALID_CTX_ID 0xffffffff
> We shouldn't need this anymore.
>

yeah I removed it and then added it back, just for the sake of explicitly
setting the specific_ctx_id to an invalid ID when closing the exclusive
stream - though resetting the value isn't strictly necessary.

also maybe your comment is assuming specific_ctx_id can be removed, while
I'd prefer to keep it.


> > +
> > +static int claim_specific_ctx(struct i915_perf_stream *stream)
> > +{
> pin_oa_specific_ctx, or something? Also would it not make more sense
> to operate on the context, not the stream.
>

Yeah, I avoided a name like that mainly because it's also initializing
specific_ctx_id, which seemed to me like it would become an unexpected side
effect with that more specific name.

The other consideration is that in my gen8+ patches the pinning code is
conditional depending on whether execlists are enabled, while the function
still initializes specific_ctx_id.

Certainly not attached to the names though.

Chris has some feedback with the code, so maybe that will affect this too.


> > +       struct drm_i915_private *dev_priv = stream->dev_priv;
> > +       struct i915_vma *vma;
> > +       int ret;
> > +
> > +       ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> > +       if (ret)
> > +               return ret;
> > +
> > +       /* So that we don't have to worry about updating the context ID
> > +        * in OACONTOL on the fly we make sure to pin the context
> > +        * upfront for the lifetime of the stream...
> > +        */
> > +       vma = stream->ctx->engine[RCS].state;
> > +       ret = i915_vma_pin(vma, 0, stream->ctx->ggtt_alignment,
> > +                          PIN_GLOBAL | PIN_HIGH);
> > +       if (ret)
> > +               return ret;
> > +
> > +       dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(vma);
> > +
> > +       mutex_unlock(&dev_priv->drm.struct_mutex);
> > +
> > +       return 0;
> > +}
>


I'll also follow up on the other notes; thanks!

- Robert
Chris Wilson Oct. 26, 2016, 8:54 a.m. UTC | #5
On Wed, Oct 26, 2016 at 12:51:58AM +0100, Robert Bragg wrote:
>    On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
>    <[1]matthew.william.auld@gmail.com> wrote:
> 
>      On 25 October 2016 at 00:19, Robert Bragg <[2]robert@sixbynine.org>
>      wrote:
> 
>     
> 
>      > diff --git a/drivers/gpu/drm/i915/i915_drv.h
>      b/drivers/gpu/drm/i915/i915_drv.h
>      > index 3448d05..ea24814 100644
>      > --- a/drivers/gpu/drm/i915/i915_drv.h
>      > +++ b/drivers/gpu/drm/i915/i915_drv.h
>      > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
> 
>      >
>      >  struct drm_i915_private {
>      > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
>      >
>      >         struct {
>      >                 bool initialized;
>      > +
>      >                 struct mutex lock;
>      >                 struct list_head streams;
>      >
>      > +               spinlock_t hook_lock;
>      > +
>      >                 struct {
>      > -                       u32 metrics_set;
>      > +                       struct i915_perf_stream *exclusive_stream;
>      > +
>      > +                       u32 specific_ctx_id;
>      Can we just get rid of this, now that the vma remains pinned we can
>      simply get the ggtt address at the time of configuring the OA_CONTROL
>      register ?
> 
>    I considered that, but would ideally prefer to keep it considering the
>    gen8+ patches to come. For gen8+ (with execlists) the context ID isn't a
>    gtt offset.

In terms of symmetry, keeping the vma you pinned and unpinning the same
later makes its ownership much clearer. (And I do want the owner of each
pin to be clear, for when we start enabling debug to catch the VMA
leaks.)
-Chris
Matthew Auld Oct. 26, 2016, 10:08 a.m. UTC | #6
On 26 October 2016 at 00:51, Robert Bragg <robert@sixbynine.org> wrote:
>
>
> On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
> <matthew.william.auld@gmail.com> wrote:
>>
>> On 25 October 2016 at 00:19, Robert Bragg <robert@sixbynine.org> wrote:
>
>
>>
>>
>> > diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> > b/drivers/gpu/drm/i915/i915_drv.h
>> > index 3448d05..ea24814 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
>>
>> >
>> >  struct drm_i915_private {
>> > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
>> >
>> >         struct {
>> >                 bool initialized;
>> > +
>> >                 struct mutex lock;
>> >                 struct list_head streams;
>> >
>> > +               spinlock_t hook_lock;
>> > +
>> >                 struct {
>> > -                       u32 metrics_set;
>> > +                       struct i915_perf_stream *exclusive_stream;
>> > +
>> > +                       u32 specific_ctx_id;
>> Can we just get rid of this, now that the vma remains pinned we can
>> simply get the ggtt address at the time of configuring the OA_CONTROL
>> register ?
>
>
> I considered that, but would ideally prefer to keep it considering the gen8+
> patches to come. For gen8+ (with execlists) the context ID isn't a gtt
> offset.
>
>>
>>
>> > +
>> > +                       struct hrtimer poll_check_timer;
>> > +                       wait_queue_head_t poll_wq;
>> > +                       atomic_t pollin;
>> > +
>>
>
>>
>> > +/* The maximum exponent the hardware accepts is 63 (essentially it
>> > selects one
>> > + * of the 64bit timestamp bits to trigger reports from) but there's
>> > currently
>> > + * no known use case for sampling as infrequently as once per 47
>> > thousand years.
>> > + *
>> > + * Since the timestamps included in OA reports are only 32bits it seems
>> > + * reasonable to limit the OA exponent where it's still possible to
>> > account for
>> > + * overflow in OA report timestamps.
>> > + */
>> > +#define OA_EXPONENT_MAX 31
>> > +
>> > +#define INVALID_CTX_ID 0xffffffff
>> We shouldn't need this anymore.
>
>
> yeah I removed it and then added it back, just for the sake of explicitly
> setting the specific_ctx_id to an invalid ID when closing the exclusive
> stream - though resetting the value isn't strictly necessary.
Can we not make the specific_ctx_id per-stream, the gem context
already is, then we don't need to be concerned with resetting it ?
Robert Bragg Oct. 26, 2016, 3:03 p.m. UTC | #7
On 26 Oct 2016 11:08 a.m., "Matthew Auld" <matthew.william.auld@gmail.com>
wrote:
>
> On 26 October 2016 at 00:51, Robert Bragg <robert@sixbynine.org> wrote:
> >
> >
> > On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
> > <matthew.william.auld@gmail.com> wrote:
> >>
> >> On 25 October 2016 at 00:19, Robert Bragg <robert@sixbynine.org> wrote:
> >
> >
> >>
> >>
> >> > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> >> > b/drivers/gpu/drm/i915/i915_drv.h
> >> > index 3448d05..ea24814 100644
> >> > --- a/drivers/gpu/drm/i915/i915_drv.h
> >> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> >> > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
> >>
> >> >
> >> >  struct drm_i915_private {
> >> > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
> >> >
> >> >         struct {
> >> >                 bool initialized;
> >> > +
> >> >                 struct mutex lock;
> >> >                 struct list_head streams;
> >> >
> >> > +               spinlock_t hook_lock;
> >> > +
> >> >                 struct {
> >> > -                       u32 metrics_set;
> >> > +                       struct i915_perf_stream *exclusive_stream;
> >> > +
> >> > +                       u32 specific_ctx_id;
> >> Can we just get rid of this, now that the vma remains pinned we can
> >> simply get the ggtt address at the time of configuring the OA_CONTROL
> >> register ?
> >
> >
> > I considered that, but would ideally prefer to keep it considering the
gen8+
> > patches to come. For gen8+ (with execlists) the context ID isn't a gtt
> > offset.
> >
> >>
> >>
> >> > +
> >> > +                       struct hrtimer poll_check_timer;
> >> > +                       wait_queue_head_t poll_wq;
> >> > +                       atomic_t pollin;
> >> > +
> >>
> >
> >>
> >> > +/* The maximum exponent the hardware accepts is 63 (essentially it
> >> > selects one
> >> > + * of the 64bit timestamp bits to trigger reports from) but there's
> >> > currently
> >> > + * no known use case for sampling as infrequently as once per 47
> >> > thousand years.
> >> > + *
> >> > + * Since the timestamps included in OA reports are only 32bits it
seems
> >> > + * reasonable to limit the OA exponent where it's still possible to
> >> > account for
> >> > + * overflow in OA report timestamps.
> >> > + */
> >> > +#define OA_EXPONENT_MAX 31
> >> > +
> >> > +#define INVALID_CTX_ID 0xffffffff
> >> We shouldn't need this anymore.
> >
> >
> > yeah I removed it and then added it back, just for the sake of
explicitly
> > setting the specific_ctx_id to an invalid ID when closing the exclusive
> > stream - though resetting the value isn't strictly necessary.
> Can we not make the specific_ctx_id per-stream, the gem context
> already is, then we don't need to be concerned with resetting it ?

Hmm, I'm not sure about that, conceptually to me it's global OA unit state.

Currently the driver only supports a single exclusive stream, while Sourab
later relaxes that to a per-engine stream and that could be relaxed further
with non-oa metric stream types.

With multiple streams we'll still only be able to programmer a single ctx
id in oacontol.

Conceptually to me, other stream types could be associated with different
contexts (if they don't depend on the OA unit) so to me stream->ctx isn't
necessarily OA unit state.

It probably could be played around with, but right now we don't track OA
specific state in the stream. For the ID it's just semantics to say it's OA
state, and we could consider that it's maybe generally useful to track the
ID, even for future non-oa streams. That might mean potentially redundantly
pinning state for the sake of tracking the ID for streams that don't end up
needing it.
Robert Bragg Oct. 26, 2016, 3:17 p.m. UTC | #8
On 26 Oct 2016 9:54 a.m., "Chris Wilson" <chris@chris-wilson.co.uk> wrote:
>
> On Wed, Oct 26, 2016 at 12:51:58AM +0100, Robert Bragg wrote:
> >    On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
> >    <[1]matthew.william.auld@gmail.com> wrote:
> >
> >      On 25 October 2016 at 00:19, Robert Bragg <[2]robert@sixbynine.org>
> >      wrote:
> >
> >
> >
> >      > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> >      b/drivers/gpu/drm/i915/i915_drv.h
> >      > index 3448d05..ea24814 100644
> >      > --- a/drivers/gpu/drm/i915/i915_drv.h
> >      > +++ b/drivers/gpu/drm/i915/i915_drv.h
> >      > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
> >
> >      >
> >      >  struct drm_i915_private {
> >      > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
> >      >
> >      >         struct {
> >      >                 bool initialized;
> >      > +
> >      >                 struct mutex lock;
> >      >                 struct list_head streams;
> >      >
> >      > +               spinlock_t hook_lock;
> >      > +
> >      >                 struct {
> >      > -                       u32 metrics_set;
> >      > +                       struct i915_perf_stream
*exclusive_stream;
> >      > +
> >      > +                       u32 specific_ctx_id;
> >      Can we just get rid of this, now that the vma remains pinned we can
> >      simply get the ggtt address at the time of configuring the
OA_CONTROL
> >      register ?
> >
> >    I considered that, but would ideally prefer to keep it considering
the
> >    gen8+ patches to come. For gen8+ (with execlists) the context ID
isn't a
> >    gtt offset.
>
> In terms of symmetry, keeping the vma you pinned and unpinning the same
> later makes its ownership much clearer. (And I do want the owner of each
> pin to be clear, for when we start enabling debug to catch the VMA
> leaks.)

Keeping our own pointer to the pinned vma could be a clarification.

Considering Matt's comments too, I'm thinking I'll put the pinning and
specific_ctx_id initialization together with setting stream->ctx, keeping
the state together under the stream. It's going to potentially mean
redundantly pinning the ctx for the sake of the ID in the future for
streams that don't really need it, but I think it's probably not worth
worrying about that.

- Robert

> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
Ville Syrjala Oct. 26, 2016, 3:37 p.m. UTC | #9
On Wed, Oct 26, 2016 at 04:17:45PM +0100, Robert Bragg wrote:
> On 26 Oct 2016 9:54 a.m., "Chris Wilson" <chris@chris-wilson.co.uk> wrote:
> >
> > On Wed, Oct 26, 2016 at 12:51:58AM +0100, Robert Bragg wrote:
> > >    On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
> > >    <[1]matthew.william.auld@gmail.com> wrote:
> > >
> > >      On 25 October 2016 at 00:19, Robert Bragg <[2]robert@sixbynine.org>
> > >      wrote:
> > >
> > >
> > >
> > >      > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> > >      b/drivers/gpu/drm/i915/i915_drv.h
> > >      > index 3448d05..ea24814 100644
> > >      > --- a/drivers/gpu/drm/i915/i915_drv.h
> > >      > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > >      > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
> > >
> > >      >
> > >      >  struct drm_i915_private {
> > >      > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
> > >      >
> > >      >         struct {
> > >      >                 bool initialized;
> > >      > +
> > >      >                 struct mutex lock;
> > >      >                 struct list_head streams;
> > >      >
> > >      > +               spinlock_t hook_lock;
> > >      > +
> > >      >                 struct {
> > >      > -                       u32 metrics_set;
> > >      > +                       struct i915_perf_stream
> *exclusive_stream;

OT:
What kind of MUA are you using that mangles quoted mails like this? I've
not seen it on intel-gfx before. mesa-dev seems rife with it, but as I
rarely read that in any great detail I've managed to ignore it there.
Anyways, it makes it espesially hard to navigate long mails since mutt's
'S' (skip quoted text) no longer works correctly.

> > >      > +
> > >      > +                       u32 specific_ctx_id;
> > >      Can we just get rid of this, now that the vma remains pinned we can
> > >      simply get the ggtt address at the time of configuring the
> OA_CONTROL
> > >      register ?
> > >
> > >    I considered that, but would ideally prefer to keep it considering
> the
> > >    gen8+ patches to come. For gen8+ (with execlists) the context ID
> isn't a
> > >    gtt offset.
> >
> > In terms of symmetry, keeping the vma you pinned and unpinning the same
> > later makes its ownership much clearer. (And I do want the owner of each
> > pin to be clear, for when we start enabling debug to catch the VMA
> > leaks.)
> 
> Keeping our own pointer to the pinned vma could be a clarification.
> 
> Considering Matt's comments too, I'm thinking I'll put the pinning and
> specific_ctx_id initialization together with setting stream->ctx, keeping
> the state together under the stream. It's going to potentially mean
> redundantly pinning the ctx for the sake of the ID in the future for
> streams that don't really need it, but I think it's probably not worth
> worrying about that.
> 
> - Robert
> 
> > -Chris
> >
> > --
> > Chris Wilson, Intel Open Source Technology Centre

> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Robert Bragg Oct. 26, 2016, 4:42 p.m. UTC | #10
On Wed, Oct 26, 2016 at 4:37 PM, Ville Syrjälä <
ville.syrjala@linux.intel.com> wrote:

> On Wed, Oct 26, 2016 at 04:17:45PM +0100, Robert Bragg wrote:
> > On 26 Oct 2016 9:54 a.m., "Chris Wilson" <chris@chris-wilson.co.uk>
> wrote:
> > >
> > > On Wed, Oct 26, 2016 at 12:51:58AM +0100, Robert Bragg wrote:
> > > >    On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
> > > >    <[1]matthew.william.auld@gmail.com> wrote:
> > > >
> > > >      On 25 October 2016 at 00:19, Robert Bragg <[2]
> robert@sixbynine.org>
> > > >      wrote:
> > > >
> > > >
> > > >
> > > >      > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> > > >      b/drivers/gpu/drm/i915/i915_drv.h
> > > >      > index 3448d05..ea24814 100644
> > > >      > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > >      > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > >      > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
> > > >
> > > >      >
> > > >      >  struct drm_i915_private {
> > > >      > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
> > > >      >
> > > >      >         struct {
> > > >      >                 bool initialized;
> > > >      > +
> > > >      >                 struct mutex lock;
> > > >      >                 struct list_head streams;
> > > >      >
> > > >      > +               spinlock_t hook_lock;
> > > >      > +
> > > >      >                 struct {
> > > >      > -                       u32 metrics_set;
> > > >      > +                       struct i915_perf_stream
> > *exclusive_stream;
>
> OT:
> What kind of MUA are you using that mangles quoted mails like this? I've
> not seen it on intel-gfx before. mesa-dev seems rife with it, but as I
> rarely read that in any great detail I've managed to ignore it there.
> Anyways, it makes it espesially hard to navigate long mails since mutt's
> 'S' (skip quoted text) no longer works correctly.
>

Not sure I want to say, and get booted out the door :-)

I've heard that gmail has an annoying habit of forcibly wrapping plain text
emails like this, and a lot of people have complained that there's no way
to disable that 'feature' :-/

I used to use Mutt, but I don't think I could really bare to go back to it
any more. Last time I was using it I found myself spending too much time
patching it to try and make it work how I'd like, but can't say I got much
enjoyment from that process.

I've tried most MUA options available, and can't say any of them make me
very happy - I think these days it's just not something developers are very
interesting in working on.

I'm a sell out and just use Gmail... sorry. I can't really see myself
changing, though I do wish Google weren't so pedantic about forcing
wrapping without any option to change that behaviour. I suspect you
wouldn't be happy with me sending html emails, which has been Google's
default response to this complaint afik.

Maybe it's gmail users causing trouble on the Mesa list too.

- Robert

P.S please don't think lesser of me due to my misguided MUA choices.



>
> > > >      > +
> > > >      > +                       u32 specific_ctx_id;
> > > >      Can we just get rid of this, now that the vma remains pinned we
> can
> > > >      simply get the ggtt address at the time of configuring the
> > OA_CONTROL
> > > >      register ?
> > > >
> > > >    I considered that, but would ideally prefer to keep it considering
> > the
> > > >    gen8+ patches to come. For gen8+ (with execlists) the context ID
> > isn't a
> > > >    gtt offset.
> > >
> > > In terms of symmetry, keeping the vma you pinned and unpinning the same
> > > later makes its ownership much clearer. (And I do want the owner of
> each
> > > pin to be clear, for when we start enabling debug to catch the VMA
> > > leaks.)
> >
> > Keeping our own pointer to the pinned vma could be a clarification.
> >
> > Considering Matt's comments too, I'm thinking I'll put the pinning and
> > specific_ctx_id initialization together with setting stream->ctx, keeping
> > the state together under the stream. It's going to potentially mean
> > redundantly pinning the ctx for the sake of the ID in the future for
> > streams that don't really need it, but I think it's probably not worth
> > worrying about that.
> >
> > - Robert
> >
> > > -Chris
> > >
> > > --
> > > Chris Wilson, Intel Open Source Technology Centre
>
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
>
> --
> Ville Syrjälä
> Intel OTC
>
Daniel Vetter Oct. 26, 2016, 4:52 p.m. UTC | #11
On Wed, Oct 26, 2016 at 05:42:23PM +0100, Robert Bragg wrote:
> On Wed, Oct 26, 2016 at 4:37 PM, Ville Syrjälä <
> ville.syrjala@linux.intel.com> wrote:
> 
> > On Wed, Oct 26, 2016 at 04:17:45PM +0100, Robert Bragg wrote:
> > > On 26 Oct 2016 9:54 a.m., "Chris Wilson" <chris@chris-wilson.co.uk>
> > wrote:
> > > >
> > > > On Wed, Oct 26, 2016 at 12:51:58AM +0100, Robert Bragg wrote:
> > > > >    On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
> > > > >    <[1]matthew.william.auld@gmail.com> wrote:
> > > > >
> > > > >      On 25 October 2016 at 00:19, Robert Bragg <[2]
> > robert@sixbynine.org>
> > > > >      wrote:
> > > > >
> > > > >
> > > > >
> > > > >      > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> > > > >      b/drivers/gpu/drm/i915/i915_drv.h
> > > > >      > index 3448d05..ea24814 100644
> > > > >      > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > > >      > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > > >      > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
> > > > >
> > > > >      >
> > > > >      >  struct drm_i915_private {
> > > > >      > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
> > > > >      >
> > > > >      >         struct {
> > > > >      >                 bool initialized;
> > > > >      > +
> > > > >      >                 struct mutex lock;
> > > > >      >                 struct list_head streams;
> > > > >      >
> > > > >      > +               spinlock_t hook_lock;
> > > > >      > +
> > > > >      >                 struct {
> > > > >      > -                       u32 metrics_set;
> > > > >      > +                       struct i915_perf_stream
> > > *exclusive_stream;
> >
> > OT:
> > What kind of MUA are you using that mangles quoted mails like this? I've
> > not seen it on intel-gfx before. mesa-dev seems rife with it, but as I
> > rarely read that in any great detail I've managed to ignore it there.
> > Anyways, it makes it espesially hard to navigate long mails since mutt's
> > 'S' (skip quoted text) no longer works correctly.
> >
> 
> Not sure I want to say, and get booted out the door :-)
> 
> I've heard that gmail has an annoying habit of forcibly wrapping plain text
> emails like this, and a lot of people have complained that there's no way
> to disable that 'feature' :-/
> 
> I used to use Mutt, but I don't think I could really bare to go back to it
> any more. Last time I was using it I found myself spending too much time
> patching it to try and make it work how I'd like, but can't say I got much
> enjoyment from that process.
> 
> I've tried most MUA options available, and can't say any of them make me
> very happy - I think these days it's just not something developers are very
> interesting in working on.
> 
> I'm a sell out and just use Gmail... sorry. I can't really see myself
> changing, though I do wish Google weren't so pedantic about forcing
> wrapping without any option to change that behaviour. I suspect you
> wouldn't be happy with me sending html emails, which has been Google's
> default response to this complaint afik.
> 
> Maybe it's gmail users causing trouble on the Mesa list too.
> 
> - Robert
> 
> P.S please don't think lesser of me due to my misguided MUA choices.

I use a mix of mutt+gmail web interface, since each has their upsides.
Haven't yet seen badly misquoted stuff, I think it mostly seems to work
for me. And there's lots of kernel folks who use gmail too afaik.
-Daniel

> 
> 
> 
> >
> > > > >      > +
> > > > >      > +                       u32 specific_ctx_id;
> > > > >      Can we just get rid of this, now that the vma remains pinned we
> > can
> > > > >      simply get the ggtt address at the time of configuring the
> > > OA_CONTROL
> > > > >      register ?
> > > > >
> > > > >    I considered that, but would ideally prefer to keep it considering
> > > the
> > > > >    gen8+ patches to come. For gen8+ (with execlists) the context ID
> > > isn't a
> > > > >    gtt offset.
> > > >
> > > > In terms of symmetry, keeping the vma you pinned and unpinning the same
> > > > later makes its ownership much clearer. (And I do want the owner of
> > each
> > > > pin to be clear, for when we start enabling debug to catch the VMA
> > > > leaks.)
> > >
> > > Keeping our own pointer to the pinned vma could be a clarification.
> > >
> > > Considering Matt's comments too, I'm thinking I'll put the pinning and
> > > specific_ctx_id initialization together with setting stream->ctx, keeping
> > > the state together under the stream. It's going to potentially mean
> > > redundantly pinning the ctx for the sake of the ID in the future for
> > > streams that don't really need it, but I think it's probably not worth
> > > worrying about that.
> > >
> > > - Robert
> > >
> > > > -Chris
> > > >
> > > > --
> > > > Chris Wilson, Intel Open Source Technology Centre
> >
> > > _______________________________________________
> > > Intel-gfx mailing list
> > > Intel-gfx@lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
> >
> >
> > --
> > Ville Syrjälä
> > Intel OTC
> >

> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Ville Syrjala Oct. 26, 2016, 4:54 p.m. UTC | #12
On Wed, Oct 26, 2016 at 05:42:23PM +0100, Robert Bragg wrote:
> On Wed, Oct 26, 2016 at 4:37 PM, Ville Syrjälä <
> ville.syrjala@linux.intel.com> wrote:
> 
> > On Wed, Oct 26, 2016 at 04:17:45PM +0100, Robert Bragg wrote:
> > > On 26 Oct 2016 9:54 a.m., "Chris Wilson" <chris@chris-wilson.co.uk>
> > wrote:
> > > >
> > > > On Wed, Oct 26, 2016 at 12:51:58AM +0100, Robert Bragg wrote:
> > > > >    On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
> > > > >    <[1]matthew.william.auld@gmail.com> wrote:
> > > > >
> > > > >      On 25 October 2016 at 00:19, Robert Bragg <[2]
> > robert@sixbynine.org>
> > > > >      wrote:
> > > > >
> > > > >
> > > > >
> > > > >      > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> > > > >      b/drivers/gpu/drm/i915/i915_drv.h
> > > > >      > index 3448d05..ea24814 100644
> > > > >      > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > > >      > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > > >      > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
> > > > >
> > > > >      >
> > > > >      >  struct drm_i915_private {
> > > > >      > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
> > > > >      >
> > > > >      >         struct {
> > > > >      >                 bool initialized;
> > > > >      > +
> > > > >      >                 struct mutex lock;
> > > > >      >                 struct list_head streams;
> > > > >      >
> > > > >      > +               spinlock_t hook_lock;
> > > > >      > +
> > > > >      >                 struct {
> > > > >      > -                       u32 metrics_set;
> > > > >      > +                       struct i915_perf_stream
> > > *exclusive_stream;
> >
> > OT:
> > What kind of MUA are you using that mangles quoted mails like this? I've
> > not seen it on intel-gfx before. mesa-dev seems rife with it, but as I
> > rarely read that in any great detail I've managed to ignore it there.
> > Anyways, it makes it espesially hard to navigate long mails since mutt's
> > 'S' (skip quoted text) no longer works correctly.
> >
> 
> Not sure I want to say, and get booted out the door :-)
> 
> I've heard that gmail has an annoying habit of forcibly wrapping plain text
> emails like this, and a lot of people have complained that there's no way
> to disable that 'feature' :-/
> 
> I used to use Mutt, but I don't think I could really bare to go back to it
> any more. Last time I was using it I found myself spending too much time
> patching it to try and make it work how I'd like, but can't say I got much
> enjoyment from that process.

Isn't gmail just a pile of client side javascript or something? Maybe
you'd enjoy patching that one more? ;)

> 
> I've tried most MUA options available, and can't say any of them make me
> very happy - I think these days it's just not something developers are very
> interesting in working on.
> 
> I'm a sell out and just use Gmail... sorry. I can't really see myself
> changing, though I do wish Google weren't so pedantic about forcing
> wrapping without any option to change that behaviour. I suspect you
> wouldn't be happy with me sending html emails, which has been Google's
> default response to this complaint afik.
> 
> Maybe it's gmail users causing trouble on the Mesa list too.
> 
> - Robert
> 
> P.S please don't think lesser of me due to my misguided MUA choices.

I think I'll just reserve the right to ignore any mail with bad quoting.
Robert Bragg Oct. 26, 2016, 6:53 p.m. UTC | #13
On 26 Oct 2016 5:54 p.m., "Ville Syrjälä" <ville.syrjala@linux.intel.com>
wrote:
>
> On Wed, Oct 26, 2016 at 05:42:23PM +0100, Robert Bragg wrote:
> > On Wed, Oct 26, 2016 at 4:37 PM, Ville Syrjälä <
> > ville.syrjala@linux.intel.com> wrote:
> >
> > > On Wed, Oct 26, 2016 at 04:17:45PM +0100, Robert Bragg wrote:
> > > > On 26 Oct 2016 9:54 a.m., "Chris Wilson" <chris@chris-wilson.co.uk>
> > > wrote:
> > > > >
> > > > > On Wed, Oct 26, 2016 at 12:51:58AM +0100, Robert Bragg wrote:
> > > > > >    On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
> > > > > >    <[1]matthew.william.auld@gmail.com> wrote:
> > > > > >
> > > > > >      On 25 October 2016 at 00:19, Robert Bragg <[2]
> > > robert@sixbynine.org>
> > > > > >      wrote:
> > > > > >
> > > > > >
> > > > > >
> > > > > >      > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> > > > > >      b/drivers/gpu/drm/i915/i915_drv.h
> > > > > >      > index 3448d05..ea24814 100644
> > > > > >      > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > > > >      > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > > > >      > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
> > > > > >
> > > > > >      >
> > > > > >      >  struct drm_i915_private {
> > > > > >      > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
> > > > > >      >
> > > > > >      >         struct {
> > > > > >      >                 bool initialized;
> > > > > >      > +
> > > > > >      >                 struct mutex lock;
> > > > > >      >                 struct list_head streams;
> > > > > >      >
> > > > > >      > +               spinlock_t hook_lock;
> > > > > >      > +
> > > > > >      >                 struct {
> > > > > >      > -                       u32 metrics_set;
> > > > > >      > +                       struct i915_perf_stream
> > > > *exclusive_stream;
> > >
> > > OT:
> > > What kind of MUA are you using that mangles quoted mails like this?
I've
> > > not seen it on intel-gfx before. mesa-dev seems rife with it, but as I
> > > rarely read that in any great detail I've managed to ignore it there.
> > > Anyways, it makes it espesially hard to navigate long mails since
mutt's
> > > 'S' (skip quoted text) no longer works correctly.
> > >
> >
> > Not sure I want to say, and get booted out the door :-)
> >
> > I've heard that gmail has an annoying habit of forcibly wrapping plain
text
> > emails like this, and a lot of people have complained that there's no
way
> > to disable that 'feature' :-/
> >
> > I used to use Mutt, but I don't think I could really bare to go back to
it
> > any more. Last time I was using it I found myself spending too much time
> > patching it to try and make it work how I'd like, but can't say I got
much
> > enjoyment from that process.
>
> Isn't gmail just a pile of client side javascript or something? Maybe
> you'd enjoy patching that one more? ;)
>
> >
> > I've tried most MUA options available, and can't say any of them make me
> > very happy - I think these days it's just not something developers are
very
> > interesting in working on.
> >
> > I'm a sell out and just use Gmail... sorry. I can't really see myself
> > changing, though I do wish Google weren't so pedantic about forcing
> > wrapping without any option to change that behaviour. I suspect you
> > wouldn't be happy with me sending html emails, which has been Google's
> > default response to this complaint afik.
> >
> > Maybe it's gmail users causing trouble on the Mesa list too.
> >
> > - Robert
> >
> > P.S please don't think lesser of me due to my misguided MUA choices.
>
> I think I'll just reserve the right to ignore any mail with bad quoting.

Okey, fwiw, at least my patches sent out via git send-email should be fine,
so maybe just ignore my replies to feedback - which I promise not to
exploit to achieve 'consensus' through silence.

- Robert

--
Sent from Gmail on Android, in a spare moment at a VR for Immersive Theatre
meet up.

>
> --
> Ville Syrjälä
> Intel OTC
Robert Bragg Oct. 26, 2016, 9:53 p.m. UTC | #14
On Wed, Oct 26, 2016 at 4:03 PM, Robert Bragg <robert.bragg@gmail.com>
wrote:

> On 26 Oct 2016 11:08 a.m., "Matthew Auld" <matthew.william.auld@gmail.com>
> wrote:
> >
> > On 26 October 2016 at 00:51, Robert Bragg <robert@sixbynine.org> wrote:
> > >
> > >
> > > On Tue, Oct 25, 2016 at 10:35 PM, Matthew Auld
> > > <matthew.william.auld@gmail.com> wrote:
> > >>
> > >> On 25 October 2016 at 00:19, Robert Bragg <robert@sixbynine.org>
> wrote:
> > >
> > >
> > >>
> > >>
> > >> > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> > >> > b/drivers/gpu/drm/i915/i915_drv.h
> > >> > index 3448d05..ea24814 100644
> > >> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > >> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > >> > @@ -1764,6 +1764,11 @@ struct intel_wm_config {
> > >>
> > >> >
> > >> >  struct drm_i915_private {
> > >> > @@ -2149,16 +2164,46 @@ struct drm_i915_private {
> > >> >
> > >> >         struct {
> > >> >                 bool initialized;
> > >> > +
> > >> >                 struct mutex lock;
> > >> >                 struct list_head streams;
> > >> >
> > >> > +               spinlock_t hook_lock;
> > >> > +
> > >> >                 struct {
> > >> > -                       u32 metrics_set;
> > >> > +                       struct i915_perf_stream *exclusive_stream;
> > >> > +
> > >> > +                       u32 specific_ctx_id;
> > >> Can we just get rid of this, now that the vma remains pinned we can
> > >> simply get the ggtt address at the time of configuring the OA_CONTROL
> > >> register ?
> > >
> > >
> > > I considered that, but would ideally prefer to keep it considering the
> gen8+
> > > patches to come. For gen8+ (with execlists) the context ID isn't a gtt
> > > offset.
> > >
> > >>
> > >>
> > >> > +
> > >> > +                       struct hrtimer poll_check_timer;
> > >> > +                       wait_queue_head_t poll_wq;
> > >> > +                       atomic_t pollin;
> > >> > +
> > >>
> > >
> > >>
> > >> > +/* The maximum exponent the hardware accepts is 63 (essentially it
> > >> > selects one
> > >> > + * of the 64bit timestamp bits to trigger reports from) but there's
> > >> > currently
> > >> > + * no known use case for sampling as infrequently as once per 47
> > >> > thousand years.
> > >> > + *
> > >> > + * Since the timestamps included in OA reports are only 32bits it
> seems
> > >> > + * reasonable to limit the OA exponent where it's still possible to
> > >> > account for
> > >> > + * overflow in OA report timestamps.
> > >> > + */
> > >> > +#define OA_EXPONENT_MAX 31
> > >> > +
> > >> > +#define INVALID_CTX_ID 0xffffffff
> > >> We shouldn't need this anymore.
> > >
> > >
> > > yeah I removed it and then added it back, just for the sake of
> explicitly
> > > setting the specific_ctx_id to an invalid ID when closing the exclusive
> > > stream - though resetting the value isn't strictly necessary.
> > Can we not make the specific_ctx_id per-stream, the gem context
> > already is, then we don't need to be concerned with resetting it ?
>
> Hmm, I'm not sure about that, conceptually to me it's global OA unit state.
>
> Currently the driver only supports a single exclusive stream, while Sourab
> later relaxes that to a per-engine stream and that could be relaxed further
> with non-oa metric stream types.
>
> With multiple streams we'll still only be able to programmer a single ctx
> id in oacontol.
>
> Conceptually to me, other stream types could be associated with different
> contexts (if they don't depend on the OA unit) so to me stream->ctx isn't
> necessarily OA unit state.
>
> It probably could be played around with, but right now we don't track OA
> specific state in the stream. For the ID it's just semantics to say it's OA
> state, and we could consider that it's maybe generally useful to track the
> ID, even for future non-oa streams. That might mean potentially redundantly
> pinning state for the sake of tracking the ID for streams that don't end up
> needing it.
>

I started to try out moving the specific_ctx_id and vma pointer (new) to
the stream, and also looked at initializing them together with the
stream->ctx reference, but I'm not really happy with how it's looking.

The specific_ctx_id and pinning are only for the render context, since the
OA unit is only well integrated with the render engine, which makes me more
inclined to consider them OA stream specific, not something we want/need
for all streams (considering that Sourab enables multiple streams in his
series).

Btw, for reference, my patches for gen8+ can also end up making use of the
INVALID_CTX_ID define (when overwriting the undefined ctx_id field in HW
reports when the report's ctx-id is flagged as invalid by the OA unit.) so
we maybe don't want to worry to much about removing the need for it here.

- Robert
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3448d05..ea24814 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1764,6 +1764,11 @@  struct intel_wm_config {
 	bool sprites_scaled;
 };
 
+struct i915_oa_format {
+	u32 format;
+	int size;
+};
+
 struct i915_oa_reg {
 	i915_reg_t addr;
 	u32 value;
@@ -1784,11 +1789,6 @@  struct i915_perf_stream_ops {
 	 */
 	void (*disable)(struct i915_perf_stream *stream);
 
-	/* Return: true if any i915 perf records are ready to read()
-	 * for this stream.
-	 */
-	bool (*can_read)(struct i915_perf_stream *stream);
-
 	/* Call poll_wait, passing a wait queue that will be woken
 	 * once there is something ready to read() for the stream
 	 */
@@ -1798,9 +1798,7 @@  struct i915_perf_stream_ops {
 
 	/* For handling a blocking read, wait until there is something
 	 * to ready to read() for the stream. E.g. wait on the same
-	 * wait queue that would be passed to poll_wait() until
-	 * ->can_read() returns true (if its safe to call ->can_read()
-	 * without the i915 perf lock held).
+	 * wait queue that would be passed to poll_wait().
 	 */
 	int (*wait_unlocked)(struct i915_perf_stream *stream);
 
@@ -1840,11 +1838,28 @@  struct i915_perf_stream {
 	struct list_head link;
 
 	u32 sample_flags;
+	int sample_size;
 
 	struct i915_gem_context *ctx;
 	bool enabled;
 
-	struct i915_perf_stream_ops *ops;
+	const struct i915_perf_stream_ops *ops;
+};
+
+struct i915_oa_ops {
+	void (*init_oa_buffer)(struct drm_i915_private *dev_priv);
+	int (*enable_metric_set)(struct drm_i915_private *dev_priv);
+	void (*disable_metric_set)(struct drm_i915_private *dev_priv);
+	void (*oa_enable)(struct drm_i915_private *dev_priv);
+	void (*oa_disable)(struct drm_i915_private *dev_priv);
+	void (*update_oacontrol)(struct drm_i915_private *dev_priv);
+	void (*update_hw_ctx_id_locked)(struct drm_i915_private *dev_priv,
+					u32 ctx_id);
+	int (*read)(struct i915_perf_stream *stream,
+		    char __user *buf,
+		    size_t count,
+		    size_t *offset);
+	bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
 };
 
 struct drm_i915_private {
@@ -2149,16 +2164,46 @@  struct drm_i915_private {
 
 	struct {
 		bool initialized;
+
 		struct mutex lock;
 		struct list_head streams;
 
+		spinlock_t hook_lock;
+
 		struct {
-			u32 metrics_set;
+			struct i915_perf_stream *exclusive_stream;
+
+			u32 specific_ctx_id;
+
+			struct hrtimer poll_check_timer;
+			wait_queue_head_t poll_wq;
+			atomic_t pollin;
+
+			bool periodic;
+			int period_exponent;
+			int timestamp_frequency;
+
+			int tail_margin;
+
+			int metrics_set;
 
 			const struct i915_oa_reg *mux_regs;
 			int mux_regs_len;
 			const struct i915_oa_reg *b_counter_regs;
 			int b_counter_regs_len;
+
+			struct {
+				struct i915_vma *vma;
+				u8 *vaddr;
+				int format;
+				int format_size;
+			} oa_buffer;
+
+			u32 gen7_latched_oastatus1;
+
+			struct i915_oa_ops ops;
+			const struct i915_oa_format *oa_formats;
+			int n_builtin_sets;
 		} oa;
 	} perf;
 
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 4d51586..d7a4899 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -25,16 +25,867 @@ 
  */
 
 #include <linux/anon_inodes.h>
+#include <linux/sizes.h>
 
 #include "i915_drv.h"
+#include "i915_oa_hsw.h"
+
+/* HW requires this to be a power of two, between 128k and 16M, though driver
+ * is currently generally designed assuming the largest 16M size is used such
+ * that the overflow cases are unlikely in normal operation.
+ */
+#define OA_BUFFER_SIZE		SZ_16M
+
+#define OA_TAKEN(tail, head)	((tail - head) & (OA_BUFFER_SIZE - 1))
+
+/* There's a HW race condition between OA unit tail pointer register updates and
+ * writes to memory whereby the tail pointer can sometimes get ahead of what's
+ * been written out to the OA buffer so far.
+ *
+ * Although this can be observed explicitly by checking for a zeroed report-id
+ * field in tail reports, it seems preferable to account for this earlier e.g.
+ * as part of the _oa_buffer_is_empty checks to minimize -EAGAIN polling cycles
+ * in this situation.
+ *
+ * To give time for the most recent reports to land before they may be copied to
+ * userspace, the driver operates as if the tail pointer effectively lags behind
+ * the HW tail pointer by 'tail_margin' bytes. The margin in bytes is calculated
+ * based on this constant in nanoseconds, the current OA sampling exponent
+ * and current report size.
+ *
+ * There is also a fallback check while reading to simply skip over reports with
+ * a zeroed report-id.
+ */
+#define OA_TAIL_MARGIN_NSEC	100000ULL
+
+/* frequency for checking whether the OA unit has written new reports to the
+ * circular OA buffer...
+ */
+#define POLL_FREQUENCY 200
+#define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY)
+
+/* The maximum exponent the hardware accepts is 63 (essentially it selects one
+ * of the 64bit timestamp bits to trigger reports from) but there's currently
+ * no known use case for sampling as infrequently as once per 47 thousand years.
+ *
+ * Since the timestamps included in OA reports are only 32bits it seems
+ * reasonable to limit the OA exponent where it's still possible to account for
+ * overflow in OA report timestamps.
+ */
+#define OA_EXPONENT_MAX 31
+
+#define INVALID_CTX_ID 0xffffffff
+
+
+/* XXX: beware if future OA HW adds new report formats that the current
+ * code assumes all reports have a power-of-two size and ~(size - 1) can
+ * be used as a mask to align the OA tail pointer.
+ */
+static struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = {
+	[I915_OA_FORMAT_A13]	    = { 0, 64 },
+	[I915_OA_FORMAT_A29]	    = { 1, 128 },
+	[I915_OA_FORMAT_A13_B8_C8]  = { 2, 128 },
+	/* A29_B8_C8 Disallowed as 192 bytes doesn't factor into buffer size */
+	[I915_OA_FORMAT_B4_C8]	    = { 4, 64 },
+	[I915_OA_FORMAT_A45_B8_C8]  = { 5, 256 },
+	[I915_OA_FORMAT_B4_C8_A16]  = { 6, 128 },
+	[I915_OA_FORMAT_C4_B8]	    = { 7, 64 },
+};
+
+#define SAMPLE_OA_REPORT      (1<<0)
 
 struct perf_open_properties {
 	u32 sample_flags;
 
 	u64 single_context:1;
 	u64 ctx_handle;
+
+	/* OA sampling state */
+	int metrics_set;
+	int oa_format;
+	bool oa_periodic;
+	int oa_period_exponent;
 };
 
+/* NB: This is either called via fops or the poll check hrtimer (atomic ctx)
+ *
+ * It's safe to read OA config state here unlocked, assuming that this is only
+ * called while the stream is enabled, while the global OA configuration can't
+ * be modified.
+ *
+ * Note: we don't lock around the head/tail reads even though there's the slim
+ * possibility of read() fop errors forcing a re-init of the OA buffer
+ * pointers.  A race here could result in a false positive !empty status which
+ * is acceptable.
+ */
+static bool gen7_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv)
+{
+	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u32 oastatus2 = I915_READ(GEN7_OASTATUS2);
+	u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
+	u32 head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
+	u32 tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
+
+	return OA_TAKEN(tail, head) <
+		dev_priv->perf.oa.tail_margin + report_size;
+}
+
+/**
+ * Appends a status record to a userspace read() buffer.
+ */
+static int append_oa_status(struct i915_perf_stream *stream,
+			    char __user *buf,
+			    size_t count,
+			    size_t *offset,
+			    enum drm_i915_perf_record_type type)
+{
+	struct drm_i915_perf_record_header header = { type, 0, sizeof(header) };
+
+	if ((count - *offset) < header.size)
+		return -ENOSPC;
+
+	if (copy_to_user(buf + *offset, &header, sizeof(header)))
+		return -EFAULT;
+
+	(*offset) += header.size;
+
+	return 0;
+}
+
+/**
+ * Copies single OA report into userspace read() buffer.
+ */
+static int append_oa_sample(struct i915_perf_stream *stream,
+			    char __user *buf,
+			    size_t count,
+			    size_t *offset,
+			    const u8 *report)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
+	struct drm_i915_perf_record_header header;
+	u32 sample_flags = stream->sample_flags;
+
+	header.type = DRM_I915_PERF_RECORD_SAMPLE;
+	header.pad = 0;
+	header.size = stream->sample_size;
+
+	if ((count - *offset) < header.size)
+		return -ENOSPC;
+
+	buf += *offset;
+	if (copy_to_user(buf, &header, sizeof(header)))
+		return -EFAULT;
+	buf += sizeof(header);
+
+	if (sample_flags & SAMPLE_OA_REPORT) {
+		if (copy_to_user(buf, report, report_size))
+			return -EFAULT;
+	}
+
+	(*offset) += header.size;
+
+	return 0;
+}
+
+/**
+ * Copies all buffered OA reports into userspace read() buffer.
+ * @head_ptr: (inout): the head pointer before and after appending
+ *
+ * Returns 0 on success, negative error code on failure.
+ *
+ * Notably any error condition resulting in a short read (-ENOSPC or
+ * -EFAULT) will be returned even though one or more records may
+ * have been successfully copied. In this case it's up to the caller
+ * to decide if the error should be squashed before returning to
+ * userspace.
+ */
+static int gen7_append_oa_reports(struct i915_perf_stream *stream,
+				  char __user *buf,
+				  size_t count,
+				  size_t *offset,
+				  u32 *head_ptr,
+				  u32 tail)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
+	int tail_margin = dev_priv->perf.oa.tail_margin;
+	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
+	u32 mask = (OA_BUFFER_SIZE - 1);
+	u32 head;
+	u32 taken;
+	int ret = 0;
+
+	BUG_ON(!stream->enabled);
+
+	head = *head_ptr - gtt_offset;
+	tail -= gtt_offset;
+
+	/* The OA unit is expected to wrap the tail pointer according to the OA
+	 * buffer size and since we should never write a misaligned head
+	 * pointer we don't expect to read one back either...
+	 */
+	if (tail > OA_BUFFER_SIZE || head > OA_BUFFER_SIZE ||
+	    head % report_size) {
+		DRM_ERROR("Inconsistent OA buffer pointer (head = %u, tail = %u): force restart",
+			  head, tail);
+		dev_priv->perf.oa.ops.oa_disable(dev_priv);
+		dev_priv->perf.oa.ops.oa_enable(dev_priv);
+		*head_ptr = I915_READ(GEN7_OASTATUS2) &
+			GEN7_OASTATUS2_HEAD_MASK;
+		return -EIO;
+	}
+
+
+	/* The tail pointer increases in 64 byte increments, not in report_size
+	 * steps...
+	 */
+	tail &= ~(report_size - 1);
+
+	/* Move the tail pointer back by the current tail_margin to account for
+	 * the possibility that the latest reports may not have really landed
+	 * in memory yet...
+	 */
+
+	if (OA_TAKEN(tail, head) < report_size + tail_margin)
+		return -EAGAIN;
+
+	tail -= tail_margin;
+	tail &= mask;
+
+	for (/* none */;
+	     (taken = OA_TAKEN(tail, head));
+	     head = (head + report_size) & mask) {
+		u8 *report = oa_buf_base + head;
+		u32 *report32 = (void *)report;
+
+		/* All the report sizes factor neatly into the buffer
+		 * size so we never expect to see a report split
+		 * between the beginning and end of the buffer.
+		 *
+		 * Given the initial alignment check a misalignment
+		 * here would imply a driver bug that would result
+		 * in an overrun.
+		 */
+		BUG_ON((OA_BUFFER_SIZE - head) < report_size);
+
+		/* The report-ID field for periodic samples includes
+		 * some undocumented flags related to what triggered
+		 * the report and is never expected to be zero so we
+		 * can check that the report isn't invalid before
+		 * copying it to userspace...
+		 */
+		if (report32[0] == 0) {
+			DRM_ERROR("Skipping spurious, invalid OA report\n");
+			continue;
+		}
+
+		ret = append_oa_sample(stream, buf, count, offset, report);
+		if (ret)
+			break;
+
+		/* The above report-id field sanity check is based on
+		 * the assumption that the OA buffer is initially
+		 * zeroed and we reset the field after copying so the
+		 * check is still meaningful once old reports start
+		 * being overwritten.
+		 */
+		report32[0] = 0;
+	}
+
+	*head_ptr = gtt_offset + head;
+
+	return ret;
+}
+
+static int gen7_oa_read(struct i915_perf_stream *stream,
+			char __user *buf,
+			size_t count,
+			size_t *offset)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u32 oastatus2;
+	u32 oastatus1;
+	u32 head;
+	u32 tail;
+	int ret;
+
+	BUG_ON(!dev_priv->perf.oa.oa_buffer.vaddr);
+
+	oastatus2 = I915_READ(GEN7_OASTATUS2);
+	oastatus1 = I915_READ(GEN7_OASTATUS1);
+
+	head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
+	tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
+
+	/* XXX: On Haswell we don't have a safe way to clear oastatus1
+	 * bits while the OA unit is enabled (while the tail pointer
+	 * may be updated asynchronously) so we ignore status bits
+	 * that have already been reported to userspace.
+	 */
+	oastatus1 &= ~dev_priv->perf.oa.gen7_latched_oastatus1;
+
+	/* We treat OABUFFER_OVERFLOW as a significant error:
+	 *
+	 * - The status can be interpreted to mean that the buffer is
+	 *   currently full (with a higher precedence than OA_TAKEN()
+	 *   which will start to report a near-empty buffer after an
+	 *   overflow) but it's awkward that we can't clear the status
+	 *   on Haswell, so without a reset we won't be able to catch
+	 *   the state again.
+	 *
+	 * - Since it also implies the HW has started overwriting old
+	 *   reports it may also affect our sanity checks for invalid
+	 *   reports when copying to userspace that assume new reports
+	 *   are being written to cleared memory.
+	 *
+	 * - In the future we may want to introduce a flight recorder
+	 *   mode where the driver will automatically maintain a safe
+	 *   guard band between head/tail, avoiding this overflow
+	 *   condition, but we avoid the added driver complexity for
+	 *   now.
+	 */
+	if (unlikely(oastatus1 & GEN7_OASTATUS1_OABUFFER_OVERFLOW)) {
+		ret = append_oa_status(stream, buf, count, offset,
+				       DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
+		if (ret)
+			return ret;
+
+		DRM_ERROR("OA buffer overflow: force restart");
+
+		dev_priv->perf.oa.ops.oa_disable(dev_priv);
+		dev_priv->perf.oa.ops.oa_enable(dev_priv);
+
+		oastatus2 = I915_READ(GEN7_OASTATUS2);
+		oastatus1 = I915_READ(GEN7_OASTATUS1);
+
+		head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
+		tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
+	}
+
+	if (unlikely(oastatus1 & GEN7_OASTATUS1_REPORT_LOST)) {
+		ret = append_oa_status(stream, buf, count, offset,
+				       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
+		if (ret)
+			return ret;
+		dev_priv->perf.oa.gen7_latched_oastatus1 |=
+			GEN7_OASTATUS1_REPORT_LOST;
+	}
+
+	ret = gen7_append_oa_reports(stream, buf, count, offset,
+				     &head, tail);
+
+	/* All the report sizes are a power of two and the
+	 * head should always be incremented by some multiple
+	 * of the report size.
+	 *
+	 * A warning here, but notably if we later read back a
+	 * misaligned pointer we will treat that as a bug since
+	 * it could lead to a buffer overrun.
+	 */
+	WARN_ONCE(head & (report_size - 1),
+		  "i915: Writing misaligned OA head pointer");
+
+	/* Note: we update the head pointer here even if an error
+	 * was returned since the error may represent a short read
+	 * where some some reports were successfully copied.
+	 */
+	I915_WRITE(GEN7_OASTATUS2,
+		   ((head & GEN7_OASTATUS2_HEAD_MASK) |
+		    OA_MEM_SELECT_GGTT));
+
+	return ret;
+}
+
+static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	/* We would wait indefinitly if periodic sampling is not enabled */
+	if (!dev_priv->perf.oa.periodic)
+		return -EIO;
+
+	/* Note: the oa_buffer_is_empty() condition is ok to run unlocked as it
+	 * just performs mmio reads of the OA buffer head + tail pointers and
+	 * it's assumed we're handling some operation that implies the stream
+	 * can't be destroyed until completion (such as a read()) that ensures
+	 * the device + OA buffer can't disappear
+	 */
+	return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
+					!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv));
+}
+
+static void i915_oa_poll_wait(struct i915_perf_stream *stream,
+			      struct file *file,
+			      poll_table *wait)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	poll_wait(file, &dev_priv->perf.oa.poll_wq, wait);
+}
+
+static int i915_oa_read(struct i915_perf_stream *stream,
+			char __user *buf,
+			size_t count,
+			size_t *offset)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
+}
+
+static int claim_specific_ctx(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_vma *vma;
+	int ret;
+
+	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
+	if (ret)
+		return ret;
+
+	/* So that we don't have to worry about updating the context ID
+	 * in OACONTOL on the fly we make sure to pin the context
+	 * upfront for the lifetime of the stream...
+	 */
+	vma = stream->ctx->engine[RCS].state;
+	ret = i915_vma_pin(vma, 0, stream->ctx->ggtt_alignment,
+			   PIN_GLOBAL | PIN_HIGH);
+	if (ret)
+		return ret;
+
+	dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(vma);
+
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+
+	return 0;
+}
+
+static void release_specific_ctx(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	mutex_lock(&dev_priv->drm.struct_mutex);
+
+	i915_vma_unpin(stream->ctx->engine[RCS].state);
+	dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
+
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+}
+
+static void
+free_oa_buffer(struct drm_i915_private *i915)
+{
+	mutex_lock(&i915->drm.struct_mutex);
+
+	i915_gem_object_unpin_map(i915->perf.oa.oa_buffer.vma->obj);
+	i915_vma_unpin(i915->perf.oa.oa_buffer.vma);
+	i915_gem_object_put(i915->perf.oa.oa_buffer.vma->obj);
+
+	i915->perf.oa.oa_buffer.vma = NULL;
+	i915->perf.oa.oa_buffer.vaddr = NULL;
+
+	mutex_unlock(&i915->drm.struct_mutex);
+}
+
+static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
+
+	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+
+	free_oa_buffer(dev_priv);
+
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+	intel_runtime_pm_put(dev_priv);
+
+	if (stream->ctx)
+		release_specific_ctx(stream);
+
+	dev_priv->perf.oa.exclusive_stream = NULL;
+}
+
+static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
+{
+	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
+
+	/* Pre-DevBDW: OABUFFER must be set with counters off,
+	 * before OASTATUS1, but after OASTATUS2
+	 */
+	I915_WRITE(GEN7_OASTATUS2, gtt_offset | OA_MEM_SELECT_GGTT); /* head */
+	I915_WRITE(GEN7_OABUFFER, gtt_offset);
+	I915_WRITE(GEN7_OASTATUS1, gtt_offset | OABUFFER_SIZE_16M); /* tail */
+
+	/* On Haswell we have to track which OASTATUS1 flags we've
+	 * already seen since they can't be cleared while periodic
+	 * sampling is enabled.
+	 */
+	dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
+
+	/* NB: although the OA buffer will initially be allocated
+	 * zeroed via shmfs (and so this memset is redundant when
+	 * first allocating), we may re-init the OA buffer, either
+	 * when re-enabling a stream or in error/reset paths.
+	 *
+	 * The reason we clear the buffer for each re-init is for the
+	 * sanity check in gen7_append_oa_reports() that looks at the
+	 * report-id field to make sure it's non-zero which relies on
+	 * the assumption that new reports are being written to zeroed
+	 * memory...
+	 */
+	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
+
+	/* Maybe make ->pollin per-stream state if we support multiple
+	 * concurrent streams in the future. */
+	atomic_set(&dev_priv->perf.oa.pollin, false);
+}
+
+static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+{
+	struct drm_i915_gem_object *bo;
+	struct i915_vma *vma;
+	int ret;
+
+	BUG_ON(dev_priv->perf.oa.oa_buffer.vma);
+
+	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
+	if (ret)
+		return ret;
+
+	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
+	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
+
+	bo = i915_gem_object_create(&dev_priv->drm, OA_BUFFER_SIZE);
+	if (IS_ERR(bo)) {
+		DRM_ERROR("Failed to allocate OA buffer\n");
+		ret = PTR_ERR(bo);
+		goto unlock;
+	}
+
+	ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
+	if (ret)
+		goto err_unref;
+
+	/* PreHSW required 512K alignment, HSW requires 16M */
+	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, PIN_MAPPABLE);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto err_unref;
+	}
+	dev_priv->perf.oa.oa_buffer.vma = vma;
+
+	dev_priv->perf.oa.oa_buffer.vaddr =
+		i915_gem_object_pin_map(bo, I915_MAP_WB);
+	if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
+		ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
+		goto err_unpin;
+	}
+
+	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
+
+	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
+			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
+			 dev_priv->perf.oa.oa_buffer.vaddr);
+
+	goto unlock;
+
+err_unpin:
+	__i915_vma_unpin(vma);
+
+err_unref:
+	i915_gem_object_put(bo);
+
+	dev_priv->perf.oa.oa_buffer.vaddr = NULL;
+	dev_priv->perf.oa.oa_buffer.vma = NULL;
+
+unlock:
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+	return ret;
+}
+
+static void config_oa_regs(struct drm_i915_private *dev_priv,
+			   const struct i915_oa_reg *regs,
+			   int n_regs)
+{
+	int i;
+
+	for (i = 0; i < n_regs; i++) {
+		const struct i915_oa_reg *reg = regs + i;
+
+		I915_WRITE(reg->addr, reg->value);
+	}
+}
+
+static int hsw_enable_metric_set(struct drm_i915_private *dev_priv)
+{
+	int ret = i915_oa_select_metric_set_hsw(dev_priv);
+
+	if (ret)
+		return ret;
+
+	I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) |
+				      GT_NOA_ENABLE));
+
+	/* PRM:
+	 *
+	 * OA unit is using “crclk” for its functionality. When trunk
+	 * level clock gating takes place, OA clock would be gated,
+	 * unable to count the events from non-render clock domain.
+	 * Render clock gating must be disabled when OA is enabled to
+	 * count the events from non-render domain. Unit level clock
+	 * gating for RCS should also be disabled.
+	 */
+	I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
+				    ~GEN7_DOP_CLOCK_GATE_ENABLE));
+	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
+				  GEN6_CSUNIT_CLOCK_GATE_DISABLE));
+
+	config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
+		       dev_priv->perf.oa.mux_regs_len);
+
+	/* It apparently takes a fairly long time for a new MUX
+	 * configuration to be be applied after these register writes.
+	 * This delay duration was derived empirically based on the
+	 * render_basic config but hopefully it covers the maximum
+	 * configuration latency.
+	 *
+	 * As a fallback, the checks in _append_oa_reports() to skip
+	 * invalid OA reports do also seem to work to discard reports
+	 * generated before this config has completed - albeit not
+	 * silently.
+	 *
+	 * Unfortunately this is essentially a magic number, since we
+	 * don't currently know of a reliable mechanism for predicting
+	 * how long the MUX config will take to apply and besides
+	 * seeing invalid reports we don't know of a reliable way to
+	 * explicitly check that the MUX config has landed.
+	 *
+	 * It's even possible we've miss characterized the underlying
+	 * problem - it just seems like the simplest explanation why
+	 * a delay at this location would mitigate any invalid reports.
+	 */
+	usleep_range(15000, 20000);
+
+	config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
+		       dev_priv->perf.oa.b_counter_regs_len);
+
+	return 0;
+}
+
+static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
+				  ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
+	I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
+				    GEN7_DOP_CLOCK_GATE_ENABLE));
+
+	I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
+				      ~GT_NOA_ENABLE));
+}
+
+static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
+{
+	assert_spin_locked(&dev_priv->perf.hook_lock);
+
+	if (dev_priv->perf.oa.exclusive_stream->enabled) {
+		struct i915_gem_context *ctx =
+			dev_priv->perf.oa.exclusive_stream->ctx;
+		u32 ctx_id = dev_priv->perf.oa.specific_ctx_id;
+
+		bool periodic = dev_priv->perf.oa.periodic;
+		u32 period_exponent = dev_priv->perf.oa.period_exponent;
+		u32 report_format = dev_priv->perf.oa.oa_buffer.format;
+
+		I915_WRITE(GEN7_OACONTROL,
+			   (ctx_id & GEN7_OACONTROL_CTX_MASK) |
+			   (period_exponent <<
+			    GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
+			   (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
+			   (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
+			   (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
+			   GEN7_OACONTROL_ENABLE);
+	} else
+		I915_WRITE(GEN7_OACONTROL, 0);
+}
+
+static void gen7_oa_enable(struct drm_i915_private *dev_priv)
+{
+	unsigned long flags;
+
+	/* Reset buf pointers so we don't forward reports from before now.
+	 *
+	 * Think carefully if considering trying to avoid this, since it
+	 * also ensures status flags and the buffer itself are cleared
+	 * in error paths, and we have checks for invalid reports based
+	 * on the assumption that certain fields are written to zeroed
+	 * memory which this helps maintains.
+	 */
+	gen7_init_oa_buffer(dev_priv);
+
+	spin_lock_irqsave(&dev_priv->perf.hook_lock, flags);
+	gen7_update_oacontrol_locked(dev_priv);
+	spin_unlock_irqrestore(&dev_priv->perf.hook_lock, flags);
+}
+
+static void i915_oa_stream_enable(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	dev_priv->perf.oa.ops.oa_enable(dev_priv);
+
+	if (dev_priv->perf.oa.periodic)
+		hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
+			      ns_to_ktime(POLL_PERIOD),
+			      HRTIMER_MODE_REL_PINNED);
+}
+
+static void gen7_oa_disable(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(GEN7_OACONTROL, 0);
+}
+
+static void i915_oa_stream_disable(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	dev_priv->perf.oa.ops.oa_disable(dev_priv);
+
+	if (dev_priv->perf.oa.periodic)
+		hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
+}
+
+static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
+{
+	return 1000000000ULL * (2ULL << exponent) /
+		dev_priv->perf.oa.timestamp_frequency;
+}
+
+static const struct i915_perf_stream_ops i915_oa_stream_ops = {
+	.destroy = i915_oa_stream_destroy,
+	.enable = i915_oa_stream_enable,
+	.disable = i915_oa_stream_disable,
+	.wait_unlocked = i915_oa_wait_unlocked,
+	.poll_wait = i915_oa_poll_wait,
+	.read = i915_oa_read,
+};
+
+static int i915_oa_stream_init(struct i915_perf_stream *stream,
+			       struct drm_i915_perf_open_param *param,
+			       struct perf_open_properties *props)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	int format_size;
+	int ret;
+
+	if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
+		DRM_ERROR("Only OA report sampling supported\n");
+		return -EINVAL;
+	}
+
+	if (!dev_priv->perf.oa.ops.init_oa_buffer) {
+		DRM_ERROR("OA unit not supported\n");
+		return -ENODEV;
+	}
+
+	/* To avoid the complexity of having to accurately filter
+	 * counter reports and marshal to the appropriate client
+	 * we currently only allow exclusive access
+	 */
+	if (dev_priv->perf.oa.exclusive_stream) {
+		DRM_ERROR("OA unit already in use\n");
+		return -EBUSY;
+	}
+
+	if (!props->metrics_set) {
+		DRM_ERROR("OA metric set not specified\n");
+		return -EINVAL;
+	}
+
+	if (!props->oa_format) {
+		DRM_ERROR("OA report format not specified\n");
+		return -EINVAL;
+	}
+
+	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
+
+	format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
+
+	stream->sample_flags |= SAMPLE_OA_REPORT;
+	stream->sample_size += format_size;
+
+	dev_priv->perf.oa.oa_buffer.format_size = format_size;
+	BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0);
+
+	dev_priv->perf.oa.oa_buffer.format =
+		dev_priv->perf.oa.oa_formats[props->oa_format].format;
+
+	dev_priv->perf.oa.metrics_set = props->metrics_set;
+
+	dev_priv->perf.oa.periodic = props->oa_periodic;
+	if (dev_priv->perf.oa.periodic) {
+		u64 period_ns = oa_exponent_to_ns(dev_priv,
+						  props->oa_period_exponent);
+
+		dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
+
+		/* See comment for OA_TAIL_MARGIN_NSEC for details
+		 * about this tail_margin...
+		 */
+		dev_priv->perf.oa.tail_margin =
+			((OA_TAIL_MARGIN_NSEC / period_ns) + 1) * format_size;
+	}
+
+	if (stream->ctx) {
+		ret = claim_specific_ctx(stream);
+		if (ret)
+			return ret;
+	}
+
+	ret = alloc_oa_buffer(dev_priv);
+	if (ret)
+		goto err_oa_buf_alloc;
+
+	/* PRM - observability performance counters:
+	 *
+	 *   OACONTROL, performance counter enable, note:
+	 *
+	 *   "When this bit is set, in order to have coherent counts,
+	 *   RC6 power state and trunk clock gating must be disabled.
+	 *   This can be achieved by programming MMIO registers as
+	 *   0xA094=0 and 0xA090[31]=1"
+	 *
+	 *   In our case we are expecting that taking pm + FORCEWAKE
+	 *   references will effectively disable RC6.
+	 */
+	intel_runtime_pm_get(dev_priv);
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
+	if (ret)
+		goto err_enable;
+
+	stream->ops = &i915_oa_stream_ops;
+
+	dev_priv->perf.oa.exclusive_stream = stream;
+
+	return 0;
+
+err_enable:
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+	intel_runtime_pm_put(dev_priv);
+	free_oa_buffer(dev_priv);
+
+err_oa_buf_alloc:
+	if (stream->ctx)
+		release_specific_ctx(stream);
+
+	return ret;
+}
+
 static ssize_t i915_perf_read_locked(struct i915_perf_stream *stream,
 				     struct file *file,
 				     char __user *buf,
@@ -78,8 +929,20 @@  static ssize_t i915_perf_read(struct file *file,
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	ssize_t ret;
 
+	/* To ensure it's handled consistently we simply treat all reads of a
+	 * disabled stream as an error. In particular it might otherwise lead
+	 * to a deadlock for blocking file descriptors...
+	 */
+	if (!stream->enabled)
+		return -EIO;
+
 	if (!(file->f_flags & O_NONBLOCK)) {
-		/* Allow false positives from stream->ops->wait_unlocked.
+		/* There's the small chance of false positives from
+		 * stream->ops->wait_unlocked.
+		 *
+		 * E.g. with single context filtering since we only wait until
+		 * oabuffer has >= 1 report we don't immediately know whether
+		 * any reports really belong to the current context
 		 */
 		do {
 			ret = stream->ops->wait_unlocked(stream);
@@ -97,21 +960,50 @@  static ssize_t i915_perf_read(struct file *file,
 		mutex_unlock(&dev_priv->perf.lock);
 	}
 
+	if (ret >= 0) {
+		/* Maybe make ->pollin per-stream state if we support multiple
+		 * concurrent streams in the future. */
+		atomic_set(&dev_priv->perf.oa.pollin, false);
+	}
+
 	return ret;
 }
 
-static unsigned int i915_perf_poll_locked(struct i915_perf_stream *stream,
+static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(hrtimer, typeof(*dev_priv),
+			     perf.oa.poll_check_timer);
+
+	if (!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv)) {
+		atomic_set(&dev_priv->perf.oa.pollin, true);
+		wake_up(&dev_priv->perf.oa.poll_wq);
+	}
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
+
+	return HRTIMER_RESTART;
+}
+
+static unsigned int i915_perf_poll_locked(struct drm_i915_private *dev_priv,
+					  struct i915_perf_stream *stream,
 					  struct file *file,
 					  poll_table *wait)
 {
-	unsigned int streams = 0;
+	unsigned int events = 0;
 
 	stream->ops->poll_wait(stream, file, wait);
 
-	if (stream->ops->can_read(stream))
-		streams |= POLLIN;
+	/* Note: we don't explicitly check whether there's something to read
+	 * here since this path may be very hot depending on what else
+	 * userspace is polling, or on the timeout in use. We rely solely on
+	 * the hrtimer/oa_poll_check_timer_cb to notify us when there are
+	 * samples to read.
+	 */
+	if (atomic_read(&dev_priv->perf.oa.pollin))
+		events |= POLLIN;
 
-	return streams;
+	return events;
 }
 
 static unsigned int i915_perf_poll(struct file *file, poll_table *wait)
@@ -121,7 +1013,7 @@  static unsigned int i915_perf_poll(struct file *file, poll_table *wait)
 	int ret;
 
 	mutex_lock(&dev_priv->perf.lock);
-	ret = i915_perf_poll_locked(stream, file, wait);
+	ret = i915_perf_poll_locked(dev_priv, stream, file, wait);
 	mutex_unlock(&dev_priv->perf.lock);
 
 	return ret;
@@ -285,18 +1177,18 @@  i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
 		goto err_ctx;
 	}
 
-	stream->sample_flags = props->sample_flags;
 	stream->dev_priv = dev_priv;
 	stream->ctx = specific_ctx;
 
-	/*
-	 * TODO: support sampling something
-	 *
-	 * For now this is as far as we can go.
+	ret = i915_oa_stream_init(stream, param, props);
+	if (ret)
+		goto err_alloc;
+
+	/* we avoid simply assigning stream->sample_flags = props->sample_flags
+	 * to have _stream_init check the combination of sample flags more
+	 * thoroughly, but still this is the expected result at this point.
 	 */
-	DRM_ERROR("Unsupported i915 perf stream configuration\n");
-	ret = -EINVAL;
-	goto err_alloc;
+	BUG_ON(stream->sample_flags != props->sample_flags);
 
 	list_add(&stream->link, &dev_priv->perf.streams);
 
@@ -376,6 +1268,56 @@  static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 			props->single_context = 1;
 			props->ctx_handle = value;
 			break;
+		case DRM_I915_PERF_PROP_SAMPLE_OA:
+			props->sample_flags |= SAMPLE_OA_REPORT;
+			break;
+		case DRM_I915_PERF_PROP_OA_METRICS_SET:
+			if (value == 0 ||
+			    value > dev_priv->perf.oa.n_builtin_sets) {
+				DRM_ERROR("Unknown OA metric set ID");
+				return -EINVAL;
+			}
+			props->metrics_set = value;
+			break;
+		case DRM_I915_PERF_PROP_OA_FORMAT:
+			if (value == 0 || value >= I915_OA_FORMAT_MAX) {
+				DRM_ERROR("Invalid OA report format\n");
+				return -EINVAL;
+			}
+			if (!dev_priv->perf.oa.oa_formats[value].size) {
+				DRM_ERROR("Invalid OA report format\n");
+				return -EINVAL;
+			}
+			props->oa_format = value;
+			break;
+		case DRM_I915_PERF_PROP_OA_EXPONENT:
+			if (value > OA_EXPONENT_MAX) {
+				DRM_ERROR("OA timer exponent too high (> %u)\n",
+					  OA_EXPONENT_MAX);
+				return -EINVAL;
+			}
+
+			/* NB: The exponent represents a period as follows:
+			 *
+			 *   80ns * 2^(period_exponent + 1)
+			 *
+			 * Theoretically we can program the OA unit to sample
+			 * every 160ns but don't allow that by default unless
+			 * root.
+			 *
+			 * Referring to perf's
+			 * kernel.perf_event_max_sample_rate for a precedent
+			 * (100000 by default); with an OA exponent of 6 we get
+			 * a period of 10.240 microseconds -just under 100000Hz
+			 */
+			if (value < 6 && !capable(CAP_SYS_ADMIN)) {
+				DRM_ERROR("Sampling period too high without root privileges\n");
+				return -EACCES;
+			}
+
+			props->oa_periodic = true;
+			props->oa_period_exponent = value;
+			break;
 		default:
 			MISSING_CASE(id);
 			DRM_ERROR("Unknown i915 perf property ID");
@@ -426,8 +1368,33 @@  int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 
 void i915_perf_init(struct drm_i915_private *dev_priv)
 {
+	if (!IS_HASWELL(dev_priv))
+		return;
+
+	hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
+		     CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
+	init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
+
 	INIT_LIST_HEAD(&dev_priv->perf.streams);
 	mutex_init(&dev_priv->perf.lock);
+	spin_lock_init(&dev_priv->perf.hook_lock);
+
+	dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
+	dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
+	dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
+	dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
+	dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
+	dev_priv->perf.oa.ops.read = gen7_oa_read;
+	dev_priv->perf.oa.ops.oa_buffer_is_empty =
+		gen7_oa_buffer_is_empty_fop_unlocked;
+
+	dev_priv->perf.oa.timestamp_frequency = 12500000;
+
+	dev_priv->perf.oa.oa_formats = hsw_oa_formats;
+
+	dev_priv->perf.oa.n_builtin_sets =
+		i915_oa_n_builtin_metric_sets_hsw;
 
 	dev_priv->perf.initialized = true;
 }
@@ -437,7 +1404,6 @@  void i915_perf_fini(struct drm_i915_private *dev_priv)
 	if (!dev_priv->perf.initialized)
 		return;
 
-	/* Currently nothing to clean up */
-
+	memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops));
 	dev_priv->perf.initialized = false;
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 070d3297..2557b3f 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -616,6 +616,343 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define HSW_CS_GPR_UDW(n)               _MMIO(0x2600 + (n) * 8 + 4)
 
 #define GEN7_OACONTROL _MMIO(0x2360)
+#define  GEN7_OACONTROL_CTX_MASK	    0xFFFFF000
+#define  GEN7_OACONTROL_TIMER_PERIOD_MASK   0x3F
+#define  GEN7_OACONTROL_TIMER_PERIOD_SHIFT  6
+#define  GEN7_OACONTROL_TIMER_ENABLE	    (1<<5)
+#define  GEN7_OACONTROL_FORMAT_A13	    (0<<2)
+#define  GEN7_OACONTROL_FORMAT_A29	    (1<<2)
+#define  GEN7_OACONTROL_FORMAT_A13_B8_C8    (2<<2)
+#define  GEN7_OACONTROL_FORMAT_A29_B8_C8    (3<<2)
+#define  GEN7_OACONTROL_FORMAT_B4_C8	    (4<<2)
+#define  GEN7_OACONTROL_FORMAT_A45_B8_C8    (5<<2)
+#define  GEN7_OACONTROL_FORMAT_B4_C8_A16    (6<<2)
+#define  GEN7_OACONTROL_FORMAT_C4_B8	    (7<<2)
+#define  GEN7_OACONTROL_FORMAT_SHIFT	    2
+#define  GEN7_OACONTROL_PER_CTX_ENABLE	    (1<<1)
+#define  GEN7_OACONTROL_ENABLE		    (1<<0)
+
+#define GEN8_OACTXID _MMIO(0x2364)
+
+#define GEN8_OACONTROL _MMIO(0x2B00)
+#define  GEN8_OA_REPORT_FORMAT_A12	    (0<<2)
+#define  GEN8_OA_REPORT_FORMAT_A12_B8_C8    (2<<2)
+#define  GEN8_OA_REPORT_FORMAT_A36_B8_C8    (5<<2)
+#define  GEN8_OA_REPORT_FORMAT_C4_B8	    (7<<2)
+#define  GEN8_OA_REPORT_FORMAT_SHIFT	    2
+#define  GEN8_OA_SPECIFIC_CONTEXT_ENABLE    (1<<1)
+#define  GEN8_OA_COUNTER_ENABLE             (1<<0)
+
+#define GEN8_OACTXCONTROL _MMIO(0x2360)
+#define  GEN8_OA_TIMER_PERIOD_MASK	    0x3F
+#define  GEN8_OA_TIMER_PERIOD_SHIFT	    2
+#define  GEN8_OA_TIMER_ENABLE		    (1<<1)
+#define  GEN8_OA_COUNTER_RESUME		    (1<<0)
+
+#define GEN7_OABUFFER _MMIO(0x23B0) /* R/W */
+#define  GEN7_OABUFFER_OVERRUN_DISABLE	    (1<<3)
+#define  GEN7_OABUFFER_EDGE_TRIGGER	    (1<<2)
+#define  GEN7_OABUFFER_STOP_RESUME_ENABLE   (1<<1)
+#define  GEN7_OABUFFER_RESUME		    (1<<0)
+
+#define GEN8_OABUFFER _MMIO(0x2b14)
+
+#define GEN7_OASTATUS1 _MMIO(0x2364)
+#define  GEN7_OASTATUS1_TAIL_MASK	    0xffffffc0
+#define  GEN7_OASTATUS1_COUNTER_OVERFLOW    (1<<2)
+#define  GEN7_OASTATUS1_OABUFFER_OVERFLOW   (1<<1)
+#define  GEN7_OASTATUS1_REPORT_LOST	    (1<<0)
+
+#define GEN7_OASTATUS2 _MMIO(0x2368)
+#define GEN7_OASTATUS2_HEAD_MASK    0xffffffc0
+
+#define GEN8_OASTATUS _MMIO(0x2b08)
+#define  GEN8_OASTATUS_OVERRUN_STATUS	    (1<<3)
+#define  GEN8_OASTATUS_COUNTER_OVERFLOW     (1<<2)
+#define  GEN8_OASTATUS_OABUFFER_OVERFLOW    (1<<1)
+#define  GEN8_OASTATUS_REPORT_LOST	    (1<<0)
+
+#define GEN8_OAHEADPTR _MMIO(0x2B0C)
+#define GEN8_OATAILPTR _MMIO(0x2B10)
+
+#define OABUFFER_SIZE_128K  (0<<3)
+#define OABUFFER_SIZE_256K  (1<<3)
+#define OABUFFER_SIZE_512K  (2<<3)
+#define OABUFFER_SIZE_1M    (3<<3)
+#define OABUFFER_SIZE_2M    (4<<3)
+#define OABUFFER_SIZE_4M    (5<<3)
+#define OABUFFER_SIZE_8M    (6<<3)
+#define OABUFFER_SIZE_16M   (7<<3)
+
+#define OA_MEM_SELECT_GGTT  (1<<0)
+
+#define EU_PERF_CNTL0	    _MMIO(0xe458)
+
+#define GDT_CHICKEN_BITS    _MMIO(0x9840)
+#define GT_NOA_ENABLE	    0x00000080
+
+/*
+ * OA Boolean state
+ */
+
+#define OAREPORTTRIG1 _MMIO(0x2740)
+#define OAREPORTTRIG1_THRESHOLD_MASK 0xffff
+#define OAREPORTTRIG1_EDGE_LEVEL_TRIGER_SELECT_MASK 0xffff0000 /* 0=level */
+
+#define OAREPORTTRIG2 _MMIO(0x2744)
+#define OAREPORTTRIG2_INVERT_A_0  (1<<0)
+#define OAREPORTTRIG2_INVERT_A_1  (1<<1)
+#define OAREPORTTRIG2_INVERT_A_2  (1<<2)
+#define OAREPORTTRIG2_INVERT_A_3  (1<<3)
+#define OAREPORTTRIG2_INVERT_A_4  (1<<4)
+#define OAREPORTTRIG2_INVERT_A_5  (1<<5)
+#define OAREPORTTRIG2_INVERT_A_6  (1<<6)
+#define OAREPORTTRIG2_INVERT_A_7  (1<<7)
+#define OAREPORTTRIG2_INVERT_A_8  (1<<8)
+#define OAREPORTTRIG2_INVERT_A_9  (1<<9)
+#define OAREPORTTRIG2_INVERT_A_10 (1<<10)
+#define OAREPORTTRIG2_INVERT_A_11 (1<<11)
+#define OAREPORTTRIG2_INVERT_A_12 (1<<12)
+#define OAREPORTTRIG2_INVERT_A_13 (1<<13)
+#define OAREPORTTRIG2_INVERT_A_14 (1<<14)
+#define OAREPORTTRIG2_INVERT_A_15 (1<<15)
+#define OAREPORTTRIG2_INVERT_B_0  (1<<16)
+#define OAREPORTTRIG2_INVERT_B_1  (1<<17)
+#define OAREPORTTRIG2_INVERT_B_2  (1<<18)
+#define OAREPORTTRIG2_INVERT_B_3  (1<<19)
+#define OAREPORTTRIG2_INVERT_C_0  (1<<20)
+#define OAREPORTTRIG2_INVERT_C_1  (1<<21)
+#define OAREPORTTRIG2_INVERT_D_0  (1<<22)
+#define OAREPORTTRIG2_THRESHOLD_ENABLE	    (1<<23)
+#define OAREPORTTRIG2_REPORT_TRIGGER_ENABLE (1<<31)
+
+#define OAREPORTTRIG3 _MMIO(0x2748)
+#define OAREPORTTRIG3_NOA_SELECT_MASK	    0xf
+#define OAREPORTTRIG3_NOA_SELECT_8_SHIFT    0
+#define OAREPORTTRIG3_NOA_SELECT_9_SHIFT    4
+#define OAREPORTTRIG3_NOA_SELECT_10_SHIFT   8
+#define OAREPORTTRIG3_NOA_SELECT_11_SHIFT   12
+#define OAREPORTTRIG3_NOA_SELECT_12_SHIFT   16
+#define OAREPORTTRIG3_NOA_SELECT_13_SHIFT   20
+#define OAREPORTTRIG3_NOA_SELECT_14_SHIFT   24
+#define OAREPORTTRIG3_NOA_SELECT_15_SHIFT   28
+
+#define OAREPORTTRIG4 _MMIO(0x274c)
+#define OAREPORTTRIG4_NOA_SELECT_MASK	    0xf
+#define OAREPORTTRIG4_NOA_SELECT_0_SHIFT    0
+#define OAREPORTTRIG4_NOA_SELECT_1_SHIFT    4
+#define OAREPORTTRIG4_NOA_SELECT_2_SHIFT    8
+#define OAREPORTTRIG4_NOA_SELECT_3_SHIFT    12
+#define OAREPORTTRIG4_NOA_SELECT_4_SHIFT    16
+#define OAREPORTTRIG4_NOA_SELECT_5_SHIFT    20
+#define OAREPORTTRIG4_NOA_SELECT_6_SHIFT    24
+#define OAREPORTTRIG4_NOA_SELECT_7_SHIFT    28
+
+#define OAREPORTTRIG5 _MMIO(0x2750)
+#define OAREPORTTRIG5_THRESHOLD_MASK 0xffff
+#define OAREPORTTRIG5_EDGE_LEVEL_TRIGER_SELECT_MASK 0xffff0000 /* 0=level */
+
+#define OAREPORTTRIG6 _MMIO(0x2754)
+#define OAREPORTTRIG6_INVERT_A_0  (1<<0)
+#define OAREPORTTRIG6_INVERT_A_1  (1<<1)
+#define OAREPORTTRIG6_INVERT_A_2  (1<<2)
+#define OAREPORTTRIG6_INVERT_A_3  (1<<3)
+#define OAREPORTTRIG6_INVERT_A_4  (1<<4)
+#define OAREPORTTRIG6_INVERT_A_5  (1<<5)
+#define OAREPORTTRIG6_INVERT_A_6  (1<<6)
+#define OAREPORTTRIG6_INVERT_A_7  (1<<7)
+#define OAREPORTTRIG6_INVERT_A_8  (1<<8)
+#define OAREPORTTRIG6_INVERT_A_9  (1<<9)
+#define OAREPORTTRIG6_INVERT_A_10 (1<<10)
+#define OAREPORTTRIG6_INVERT_A_11 (1<<11)
+#define OAREPORTTRIG6_INVERT_A_12 (1<<12)
+#define OAREPORTTRIG6_INVERT_A_13 (1<<13)
+#define OAREPORTTRIG6_INVERT_A_14 (1<<14)
+#define OAREPORTTRIG6_INVERT_A_15 (1<<15)
+#define OAREPORTTRIG6_INVERT_B_0  (1<<16)
+#define OAREPORTTRIG6_INVERT_B_1  (1<<17)
+#define OAREPORTTRIG6_INVERT_B_2  (1<<18)
+#define OAREPORTTRIG6_INVERT_B_3  (1<<19)
+#define OAREPORTTRIG6_INVERT_C_0  (1<<20)
+#define OAREPORTTRIG6_INVERT_C_1  (1<<21)
+#define OAREPORTTRIG6_INVERT_D_0  (1<<22)
+#define OAREPORTTRIG6_THRESHOLD_ENABLE	    (1<<23)
+#define OAREPORTTRIG6_REPORT_TRIGGER_ENABLE (1<<31)
+
+#define OAREPORTTRIG7 _MMIO(0x2758)
+#define OAREPORTTRIG7_NOA_SELECT_MASK	    0xf
+#define OAREPORTTRIG7_NOA_SELECT_8_SHIFT    0
+#define OAREPORTTRIG7_NOA_SELECT_9_SHIFT    4
+#define OAREPORTTRIG7_NOA_SELECT_10_SHIFT   8
+#define OAREPORTTRIG7_NOA_SELECT_11_SHIFT   12
+#define OAREPORTTRIG7_NOA_SELECT_12_SHIFT   16
+#define OAREPORTTRIG7_NOA_SELECT_13_SHIFT   20
+#define OAREPORTTRIG7_NOA_SELECT_14_SHIFT   24
+#define OAREPORTTRIG7_NOA_SELECT_15_SHIFT   28
+
+#define OAREPORTTRIG8 _MMIO(0x275c)
+#define OAREPORTTRIG8_NOA_SELECT_MASK	    0xf
+#define OAREPORTTRIG8_NOA_SELECT_0_SHIFT    0
+#define OAREPORTTRIG8_NOA_SELECT_1_SHIFT    4
+#define OAREPORTTRIG8_NOA_SELECT_2_SHIFT    8
+#define OAREPORTTRIG8_NOA_SELECT_3_SHIFT    12
+#define OAREPORTTRIG8_NOA_SELECT_4_SHIFT    16
+#define OAREPORTTRIG8_NOA_SELECT_5_SHIFT    20
+#define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
+#define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
+
+#define OASTARTTRIG1 _MMIO(0x2710)
+#define OASTARTTRIG1_THRESHOLD_COUNT_MASK_MBZ 0xffff0000
+#define OASTARTTRIG1_THRESHOLD_MASK	      0xffff
+
+#define OASTARTTRIG2 _MMIO(0x2714)
+#define OASTARTTRIG2_INVERT_A_0 (1<<0)
+#define OASTARTTRIG2_INVERT_A_1 (1<<1)
+#define OASTARTTRIG2_INVERT_A_2 (1<<2)
+#define OASTARTTRIG2_INVERT_A_3 (1<<3)
+#define OASTARTTRIG2_INVERT_A_4 (1<<4)
+#define OASTARTTRIG2_INVERT_A_5 (1<<5)
+#define OASTARTTRIG2_INVERT_A_6 (1<<6)
+#define OASTARTTRIG2_INVERT_A_7 (1<<7)
+#define OASTARTTRIG2_INVERT_A_8 (1<<8)
+#define OASTARTTRIG2_INVERT_A_9 (1<<9)
+#define OASTARTTRIG2_INVERT_A_10 (1<<10)
+#define OASTARTTRIG2_INVERT_A_11 (1<<11)
+#define OASTARTTRIG2_INVERT_A_12 (1<<12)
+#define OASTARTTRIG2_INVERT_A_13 (1<<13)
+#define OASTARTTRIG2_INVERT_A_14 (1<<14)
+#define OASTARTTRIG2_INVERT_A_15 (1<<15)
+#define OASTARTTRIG2_INVERT_B_0 (1<<16)
+#define OASTARTTRIG2_INVERT_B_1 (1<<17)
+#define OASTARTTRIG2_INVERT_B_2 (1<<18)
+#define OASTARTTRIG2_INVERT_B_3 (1<<19)
+#define OASTARTTRIG2_INVERT_C_0 (1<<20)
+#define OASTARTTRIG2_INVERT_C_1 (1<<21)
+#define OASTARTTRIG2_INVERT_D_0 (1<<22)
+#define OASTARTTRIG2_THRESHOLD_ENABLE	    (1<<23)
+#define OASTARTTRIG2_START_TRIG_FLAG_MBZ    (1<<24)
+#define OASTARTTRIG2_EVENT_SELECT_0  (1<<28)
+#define OASTARTTRIG2_EVENT_SELECT_1  (1<<29)
+#define OASTARTTRIG2_EVENT_SELECT_2  (1<<30)
+#define OASTARTTRIG2_EVENT_SELECT_3  (1<<31)
+
+#define OASTARTTRIG3 _MMIO(0x2718)
+#define OASTARTTRIG3_NOA_SELECT_MASK	   0xf
+#define OASTARTTRIG3_NOA_SELECT_8_SHIFT    0
+#define OASTARTTRIG3_NOA_SELECT_9_SHIFT    4
+#define OASTARTTRIG3_NOA_SELECT_10_SHIFT   8
+#define OASTARTTRIG3_NOA_SELECT_11_SHIFT   12
+#define OASTARTTRIG3_NOA_SELECT_12_SHIFT   16
+#define OASTARTTRIG3_NOA_SELECT_13_SHIFT   20
+#define OASTARTTRIG3_NOA_SELECT_14_SHIFT   24
+#define OASTARTTRIG3_NOA_SELECT_15_SHIFT   28
+
+#define OASTARTTRIG4 _MMIO(0x271c)
+#define OASTARTTRIG4_NOA_SELECT_MASK	    0xf
+#define OASTARTTRIG4_NOA_SELECT_0_SHIFT    0
+#define OASTARTTRIG4_NOA_SELECT_1_SHIFT    4
+#define OASTARTTRIG4_NOA_SELECT_2_SHIFT    8
+#define OASTARTTRIG4_NOA_SELECT_3_SHIFT    12
+#define OASTARTTRIG4_NOA_SELECT_4_SHIFT    16
+#define OASTARTTRIG4_NOA_SELECT_5_SHIFT    20
+#define OASTARTTRIG4_NOA_SELECT_6_SHIFT    24
+#define OASTARTTRIG4_NOA_SELECT_7_SHIFT    28
+
+#define OASTARTTRIG5 _MMIO(0x2720)
+#define OASTARTTRIG5_THRESHOLD_COUNT_MASK_MBZ 0xffff0000
+#define OASTARTTRIG5_THRESHOLD_MASK	      0xffff
+
+#define OASTARTTRIG6 _MMIO(0x2724)
+#define OASTARTTRIG6_INVERT_A_0 (1<<0)
+#define OASTARTTRIG6_INVERT_A_1 (1<<1)
+#define OASTARTTRIG6_INVERT_A_2 (1<<2)
+#define OASTARTTRIG6_INVERT_A_3 (1<<3)
+#define OASTARTTRIG6_INVERT_A_4 (1<<4)
+#define OASTARTTRIG6_INVERT_A_5 (1<<5)
+#define OASTARTTRIG6_INVERT_A_6 (1<<6)
+#define OASTARTTRIG6_INVERT_A_7 (1<<7)
+#define OASTARTTRIG6_INVERT_A_8 (1<<8)
+#define OASTARTTRIG6_INVERT_A_9 (1<<9)
+#define OASTARTTRIG6_INVERT_A_10 (1<<10)
+#define OASTARTTRIG6_INVERT_A_11 (1<<11)
+#define OASTARTTRIG6_INVERT_A_12 (1<<12)
+#define OASTARTTRIG6_INVERT_A_13 (1<<13)
+#define OASTARTTRIG6_INVERT_A_14 (1<<14)
+#define OASTARTTRIG6_INVERT_A_15 (1<<15)
+#define OASTARTTRIG6_INVERT_B_0 (1<<16)
+#define OASTARTTRIG6_INVERT_B_1 (1<<17)
+#define OASTARTTRIG6_INVERT_B_2 (1<<18)
+#define OASTARTTRIG6_INVERT_B_3 (1<<19)
+#define OASTARTTRIG6_INVERT_C_0 (1<<20)
+#define OASTARTTRIG6_INVERT_C_1 (1<<21)
+#define OASTARTTRIG6_INVERT_D_0 (1<<22)
+#define OASTARTTRIG6_THRESHOLD_ENABLE	    (1<<23)
+#define OASTARTTRIG6_START_TRIG_FLAG_MBZ    (1<<24)
+#define OASTARTTRIG6_EVENT_SELECT_4  (1<<28)
+#define OASTARTTRIG6_EVENT_SELECT_5  (1<<29)
+#define OASTARTTRIG6_EVENT_SELECT_6  (1<<30)
+#define OASTARTTRIG6_EVENT_SELECT_7  (1<<31)
+
+#define OASTARTTRIG7 _MMIO(0x2728)
+#define OASTARTTRIG7_NOA_SELECT_MASK	   0xf
+#define OASTARTTRIG7_NOA_SELECT_8_SHIFT    0
+#define OASTARTTRIG7_NOA_SELECT_9_SHIFT    4
+#define OASTARTTRIG7_NOA_SELECT_10_SHIFT   8
+#define OASTARTTRIG7_NOA_SELECT_11_SHIFT   12
+#define OASTARTTRIG7_NOA_SELECT_12_SHIFT   16
+#define OASTARTTRIG7_NOA_SELECT_13_SHIFT   20
+#define OASTARTTRIG7_NOA_SELECT_14_SHIFT   24
+#define OASTARTTRIG7_NOA_SELECT_15_SHIFT   28
+
+#define OASTARTTRIG8 _MMIO(0x272c)
+#define OASTARTTRIG8_NOA_SELECT_MASK	   0xf
+#define OASTARTTRIG8_NOA_SELECT_0_SHIFT    0
+#define OASTARTTRIG8_NOA_SELECT_1_SHIFT    4
+#define OASTARTTRIG8_NOA_SELECT_2_SHIFT    8
+#define OASTARTTRIG8_NOA_SELECT_3_SHIFT    12
+#define OASTARTTRIG8_NOA_SELECT_4_SHIFT    16
+#define OASTARTTRIG8_NOA_SELECT_5_SHIFT    20
+#define OASTARTTRIG8_NOA_SELECT_6_SHIFT    24
+#define OASTARTTRIG8_NOA_SELECT_7_SHIFT    28
+
+/* CECX_0 */
+#define OACEC_COMPARE_LESS_OR_EQUAL	6
+#define OACEC_COMPARE_NOT_EQUAL		5
+#define OACEC_COMPARE_LESS_THAN		4
+#define OACEC_COMPARE_GREATER_OR_EQUAL	3
+#define OACEC_COMPARE_EQUAL		2
+#define OACEC_COMPARE_GREATER_THAN	1
+#define OACEC_COMPARE_ANY_EQUAL		0
+
+#define OACEC_COMPARE_VALUE_MASK    0xffff
+#define OACEC_COMPARE_VALUE_SHIFT   3
+
+#define OACEC_SELECT_NOA	(0<<19)
+#define OACEC_SELECT_PREV	(1<<19)
+#define OACEC_SELECT_BOOLEAN	(2<<19)
+
+/* CECX_1 */
+#define OACEC_MASK_MASK		    0xffff
+#define OACEC_CONSIDERATIONS_MASK   0xffff
+#define OACEC_CONSIDERATIONS_SHIFT  16
+
+#define OACEC0_0 _MMIO(0x2770)
+#define OACEC0_1 _MMIO(0x2774)
+#define OACEC1_0 _MMIO(0x2778)
+#define OACEC1_1 _MMIO(0x277c)
+#define OACEC2_0 _MMIO(0x2780)
+#define OACEC2_1 _MMIO(0x2784)
+#define OACEC3_0 _MMIO(0x2788)
+#define OACEC3_1 _MMIO(0x278c)
+#define OACEC4_0 _MMIO(0x2790)
+#define OACEC4_1 _MMIO(0x2794)
+#define OACEC5_0 _MMIO(0x2798)
+#define OACEC5_1 _MMIO(0x279c)
+#define OACEC6_0 _MMIO(0x27a0)
+#define OACEC6_1 _MMIO(0x27a4)
+#define OACEC7_0 _MMIO(0x27a8)
+#define OACEC7_1 _MMIO(0x27ac)
+
 
 #define _GEN7_PIPEA_DE_LOAD_SL	0x70068
 #define _GEN7_PIPEB_DE_LOAD_SL	0x71068
@@ -6982,6 +7319,7 @@  enum {
 # define GEN6_RCCUNIT_CLOCK_GATE_DISABLE		(1 << 11)
 
 #define GEN6_UCGCTL3				_MMIO(0x9408)
+# define GEN6_OACSUNIT_CLOCK_GATE_DISABLE		(1 << 20)
 
 #define GEN7_UCGCTL4				_MMIO(0x940c)
 #define  GEN7_L3BANK2X_CLOCK_GATE_DISABLE	(1<<25)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 98cd493..bf3b8e2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1224,6 +1224,18 @@  struct drm_i915_gem_context_param {
 	__u64 value;
 };
 
+enum drm_i915_oa_format {
+	I915_OA_FORMAT_A13 = 1,
+	I915_OA_FORMAT_A29,
+	I915_OA_FORMAT_A13_B8_C8,
+	I915_OA_FORMAT_B4_C8,
+	I915_OA_FORMAT_A45_B8_C8,
+	I915_OA_FORMAT_B4_C8_A16,
+	I915_OA_FORMAT_C4_B8,
+
+	I915_OA_FORMAT_MAX	    /* non-ABI */
+};
+
 enum drm_i915_perf_property_id {
 	/**
 	 * Open the stream for a specific context handle (as used with
@@ -1232,6 +1244,32 @@  enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_CTX_HANDLE = 1,
 
+	/**
+	 * A value of 1 requests the inclusion of raw OA unit reports as
+	 * part of stream samples.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_OA,
+
+	/**
+	 * The value specifies which set of OA unit metrics should be
+	 * be configured, defining the contents of any OA unit reports.
+	 */
+	DRM_I915_PERF_PROP_OA_METRICS_SET,
+
+	/**
+	 * The value specifies the size and layout of OA unit reports.
+	 */
+	DRM_I915_PERF_PROP_OA_FORMAT,
+
+	/**
+	 * Specifying this property implicitly requests periodic OA unit
+	 * sampling and (at least on Haswell) the sampling frequency is derived
+	 * from this exponent as follows:
+	 *
+	 *   80ns * 2^(period_exponent + 1)
+	 */
+	DRM_I915_PERF_PROP_OA_EXPONENT,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
@@ -1251,7 +1289,22 @@  struct drm_i915_perf_open_param {
 	__u64 __user properties_ptr;
 };
 
+/**
+ * Enable data capture for a stream that was either opened in a disabled state
+ * via I915_PERF_FLAG_DIABLED or was later disabled via I915_PERF_IOCTL_DISABLE.
+ *
+ * It is intended to be cheaper to disable and enable a stream than it may be
+ * to close and re-open a stream with the same configuration.
+ *
+ * It's undefined whether any pending data for the stream will be lost.
+ */
 #define I915_PERF_IOCTL_ENABLE	_IO('i', 0x0)
+
+/**
+ * Disable data capture for a stream.
+ *
+ * It is an error to try and read a stream that is disabled.
+ */
 #define I915_PERF_IOCTL_DISABLE	_IO('i', 0x1)
 
 /**
@@ -1275,17 +1328,30 @@  enum drm_i915_perf_record_type {
 	 * every sample.
 	 *
 	 * The order of these sample properties given by userspace has no
-	 * affect on the ordering of data within a sample. The order will be
+	 * affect on the ordering of data within a sample. The order is
 	 * documented here.
 	 *
 	 * struct {
 	 *     struct drm_i915_perf_record_header header;
 	 *
-	 *     TODO: itemize extensible sample data here
+	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
 	 * };
 	 */
 	DRM_I915_PERF_RECORD_SAMPLE = 1,
 
+	/*
+	 * Indicates that one or more OA reports were not written by the
+	 * hardware. This can happen for example if an MI_REPORT_PERF_COUNT
+	 * command collides with periodic sampling - which would be more likely
+	 * at higher sampling frequencies.
+	 */
+	DRM_I915_PERF_RECORD_OA_REPORT_LOST = 2,
+
+	/**
+	 * An error occurred that resulted in all pending OA reports being lost.
+	 */
+	DRM_I915_PERF_RECORD_OA_BUFFER_LOST = 3,
+
 	DRM_I915_PERF_RECORD_MAX /* non-ABI */
 };