diff mbox

[09/11] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines

Message ID 1455600439-18480-10-git-send-email-sourab.gupta@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

sourab.gupta@intel.com Feb. 16, 2016, 5:27 a.m. UTC
From: Sourab Gupta <sourab.gupta@intel.com>

This patch extends the i915  perf framework to handle the perf sample
collection for any given gpu engine. Particularly, the support
for collecting timestamp sample type is added, which can be requested for
any engine.
The thing to note is that, still only a single stream instance can be
opened at any particular time. Though that stream may now be opened for any
gpu engine, for collection of timestamp samples.

So, this patch doesn't add the support to open multiple concurrent streams,
as yet. Though it lays the groundwork for this support to be added
susequently. Part of this groundwork involves having separate command
stream buffers, per engine, for holding the samples generated.
Likewise for a few other data structures maintaining per-engine state.

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |  33 ++-
 drivers/gpu/drm/i915/i915_perf.c | 578 +++++++++++++++++++++++++++------------
 drivers/gpu/drm/i915/i915_reg.h  |   2 +
 include/uapi/drm/i915_drm.h      |   7 +
 4 files changed, 431 insertions(+), 189 deletions(-)
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index cf86228..b1c952c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1725,6 +1725,7 @@  struct i915_perf_stream {
 
 	struct list_head link;
 
+	enum intel_ring_id ring_id;
 	u32 sample_flags;
 	int sample_size;
 
@@ -1734,6 +1735,9 @@  struct i915_perf_stream {
 	/* Whether command stream based data collection is enabled */
 	bool cs_mode;
 
+	/* Whether the OA unit is in use */
+	bool using_oa;
+
 	/* Enables the collection of HW samples, either in response to
 	 * I915_PERF_IOCTL_ENABLE or implicitly called when stream is
 	 * opened without I915_PERF_FLAG_DISABLED */
@@ -1782,7 +1786,8 @@  struct i915_perf_stream {
 	 * Routine to emit the commands in the command streamer associated
 	 * with the corresponding gpu engine.
 	 */
-	void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag);
+	void (*command_stream_hook)(struct i915_perf_stream *stream,
+				struct drm_i915_gem_request *req, u32 tag);
 };
 
 struct i915_oa_ops {
@@ -1807,7 +1812,16 @@  struct i915_oa_ops {
 struct i915_perf_cs_data_node {
 	struct list_head link;
 	struct drm_i915_gem_request *request;
-	u32 offset;
+
+	/* Offsets into the GEM obj holding the data */
+	u32 start_offset;
+	u32 oa_offset;
+	u32 ts_offset;
+
+	/* buffer size corresponding to this entry */
+	u32 size;
+
+	/* Other metadata */
 	u32 ctx_id;
 	u32 pid;
 	u32 tag;
@@ -2071,14 +2085,13 @@  struct drm_i915_private {
 
 		spinlock_t hook_lock;
 
-		struct {
-			struct i915_perf_stream *exclusive_stream;
+		struct hrtimer poll_check_timer;
+		struct i915_perf_stream *exclusive_stream;
+		wait_queue_head_t poll_wq[I915_NUM_RINGS];
 
+		struct {
 			u32 specific_ctx_id;
 
-			struct hrtimer poll_check_timer;
-			wait_queue_head_t poll_wq;
-
 			bool periodic;
 			u32 period_exponent;
 
@@ -2115,10 +2128,10 @@  struct drm_i915_private {
 			struct drm_i915_gem_object *obj;
 			struct i915_vma *vma;
 			u8 *addr;
-		} command_stream_buf;
+		} command_stream_buf[I915_NUM_RINGS];
 
-		struct list_head node_list;
-		spinlock_t node_list_lock;
+		struct list_head node_list[I915_NUM_RINGS];
+		spinlock_t node_list_lock[I915_NUM_RINGS];
 	} perf;
 
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 141f721..1d2712d 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -51,12 +51,17 @@  static u32 i915_perf_stream_paranoid = true;
 #define GEN8_OAREPORT_REASON_GO_TRANSITION  (1<<23)
 #define GEN9_OAREPORT_REASON_CLK_RATIO      (1<<24)
 
-/* Data common to periodic and RCS based samples */
-struct oa_sample_data {
+#define OA_ADDR_ALIGN 64
+#define TS_ADDR_ALIGN 8
+#define I915_PERF_TS_SAMPLE_SIZE 8
+
+/* Data common to all samples (periodic OA / CS based OA / Timestamps) */
+struct sample_data {
 	u32 source;
 	u32 ctx_id;
 	u32 pid;
 	u32 tag;
+	u64 ts;
 	const u8 *report;
 };
 
@@ -100,6 +105,7 @@  static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
 #define SAMPLE_CTX_ID		(1<<2)
 #define SAMPLE_PID		(1<<3)
 #define SAMPLE_TAG		(1<<4)
+#define SAMPLE_TS		(1<<5)
 
 struct perf_open_properties
 {
@@ -136,8 +142,9 @@  void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag)
 
 	mutex_lock(&dev_priv->perf.streams_lock);
 	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
-		if (stream->enabled && stream->command_stream_hook)
-			stream->command_stream_hook(req, tag);
+		if (stream->enabled && (stream->ring_id == ring->id) &&
+				stream->command_stream_hook)
+			stream->command_stream_hook(stream, req, tag);
 	}
 	mutex_unlock(&dev_priv->perf.streams_lock);
 }
@@ -150,16 +157,15 @@  void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag)
  * eventually, when the request associated with new entry completes.
  */
 static void release_some_perf_entries(struct drm_i915_private *dev_priv,
-					u32 target_size)
+				enum intel_ring_id id, u32 target_size)
 {
 	struct i915_perf_cs_data_node *entry, *next;
-	u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size;
 	u32 size = 0;
 
 	list_for_each_entry_safe
-		(entry, next, &dev_priv->perf.node_list, link) {
+		(entry, next, &dev_priv->perf.node_list[id], link) {
 
-		size += entry_size;
+		size += entry->size;
 		i915_gem_request_unreference(entry->request);
 		list_del(&entry->link);
 		kfree(entry);
@@ -175,99 +181,117 @@  static void release_some_perf_entries(struct drm_i915_private *dev_priv,
  * the buffer, it will remove the oldest entries in order to make space.
  */
 static void insert_perf_entry(struct drm_i915_private *dev_priv,
+				struct i915_perf_stream *stream,
 				struct i915_perf_cs_data_node *entry)
 {
 	struct i915_perf_cs_data_node *first_entry, *last_entry;
-	int max_offset = dev_priv->perf.command_stream_buf.obj->base.size;
-	u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size;
-
-	spin_lock(&dev_priv->perf.node_list_lock);
-	if (list_empty(&dev_priv->perf.node_list)) {
-		entry->offset = 0;
-		list_add_tail(&entry->link, &dev_priv->perf.node_list);
-		spin_unlock(&dev_priv->perf.node_list_lock);
-		return;
+	u32 sample_flags = stream->sample_flags;
+	enum intel_ring_id id = stream->ring_id;
+	int max_offset = dev_priv->perf.command_stream_buf[id].obj->base.size;
+	u32 offset, entry_size = 0;
+	bool sample_ts = false;
+
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		entry_size += dev_priv->perf.oa.oa_buffer.format_size;
+	else if (sample_flags & SAMPLE_TS) {
+		/*
+		 * XXX: Since TS data can anyways be derived from OA report, so
+		 * no need to capture it for RCS ring, if capture oa data is
+		 * called already.
+		 */
+		entry_size += I915_PERF_TS_SAMPLE_SIZE;
+		sample_ts = true;
 	}
 
-	first_entry = list_first_entry(&dev_priv->perf.node_list,
+	spin_lock(&dev_priv->perf.node_list_lock[id]);
+	if (list_empty(&dev_priv->perf.node_list[id])) {
+		offset = 0;
+		goto out;
+	}
+
+	first_entry = list_first_entry(&dev_priv->perf.node_list[id],
 				       typeof(*first_entry), link);
-	last_entry = list_last_entry(&dev_priv->perf.node_list,
-				     typeof(*first_entry), link);
+	last_entry = list_last_entry(&dev_priv->perf.node_list[id],
+				     typeof(*last_entry), link);
 
-	if (last_entry->offset >= first_entry->offset) {
+	if (last_entry->start_offset >= first_entry->start_offset) {
 		/* Sufficient space available at the end of buffer? */
-		if (last_entry->offset + 2*entry_size < max_offset)
-			entry->offset = last_entry->offset + entry_size;
+		if (last_entry->start_offset + last_entry->size + entry_size
+							< max_offset)
+			offset = last_entry->start_offset + last_entry->size;
 		/*
 		 * Wraparound condition. Is sufficient space available at
 		 * beginning of buffer?
 		 */
-		else if (entry_size < first_entry->offset)
-			entry->offset = 0;
+		else if (entry_size < first_entry->start_offset)
+			offset = 0;
 		/* Insufficient space. Overwrite existing old entries */
 		else {
-			u32 target_size = entry_size - first_entry->offset;
+			u32 target_size = entry_size -
+						first_entry->start_offset;
 
-			release_some_perf_entries(dev_priv, target_size);
-			entry->offset = 0;
+			release_some_perf_entries(dev_priv, id, target_size);
+			offset = 0;
 		}
 	} else {
 		/* Sufficient space available? */
-		if (last_entry->offset + 2*entry_size < first_entry->offset)
-			entry->offset = last_entry->offset + entry_size;
+		if (last_entry->start_offset + last_entry->size + entry_size
+						< first_entry->start_offset)
+			offset = last_entry->start_offset + last_entry->size;
 		/* Insufficient space. Overwrite existing old entries */
 		else {
 			u32 target_size = entry_size -
-				(first_entry->offset - last_entry->offset -
-				entry_size);
+				(first_entry->start_offset -
+					last_entry->start_offset -
+					last_entry->size);
 
-			release_some_perf_entries(dev_priv, target_size);
-			entry->offset = last_entry->offset + entry_size;
+			release_some_perf_entries(dev_priv, id, target_size);
+			offset = last_entry->start_offset + last_entry->size;
 		}
 	}
-	list_add_tail(&entry->link, &dev_priv->perf.node_list);
-	spin_unlock(&dev_priv->perf.node_list_lock);
+
+out:
+	entry->start_offset = offset;
+	entry->size = entry_size;
+	if (stream->sample_flags & SAMPLE_OA_REPORT) {
+		entry->oa_offset = offset;
+		/* Ensure 64 byte alignment of oa_offset */
+		entry->oa_offset = ALIGN(entry->oa_offset, OA_ADDR_ALIGN);
+		offset = entry->oa_offset +
+				dev_priv->perf.oa.oa_buffer.format_size;
+	}
+	if (sample_ts) {
+		entry->ts_offset = offset;
+		/* Ensure 8 byte alignment of ts_offset */
+		entry->ts_offset = ALIGN(entry->ts_offset, TS_ADDR_ALIGN);
+		offset = entry->ts_offset + I915_PERF_TS_SAMPLE_SIZE;
+	}
+
+	list_add_tail(&entry->link, &dev_priv->perf.node_list[id]);
+	spin_unlock(&dev_priv->perf.node_list_lock[id]);
 }
 
-static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req,
-						u32 tag)
+static int i915_perf_stream_capture_oa_report(struct drm_i915_gem_request *req,
+				u32 offset)
 {
 	struct intel_engine_cs *ring = req->ring;
 	struct intel_ringbuffer *ringbuf = req->ringbuf;
-	struct intel_context *ctx = req->ctx;
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-	struct i915_perf_cs_data_node *entry;
 	u32 addr = 0;
 	int ret;
 
 	/* OA counters are only supported on the render ring */
 	BUG_ON(ring->id != RCS);
 
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (entry == NULL) {
-		DRM_ERROR("alloc failed\n");
-		return;
-	}
-
 	if (i915.enable_execlists)
 		ret = intel_logical_ring_begin(req, 4);
 	else
 		ret = intel_ring_begin(req, 4);
 
-	if (ret) {
-		kfree(entry);
-		return;
-	}
-
-	entry->ctx_id = ctx->global_id;
-	entry->pid = current->pid;
-	entry->tag = tag;
-	i915_gem_request_assign(&entry->request, req);
-
-	insert_perf_entry(dev_priv, entry);
+	if (ret)
+		return ret;
 
-	addr = dev_priv->perf.command_stream_buf.vma->node.start +
-		entry->offset;
+	addr = dev_priv->perf.command_stream_buf[RCS].vma->node.start + offset;
 
 	/* addr should be 64 byte aligned */
 	BUG_ON(addr & 0x3f);
@@ -295,10 +319,154 @@  static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req,
 		}
 		intel_ring_advance(ring);
 	}
-	i915_vma_move_to_active(dev_priv->perf.command_stream_buf.vma, req);
+	return 0;
+}
+
+static int i915_perf_stream_capture_ts_data(struct drm_i915_gem_request *req,
+						u32 offset)
+{
+	struct intel_engine_cs *ring = req->ring;
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	u32 addr = 0;
+	int ret;
+
+	if (i915.enable_execlists)
+		ret = intel_logical_ring_begin(req, 6);
+	else
+		ret = intel_ring_begin(req, 6);
+
+	if (ret)
+		return ret;
+
+	addr = dev_priv->perf.command_stream_buf[ring->id].vma->node.start +
+		offset;
+
+	if (i915.enable_execlists) {
+		if (ring->id == RCS) {
+			intel_logical_ring_emit(ringbuf,
+						GFX_OP_PIPE_CONTROL(6));
+			intel_logical_ring_emit(ringbuf,
+						PIPE_CONTROL_GLOBAL_GTT_IVB |
+						PIPE_CONTROL_TIMESTAMP_WRITE);
+			intel_logical_ring_emit(ringbuf, addr |
+						PIPE_CONTROL_GLOBAL_GTT);
+			intel_logical_ring_emit(ringbuf, 0);
+			intel_logical_ring_emit(ringbuf, 0);
+			intel_logical_ring_emit(ringbuf, 0);
+		} else {
+			uint32_t cmd;
+
+			cmd = MI_FLUSH_DW + 2; /* Gen8+ */
+
+			cmd |= MI_FLUSH_DW_OP_STAMP;
+
+			intel_logical_ring_emit(ringbuf, cmd);
+			intel_logical_ring_emit(ringbuf, addr |
+						MI_FLUSH_DW_USE_GTT);
+			intel_logical_ring_emit(ringbuf, 0);
+			intel_logical_ring_emit(ringbuf, 0);
+			intel_logical_ring_emit(ringbuf, 0);
+			intel_logical_ring_emit(ringbuf, MI_NOOP);
+		}
+		intel_logical_ring_advance(ringbuf);
+	} else {
+		if (ring->id == RCS) {
+			if (INTEL_INFO(ring->dev)->gen >= 8)
+				intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
+			else
+				intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
+			intel_ring_emit(ring,
+					PIPE_CONTROL_GLOBAL_GTT_IVB |
+					PIPE_CONTROL_TIMESTAMP_WRITE);
+			intel_ring_emit(ring, addr | PIPE_CONTROL_GLOBAL_GTT);
+			intel_ring_emit(ring, 0);
+			if (INTEL_INFO(ring->dev)->gen >= 8) {
+				intel_ring_emit(ring, 0);
+				intel_ring_emit(ring, 0);
+			} else {
+				intel_ring_emit(ring, 0);
+				intel_ring_emit(ring, MI_NOOP);
+			}
+		} else {
+			uint32_t cmd;
+
+			cmd = MI_FLUSH_DW + 1;
+			if (INTEL_INFO(ring->dev)->gen >= 8)
+				cmd += 1;
+
+			cmd |= MI_FLUSH_DW_OP_STAMP;
+
+			intel_ring_emit(ring, cmd);
+			intel_ring_emit(ring, addr | MI_FLUSH_DW_USE_GTT);
+			if (INTEL_INFO(ring->dev)->gen >= 8) {
+				intel_ring_emit(ring, 0);
+				intel_ring_emit(ring, 0);
+				intel_ring_emit(ring, 0);
+			} else {
+				intel_ring_emit(ring, 0);
+				intel_ring_emit(ring, 0);
+				intel_ring_emit(ring, MI_NOOP);
+			}
+			intel_ring_emit(ring, MI_NOOP);
+		}
+		intel_ring_advance(ring);
+	}
+	return 0;
+}
+
+static void i915_perf_stream_cs_hook(struct i915_perf_stream *stream,
+				struct drm_i915_gem_request *req, u32 tag)
+{
+	struct intel_engine_cs *ring = req->ring;
+	struct intel_context *ctx = req->ctx;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	enum intel_ring_id id = stream->ring_id;
+	u32 sample_flags = stream->sample_flags;
+	struct i915_perf_cs_data_node *entry;
+	int ret = 0;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry == NULL) {
+		DRM_ERROR("alloc failed\n");
+		return;
+	}
+
+	entry->ctx_id = ctx->global_id;
+	entry->pid = current->pid;
+	entry->tag = tag;
+	i915_gem_request_assign(&entry->request, req);
+
+	insert_perf_entry(dev_priv, stream, entry);
+
+	if (sample_flags & SAMPLE_OA_REPORT) {
+		ret = i915_perf_stream_capture_oa_report(req, entry->oa_offset);
+		if (ret)
+			goto err;
+	} else if (sample_flags & SAMPLE_TS) {
+		/*
+		 * XXX: Since TS data can anyways be derived from OA report, so
+		 * no need to capture it for RCS ring, if capture oa data is
+		 * called already.
+		 */
+		ret = i915_perf_stream_capture_ts_data(req, entry->ts_offset);
+		if (ret)
+			goto err;
+	}
+
+	i915_vma_move_to_active(dev_priv->perf.command_stream_buf[id].vma, req);
+	return;
+
+err:
+	i915_gem_request_unreference(entry->request);
+	spin_lock(&dev_priv->perf.node_list_lock[id]);
+	list_del(&entry->link);
+	kfree(entry);
+	spin_unlock(&dev_priv->perf.node_list_lock[id]);
 }
 
-static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
+static int i915_perf_wait_gpu(struct drm_i915_private *dev_priv,
+				enum intel_ring_id id)
 {
 	struct i915_perf_cs_data_node *last_entry = NULL;
 	struct drm_i915_gem_request *req = NULL;
@@ -309,14 +477,14 @@  static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
 	 * implicitly wait for the prior submitted requests. The refcount
 	 * of the requests is not decremented here.
 	 */
-	spin_lock(&dev_priv->perf.node_list_lock);
+	spin_lock(&dev_priv->perf.node_list_lock[id]);
 
-	if (!list_empty(&dev_priv->perf.node_list)) {
-		last_entry = list_last_entry(&dev_priv->perf.node_list,
+	if (!list_empty(&dev_priv->perf.node_list[id])) {
+		last_entry = list_last_entry(&dev_priv->perf.node_list[id],
 			struct i915_perf_cs_data_node, link);
 		req = last_entry->request;
 	}
-	spin_unlock(&dev_priv->perf.node_list_lock);
+	spin_unlock(&dev_priv->perf.node_list_lock[id]);
 
 	if (!req)
 		return 0;
@@ -331,17 +499,18 @@  static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
 	return 0;
 }
 
-static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv)
+static void i915_perf_free_requests(struct drm_i915_private *dev_priv,
+				enum intel_ring_id id)
 {
 	struct i915_perf_cs_data_node *entry, *next;
 
 	list_for_each_entry_safe
-		(entry, next, &dev_priv->perf.node_list, link) {
+		(entry, next, &dev_priv->perf.node_list[id], link) {
 		i915_gem_request_unreference__unlocked(entry->request);
 
-		spin_lock(&dev_priv->perf.node_list_lock);
+		spin_lock(&dev_priv->perf.node_list_lock[id]);
 		list_del(&entry->link);
-		spin_unlock(&dev_priv->perf.node_list_lock);
+		spin_unlock(&dev_priv->perf.node_list_lock[id]);
 		kfree(entry);
 	}
 }
@@ -381,9 +550,9 @@  static bool append_oa_status(struct i915_perf_stream *stream,
 	return true;
 }
 
-static bool append_oa_sample(struct i915_perf_stream *stream,
+static bool append_sample(struct i915_perf_stream *stream,
 			     struct i915_perf_read_state *read_state,
-			     struct oa_sample_data *data)
+			     struct sample_data *data)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -424,6 +593,13 @@  static bool append_oa_sample(struct i915_perf_stream *stream,
 		read_state->buf += 4;
 	}
 
+	if (sample_flags & SAMPLE_TS) {
+		if (copy_to_user(read_state->buf, &data->ts,
+					I915_PERF_TS_SAMPLE_SIZE))
+			return false;
+		read_state->buf += I915_PERF_TS_SAMPLE_SIZE;
+	}
+
 	if (sample_flags & SAMPLE_OA_REPORT) {
 		if (copy_to_user(read_state->buf, data->report, report_size))
 			return false;
@@ -441,7 +617,7 @@  static bool append_oa_buffer_sample(struct i915_perf_stream *stream,
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 sample_flags = stream->sample_flags;
-	struct oa_sample_data data = { 0 };
+	struct sample_data data = { 0 };
 
 	if (sample_flags & SAMPLE_OA_SOURCE_INFO) {
 		enum drm_i915_perf_oa_event_source source;
@@ -473,10 +649,15 @@  static bool append_oa_buffer_sample(struct i915_perf_stream *stream,
 	if (sample_flags & SAMPLE_TAG)
 		data.tag = 0;
 
+	/* Derive timestamp from OA report, after scaling with the ts base */
+#warning "FIXME: append_oa_buffer_sample: derive the timestamp from OA report"
+	if (sample_flags & SAMPLE_TS)
+		data.ts = 0;
+
 	if (sample_flags & SAMPLE_OA_REPORT)
 		data.report = report;
 
-	append_oa_sample(stream, read_state, &data);
+	append_sample(stream, read_state, &data);
 
 	return true;
 }
@@ -528,7 +709,7 @@  static u32 gen8_append_oa_reports(struct i915_perf_stream *stream,
 			ctx_id &= 0xfffff;
 		}
 
-		if (dev_priv->perf.oa.exclusive_stream->enabled) {
+		if (stream->enabled) {
 
 			/* NB: For Gen 8 we handle per-context report filtering
 			 * ourselves instead of programming the OA unit with a
@@ -539,7 +720,7 @@  static u32 gen8_append_oa_reports(struct i915_perf_stream *stream,
 			 * first report belonging to any subsequently
 			 * switched-too context.
 			 */
-			if (!dev_priv->perf.oa.exclusive_stream->ctx ||
+			if (!stream->ctx ||
 			    (dev_priv->perf.oa.specific_ctx_id == ctx_id ||
 			     (dev_priv->perf.oa.specific_ctx_id !=
 			      dev_priv->perf.oa.oa_buffer.last_ctx_id))) {
@@ -630,7 +811,7 @@  static u32 gen7_append_oa_reports(struct i915_perf_stream *stream,
 		if (report_ts > ts)
 			break;
 
-		if (dev_priv->perf.oa.exclusive_stream->enabled) {
+		if (stream->enabled) {
 			if (!append_oa_buffer_sample(stream, read_state,
 							report))
 				break;
@@ -687,24 +868,32 @@  static void gen7_oa_read(struct i915_perf_stream *stream,
 				    OA_MEM_SELECT_GGTT);
 }
 
-static bool append_oa_rcs_sample(struct i915_perf_stream *stream,
+static bool append_one_cs_sample(struct i915_perf_stream *stream,
 				 struct i915_perf_read_state *read_state,
 				 struct i915_perf_cs_data_node *node)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	struct oa_sample_data data = { 0 };
-	const u8 *report = dev_priv->perf.command_stream_buf.addr +
-				node->offset;
+	enum intel_ring_id id = stream->ring_id;
+	struct sample_data data = { 0 };
 	u32 sample_flags = stream->sample_flags;
-	u32 report_ts;
 
-	/*
-	 * Forward the periodic OA samples which have the timestamp lower
-	 * than timestamp of this sample, before forwarding this sample.
-	 * This ensures samples read by user are order acc. to their timestamps
-	 */
-	report_ts = *(u32 *)(report + 4);
-	dev_priv->perf.oa.ops.read(stream, read_state, report_ts);
+	if (sample_flags & SAMPLE_OA_REPORT) {
+		const u8 *report = dev_priv->perf.command_stream_buf[id].addr +
+				   node->oa_offset;
+		u32 sample_ts = *(u32 *)(report + 4);
+
+		BUG_ON(id != RCS);
+
+		data.report = report;
+
+		/*
+		 * Forward the periodic OA samples which have the timestamp
+		 * lower than timestamp of this sample, before forwarding this
+		 * sample.  This ensures samples read by user are order acc. to
+		 * their timestamps
+		 */
+		dev_priv->perf.oa.ops.read(stream, read_state, sample_ts);
+	}
 
 	if (sample_flags & SAMPLE_OA_SOURCE_INFO)
 		data.source = I915_PERF_OA_EVENT_SOURCE_RCS;
@@ -718,38 +907,51 @@  static bool append_oa_rcs_sample(struct i915_perf_stream *stream,
 	if (sample_flags & SAMPLE_TAG)
 		data.tag = node->tag;
 
-	if (sample_flags & SAMPLE_OA_REPORT)
-		data.report = report;
+	if (sample_flags & SAMPLE_TS) {
+		/* For RCS, derive timestamp from OA report, after
+		 * scaling with the timestamp base. For other rings, forward the
+		 * timestamp collected via command stream.
+		 */
+#warning "FIXME: append_one_cs_sample: derive the timestamp from OA report"
+		if (sample_flags & SAMPLE_OA_REPORT)
+			data.ts = 0;
+		else
+			data.ts = *(u64 *)
+				(dev_priv->perf.command_stream_buf[id].addr +
+					node->ts_offset);
+	}
 
-	append_oa_sample(stream, read_state, &data);
+	append_sample(stream, read_state, &data);
 
 	return true;
 }
 
-static void oa_rcs_append_reports(struct i915_perf_stream *stream,
+static void append_command_stream_samples(struct i915_perf_stream *stream,
 				  struct i915_perf_read_state *read_state)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	enum intel_ring_id id = stream->ring_id;
 	struct i915_perf_cs_data_node *entry, *next;
 
 	list_for_each_entry_safe(entry, next,
-				 &dev_priv->perf.node_list, link) {
+				 &dev_priv->perf.node_list[id], link) {
 		if (!i915_gem_request_completed(entry->request, true))
 			break;
 
-		if (!append_oa_rcs_sample(stream, read_state, entry))
+		if (!append_one_cs_sample(stream, read_state, entry))
 			break;
 
-		spin_lock(&dev_priv->perf.node_list_lock);
+		spin_lock(&dev_priv->perf.node_list_lock[id]);
 		list_del(&entry->link);
-		spin_unlock(&dev_priv->perf.node_list_lock);
+		spin_unlock(&dev_priv->perf.node_list_lock[id]);
 
 		i915_gem_request_unreference__unlocked(entry->request);
 		kfree(entry);
 	}
 
-	/* Flush any remaining periodic reports */
-	dev_priv->perf.oa.ops.read(stream, read_state, U32_MAX);
+	/* Flush any remaining periodic OA reports in case of RCS*/
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		dev_priv->perf.oa.ops.read(stream, read_state, U32_MAX);
 }
 
 static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
@@ -757,7 +959,7 @@  static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
 	if (stream->cs_mode)
-		return list_empty(&dev_priv->perf.node_list);
+		return list_empty(&dev_priv->perf.node_list[stream->ring_id]);
 	else
 		return true;
 }
@@ -772,63 +974,69 @@  static bool stream_have_data__unlocked(struct i915_perf_stream *stream)
 	 * can't be destroyed until completion (such as a read()) that ensures
 	 * the device + OA buffer can't disappear
 	 */
-	return !(dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv) &&
-		 command_stream_buf_is_empty(stream));
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		return !(dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv) &&
+			command_stream_buf_is_empty(stream));
+	else
+		return !command_stream_buf_is_empty(stream);
 }
 
-static bool i915_oa_can_read(struct i915_perf_stream *stream)
+static bool i915_perf_stream_can_read(struct i915_perf_stream *stream)
 {
 
 	return stream_have_data__unlocked(stream);
 }
 
-static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
+static int i915_perf_stream_wait_unlocked(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	enum intel_ring_id id = stream->ring_id;
 	int ret;
 
 	if (stream->cs_mode) {
-		ret = i915_oa_rcs_wait_gpu(dev_priv);
+		ret = i915_perf_wait_gpu(dev_priv, id);
 		if (ret)
 			return ret;
 	}
 
-	return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
+	return wait_event_interruptible(dev_priv->perf.poll_wq[id],
 					stream_have_data__unlocked(stream));
 }
 
-static void i915_oa_poll_wait(struct i915_perf_stream *stream,
+static void i915_perf_stream_poll_wait(struct i915_perf_stream *stream,
 			      struct file *file,
 			      poll_table *wait)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	poll_wait(file, &dev_priv->perf.oa.poll_wq, wait);
+	poll_wait(file, &dev_priv->perf.poll_wq[stream->ring_id], wait);
 }
 
-static void i915_oa_read(struct i915_perf_stream *stream,
+static void i915_perf_stream_read(struct i915_perf_stream *stream,
 			 struct i915_perf_read_state *read_state)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
 	if (stream->cs_mode)
-		oa_rcs_append_reports(stream, read_state);
-	else
+		append_command_stream_samples(stream, read_state);
+	else if (stream->ring_id == RCS)
 		dev_priv->perf.oa.ops.read(stream, read_state, U32_MAX);
 }
 
 static void
-free_command_stream_buf(struct drm_i915_private *i915)
+free_command_stream_buf(struct drm_i915_private *i915,
+				enum intel_ring_id id)
 {
 	mutex_lock(&i915->dev->struct_mutex);
 
-	vunmap(i915->perf.command_stream_buf.addr);
-	i915_gem_object_ggtt_unpin(i915->perf.command_stream_buf.obj);
-	drm_gem_object_unreference(&i915->perf.command_stream_buf.obj->base);
+	vunmap(i915->perf.command_stream_buf[id].addr);
+	i915_gem_object_ggtt_unpin(i915->perf.command_stream_buf[id].obj);
+	drm_gem_object_unreference(
+		&i915->perf.command_stream_buf[id].obj->base);
 
-	i915->perf.command_stream_buf.obj = NULL;
-	i915->perf.command_stream_buf.vma = NULL;
-	i915->perf.command_stream_buf.addr = NULL;
+	i915->perf.command_stream_buf[id].obj = NULL;
+	i915->perf.command_stream_buf[id].vma = NULL;
+	i915->perf.command_stream_buf[id].addr = NULL;
 
 	mutex_unlock(&i915->dev->struct_mutex);
 }
@@ -849,16 +1057,13 @@  free_oa_buffer(struct drm_i915_private *i915)
 	mutex_unlock(&i915->dev->struct_mutex);
 }
 
-static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
+static void i915_perf_stream_destroy(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
+	BUG_ON(stream != dev_priv->perf.exclusive_stream);
 
-	if (stream->cs_mode)
-		free_command_stream_buf(dev_priv);
-
-	if (dev_priv->perf.oa.oa_buffer.obj) {
+	if (stream->using_oa) {
 		dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
 
 		free_oa_buffer(dev_priv);
@@ -867,7 +1072,10 @@  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 		intel_runtime_pm_put(dev_priv);
 	}
 
-	dev_priv->perf.oa.exclusive_stream = NULL;
+	if (stream->cs_mode)
+		free_command_stream_buf(dev_priv, stream->ring_id);
+
+	dev_priv->perf.exclusive_stream = NULL;
 }
 
 static void *vmap_oa_buffer(struct drm_i915_gem_object *obj)
@@ -993,27 +1201,28 @@  static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 	return 0;
 }
 
-static int alloc_command_stream_buf(struct drm_i915_private *dev_priv)
+static int alloc_command_stream_buf(struct drm_i915_private *dev_priv,
+					enum intel_ring_id id)
 {
 	struct drm_i915_gem_object *bo;
 	int ret;
 
-	BUG_ON(dev_priv->perf.command_stream_buf.obj);
+	BUG_ON(dev_priv->perf.command_stream_buf[id].obj);
 
 	ret = alloc_obj(dev_priv, &bo);
 	if (ret)
 		return ret;
 
-	dev_priv->perf.command_stream_buf.obj = bo;
-	dev_priv->perf.command_stream_buf.vma = i915_gem_obj_to_ggtt(bo);
-	dev_priv->perf.command_stream_buf.addr = vmap_oa_buffer(bo);
-	INIT_LIST_HEAD(&dev_priv->perf.node_list);
+	dev_priv->perf.command_stream_buf[id].obj = bo;
+	dev_priv->perf.command_stream_buf[id].vma = i915_gem_obj_to_ggtt(bo);
+	dev_priv->perf.command_stream_buf[id].addr = vmap_oa_buffer(bo);
+	INIT_LIST_HEAD(&dev_priv->perf.node_list[id]);
 
 	DRM_DEBUG_DRIVER(
 		"command stream buf initialized, gtt offset = 0x%x, vaddr = %p",
 		 (unsigned int)
-		 dev_priv->perf.command_stream_buf.vma->node.start,
-		 dev_priv->perf.command_stream_buf.addr);
+		 dev_priv->perf.command_stream_buf[id].vma->node.start,
+		 dev_priv->perf.command_stream_buf[id].addr);
 
 	return 0;
 }
@@ -1225,17 +1434,17 @@  static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
 {
 	assert_spin_locked(&dev_priv->perf.hook_lock);
 
-	if (dev_priv->perf.oa.exclusive_stream->enabled) {
+	if (dev_priv->perf.exclusive_stream->enabled) {
 		unsigned long ctx_id = 0;
 		bool pinning_ok = false;
 
-		if (dev_priv->perf.oa.exclusive_stream->ctx &&
+		if (dev_priv->perf.exclusive_stream->ctx &&
 		    dev_priv->perf.oa.specific_ctx_id) {
 			ctx_id = dev_priv->perf.oa.specific_ctx_id;
 			pinning_ok = true;
 		}
 
-		if (dev_priv->perf.oa.exclusive_stream->ctx == NULL ||
+		if (dev_priv->perf.exclusive_stream->ctx == NULL ||
 		    pinning_ok) {
 			bool periodic = dev_priv->perf.oa.periodic;
 			u32 period_exponent = dev_priv->perf.oa.period_exponent;
@@ -1292,17 +1501,18 @@  static void gen8_oa_enable(struct drm_i915_private *dev_priv)
 	I915_WRITE(GEN8_OAHEADPTR, tail);
 }
 
-static void i915_oa_stream_enable(struct i915_perf_stream *stream)
+static void i915_perf_stream_enable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	dev_priv->perf.oa.ops.oa_enable(dev_priv);
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		dev_priv->perf.oa.ops.oa_enable(dev_priv);
 
 	if (stream->cs_mode)
-		stream->command_stream_hook = i915_perf_command_stream_hook_oa;
+		stream->command_stream_hook = i915_perf_stream_cs_hook;
 
-	if (dev_priv->perf.oa.periodic)
-		hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
+	if (stream->cs_mode || dev_priv->perf.oa.periodic)
+		hrtimer_start(&dev_priv->perf.poll_check_timer,
 			      ns_to_ktime(POLL_PERIOD),
 			      HRTIMER_MODE_REL_PINNED);
 }
@@ -1317,23 +1527,24 @@  static void gen8_oa_disable(struct drm_i915_private *dev_priv)
 	I915_WRITE(GEN8_OACONTROL, 0);
 }
 
-static void i915_oa_stream_disable(struct i915_perf_stream *stream)
+static void i915_perf_stream_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	if (dev_priv->perf.oa.periodic)
-		hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
+	if (stream->cs_mode || dev_priv->perf.oa.periodic)
+		hrtimer_cancel(&dev_priv->perf.poll_check_timer);
 
 	if (stream->cs_mode) {
 		stream->command_stream_hook = NULL;
-		i915_oa_rcs_wait_gpu(dev_priv);
-		i915_oa_rcs_free_requests(dev_priv);
+		i915_perf_wait_gpu(dev_priv, stream->ring_id);
+		i915_perf_free_requests(dev_priv, stream->ring_id);
 	}
 
-	dev_priv->perf.oa.ops.oa_disable(dev_priv);
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		dev_priv->perf.oa.ops.oa_disable(dev_priv);
 }
 
-static int i915_oa_stream_init(struct i915_perf_stream *stream,
+static int i915_perf_stream_init(struct i915_perf_stream *stream,
 			       struct drm_i915_perf_open_param *param,
 			       struct perf_open_properties *props)
 {
@@ -1341,15 +1552,15 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT |
 						      SAMPLE_OA_SOURCE_INFO);
 	bool require_cs_mode = props->sample_flags & (SAMPLE_PID |
-						      SAMPLE_TAG);
-	int format_size;
+						      SAMPLE_TAG |
+						      SAMPLE_TS);
 	int ret;
 
 	/* To avoid the complexity of having to accurately filter
 	 * counter reports and marshal to the appropriate client
 	 * we currently only allow exclusive access */
-	if (dev_priv->perf.oa.exclusive_stream) {
-		DRM_ERROR("OA unit already in use\n");
+	if (dev_priv->perf.exclusive_stream) {
+		DRM_ERROR("Stream already in use\n");
 		return -EBUSY;
 	}
 
@@ -1364,6 +1575,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
 
 	if (require_oa_unit) {
+		int format_size;
 		if (!dev_priv->perf.oa.ops.init_oa_buffer) {
 			DRM_ERROR("OA unit not supported\n");
 			return -ENODEV;
@@ -1386,6 +1598,8 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			return -EINVAL;
 		}
 
+		stream->using_oa = true;
+
 		format_size =
 			dev_priv->perf.oa.oa_formats[props->oa_format].size;
 
@@ -1452,7 +1666,8 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	}
 
 	if (require_cs_mode && !props->cs_mode) {
-		DRM_ERROR("PID or TAG sampling require a ring to be specified");
+		DRM_ERROR(
+			"PID, TAG or TS sampling require a ring to be specified");
 		ret = -EINVAL;
 		goto cs_error;
 	}
@@ -1472,6 +1687,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		}
 
 		stream->cs_mode = true;
+		stream->ring_id = props->ring_id;
 
 		if (props->sample_flags & SAMPLE_PID) {
 			stream->sample_flags |= SAMPLE_PID;
@@ -1483,20 +1699,25 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			stream->sample_size += 4;
 		}
 
-		ret = alloc_command_stream_buf(dev_priv);
+		if (props->sample_flags & SAMPLE_TS) {
+			stream->sample_flags |= SAMPLE_TS;
+			stream->sample_size += I915_PERF_TS_SAMPLE_SIZE;
+		}
+
+		ret = alloc_command_stream_buf(dev_priv, stream->ring_id);
 		if (ret)
 			goto cs_error;
 	}
 
-	dev_priv->perf.oa.exclusive_stream = stream;
+	dev_priv->perf.exclusive_stream = stream;
 
-	stream->destroy = i915_oa_stream_destroy;
-	stream->enable = i915_oa_stream_enable;
-	stream->disable = i915_oa_stream_disable;
-	stream->can_read = i915_oa_can_read;
-	stream->wait_unlocked = i915_oa_wait_unlocked;
-	stream->poll_wait = i915_oa_poll_wait;
-	stream->read = i915_oa_read;
+	stream->destroy = i915_perf_stream_destroy;
+	stream->enable = i915_perf_stream_enable;
+	stream->disable = i915_perf_stream_disable;
+	stream->can_read = i915_perf_stream_can_read;
+	stream->wait_unlocked = i915_perf_stream_wait_unlocked;
+	stream->poll_wait = i915_perf_stream_poll_wait;
+	stream->read = i915_perf_stream_read;
 
 	return 0;
 
@@ -1530,8 +1751,8 @@  static void i915_oa_context_pin_notify_locked(struct drm_i915_private *dev_priv,
 	    dev_priv->perf.oa.ops.update_hw_ctx_id_locked == NULL)
 		return;
 
-	if (dev_priv->perf.oa.exclusive_stream &&
-	    dev_priv->perf.oa.exclusive_stream->ctx == context) {
+	if (dev_priv->perf.exclusive_stream &&
+	    dev_priv->perf.exclusive_stream->ctx == context) {
 		struct drm_i915_gem_object *obj =
 			context->legacy_hw_ctx.rcs_state;
 		u32 ctx_id = i915_gem_obj_ggtt_offset(obj);
@@ -1599,8 +1820,8 @@  void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req)
 	if (dev_priv->perf.oa.ops.legacy_ctx_switch_unlocked == NULL)
 		return;
 
-	if (dev_priv->perf.oa.exclusive_stream &&
-	    dev_priv->perf.oa.exclusive_stream->enabled) {
+	if (dev_priv->perf.exclusive_stream &&
+	    dev_priv->perf.exclusive_stream->enabled) {
 
 		/* XXX: We don't take a lock here and this may run
 		 * async with respect to stream methods. Notably we
@@ -1729,7 +1950,7 @@  static enum hrtimer_restart poll_check_timer_cb(struct hrtimer *hrtimer)
 
 	struct drm_i915_private *dev_priv =
 		container_of(hrtimer, typeof(*dev_priv),
-			     perf.oa.poll_check_timer);
+			     perf.poll_check_timer);
 
 	/* No need to protect the streams list here, since the hrtimer is
 	 * disabled before the stream is removed from list, and currently a
@@ -1738,7 +1959,7 @@  static enum hrtimer_restart poll_check_timer_cb(struct hrtimer *hrtimer)
 	 */
 	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
 		if (stream_have_data__unlocked(stream))
-			wake_up(&dev_priv->perf.oa.poll_wq);
+			wake_up(&dev_priv->perf.poll_wq[stream->ring_id]);
 	}
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
@@ -1947,7 +2168,7 @@  int i915_perf_open_ioctl_locked(struct drm_device *dev,
 	stream->dev_priv = dev_priv;
 	stream->ctx = specific_ctx;
 
-	ret = i915_oa_stream_init(stream, param, props);
+	ret = i915_perf_stream_init(stream, param, props);
 	if (ret)
 		goto err_alloc;
 
@@ -2088,13 +2309,6 @@  static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 				if (ring_id >= LAST_USER_RING)
 					return -EINVAL;
 
-				/* XXX: Currently only RCS is supported.
-				 * Remove this check when support for other
-				 * rings is added
-				 */
-				if (ring_id != RCS)
-					return -EINVAL;
-
 				props->cs_mode = true;
 				props->ring_id = ring_id;
 			}
@@ -2108,6 +2322,9 @@  static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_SAMPLE_TAG_PROP:
 			props->sample_flags |= SAMPLE_TAG;
 			break;
+		case DRM_I915_PERF_SAMPLE_TS_PROP:
+			props->sample_flags |= SAMPLE_TS;
+			break;
 		case DRM_I915_PERF_PROP_MAX:
 			BUG();
 		}
@@ -2193,6 +2410,7 @@  static struct ctl_table dev_root[] = {
 void i915_perf_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
+	int i;
 
 	if (!(IS_HASWELL(dev) ||
 	      IS_BROADWELL(dev) || IS_CHERRYVIEW(dev) ||
@@ -2204,16 +2422,18 @@  void i915_perf_init(struct drm_device *dev)
 	if (!dev_priv->perf.metrics_kobj)
 		return;
 
-	hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
+	hrtimer_init(&dev_priv->perf.poll_check_timer,
 		     CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	dev_priv->perf.oa.poll_check_timer.function = poll_check_timer_cb;
-	init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
+	dev_priv->perf.poll_check_timer.function = poll_check_timer_cb;
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		spin_lock_init(&dev_priv->perf.node_list_lock[i]);
+		init_waitqueue_head(&dev_priv->perf.poll_wq[i]);
+	}
 
 	INIT_LIST_HEAD(&dev_priv->perf.streams);
 	mutex_init(&dev_priv->perf.lock);
 	mutex_init(&dev_priv->perf.streams_lock);
 	spin_lock_init(&dev_priv->perf.hook_lock);
-	spin_lock_init(&dev_priv->perf.node_list_lock);
 
 	if (IS_HASWELL(dev)) {
 		dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a333038..e244626 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -359,6 +359,7 @@ 
 #define   MI_FLUSH_DW_STORE_INDEX	(1<<21)
 #define   MI_INVALIDATE_TLB		(1<<18)
 #define   MI_FLUSH_DW_OP_STOREDW	(1<<14)
+#define   MI_FLUSH_DW_OP_STAMP		(3<<14)
 #define   MI_FLUSH_DW_OP_MASK		(3<<14)
 #define   MI_FLUSH_DW_NOTIFY		(1<<8)
 #define   MI_INVALIDATE_BSD		(1<<7)
@@ -438,6 +439,7 @@ 
 #define   PIPE_CONTROL_TLB_INVALIDATE			(1<<18)
 #define   PIPE_CONTROL_MEDIA_STATE_CLEAR		(1<<16)
 #define   PIPE_CONTROL_QW_WRITE				(1<<14)
+#define   PIPE_CONTROL_TIMESTAMP_WRITE			(3<<14)
 #define   PIPE_CONTROL_POST_SYNC_OP_MASK                (3<<14)
 #define   PIPE_CONTROL_DEPTH_STALL			(1<<13)
 #define   PIPE_CONTROL_WRITE_FLUSH			(1<<12)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 5687080..2570f3ea 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1236,6 +1236,12 @@  enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_SAMPLE_TAG_PROP,
 
+	/**
+	 * The value of this property set to 1 requests inclusion of timestamp
+	 * in the perf sample data.
+	 */
+	DRM_I915_PERF_SAMPLE_TS_PROP,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
@@ -1287,6 +1293,7 @@  enum drm_i915_perf_record_type {
 	 *     { u32 ctx_id; } && DRM_I915_PERF_SAMPLE_CTX_ID_PROP
 	 *     { u32 pid; } && DRM_I915_PERF_SAMPLE_PID_PROP
 	 *     { u32 tag; } && DRM_I915_PERF_SAMPLE_TAG_PROP
+	 *     { u64 timestamp; } && DRM_I915_PERF_SAMPLE_TS_PROP
 	 *     { u32 oa_report[]; } && DRM_I915_PERF_SAMPLE_OA_PROP
 	 * };
 	 */