[RFC,6/8] drm/i915: Routines for inserting OA capture commands in the ringbuffer
diff mbox

Message ID 1434966619-3979-7-git-send-email-sourab.gupta@intel.com
State New
Headers show

Commit Message

sourab.gupta@intel.com June 22, 2015, 9:50 a.m. UTC
From: Sourab Gupta <sourab.gupta@intel.com>

This patch introduces the routines which insert commands for capturing OA
snapshots, into the ringbuffer for RCS engine.
The command MI_REPORT_PERF_COUNT can be used to capture	snapshots of OA
counters. The routines introduced in this patch can be called to insert these
commands at appropriate points during workload submission

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c     |  1 +
 drivers/gpu/drm/i915/i915_drv.h     |  3 ++
 drivers/gpu/drm/i915/i915_oa_perf.c | 86 +++++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+)

Comments

Daniel Vetter June 22, 2015, 3:55 p.m. UTC | #1
On Mon, Jun 22, 2015 at 03:20:17PM +0530, sourab.gupta@intel.com wrote:
> From: Sourab Gupta <sourab.gupta@intel.com>
> 
> This patch introduces the routines which insert commands for capturing OA
> snapshots, into the ringbuffer for RCS engine.
> The command MI_REPORT_PERF_COUNT can be used to capture	snapshots of OA
> counters. The routines introduced in this patch can be called to insert these
> commands at appropriate points during workload submission
> 
> Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c     |  1 +
>  drivers/gpu/drm/i915/i915_drv.h     |  3 ++
>  drivers/gpu/drm/i915/i915_oa_perf.c | 86 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 90 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 0553f20..f12feaa 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -821,6 +821,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
>  	/* Must at least be registered before trying to pin any context
>  	 * otherwise i915_oa_context_pin_notify() will lock an un-initialized
>  	 * spinlock, upsetting lockdep checks */
> +	INIT_LIST_HEAD(&dev_priv->profile_cmd);
>  	i915_oa_pmu_register(dev);
>  
>  	intel_pm_setup(dev);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5453842..798da49 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1982,6 +1982,8 @@ struct drm_i915_private {
>  		struct work_struct work_event_stop;
>  		struct completion complete;
>  	} oa_pmu;
> +
> +	struct list_head profile_cmd;

Adding a list for just one entry (or maybe 2-3) is imo total overkill, but
what it achieves is making the code much harder to read. Please remove.

Also please don't split up patches by first adding dead code which isn't
called anywhere and then a 2nd patch which just adds 1-2 places that call
the new functions. Understanding the calling context and place of a
function is very important to do the review, by splitting things up like
that you force reviewers to read ahead and jump around in the patch
series. Which is inefficient.

If the code would otherwise be too large for 1 patch (not the case here)
then a good practice is to first introduce the scaffolding of function
calls and later on fill out the guts of each.
-Daniel

>  #endif
>  
>  	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
> @@ -3162,6 +3164,7 @@ void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
>  				struct intel_context *context);
>  void i915_oa_context_unpin_notify(struct drm_i915_private *dev_priv,
>  				  struct intel_context *context);
> +void i915_insert_profiling_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id);
>  #else
>  static inline void
>  i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
> diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
> index 5d63dab..b02850c 100644
> --- a/drivers/gpu/drm/i915/i915_oa_perf.c
> +++ b/drivers/gpu/drm/i915/i915_oa_perf.c
> @@ -25,6 +25,76 @@ static int hsw_perf_format_sizes[] = {
>  	64   /* C4_B8_HSW */
>  };
>  
> +struct drm_i915_insert_cmd {
> +	struct list_head list;
> +	void (*insert_cmd)(struct intel_ringbuffer *ringbuf, u32 ctx_id);
> +};
> +
> +void i915_insert_profiling_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id)
> +{
> +	struct intel_engine_cs *ring = ringbuf->ring;
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	struct drm_i915_insert_cmd *entry;
> +
> +	list_for_each_entry(entry, &dev_priv->profile_cmd, list)
> +		entry->insert_cmd(ringbuf, ctx_id);
> +}
> +
> +void i915_oa_insert_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id)
> +{
> +	struct intel_engine_cs *ring = ringbuf->ring;
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	struct drm_i915_oa_async_node_info *node_info = NULL;
> +	struct drm_i915_oa_async_queue_header *queue_hdr =
> +			(struct drm_i915_oa_async_queue_header *)
> +			dev_priv->oa_pmu.oa_async_buffer.addr;
> +	void *data_ptr = (u8 *)queue_hdr + queue_hdr->data_offset;
> +	int data_size =	(queue_hdr->size_in_bytes - queue_hdr->data_offset);
> +	u32 data_offset, addr = 0;
> +	int ret;
> +
> +	struct drm_i915_oa_async_node *nodes = data_ptr;
> +	int num_nodes = 0;
> +	int index = 0;
> +
> +	/* OA counters are only supported on the render ring */
> +	if (ring->id != RCS)
> +		return;
> +
> +	num_nodes = data_size / sizeof(*nodes);
> +	index = queue_hdr->node_count % num_nodes;
> +
> +	data_offset = offsetof(struct drm_i915_oa_async_node, report_perf);
> +
> +	addr = i915_gem_obj_ggtt_offset(dev_priv->oa_pmu.oa_async_buffer.obj) +
> +		queue_hdr->data_offset +
> +		index * sizeof(struct drm_i915_oa_async_node) +
> +		data_offset;
> +
> +	/* addr should be 64 byte aligned */
> +	BUG_ON(addr & 0x3f);
> +
> +	ret = intel_ring_begin(ring, 4);
> +	if (ret)
> +		return;
> +
> +	intel_ring_emit(ring, MI_REPORT_PERF_COUNT | (1<<0));
> +	intel_ring_emit(ring, addr | MI_REPORT_PERF_COUNT_GGTT);
> +	intel_ring_emit(ring, ring->outstanding_lazy_request->seqno);
> +	intel_ring_emit(ring, MI_NOOP);
> +	intel_ring_advance(ring);
> +
> +	node_info = &nodes[index].node_info;
> +	i915_gem_request_assign(&node_info->req,
> +				ring->outstanding_lazy_request);
> +
> +	node_info->pid = current->pid;
> +	node_info->ctx_id = ctx_id;
> +	queue_hdr->node_count++;
> +	if (queue_hdr->node_count > num_nodes)
> +		queue_hdr->wrap_count++;
> +}
> +
>  static void init_oa_async_buf_queue(struct drm_i915_private *dev_priv)
>  {
>  	struct drm_i915_oa_async_queue_header *hdr =
> @@ -865,6 +935,7 @@ void i915_oa_async_stop_work_fn(struct work_struct *__work)
>  		container_of(__work, typeof(*dev_priv),
>  			oa_pmu.work_event_stop);
>  	struct perf_event *event = dev_priv->oa_pmu.exclusive_event;
> +	struct drm_i915_insert_cmd *entry, *next;
>  	struct drm_i915_oa_async_queue_header *hdr =
>  		(struct drm_i915_oa_async_queue_header *)
>  		dev_priv->oa_pmu.oa_async_buffer.addr;
> @@ -882,6 +953,13 @@ void i915_oa_async_stop_work_fn(struct work_struct *__work)
>  	if (ret)
>  		return;
>  
> +	list_for_each_entry_safe(entry, next, &dev_priv->profile_cmd, list) {
> +		if (entry->insert_cmd == i915_oa_insert_cmd) {
> +			list_del(&entry->list);
> +			kfree(entry);
> +		}
> +	}
> +
>  	dev_priv->oa_pmu.event_active = false;
>  
>  	i915_oa_async_wait_gpu(dev_priv);
> @@ -920,8 +998,14 @@ static void i915_oa_event_start(struct perf_event *event, int flags)
>  	struct drm_i915_private *dev_priv =
>  		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
>  	unsigned long lock_flags;
> +	struct drm_i915_insert_cmd *entry;
>  	u32 oastatus1, tail;
>  
> +	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
> +	if (!entry)
> +		return;
> +	entry->insert_cmd = i915_oa_insert_cmd;
> +
>  	if (dev_priv->oa_pmu.metrics_set == I915_OA_METRICS_SET_3D) {
>  		config_oa_regs(dev_priv, i915_oa_3d_mux_config_hsw,
>  				i915_oa_3d_mux_config_hsw_len);
> @@ -976,6 +1060,8 @@ static void i915_oa_event_start(struct perf_event *event, int flags)
>  	dev_priv->oa_pmu.event_active = true;
>  	update_oacontrol(dev_priv);
>  
> +	list_add_tail(&entry->list, &dev_priv->profile_cmd);
> +
>  	/* Reset the head ptr to ensure we don't forward reports relating
>  	 * to a previous perf event */
>  	oastatus1 = I915_READ(GEN7_OASTATUS1);
> -- 
> 1.8.5.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 0553f20..f12feaa 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -821,6 +821,7 @@  int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	/* Must at least be registered before trying to pin any context
 	 * otherwise i915_oa_context_pin_notify() will lock an un-initialized
 	 * spinlock, upsetting lockdep checks */
+	INIT_LIST_HEAD(&dev_priv->profile_cmd);
 	i915_oa_pmu_register(dev);
 
 	intel_pm_setup(dev);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5453842..798da49 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1982,6 +1982,8 @@  struct drm_i915_private {
 		struct work_struct work_event_stop;
 		struct completion complete;
 	} oa_pmu;
+
+	struct list_head profile_cmd;
 #endif
 
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
@@ -3162,6 +3164,7 @@  void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
 				struct intel_context *context);
 void i915_oa_context_unpin_notify(struct drm_i915_private *dev_priv,
 				  struct intel_context *context);
+void i915_insert_profiling_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id);
 #else
 static inline void
 i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index 5d63dab..b02850c 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -25,6 +25,76 @@  static int hsw_perf_format_sizes[] = {
 	64   /* C4_B8_HSW */
 };
 
+struct drm_i915_insert_cmd {
+	struct list_head list;
+	void (*insert_cmd)(struct intel_ringbuffer *ringbuf, u32 ctx_id);
+};
+
+void i915_insert_profiling_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id)
+{
+	struct intel_engine_cs *ring = ringbuf->ring;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct drm_i915_insert_cmd *entry;
+
+	list_for_each_entry(entry, &dev_priv->profile_cmd, list)
+		entry->insert_cmd(ringbuf, ctx_id);
+}
+
+void i915_oa_insert_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id)
+{
+	struct intel_engine_cs *ring = ringbuf->ring;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct drm_i915_oa_async_node_info *node_info = NULL;
+	struct drm_i915_oa_async_queue_header *queue_hdr =
+			(struct drm_i915_oa_async_queue_header *)
+			dev_priv->oa_pmu.oa_async_buffer.addr;
+	void *data_ptr = (u8 *)queue_hdr + queue_hdr->data_offset;
+	int data_size =	(queue_hdr->size_in_bytes - queue_hdr->data_offset);
+	u32 data_offset, addr = 0;
+	int ret;
+
+	struct drm_i915_oa_async_node *nodes = data_ptr;
+	int num_nodes = 0;
+	int index = 0;
+
+	/* OA counters are only supported on the render ring */
+	if (ring->id != RCS)
+		return;
+
+	num_nodes = data_size / sizeof(*nodes);
+	index = queue_hdr->node_count % num_nodes;
+
+	data_offset = offsetof(struct drm_i915_oa_async_node, report_perf);
+
+	addr = i915_gem_obj_ggtt_offset(dev_priv->oa_pmu.oa_async_buffer.obj) +
+		queue_hdr->data_offset +
+		index * sizeof(struct drm_i915_oa_async_node) +
+		data_offset;
+
+	/* addr should be 64 byte aligned */
+	BUG_ON(addr & 0x3f);
+
+	ret = intel_ring_begin(ring, 4);
+	if (ret)
+		return;
+
+	intel_ring_emit(ring, MI_REPORT_PERF_COUNT | (1<<0));
+	intel_ring_emit(ring, addr | MI_REPORT_PERF_COUNT_GGTT);
+	intel_ring_emit(ring, ring->outstanding_lazy_request->seqno);
+	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_advance(ring);
+
+	node_info = &nodes[index].node_info;
+	i915_gem_request_assign(&node_info->req,
+				ring->outstanding_lazy_request);
+
+	node_info->pid = current->pid;
+	node_info->ctx_id = ctx_id;
+	queue_hdr->node_count++;
+	if (queue_hdr->node_count > num_nodes)
+		queue_hdr->wrap_count++;
+}
+
 static void init_oa_async_buf_queue(struct drm_i915_private *dev_priv)
 {
 	struct drm_i915_oa_async_queue_header *hdr =
@@ -865,6 +935,7 @@  void i915_oa_async_stop_work_fn(struct work_struct *__work)
 		container_of(__work, typeof(*dev_priv),
 			oa_pmu.work_event_stop);
 	struct perf_event *event = dev_priv->oa_pmu.exclusive_event;
+	struct drm_i915_insert_cmd *entry, *next;
 	struct drm_i915_oa_async_queue_header *hdr =
 		(struct drm_i915_oa_async_queue_header *)
 		dev_priv->oa_pmu.oa_async_buffer.addr;
@@ -882,6 +953,13 @@  void i915_oa_async_stop_work_fn(struct work_struct *__work)
 	if (ret)
 		return;
 
+	list_for_each_entry_safe(entry, next, &dev_priv->profile_cmd, list) {
+		if (entry->insert_cmd == i915_oa_insert_cmd) {
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
+
 	dev_priv->oa_pmu.event_active = false;
 
 	i915_oa_async_wait_gpu(dev_priv);
@@ -920,8 +998,14 @@  static void i915_oa_event_start(struct perf_event *event, int flags)
 	struct drm_i915_private *dev_priv =
 		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
 	unsigned long lock_flags;
+	struct drm_i915_insert_cmd *entry;
 	u32 oastatus1, tail;
 
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		return;
+	entry->insert_cmd = i915_oa_insert_cmd;
+
 	if (dev_priv->oa_pmu.metrics_set == I915_OA_METRICS_SET_3D) {
 		config_oa_regs(dev_priv, i915_oa_3d_mux_config_hsw,
 				i915_oa_3d_mux_config_hsw_len);
@@ -976,6 +1060,8 @@  static void i915_oa_event_start(struct perf_event *event, int flags)
 	dev_priv->oa_pmu.event_active = true;
 	update_oacontrol(dev_priv);
 
+	list_add_tail(&entry->list, &dev_priv->profile_cmd);
+
 	/* Reset the head ptr to ensure we don't forward reports relating
 	 * to a previous perf event */
 	oastatus1 = I915_READ(GEN7_OASTATUS1);