diff mbox

[RFC,8/8] drm/i915: Support for retrieving MMIO register values alongwith timestamps through perf

Message ID 1436950306-14147-9-git-send-email-sourab.gupta@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

sourab.gupta@intel.com July 15, 2015, 8:51 a.m. UTC
From: Sourab Gupta <sourab.gupta@intel.com>

This patch adds support for retrieving MMIO register values alongwith
timestamps and forwarding them to userspace through perf.
The userspace can request upto 8 MMIO register values to be dumped.
The addresses of upto 8 MMIO registers can be passed through perf attr
config. The registers are checked against a whitelist before passing them
on. The commands to dump the values of these MMIO registers are then
inserted into the ring alongwith commands to dump the timestamps.

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  2 +
 drivers/gpu/drm/i915/i915_oa_perf.c | 87 ++++++++++++++++++++++++++++++++++---
 include/uapi/drm/i915_drm.h         | 10 ++++-
 3 files changed, 92 insertions(+), 7 deletions(-)

Comments

Chris Wilson July 15, 2015, 12:51 p.m. UTC | #1
On Wed, Jul 15, 2015 at 02:21:46PM +0530, sourab.gupta@intel.com wrote:
> From: Sourab Gupta <sourab.gupta@intel.com>
> 
> This patch adds support for retrieving MMIO register values alongwith
> timestamps and forwarding them to userspace through perf.
> The userspace can request upto 8 MMIO register values to be dumped.
> The addresses of upto 8 MMIO registers can be passed through perf attr
> config. The registers are checked against a whitelist before passing them
> on. The commands to dump the values of these MMIO registers are then
> inserted into the ring alongwith commands to dump the timestamps.
> 
> Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h     |  2 +
>  drivers/gpu/drm/i915/i915_oa_perf.c | 87 ++++++++++++++++++++++++++++++++++---
>  include/uapi/drm/i915_drm.h         | 10 ++++-
>  3 files changed, 92 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index f2fe8d0..e114175 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2017,7 +2017,9 @@ struct drm_i915_private {
>  #define I915_GEN_PMU_SAMPLE_RING		(1<<0)
>  #define I915_GEN_PMU_SAMPLE_PID			(1<<1)
>  #define I915_GEN_PMU_SAMPLE_TAG			(1<<2)
> +#define I915_GEN_PMU_SAMPLE_MMIO		(1<<3)
>  		int sample_info_flags;
> +		u32 mmio_list[8];
>  	} gen_pmu;
>  
>  	void (*insert_profile_cmd[I915_PROFILE_MAX])
> diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
> index 1cc16ef..a9d44e0 100644
> --- a/drivers/gpu/drm/i915/i915_oa_perf.c
> +++ b/drivers/gpu/drm/i915/i915_oa_perf.c
> @@ -113,8 +113,8 @@ void i915_gen_insert_cmd_ts(struct intel_ringbuffer *ringbuf, u32 ctx_id,
>  	struct drm_i915_gem_object *obj = dev_priv->gen_pmu.buffer.obj;
>  	struct i915_gen_pmu_node *entry;
>  	unsigned long lock_flags;
> -	u32 addr = 0;
> -	int ret;
> +	u32 mmio_addr, addr = 0;
> +	int ret, i;
>  
>  	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
>  	if (entry == NULL) {
> @@ -150,6 +150,7 @@ void i915_gen_insert_cmd_ts(struct intel_ringbuffer *ringbuf, u32 ctx_id,
>  	spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
>  
>  	addr = i915_gem_obj_ggtt_offset(obj) + entry->offset;
> +	mmio_addr = addr + sizeof(struct drm_i915_ts_data);
>  
>  	if (ring->id == RCS) {
>  		ret = intel_ring_begin(ring, 6);
> @@ -177,6 +178,25 @@ void i915_gen_insert_cmd_ts(struct intel_ringbuffer *ringbuf, u32 ctx_id,
>  		intel_ring_emit(ring, 0); /* imm high, must be zero */
>  		intel_ring_advance(ring);
>  	}
> +	for (i = 0; i < 8; i++) {
> +		if (0 == dev_priv->gen_pmu.mmio_list[i])
> +			break;
> +
> +		addr = mmio_addr +
> +			i * sizeof(dev_priv->gen_pmu.mmio_list[i]);
> +
> +		ret = intel_ring_begin(ring, 4);
> +		if (ret)
> +			return;
> +
> +		intel_ring_emit(ring,
> +				MI_STORE_REGISTER_MEM(1) |
> +				MI_SRM_LRM_GLOBAL_GTT);
> +		intel_ring_emit(ring, dev_priv->gen_pmu.mmio_list[i]);
> +		intel_ring_emit(ring, addr);
> +		intel_ring_emit(ring, MI_NOOP);
> +		intel_ring_advance(ring);

Premature optimisation, but can we store the register array with a single command?

for (i = 0; i < 8; i++) {
	if (0 == dev_priv->gen_pmu.mmio_list[i])
		break;

ret = intel_ring_begin(ring, 2*i + 2);
if (ret)
	return;

intel_ring_emit(MI_STORE_REGISTER_MEM(i));
	for (i = 0; i < 8; i++) {
		if (0 == dev_priv->gen_pmu.mmio_list[i])
			break;
		intel_ring_emit(dev_priv->gen_pmu.mmio_list[i]);
		intel_ring_emit(addr + i*4);
	}
intel_ring_emit(MI_NOOP);

You should also consider ensuring that we have a command barrier between
these and 3d operations. If only we had engine->emit_flush(I915_COMMAND_BARRIER).

> +/* Some embargoed entries missing from whitelist */
> +static const struct register_whitelist {
> +	uint64_t offset;
> +	uint32_t size;
> +	/* supported gens, 0x10 for 4, 0x30 for 4 and 5, etc. */
> +	uint32_t gen_bitmask;
> +} whitelist[] = {
> +	{ GEN6_GT_GFX_RC6, 4, GEN_RANGE(7, 9) },
> +	{ GEN6_GT_GFX_RC6p, 4, GEN_RANGE(7, 9) },
> +};
> +
> +static int check_mmio_whitelist(struct drm_i915_private *dev_priv,
> +				struct drm_i915_gen_pmu_attr *gen_attr)

Just what I came looking for.

> +{
> +	struct register_whitelist const *entry = whitelist;
> +	int i, count;
> +
> +	for (count = 0; count < 8; count++) {

hardcoded value, ARRAY_SIZE(gen_attr->mmio_list)

> +		if (!gen_attr->mmio_list[count])
> +			break;
> +
> +		for (i = 0; i < ARRAY_SIZE(whitelist); i++, entry++) {

Interesting choice of continuation in the ABI. Seems a litte forced.
Move the whitelist into the function, no one else should think of
accessing it (right?), then just use whitelist[i].

> +			if (entry->offset == gen_attr->mmio_list[count] &&
> +			    (1 << INTEL_INFO(dev_priv->dev)->gen &

dev_priv->dev->dev_priv->info, you have to be kidding me.

INTEL_INFO(dev_priv) is the natural form.

> +					entry->gen_bitmask))
> +				break;
> +		}
> +
> +		if (i == ARRAY_SIZE(whitelist))
> +			return -EINVAL;
> +	}
> +	return 0;
> +}
> +
>  static int i915_gen_event_init(struct perf_event *event)
>  {
>  	struct drm_i915_private *dev_priv =
> @@ -1674,6 +1740,17 @@ static int i915_gen_event_init(struct perf_event *event)
>  	if (gen_attr.sample_tag)
>  		dev_priv->gen_pmu.sample_info_flags |= I915_GEN_PMU_SAMPLE_TAG;
>  
> +	if (gen_attr.sample_mmio) {
> +		ret = check_mmio_whitelist(dev_priv, &gen_attr);
> +		if (ret)
> +			return ret;

Global hw state should only be inspectable by root (unless the system
has relaxed perf permissions).

> +		dev_priv->gen_pmu.sample_info_flags |=
> +				I915_GEN_PMU_SAMPLE_MMIO;
> +		memcpy(dev_priv->gen_pmu.mmio_list, gen_attr.mmio_list,
> +				sizeof(dev_priv->gen_pmu.mmio_list));
> +	}
> +
>  	/* To avoid the complexity of having to accurately filter
>  	 * data and marshal to the appropriate client
>  	 * we currently only allow exclusive access */
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 7ab4972..65bc39d 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -81,7 +81,7 @@
>  
>  #define I915_OA_ATTR_SIZE_VER0		32  /* sizeof first published struct */
>  
> -#define I915_GEN_PMU_ATTR_SIZE_VER0	8  /* sizeof first published struct */
> +#define I915_GEN_PMU_ATTR_SIZE_VER0	40  /* sizeof first published struct */
>  
>  typedef struct _drm_i915_oa_attr {
>  	__u32 size;
> @@ -105,7 +105,9 @@ struct drm_i915_gen_pmu_attr {
>  	__u32 sample_ring:1,
>  		sample_pid:1,
>  		sample_tag:1,
> -		__reserved_1:29;
> +		sample_mmio:1,
> +		__reserved_1:28;
> +	__u32 mmio_list[8];
>  };

seems like having a
BUILD_BUG_ON(sizeof(struct _drm_i915_oa_attr) != I915_GEN_PMU_ATTR_SIZE_VER0) is in order.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f2fe8d0..e114175 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2017,7 +2017,9 @@  struct drm_i915_private {
 #define I915_GEN_PMU_SAMPLE_RING		(1<<0)
 #define I915_GEN_PMU_SAMPLE_PID			(1<<1)
 #define I915_GEN_PMU_SAMPLE_TAG			(1<<2)
+#define I915_GEN_PMU_SAMPLE_MMIO		(1<<3)
 		int sample_info_flags;
+		u32 mmio_list[8];
 	} gen_pmu;
 
 	void (*insert_profile_cmd[I915_PROFILE_MAX])
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index 1cc16ef..a9d44e0 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -113,8 +113,8 @@  void i915_gen_insert_cmd_ts(struct intel_ringbuffer *ringbuf, u32 ctx_id,
 	struct drm_i915_gem_object *obj = dev_priv->gen_pmu.buffer.obj;
 	struct i915_gen_pmu_node *entry;
 	unsigned long lock_flags;
-	u32 addr = 0;
-	int ret;
+	u32 mmio_addr, addr = 0;
+	int ret, i;
 
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (entry == NULL) {
@@ -150,6 +150,7 @@  void i915_gen_insert_cmd_ts(struct intel_ringbuffer *ringbuf, u32 ctx_id,
 	spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
 
 	addr = i915_gem_obj_ggtt_offset(obj) + entry->offset;
+	mmio_addr = addr + sizeof(struct drm_i915_ts_data);
 
 	if (ring->id == RCS) {
 		ret = intel_ring_begin(ring, 6);
@@ -177,6 +178,25 @@  void i915_gen_insert_cmd_ts(struct intel_ringbuffer *ringbuf, u32 ctx_id,
 		intel_ring_emit(ring, 0); /* imm high, must be zero */
 		intel_ring_advance(ring);
 	}
+	for (i = 0; i < 8; i++) {
+		if (0 == dev_priv->gen_pmu.mmio_list[i])
+			break;
+
+		addr = mmio_addr +
+			i * sizeof(dev_priv->gen_pmu.mmio_list[i]);
+
+		ret = intel_ring_begin(ring, 4);
+		if (ret)
+			return;
+
+		intel_ring_emit(ring,
+				MI_STORE_REGISTER_MEM(1) |
+				MI_SRM_LRM_GLOBAL_GTT);
+		intel_ring_emit(ring, dev_priv->gen_pmu.mmio_list[i]);
+		intel_ring_emit(ring, addr);
+		intel_ring_emit(ring, MI_NOOP);
+		intel_ring_advance(ring);
+	}
 
 	obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
 	i915_vma_move_to_active(i915_gem_obj_to_ggtt(obj), ring);
@@ -556,7 +576,7 @@  static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv,
 {
 	struct perf_sample_data data;
 	struct perf_event *event = dev_priv->gen_pmu.exclusive_event;
-	int ts_size, snapshot_size;
+	int ts_size, mmio_size, snapshot_size;
 	u8 *snapshot, *current_ptr;
 	struct drm_i915_ts_node_ctx_id *ctx_info;
 	struct drm_i915_ts_node_ring_id *ring_info;
@@ -565,10 +585,17 @@  static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv,
 	struct perf_raw_record raw;
 
 	ts_size = sizeof(struct drm_i915_ts_data);
-	snapshot_size = ts_size + sizeof(*ctx_info);
+
+	if (dev_priv->gen_pmu.sample_info_flags & I915_GEN_PMU_SAMPLE_MMIO)
+		mmio_size = sizeof(struct drm_i915_mmio_data);
+	else
+		mmio_size = 0;
+
 	snapshot = dev_priv->gen_pmu.buffer.addr + node->offset;
+	snapshot_size = ts_size + mmio_size + sizeof(*ctx_info);
 
-	ctx_info = (struct drm_i915_ts_node_ctx_id *)(snapshot + ts_size);
+	ctx_info = (struct drm_i915_ts_node_ctx_id *)
+			(snapshot + mmio_size + ts_size);
 	ctx_info->ctx_id = node->ctx_id;
 	current_ptr = snapshot + snapshot_size;
 
@@ -1053,6 +1080,9 @@  static int init_gen_pmu_buffer(struct perf_event *event)
 	if (dev_priv->gen_pmu.sample_info_flags & I915_GEN_PMU_SAMPLE_TAG)
 		node_size += sizeof(struct drm_i915_ts_node_tag);
 
+	if (dev_priv->gen_pmu.sample_info_flags & I915_GEN_PMU_SAMPLE_MMIO)
+		node_size += sizeof(struct drm_i915_mmio_data);
+
 	/* size has to be aligned to 8 bytes (required by relevant gpu cmds) */
 	node_size = ALIGN(node_size, 8);
 	dev_priv->gen_pmu.buffer.node_size = node_size;
@@ -1648,6 +1678,42 @@  err_size:
 	goto out;
 }
 
+#define GEN_RANGE(l, h) GENMASK(h, l)
+
+/* Some embargoed entries missing from whitelist */
+static const struct register_whitelist {
+	uint64_t offset;
+	uint32_t size;
+	/* supported gens, 0x10 for 4, 0x30 for 4 and 5, etc. */
+	uint32_t gen_bitmask;
+} whitelist[] = {
+	{ GEN6_GT_GFX_RC6, 4, GEN_RANGE(7, 9) },
+	{ GEN6_GT_GFX_RC6p, 4, GEN_RANGE(7, 9) },
+};
+
+static int check_mmio_whitelist(struct drm_i915_private *dev_priv,
+				struct drm_i915_gen_pmu_attr *gen_attr)
+{
+	struct register_whitelist const *entry = whitelist;
+	int i, count;
+
+	for (count = 0; count < 8; count++) {
+		if (!gen_attr->mmio_list[count])
+			break;
+
+		for (i = 0; i < ARRAY_SIZE(whitelist); i++, entry++) {
+			if (entry->offset == gen_attr->mmio_list[count] &&
+			    (1 << INTEL_INFO(dev_priv->dev)->gen &
+					entry->gen_bitmask))
+				break;
+		}
+
+		if (i == ARRAY_SIZE(whitelist))
+			return -EINVAL;
+	}
+	return 0;
+}
+
 static int i915_gen_event_init(struct perf_event *event)
 {
 	struct drm_i915_private *dev_priv =
@@ -1674,6 +1740,17 @@  static int i915_gen_event_init(struct perf_event *event)
 	if (gen_attr.sample_tag)
 		dev_priv->gen_pmu.sample_info_flags |= I915_GEN_PMU_SAMPLE_TAG;
 
+	if (gen_attr.sample_mmio) {
+		ret = check_mmio_whitelist(dev_priv, &gen_attr);
+		if (ret)
+			return ret;
+
+		dev_priv->gen_pmu.sample_info_flags |=
+				I915_GEN_PMU_SAMPLE_MMIO;
+		memcpy(dev_priv->gen_pmu.mmio_list, gen_attr.mmio_list,
+				sizeof(dev_priv->gen_pmu.mmio_list));
+	}
+
 	/* To avoid the complexity of having to accurately filter
 	 * data and marshal to the appropriate client
 	 * we currently only allow exclusive access */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 7ab4972..65bc39d 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -81,7 +81,7 @@ 
 
 #define I915_OA_ATTR_SIZE_VER0		32  /* sizeof first published struct */
 
-#define I915_GEN_PMU_ATTR_SIZE_VER0	8  /* sizeof first published struct */
+#define I915_GEN_PMU_ATTR_SIZE_VER0	40  /* sizeof first published struct */
 
 typedef struct _drm_i915_oa_attr {
 	__u32 size;
@@ -105,7 +105,9 @@  struct drm_i915_gen_pmu_attr {
 	__u32 sample_ring:1,
 		sample_pid:1,
 		sample_tag:1,
-		__reserved_1:29;
+		sample_mmio:1,
+		__reserved_1:28;
+	__u32 mmio_list[8];
 };
 
 /* Header for PERF_RECORD_DEVICE type events */
@@ -155,6 +157,10 @@  struct drm_i915_ts_data {
 	__u32 ts_high;
 };
 
+struct drm_i915_mmio_data {
+	__u32 mmio[8];
+};
+
 struct drm_i915_ts_node_ctx_id {
 	__u32 ctx_id;
 	__u32 pad;