diff mbox

[RFC,2/8] drm/i915: Add mechanism for forwarding the timestamp data through perf

Message ID 1436950306-14147-3-git-send-email-sourab.gupta@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

sourab.gupta@intel.com July 15, 2015, 8:51 a.m. UTC
From: Sourab Gupta <sourab.gupta@intel.com>

This patch adds the mechanism for forwarding the timestamp data to
userspace using the Gen PMU perf event interface.

The timestamps will be captured in a gem buffer object. The metadata
information (ctx_id right now) pertaining to snapshot is maintained in a
list, whose each node has offsets into the gem buffer object for each
snapshot captured.
In order to track whether the gpu has completed processing the node,
a field pertaining to corresponding gem request is added. The request is
expected to be referenced whenever the gpu command is submitted.

Each snapshot collected is forwarded as a separate perf sample. The perf
sample will have raw timestamp data followed by metadata information
pertaining to that sample.
While forwarding the samples, we check whether the gem request is completed
and dereference the corresponding request. The need to dereference the
request necessitates a worker here, which will be scheduled when the
hrtimer triggers.
While flushing the samples, we have to wait for the requests already
scheduled, before forwarding the samples. This wait is done in a lockless
fashion.

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
 drivers/gpu/drm/i915/i915_oa_perf.c | 118 +++++++++++++++++++++++++++++++++++-
 include/uapi/drm/i915_drm.h         |  10 +++
 3 files changed, 137 insertions(+), 2 deletions(-)

Comments

Chris Wilson July 15, 2015, 9:40 a.m. UTC | #1
On Wed, Jul 15, 2015 at 02:21:40PM +0530, sourab.gupta@intel.com wrote:
> From: Sourab Gupta <sourab.gupta@intel.com>
> 
> This patch adds the mechanism for forwarding the timestamp data to
> userspace using the Gen PMU perf event interface.
> 
> The timestamps will be captured in a gem buffer object. The metadata
> information (ctx_id right now) pertaining to snapshot is maintained in a
> list, whose each node has offsets into the gem buffer object for each
> snapshot captured.

What is the definition of ctx_id? The only persistent one is the
user_handle which is only valid within the file_priv namespace. There is
no guid for ctx at the moment.

> In order to track whether the gpu has completed processing the node,
> a field pertaining to corresponding gem request is added. The request is
> expected to be referenced whenever the gpu command is submitted.
> 
> Each snapshot collected is forwarded as a separate perf sample. The perf
> sample will have raw timestamp data followed by metadata information
> pertaining to that sample.
> While forwarding the samples, we check whether the gem request is completed
> and dereference the corresponding request. The need to dereference the
> request necessitates a worker here, which will be scheduled when the
> hrtimer triggers.
> While flushing the samples, we have to wait for the requests already
> scheduled, before forwarding the samples. This wait is done in a lockless
> fashion.
> 
> Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
>  drivers/gpu/drm/i915/i915_oa_perf.c | 118 +++++++++++++++++++++++++++++++++++-
>  include/uapi/drm/i915_drm.h         |  10 +++
>  3 files changed, 137 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7e8e77b..6984150 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1675,6 +1675,13 @@ struct i915_oa_rcs_node {
>  	u32 tag;
>  };
>  
> +struct i915_gen_pmu_node {
> +	struct list_head head;
> +	struct drm_i915_gem_request *req;
> +	u32 offset;
> +	u32 ctx_id;
> +};
> +
>  extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[];
>  extern const int i915_oa_3d_mux_config_hsw_len;
>  extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[];
> @@ -1996,7 +2003,11 @@ struct drm_i915_private {
>  		struct {
>  			struct drm_i915_gem_object *obj;
>  			u8 *addr;
> +			u32 node_size;
> +			u32 node_count;
>  		} buffer;
> +		struct list_head node_list;
> +		struct work_struct work_timer;
>  	} gen_pmu;
>  
>  	void (*insert_profile_cmd[I915_PROFILE_MAX])
> diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
> index ab965b4..350b560 100644
> --- a/drivers/gpu/drm/i915/i915_oa_perf.c
> +++ b/drivers/gpu/drm/i915/i915_oa_perf.c
> @@ -408,11 +408,108 @@ void forward_oa_rcs_snapshots_work(struct work_struct *__work)
>  	}
>  }
>  
> +int i915_gen_pmu_wait_gpu(struct drm_i915_private *dev_priv)

Why so many exports from this file? And why are half of them privately
named?

> +{
> +	struct i915_gen_pmu_node *last_entry;
> +	unsigned long lock_flags;
> +	int ret;
> +
> +	/*
> +	 * Wait for the last scheduled request to complete. This would
> +	 * implicitly wait for the prior submitted requests. The refcount
> +	 * of the requests is not decremented here.
> +	 */
> +	spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);

Urm, are we really going to be called in irq context? I really hope you
are not planning on hooking in the irq handlers...

Afaict, you are just using the list from a timer context, so
spin_lock_bh() would be sufficient. Apparently it isn't a timer (thanks
for the misleading name!) so just spin_lock().

> +	if (list_empty(&dev_priv->gen_pmu.node_list)) {
> +		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
> +		return 0;
> +	}
> +	last_entry = list_last_entry(&dev_priv->gen_pmu.node_list,
> +			struct i915_gen_pmu_node, head);
> +	spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);

You could write this a little neater with just one path through the
crticial section.

> +	if (last_entry && last_entry->req) {

last_entry cannot be NULL.

When is req NULL? Does the last_entry->req being NULL guarrantee that
all previous req are NULL?

> +		ret = __i915_wait_request(last_entry->req, atomic_read(
> +				&dev_priv->gpu_error.reset_counter),
> +				dev_priv->mm.interruptible, NULL, NULL);

Invalid use of dev_priv->mm.interruptible (just pass true).

> +		if (ret) {
> +			DRM_ERROR("failed to wait\n");
> +			return ret;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv,
> +				struct i915_gen_pmu_node *node)
> +{
> +	struct perf_sample_data data;
> +	struct perf_event *event = dev_priv->gen_pmu.exclusive_event;
> +	int ts_size, snapshot_size;
> +	u8 *snapshot;
> +	struct drm_i915_ts_node_ctx_id *ctx_info;
> +	struct perf_raw_record raw;
> +
> +	ts_size = sizeof(struct drm_i915_ts_data);
> +	snapshot_size = ts_size + sizeof(*ctx_info);

If you kept these as compile time constants it will make the rest a bit
easier to follow.

> +	snapshot = dev_priv->gen_pmu.buffer.addr + node->offset;
> +
> +	ctx_info = (struct drm_i915_ts_node_ctx_id *)(snapshot + ts_size);
> +	ctx_info->ctx_id = node->ctx_id;
> +
> +	perf_sample_data_init(&data, 0, event->hw.last_period);
> +
> +	/* Note: the combined u32 raw->size member + raw data itself must be 8
> +	 * byte aligned. (See note in init_gen_pmu_buffer for more details) */

You mean this comment?
/* size has to be aligned to 8 bytes (required by relevant gpu cmds) */
That's not particularly enlightening.

Missing BUILD_BUG tests to assert that your structure sizes are what you
claim they should be. To illustrate this comment, you could do

BUILD_BUG_ON(sizeof(struct drm_i915_ts_data) != 8);
BUILD_BUG_ON(sizeof(struct drm_i915_ts_mode_ctx_id) != 8);
BUILD_BUG_ON((snapshot_size + 4 + sizeof(raw.size) % 8) == 0);

Otherwise the comment doesn't really make it clear exactly what has to
be aligned to 8 or *why*.

> +	raw.size = snapshot_size + 4;
> +	raw.data = snapshot;

> +	data.raw = &raw;
> +
> +	perf_event_overflow(event, &data, &dev_priv->gen_pmu.dummy_regs);
> +}
> +
> +void forward_gen_pmu_snapshots_work(struct work_struct *__work)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(__work, typeof(*dev_priv), gen_pmu.work_timer);
> +	struct i915_gen_pmu_node *entry, *next;
> +	struct drm_i915_gem_request *req;
> +	unsigned long lock_flags;
> +	int ret;
> +
> +	list_for_each_entry_safe
> +		(entry, next, &dev_priv->gen_pmu.node_list, head) {
> +		req = entry->req;
> +		if (req && i915_gem_request_completed(req, true)) {

Negate the test and reduce indentation for ease of reading.

> +			forward_one_gen_pmu_sample(dev_priv, entry);
> +			ret = i915_mutex_lock_interruptible(dev_priv->dev);
> +			if (ret)
> +				break;
> +			i915_gem_request_assign(&entry->req, NULL);
> +			mutex_unlock(&dev_priv->dev->struct_mutex);

This is just i915_gem_request_unreference_unlocked().

> +		} else
> +			break;
> +
> +		/*
> +		 * Do we instead need to protect whole loop? If so, we would
> +		 * need to *list_move_tail* to a deferred list, from where
> +		 * i915 device mutex could be taken to deference the requests,
> +		 * and free the node.
> +		 */
> +		spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);
> +		list_del(&entry->head);
> +		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
> +		kfree(entry);
> +	}
> +}
> +
>  static void gen_pmu_flush_snapshots(struct drm_i915_private *dev_priv)
>  {
>  	WARN_ON(!dev_priv->gen_pmu.buffer.addr);
>  
> -	/* TODO: routine for forwarding snapshots to userspace */
> +	schedule_work(&dev_priv->gen_pmu.work_timer);

Why are we scheduling a timer? Might be a bad name for a work item to
infer that it is a timer.

Why is this in a work queue if you already blocked during the flush?

>  }
>  
>  static void
> @@ -761,7 +858,7 @@ static int init_gen_pmu_buffer(struct perf_event *event)
>  	struct drm_i915_private *dev_priv =
>  		container_of(event->pmu, typeof(*dev_priv), gen_pmu.pmu);
>  	struct drm_i915_gem_object *bo;
> -	int ret;
> +	int ret, node_size;
>  
>  	BUG_ON(dev_priv->gen_pmu.buffer.obj);
>  
> @@ -772,6 +869,15 @@ static int init_gen_pmu_buffer(struct perf_event *event)
>  	dev_priv->gen_pmu.buffer.obj = bo;
>  
>  	dev_priv->gen_pmu.buffer.addr = vmap_oa_buffer(bo);
> +	INIT_LIST_HEAD(&dev_priv->gen_pmu.node_list);
> +
> +	node_size = sizeof(struct drm_i915_ts_data) +
> +			sizeof(struct drm_i915_ts_node_ctx_id);
> +
> +	/* size has to be aligned to 8 bytes (required by relevant gpu cmds) */
> +	node_size = ALIGN(node_size, 8);
> +	dev_priv->gen_pmu.buffer.node_size = node_size;
> +	dev_priv->gen_pmu.buffer.node_count = bo->base.size / node_size;
>  
>  	DRM_DEBUG_DRIVER("Gen PMU Buffer initialized, vaddr = %p",
>  			 dev_priv->gen_pmu.buffer.addr);
> @@ -1397,6 +1503,11 @@ static int i915_gen_event_flush(struct perf_event *event)
>  {
>  	struct drm_i915_private *i915 =
>  		container_of(event->pmu, typeof(*i915), gen_pmu.pmu);
> +	int ret;
> +
> +	ret = i915_gen_pmu_wait_gpu(i915);
> +	if (ret)
> +		return ret;
>  
>  	gen_pmu_flush_snapshots(i915);

Wait for idle, then schedule a task???
-Chris
>
sourab.gupta@intel.com July 15, 2015, 11:30 a.m. UTC | #2
On Wed, 2015-07-15 at 09:40 +0000, Chris Wilson wrote:
> On Wed, Jul 15, 2015 at 02:21:40PM +0530, sourab.gupta@intel.com wrote:

> > From: Sourab Gupta <sourab.gupta@intel.com>

> > 

> > This patch adds the mechanism for forwarding the timestamp data to

> > userspace using the Gen PMU perf event interface.

> > 

> > The timestamps will be captured in a gem buffer object. The metadata

> > information (ctx_id right now) pertaining to snapshot is maintained in a

> > list, whose each node has offsets into the gem buffer object for each

> > snapshot captured.

> 

> What is the definition of ctx_id? The only persistent one is the

> user_handle which is only valid within the file_priv namespace. There is

> no guid for ctx at the moment.


Well, this patch set makes assumption on the availability of a globally
unique ctx_id. The first patch in the series proposes the same:
http://lists.freedesktop.org/archives/intel-gfx/2015-July/071698.html
Not sure, whether that would be acceptable though.
> 

> > In order to track whether the gpu has completed processing the node,

> > a field pertaining to corresponding gem request is added. The request is

> > expected to be referenced whenever the gpu command is submitted.

> > 

> > Each snapshot collected is forwarded as a separate perf sample. The perf

> > sample will have raw timestamp data followed by metadata information

> > pertaining to that sample.

> > While forwarding the samples, we check whether the gem request is completed

> > and dereference the corresponding request. The need to dereference the

> > request necessitates a worker here, which will be scheduled when the

> > hrtimer triggers.

> > While flushing the samples, we have to wait for the requests already

> > scheduled, before forwarding the samples. This wait is done in a lockless

> > fashion.

> > 

> > Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>

> > ---

> >  drivers/gpu/drm/i915/i915_drv.h     |  11 ++++

> >  drivers/gpu/drm/i915/i915_oa_perf.c | 118 +++++++++++++++++++++++++++++++++++-

> >  include/uapi/drm/i915_drm.h         |  10 +++

> >  3 files changed, 137 insertions(+), 2 deletions(-)

> > 

> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h

> > index 7e8e77b..6984150 100644

> > --- a/drivers/gpu/drm/i915/i915_drv.h

> > +++ b/drivers/gpu/drm/i915/i915_drv.h

> > @@ -1675,6 +1675,13 @@ struct i915_oa_rcs_node {

> >  	u32 tag;

> >  };

> >  

> > +struct i915_gen_pmu_node {

> > +	struct list_head head;

> > +	struct drm_i915_gem_request *req;

> > +	u32 offset;

> > +	u32 ctx_id;

> > +};

> > +

> >  extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[];

> >  extern const int i915_oa_3d_mux_config_hsw_len;

> >  extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[];

> > @@ -1996,7 +2003,11 @@ struct drm_i915_private {

> >  		struct {

> >  			struct drm_i915_gem_object *obj;

> >  			u8 *addr;

> > +			u32 node_size;

> > +			u32 node_count;

> >  		} buffer;

> > +		struct list_head node_list;

> > +		struct work_struct work_timer;

> >  	} gen_pmu;

> >  

> >  	void (*insert_profile_cmd[I915_PROFILE_MAX])

> > diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c

> > index ab965b4..350b560 100644

> > --- a/drivers/gpu/drm/i915/i915_oa_perf.c

> > +++ b/drivers/gpu/drm/i915/i915_oa_perf.c

> > @@ -408,11 +408,108 @@ void forward_oa_rcs_snapshots_work(struct work_struct *__work)

> >  	}

> >  }

> >  

> > +int i915_gen_pmu_wait_gpu(struct drm_i915_private *dev_priv)

> 

> Why so many exports from this file? And why are half of them privately

> named?


Acknowledged. The exports are not needed. Will have a static
declaration.
> 

> > +{

> > +	struct i915_gen_pmu_node *last_entry;

> > +	unsigned long lock_flags;

> > +	int ret;

> > +

> > +	/*

> > +	 * Wait for the last scheduled request to complete. This would

> > +	 * implicitly wait for the prior submitted requests. The refcount

> > +	 * of the requests is not decremented here.

> > +	 */

> > +	spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);

> 

> Urm, are we really going to be called in irq context? I really hope you

> are not planning on hooking in the irq handlers...

> 

> Afaict, you are just using the list from a timer context, so

> spin_lock_bh() would be sufficient. Apparently it isn't a timer (thanks

> for the misleading name!) so just spin_lock().


No, it won't be called from irq context. I'll just use spin_lock() here.
> 

> > +	if (list_empty(&dev_priv->gen_pmu.node_list)) {

> > +		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);

> > +		return 0;

> > +	}

> > +	last_entry = list_last_entry(&dev_priv->gen_pmu.node_list,

> > +			struct i915_gen_pmu_node, head);

> > +	spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);

> 

> You could write this a little neater with just one path through the

> crticial section.


Will rework on this.
> 

> > +	if (last_entry && last_entry->req) {

> 

> last_entry cannot be NULL.

> 

> When is req NULL? Does the last_entry->req being NULL guarrantee that

> all previous req are NULL?


Sorry, these extraneous checks may have crept in. Agreed that last_entry
can't be NULL, and same for req, since we have refcounted the req. Will
remove these.  
> 

> > +		ret = __i915_wait_request(last_entry->req, atomic_read(

> > +				&dev_priv->gpu_error.reset_counter),

> > +				dev_priv->mm.interruptible, NULL, NULL);

> 

> Invalid use of dev_priv->mm.interruptible (just pass true).


Will make this change.
> 

> > +		if (ret) {

> > +			DRM_ERROR("failed to wait\n");

> > +			return ret;

> > +		}

> > +	}

> > +	return 0;

> > +}

> > +

> > +static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv,

> > +				struct i915_gen_pmu_node *node)

> > +{

> > +	struct perf_sample_data data;

> > +	struct perf_event *event = dev_priv->gen_pmu.exclusive_event;

> > +	int ts_size, snapshot_size;

> > +	u8 *snapshot;

> > +	struct drm_i915_ts_node_ctx_id *ctx_info;

> > +	struct perf_raw_record raw;

> > +

> > +	ts_size = sizeof(struct drm_i915_ts_data);

> > +	snapshot_size = ts_size + sizeof(*ctx_info);

> 

> If you kept these as compile time constants it will make the rest a bit

> easier to follow.

Will make this change.
> 

> > +	snapshot = dev_priv->gen_pmu.buffer.addr + node->offset;

> > +

> > +	ctx_info = (struct drm_i915_ts_node_ctx_id *)(snapshot + ts_size);

> > +	ctx_info->ctx_id = node->ctx_id;

> > +

> > +	perf_sample_data_init(&data, 0, event->hw.last_period);

> > +

> > +	/* Note: the combined u32 raw->size member + raw data itself must be 8

> > +	 * byte aligned. (See note in init_gen_pmu_buffer for more details) */

> 

> You mean this comment?

> /* size has to be aligned to 8 bytes (required by relevant gpu cmds) */

> That's not particularly enlightening.

> 

> Missing BUILD_BUG tests to assert that your structure sizes are what you

> claim they should be. To illustrate this comment, you could do

> 

> BUILD_BUG_ON(sizeof(struct drm_i915_ts_data) != 8);

> BUILD_BUG_ON(sizeof(struct drm_i915_ts_mode_ctx_id) != 8);

> BUILD_BUG_ON((snapshot_size + 4 + sizeof(raw.size) % 8) == 0);

> 

> Otherwise the comment doesn't really make it clear exactly what has to

> be aligned to 8 or *why*.

> 


Actually, this check is for ensuring that the combined size of raw data
+ the size of 'raw->size' member field should be multiple of 8 bytes.
This check can be seen at kernel/events/core.c: perf_prepare_sample()
(the place where PERF_SAMPLE_RAW is processed).
Will reword the comment to make it clearer, in addition to having the
BUILD_BUG_ON macro checks.

> > +	raw.size = snapshot_size + 4;

> > +	raw.data = snapshot;

> 

> > +	data.raw = &raw;

> > +

> > +	perf_event_overflow(event, &data, &dev_priv->gen_pmu.dummy_regs);

> > +}

> > +

> > +void forward_gen_pmu_snapshots_work(struct work_struct *__work)

> > +{

> > +	struct drm_i915_private *dev_priv =

> > +		container_of(__work, typeof(*dev_priv), gen_pmu.work_timer);

> > +	struct i915_gen_pmu_node *entry, *next;

> > +	struct drm_i915_gem_request *req;

> > +	unsigned long lock_flags;

> > +	int ret;

> > +

> > +	list_for_each_entry_safe

> > +		(entry, next, &dev_priv->gen_pmu.node_list, head) {

> > +		req = entry->req;

> > +		if (req && i915_gem_request_completed(req, true)) {

> 

> Negate the test and reduce indentation for ease of reading.


Ok, will do.
> 

> > +			forward_one_gen_pmu_sample(dev_priv, entry);

> > +			ret = i915_mutex_lock_interruptible(dev_priv->dev);

> > +			if (ret)

> > +				break;

> > +			i915_gem_request_assign(&entry->req, NULL);

> > +			mutex_unlock(&dev_priv->dev->struct_mutex);

> 

> This is just i915_gem_request_unreference_unlocked().


Right. Missed it. Will have this function.
> 

> > +		} else

> > +			break;

> > +

> > +		/*

> > +		 * Do we instead need to protect whole loop? If so, we would

> > +		 * need to *list_move_tail* to a deferred list, from where

> > +		 * i915 device mutex could be taken to deference the requests,

> > +		 * and free the node.

> > +		 */

> > +		spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);

> > +		list_del(&entry->head);

> > +		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);

> > +		kfree(entry);

> > +	}

> > +}

> > +

> >  static void gen_pmu_flush_snapshots(struct drm_i915_private *dev_priv)

> >  {

> >  	WARN_ON(!dev_priv->gen_pmu.buffer.addr);

> >  

> > -	/* TODO: routine for forwarding snapshots to userspace */

> > +	schedule_work(&dev_priv->gen_pmu.work_timer);

> 

> Why are we scheduling a timer? Might be a bad name for a work item to

> infer that it is a timer.

> 

Well, this is a case of bad name and it's a work item really.

> Why is this in a work queue if you already blocked during the flush?

> 


Actually, the gen_pmu_flush_snapshots() fn is called from 3 places:
hrtimer, event_stop and event_flush. And we can block only from event
flush. So, this function not really tied with event flush only.

Also, the job of forwarding samples needs mutex to be taken (to
dereference the request : i915_gem_request_unreference_unlocked), and
since the forwarding has to be initiated from atomic context (e.g.
hrtimer), I created a wq for the same, with the intention of not having
code duplication for event flush.

Probably, the better sense would be to have the forwarding functionality
implemented in a seperate function. The work item (which would be
scheduled from hrtimer/event stop) would be calling that function. And
the the event flush would directly call this forwarding fn, without the
extraneous work item scheduled. Does this seem ok?

> >  }

> >  

> >  static void

> > @@ -761,7 +858,7 @@ static int init_gen_pmu_buffer(struct perf_event *event)

> >  	struct drm_i915_private *dev_priv =

> >  		container_of(event->pmu, typeof(*dev_priv), gen_pmu.pmu);

> >  	struct drm_i915_gem_object *bo;

> > -	int ret;

> > +	int ret, node_size;

> >  

> >  	BUG_ON(dev_priv->gen_pmu.buffer.obj);

> >  

> > @@ -772,6 +869,15 @@ static int init_gen_pmu_buffer(struct perf_event *event)

> >  	dev_priv->gen_pmu.buffer.obj = bo;

> >  

> >  	dev_priv->gen_pmu.buffer.addr = vmap_oa_buffer(bo);

> > +	INIT_LIST_HEAD(&dev_priv->gen_pmu.node_list);

> > +

> > +	node_size = sizeof(struct drm_i915_ts_data) +

> > +			sizeof(struct drm_i915_ts_node_ctx_id);

> > +

> > +	/* size has to be aligned to 8 bytes (required by relevant gpu cmds) */

> > +	node_size = ALIGN(node_size, 8);

> > +	dev_priv->gen_pmu.buffer.node_size = node_size;

> > +	dev_priv->gen_pmu.buffer.node_count = bo->base.size / node_size;

> >  

> >  	DRM_DEBUG_DRIVER("Gen PMU Buffer initialized, vaddr = %p",

> >  			 dev_priv->gen_pmu.buffer.addr);

> > @@ -1397,6 +1503,11 @@ static int i915_gen_event_flush(struct perf_event *event)

> >  {

> >  	struct drm_i915_private *i915 =

> >  		container_of(event->pmu, typeof(*i915), gen_pmu.pmu);

> > +	int ret;

> > +

> > +	ret = i915_gen_pmu_wait_gpu(i915);

> > +	if (ret)

> > +		return ret;

> >  

> >  	gen_pmu_flush_snapshots(i915);

> 

> Wait for idle, then schedule a task???

> -Chris

> >   

>
Chris Wilson July 15, 2015, 12:02 p.m. UTC | #3
On Wed, Jul 15, 2015 at 11:30:13AM +0000, Gupta, Sourab wrote:
> On Wed, 2015-07-15 at 09:40 +0000, Chris Wilson wrote:
> > >  static void gen_pmu_flush_snapshots(struct drm_i915_private *dev_priv)
> > >  {
> > >  	WARN_ON(!dev_priv->gen_pmu.buffer.addr);
> > >  
> > > -	/* TODO: routine for forwarding snapshots to userspace */
> > > +	schedule_work(&dev_priv->gen_pmu.work_timer);
> > 
> > Why are we scheduling a timer? Might be a bad name for a work item to
> > infer that it is a timer.
> > 
> Well, this is a case of bad name and it's a work item really.
> 
> > Why is this in a work queue if you already blocked during the flush?
> > 
> 
> Actually, the gen_pmu_flush_snapshots() fn is called from 3 places:
> hrtimer, event_stop and event_flush. And we can block only from event
> flush. So, this function not really tied with event flush only.
> 
> Also, the job of forwarding samples needs mutex to be taken (to
> dereference the request : i915_gem_request_unreference_unlocked), and
> since the forwarding has to be initiated from atomic context (e.g.
> hrtimer), I created a wq for the same, with the intention of not having
> code duplication for event flush.
> 
> Probably, the better sense would be to have the forwarding functionality
> implemented in a seperate function. The work item (which would be
> scheduled from hrtimer/event stop) would be calling that function. And
> the the event flush would directly call this forwarding fn, without the
> extraneous work item scheduled. Does this seem ok?

Indeed that's an improvement for flush, where the semantics may be such
that after calling it we do expect to be able to process the samples (in
userspace) immediately.

Having a comment at the top the forward function would also help the
reader to understand the contexts from which we can be called (and why).
It's something we often lack, but is very useful for review.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7e8e77b..6984150 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1675,6 +1675,13 @@  struct i915_oa_rcs_node {
 	u32 tag;
 };
 
+struct i915_gen_pmu_node {
+	struct list_head head;
+	struct drm_i915_gem_request *req;
+	u32 offset;
+	u32 ctx_id;
+};
+
 extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[];
 extern const int i915_oa_3d_mux_config_hsw_len;
 extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[];
@@ -1996,7 +2003,11 @@  struct drm_i915_private {
 		struct {
 			struct drm_i915_gem_object *obj;
 			u8 *addr;
+			u32 node_size;
+			u32 node_count;
 		} buffer;
+		struct list_head node_list;
+		struct work_struct work_timer;
 	} gen_pmu;
 
 	void (*insert_profile_cmd[I915_PROFILE_MAX])
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index ab965b4..350b560 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -408,11 +408,108 @@  void forward_oa_rcs_snapshots_work(struct work_struct *__work)
 	}
 }
 
+int i915_gen_pmu_wait_gpu(struct drm_i915_private *dev_priv)
+{
+	struct i915_gen_pmu_node *last_entry;
+	unsigned long lock_flags;
+	int ret;
+
+	/*
+	 * Wait for the last scheduled request to complete. This would
+	 * implicitly wait for the prior submitted requests. The refcount
+	 * of the requests is not decremented here.
+	 */
+	spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);
+
+	if (list_empty(&dev_priv->gen_pmu.node_list)) {
+		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
+		return 0;
+	}
+	last_entry = list_last_entry(&dev_priv->gen_pmu.node_list,
+			struct i915_gen_pmu_node, head);
+	spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
+
+	if (last_entry && last_entry->req) {
+		ret = __i915_wait_request(last_entry->req, atomic_read(
+				&dev_priv->gpu_error.reset_counter),
+				dev_priv->mm.interruptible, NULL, NULL);
+		if (ret) {
+			DRM_ERROR("failed to wait\n");
+			return ret;
+		}
+	}
+	return 0;
+}
+
+static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv,
+				struct i915_gen_pmu_node *node)
+{
+	struct perf_sample_data data;
+	struct perf_event *event = dev_priv->gen_pmu.exclusive_event;
+	int ts_size, snapshot_size;
+	u8 *snapshot;
+	struct drm_i915_ts_node_ctx_id *ctx_info;
+	struct perf_raw_record raw;
+
+	ts_size = sizeof(struct drm_i915_ts_data);
+	snapshot_size = ts_size + sizeof(*ctx_info);
+	snapshot = dev_priv->gen_pmu.buffer.addr + node->offset;
+
+	ctx_info = (struct drm_i915_ts_node_ctx_id *)(snapshot + ts_size);
+	ctx_info->ctx_id = node->ctx_id;
+
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+
+	/* Note: the combined u32 raw->size member + raw data itself must be 8
+	 * byte aligned. (See note in init_gen_pmu_buffer for more details) */
+	raw.size = snapshot_size + 4;
+	raw.data = snapshot;
+
+	data.raw = &raw;
+
+	perf_event_overflow(event, &data, &dev_priv->gen_pmu.dummy_regs);
+}
+
+void forward_gen_pmu_snapshots_work(struct work_struct *__work)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(__work, typeof(*dev_priv), gen_pmu.work_timer);
+	struct i915_gen_pmu_node *entry, *next;
+	struct drm_i915_gem_request *req;
+	unsigned long lock_flags;
+	int ret;
+
+	list_for_each_entry_safe
+		(entry, next, &dev_priv->gen_pmu.node_list, head) {
+		req = entry->req;
+		if (req && i915_gem_request_completed(req, true)) {
+			forward_one_gen_pmu_sample(dev_priv, entry);
+			ret = i915_mutex_lock_interruptible(dev_priv->dev);
+			if (ret)
+				break;
+			i915_gem_request_assign(&entry->req, NULL);
+			mutex_unlock(&dev_priv->dev->struct_mutex);
+		} else
+			break;
+
+		/*
+		 * Do we instead need to protect whole loop? If so, we would
+		 * need to *list_move_tail* to a deferred list, from where
+		 * i915 device mutex could be taken to deference the requests,
+		 * and free the node.
+		 */
+		spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);
+		list_del(&entry->head);
+		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
+		kfree(entry);
+	}
+}
+
 static void gen_pmu_flush_snapshots(struct drm_i915_private *dev_priv)
 {
 	WARN_ON(!dev_priv->gen_pmu.buffer.addr);
 
-	/* TODO: routine for forwarding snapshots to userspace */
+	schedule_work(&dev_priv->gen_pmu.work_timer);
 }
 
 static void
@@ -761,7 +858,7 @@  static int init_gen_pmu_buffer(struct perf_event *event)
 	struct drm_i915_private *dev_priv =
 		container_of(event->pmu, typeof(*dev_priv), gen_pmu.pmu);
 	struct drm_i915_gem_object *bo;
-	int ret;
+	int ret, node_size;
 
 	BUG_ON(dev_priv->gen_pmu.buffer.obj);
 
@@ -772,6 +869,15 @@  static int init_gen_pmu_buffer(struct perf_event *event)
 	dev_priv->gen_pmu.buffer.obj = bo;
 
 	dev_priv->gen_pmu.buffer.addr = vmap_oa_buffer(bo);
+	INIT_LIST_HEAD(&dev_priv->gen_pmu.node_list);
+
+	node_size = sizeof(struct drm_i915_ts_data) +
+			sizeof(struct drm_i915_ts_node_ctx_id);
+
+	/* size has to be aligned to 8 bytes (required by relevant gpu cmds) */
+	node_size = ALIGN(node_size, 8);
+	dev_priv->gen_pmu.buffer.node_size = node_size;
+	dev_priv->gen_pmu.buffer.node_count = bo->base.size / node_size;
 
 	DRM_DEBUG_DRIVER("Gen PMU Buffer initialized, vaddr = %p",
 			 dev_priv->gen_pmu.buffer.addr);
@@ -1397,6 +1503,11 @@  static int i915_gen_event_flush(struct perf_event *event)
 {
 	struct drm_i915_private *i915 =
 		container_of(event->pmu, typeof(*i915), gen_pmu.pmu);
+	int ret;
+
+	ret = i915_gen_pmu_wait_gpu(i915);
+	if (ret)
+		return ret;
 
 	gen_pmu_flush_snapshots(i915);
 	return 0;
@@ -1548,6 +1659,7 @@  void i915_gen_pmu_register(struct drm_device *dev)
 	hrtimer_init(&i915->gen_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	i915->gen_pmu.timer.function = hrtimer_sample_gen;
 
+	INIT_WORK(&i915->gen_pmu.work_timer, forward_gen_pmu_snapshots_work);
 	spin_lock_init(&i915->gen_pmu.lock);
 
 	i915->gen_pmu.pmu.capabilities  = PERF_PMU_CAP_IS_DEVICE;
@@ -1577,6 +1689,8 @@  void i915_gen_pmu_unregister(struct drm_device *dev)
 	if (i915->gen_pmu.pmu.event_init == NULL)
 		return;
 
+	cancel_work_sync(&i915->gen_pmu.work_timer);
+
 	perf_pmu_unregister(&i915->gen_pmu.pmu);
 	i915->gen_pmu.pmu.event_init = NULL;
 }
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1084178..9c083a2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -140,6 +140,16 @@  struct drm_i915_oa_node_tag {
 	__u32 pad;
 };
 
+struct drm_i915_ts_data {
+	__u32 ts_low;
+	__u32 ts_high;
+};
+
+struct drm_i915_ts_node_ctx_id {
+	__u32 ctx_id;
+	__u32 pad;
+};
+
 /* Each region is a minimum of 16k, and there are at most 255 of them.
  */
 #define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use