diff mbox

[3/8] drm/i915: Framework for capturing command stream based OA reports

Message ID 1489644855-25562-4-git-send-email-sourab.gupta@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

sourab.gupta@intel.com March 16, 2017, 6:14 a.m. UTC
From: Sourab Gupta <sourab.gupta@intel.com>

This patch introduces a framework to capture OA counter reports associated
with Render command stream. We can then associate the reports captured
through this mechanism with their corresponding context id's. This can be
further extended to associate any other metadata information with the
corresponding samples (since the association with Render command stream
gives us the ability to capture these information while inserting the
corresponding capture commands into the command stream).

The OA reports generated in this way are associated with a corresponding
workload, and thus can be used the delimit the workload (i.e. sample the
counters at the workload boundaries), within an ongoing stream of periodic
counter snapshots.

There may be usecases wherein we need more than periodic OA capture mode
which is supported currently. This mode is primarily used for two usecases:
    - Ability to capture system wide metrics, alongwith the ability to map
      the reports back to individual contexts (particularly for HSW).
    - Ability to inject tags for work, into the reports. This provides
      visibility into the multiple stages of work within single context.

The userspace will be able to distinguish between the periodic and CS based
OA reports by the virtue of source_info sample field.

The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA
counters, and is inserted at BB boundaries.
The data thus captured will be stored in a separate buffer, which will
be different from the buffer used otherwise for periodic OA capture mode.
The metadata information pertaining to snapshot is maintained in a list,
which also has offsets into the gem buffer object per captured snapshot.
In order to track whether the gpu has completed processing the node,
a field pertaining to corresponding gem request is added, which is tracked
for completion of the command.

Both periodic and RCS based reports are associated with a single stream
(corresponding to render engine), and it is expected to have the samples
in the sequential order according to their timestamps. Now, since these
reports are collected in separate buffers, these are merge sorted at the
time of forwarding to userspace during the read call.

v2: Aligining with the non-perf interface (custom drm ioctl based). Also,
few related patches are squashed together for better readability

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
Signed-off-by: Robert Bragg <robert@sixbynine.org>
---
 drivers/gpu/drm/i915/i915_drv.h            |  88 ++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   4 +
 drivers/gpu/drm/i915/i915_perf.c           | 888 ++++++++++++++++++++++++-----
 include/uapi/drm/i915_drm.h                |  15 +
 4 files changed, 861 insertions(+), 134 deletions(-)

Comments

Chris Wilson March 16, 2017, 8:10 a.m. UTC | #1
On Thu, Mar 16, 2017 at 11:44:10AM +0530, sourab.gupta@intel.com wrote:
> @@ -3593,6 +3670,7 @@ void i915_oa_init_reg_state(struct intel_engine_cs *engine,
>  void i915_oa_update_reg_state(struct intel_engine_cs *engine,
>  			      struct i915_gem_context *ctx,
>  			      uint32_t *reg_state);
> +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
>  
>  /* i915_gem_evict.c */
>  int __must_check i915_gem_evict_something(struct i915_address_space *vm,
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index aa75ea2..7af32c97 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1441,12 +1441,16 @@ static void eb_export_fence(struct drm_i915_gem_object *obj,
>  	if (exec_len == 0)
>  		exec_len = params->batch->size - params->args_batch_start_offset;
>  
> +	i915_perf_command_stream_hook(params->request);

Could you have named it anything more cyptic and inconsistent?

Quite clearly this can fail, so justify the non handling of errors.

DRM_ERROR is not error handling, it is an indication that this patch
isn't ready.

> +void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
> +{
> +	struct intel_engine_cs *engine = request->engine;
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	struct i915_perf_stream *stream;
> +
> +	if (!dev_priv->perf.initialized)
> +		return;
> +
> +	mutex_lock(&dev_priv->perf.streams_lock);

Justify a new global lock very, very carefully on execbuf.

> +	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
> +		if ((stream->state == I915_PERF_STREAM_ENABLED) &&
> +					stream->cs_mode)
> +			stream->ops->command_stream_hook(stream, request);
> +	}
> +	mutex_unlock(&dev_priv->perf.streams_lock);
> +}

> +static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
> +					struct drm_i915_gem_request *request)
> +{
> +	struct drm_i915_private *dev_priv = request->i915;
> +	struct i915_gem_context *ctx = request->ctx;
> +	struct i915_perf_cs_sample *sample;
> +	u32 addr = 0;
> +	u32 cmd, *cs;
> +
> +	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
> +	if (sample == NULL) {
> +		DRM_ERROR("Perf sample alloc failed\n");
> +		return;
> +	}
> +
> +	cs = intel_ring_begin(request, 4);
> +	if (IS_ERR(cs)) {
> +		kfree(sample);
> +		return;
> +	}
> +
> +	sample->ctx_id = ctx->hw_id;
> +	i915_gem_request_assign(&sample->request, request);

> +
> +	i915_gem_active_set(&stream->last_request, request);

Hint, you don't need you own request trap.
-Chris
Chris Wilson March 16, 2017, 8:31 a.m. UTC | #2
On Thu, Mar 16, 2017 at 11:44:10AM +0530, sourab.gupta@intel.com wrote:
> +static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
> +					struct drm_i915_gem_request *request)
> +{
> +	struct drm_i915_private *dev_priv = request->i915;
> +	struct i915_gem_context *ctx = request->ctx;
> +	struct i915_perf_cs_sample *sample;
> +	u32 addr = 0;
> +	u32 cmd, *cs;
> +
> +	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
> +	if (sample == NULL) {
> +		DRM_ERROR("Perf sample alloc failed\n");
> +		return;
> +	}
> +
> +	cs = intel_ring_begin(request, 4);
> +	if (IS_ERR(cs)) {
> +		kfree(sample);
> +		return;
> +	}
> +
> +	sample->ctx_id = ctx->hw_id;
> +	i915_gem_request_assign(&sample->request, request);
> +
> +	insert_perf_sample(dev_priv, sample);
> +
> +	addr = dev_priv->perf.command_stream_buf.vma->node.start +
> +		sample->offset;
> +
> +	if (WARN_ON(addr & 0x3f)) {
> +		DRM_ERROR("OA buffer address not aligned to 64 byte");
> +		return;
> +	}
> +
> +	cmd = MI_REPORT_PERF_COUNT | (1<<0);
> +	if (INTEL_GEN(dev_priv) >= 8)
> +		cmd |= (2<<0);
> +
> +	*cs++ = cmd;
> +	*cs++ = addr | MI_REPORT_PERF_COUNT_GGTT;
> +	*cs++ = request->global_seqno;

request->global_seqno is always zero at this point, and is subject to
change after assignment.

[ctx:engine or request->fence.context]:request->fence.seqno is permanent.
-Chris
sourab.gupta@intel.com March 16, 2017, 8:54 a.m. UTC | #3
On Thu, 2017-03-16 at 01:10 -0700, Chris Wilson wrote:
> On Thu, Mar 16, 2017 at 11:44:10AM +0530, sourab.gupta@intel.com wrote:
> > @@ -3593,6 +3670,7 @@ void i915_oa_init_reg_state(struct intel_engine_cs *engine,
> >  void i915_oa_update_reg_state(struct intel_engine_cs *engine,
> >  			      struct i915_gem_context *ctx,
> >  			      uint32_t *reg_state);
> > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
> >  
> >  /* i915_gem_evict.c */
> >  int __must_check i915_gem_evict_something(struct i915_address_space *vm,
> > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > index aa75ea2..7af32c97 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > @@ -1441,12 +1441,16 @@ static void eb_export_fence(struct drm_i915_gem_object *obj,
> >  	if (exec_len == 0)
> >  		exec_len = params->batch->size - params->args_batch_start_offset;
> >  
> > +	i915_perf_command_stream_hook(params->request);
> 
> Could you have named it anything more cyptic and inconsistent?

Sorry. Would 'i915_perf_capture_metrics' work?
Can you please suggest an appropriate name otherwise.
> 
> Quite clearly this can fail, so justify the non handling of errors.
> 
> DRM_ERROR is not error handling, it is an indication that this patch
> isn't ready.

Well, the intent was to have minimal effect to execbuf normal routine,
even if we fail. But, I guess I'm mistaken.
I'll rectify this to handle the case, if perf callback fails.
	
> 
> > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
> > +{
> > +	struct intel_engine_cs *engine = request->engine;
> > +	struct drm_i915_private *dev_priv = engine->i915;
> > +	struct i915_perf_stream *stream;
> > +
> > +	if (!dev_priv->perf.initialized)
> > +		return;
> > +
> > +	mutex_lock(&dev_priv->perf.streams_lock);
> 
> Justify a new global lock very, very carefully on execbuf.

The lock introduced here is to protect the the perf.streams list against
addition/deletion while we're processing the list during execbuf call.
The other places where the mutex is taken is when a new stream is being
created (using perf_open ioctl) or a stream is being destroyed
(perf_release ioctl), which just protect the list_add and list_del into
the perf.streams list.
As such, there should not be much impact on execbuf path.
Does this seem reasonable?

> 
> > +	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
> > +		if ((stream->state == I915_PERF_STREAM_ENABLED) &&
> > +					stream->cs_mode)
> > +			stream->ops->command_stream_hook(stream, request);
> > +	}
> > +	mutex_unlock(&dev_priv->perf.streams_lock);
> > +}
> 
> > +static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
> > +					struct drm_i915_gem_request *request)
> > +{
> > +	struct drm_i915_private *dev_priv = request->i915;
> > +	struct i915_gem_context *ctx = request->ctx;
> > +	struct i915_perf_cs_sample *sample;
> > +	u32 addr = 0;
> > +	u32 cmd, *cs;
> > +
> > +	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
> > +	if (sample == NULL) {
> > +		DRM_ERROR("Perf sample alloc failed\n");
> > +		return;
> > +	}
> > +
> > +	cs = intel_ring_begin(request, 4);
> > +	if (IS_ERR(cs)) {
> > +		kfree(sample);
> > +		return;
> > +	}
> > +
> > +	sample->ctx_id = ctx->hw_id;
> > +	i915_gem_request_assign(&sample->request, request);
> 
> > +
> > +	i915_gem_active_set(&stream->last_request, request);
> 
> Hint, you don't need you own request trap.
Sorry, I didn't understand you fully here. I'm storing the reference to
the last active request associated with the stream inside the
last_request 'gem_active' field. Do I need to do something differently
here?

> -Chris
>
sourab.gupta@intel.com March 16, 2017, 8:57 a.m. UTC | #4
On Thu, 2017-03-16 at 01:31 -0700, Chris Wilson wrote:
> On Thu, Mar 16, 2017 at 11:44:10AM +0530, sourab.gupta@intel.com wrote:
> > +static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
> > +					struct drm_i915_gem_request *request)
> > +{
> > +	struct drm_i915_private *dev_priv = request->i915;
> > +	struct i915_gem_context *ctx = request->ctx;
> > +	struct i915_perf_cs_sample *sample;
> > +	u32 addr = 0;
> > +	u32 cmd, *cs;
> > +
> > +	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
> > +	if (sample == NULL) {
> > +		DRM_ERROR("Perf sample alloc failed\n");
> > +		return;
> > +	}
> > +
> > +	cs = intel_ring_begin(request, 4);
> > +	if (IS_ERR(cs)) {
> > +		kfree(sample);
> > +		return;
> > +	}
> > +
> > +	sample->ctx_id = ctx->hw_id;
> > +	i915_gem_request_assign(&sample->request, request);
> > +
> > +	insert_perf_sample(dev_priv, sample);
> > +
> > +	addr = dev_priv->perf.command_stream_buf.vma->node.start +
> > +		sample->offset;
> > +
> > +	if (WARN_ON(addr & 0x3f)) {
> > +		DRM_ERROR("OA buffer address not aligned to 64 byte");
> > +		return;
> > +	}
> > +
> > +	cmd = MI_REPORT_PERF_COUNT | (1<<0);
> > +	if (INTEL_GEN(dev_priv) >= 8)
> > +		cmd |= (2<<0);
> > +
> > +	*cs++ = cmd;
> > +	*cs++ = addr | MI_REPORT_PERF_COUNT_GGTT;
> > +	*cs++ = request->global_seqno;
> 
> request->global_seqno is always zero at this point, and is subject to
> change after assignment.
> 
> [ctx:engine or request->fence.context]:request->fence.seqno is permanent.
> -Chris
> 
Thanks for pointing out. I'll use request->fence.seqno instead.

Regards,
Sourab
Chris Wilson March 16, 2017, 9:03 a.m. UTC | #5
On Thu, Mar 16, 2017 at 02:24:55PM +0530, sourab gupta wrote:
> On Thu, 2017-03-16 at 01:10 -0700, Chris Wilson wrote:
> > On Thu, Mar 16, 2017 at 11:44:10AM +0530, sourab.gupta@intel.com wrote:
> > > @@ -3593,6 +3670,7 @@ void i915_oa_init_reg_state(struct intel_engine_cs *engine,
> > >  void i915_oa_update_reg_state(struct intel_engine_cs *engine,
> > >  			      struct i915_gem_context *ctx,
> > >  			      uint32_t *reg_state);
> > > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
> > >  
> > >  /* i915_gem_evict.c */
> > >  int __must_check i915_gem_evict_something(struct i915_address_space *vm,
> > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > index aa75ea2..7af32c97 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > @@ -1441,12 +1441,16 @@ static void eb_export_fence(struct drm_i915_gem_object *obj,
> > >  	if (exec_len == 0)
> > >  		exec_len = params->batch->size - params->args_batch_start_offset;
> > >  
> > > +	i915_perf_command_stream_hook(params->request);
> > 
> > Could you have named it anything more cyptic and inconsistent?
> 
> Sorry. Would 'i915_perf_capture_metrics' work?
> Can you please suggest an appropriate name otherwise.

The verb we use for writting into the command stream is emit. So
i915_perf_emit_samples() (emit_record record is clumsy as it is not clear
whether it is the verb or noun).

> > 
> > Quite clearly this can fail, so justify the non handling of errors.
> > 
> > DRM_ERROR is not error handling, it is an indication that this patch
> > isn't ready.
> 
> Well, the intent was to have minimal effect to execbuf normal routine,
> even if we fail. But, I guess I'm mistaken.
> I'll rectify this to handle the case, if perf callback fails.

That all depends on whether or not you can handle the unbalanced
metrics. If simply missing a report is fine, then just kill the
DRM_ERROR.

The bigger question is whether the following emit can to fail -- once
the batch is in the request, no more failures are tolerable. You have to
preallocate reserved space.

Don't you need a flush before the emit following the batch?

> > > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
> > > +{
> > > +	struct intel_engine_cs *engine = request->engine;
> > > +	struct drm_i915_private *dev_priv = engine->i915;
> > > +	struct i915_perf_stream *stream;
> > > +
> > > +	if (!dev_priv->perf.initialized)
> > > +		return;
> > > +
> > > +	mutex_lock(&dev_priv->perf.streams_lock);
> > 
> > Justify a new global lock very, very carefully on execbuf.
> 
> The lock introduced here is to protect the the perf.streams list against
> addition/deletion while we're processing the list during execbuf call.
> The other places where the mutex is taken is when a new stream is being
> created (using perf_open ioctl) or a stream is being destroyed
> (perf_release ioctl), which just protect the list_add and list_del into
> the perf.streams list.
> As such, there should not be much impact on execbuf path.
> Does this seem reasonable?

It doesn't sound like it needs a mutex, which will simplify the other
users as well (if switched to, for example, an RCU protected list).

> > > +	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
> > > +		if ((stream->state == I915_PERF_STREAM_ENABLED) &&
> > > +					stream->cs_mode)
> > > +			stream->ops->command_stream_hook(stream, request);
> > > +	}
> > > +	mutex_unlock(&dev_priv->perf.streams_lock);
> > > +}
> > 
> > > +static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
> > > +					struct drm_i915_gem_request *request)
> > > +{
> > > +	struct drm_i915_private *dev_priv = request->i915;
> > > +	struct i915_gem_context *ctx = request->ctx;
> > > +	struct i915_perf_cs_sample *sample;
> > > +	u32 addr = 0;
> > > +	u32 cmd, *cs;
> > > +
> > > +	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
> > > +	if (sample == NULL) {
> > > +		DRM_ERROR("Perf sample alloc failed\n");
> > > +		return;
> > > +	}
> > > +
> > > +	cs = intel_ring_begin(request, 4);
> > > +	if (IS_ERR(cs)) {
> > > +		kfree(sample);
> > > +		return;
> > > +	}
> > > +
> > > +	sample->ctx_id = ctx->hw_id;
> > > +	i915_gem_request_assign(&sample->request, request);
> > 
> > > +
> > > +	i915_gem_active_set(&stream->last_request, request);
> > 
> > Hint, you don't need you own request trap.
> Sorry, I didn't understand you fully here. I'm storing the reference to
> the last active request associated with the stream inside the
> last_request 'gem_active' field. Do I need to do something differently
> here?

It's the duplication.
-Chris
sourab.gupta@intel.com March 16, 2017, 9:52 a.m. UTC | #6
On Thu, 2017-03-16 at 02:03 -0700, Chris Wilson wrote:
> On Thu, Mar 16, 2017 at 02:24:55PM +0530, sourab gupta wrote:
> > On Thu, 2017-03-16 at 01:10 -0700, Chris Wilson wrote:
> > > On Thu, Mar 16, 2017 at 11:44:10AM +0530, sourab.gupta@intel.com wrote:
> > > > @@ -3593,6 +3670,7 @@ void i915_oa_init_reg_state(struct intel_engine_cs *engine,
> > > >  void i915_oa_update_reg_state(struct intel_engine_cs *engine,
> > > >  			      struct i915_gem_context *ctx,
> > > >  			      uint32_t *reg_state);
> > > > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
> > > >  
> > > >  /* i915_gem_evict.c */
> > > >  int __must_check i915_gem_evict_something(struct i915_address_space *vm,
> > > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > index aa75ea2..7af32c97 100644
> > > > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > @@ -1441,12 +1441,16 @@ static void eb_export_fence(struct drm_i915_gem_object *obj,
> > > >  	if (exec_len == 0)
> > > >  		exec_len = params->batch->size - params->args_batch_start_offset;
> > > >  
> > > > +	i915_perf_command_stream_hook(params->request);
> > > 
> > > Could you have named it anything more cyptic and inconsistent?
> > 
> > Sorry. Would 'i915_perf_capture_metrics' work?
> > Can you please suggest an appropriate name otherwise.
> 
> The verb we use for writting into the command stream is emit. So
> i915_perf_emit_samples() (emit_record record is clumsy as it is not clear
> whether it is the verb or noun).
> 
Thanks for suggesting. I'll use 'i915_perf_emit_samples' here.

> > > 
> > > Quite clearly this can fail, so justify the non handling of errors.
> > > 
> > > DRM_ERROR is not error handling, it is an indication that this patch
> > > isn't ready.
> > 
> > Well, the intent was to have minimal effect to execbuf normal routine,
> > even if we fail. But, I guess I'm mistaken.
> > I'll rectify this to handle the case, if perf callback fails.
> 
> That all depends on whether or not you can handle the unbalanced
> metrics. If simply missing a report is fine, then just kill the
> DRM_ERROR.
> 
> The bigger question is whether the following emit can to fail -- once
> the batch is in the request, no more failures are tolerable. You have to
> preallocate reserved space.
> 
> Don't you need a flush before the emit following the batch?
> 

Ok. So, that would mean that we have to first of all reserve the space
required by two 'perf_emit_samples' calls, so that we can't fail for the
lack of space in the emit following the batch.
Probably, I could pass an additional boolean parameter 'reserve_space'
set to true in the first call, which would reserve the space for both
emit_samples() calls (through intel_ring_begin)?


Would it still need the flush before the emit following the batch?
Ideally, the metrics should be collected as close to batch as possible.
So, if there are cache flush/tlb invalidate commands, it would introduce
some lag between the batch and following emit_samples command.
Sorry, I'm not able to gauge the need for flush here. I can understand
it's needed before the batch is programmed for flushing the cache/TLB
entries for the new workload to be submitted. But for the Sample_emit
commands, which generally only capture OA/timestamp/mmio metrics, is
this still required? 


> > > > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
> > > > +{
> > > > +	struct intel_engine_cs *engine = request->engine;
> > > > +	struct drm_i915_private *dev_priv = engine->i915;
> > > > +	struct i915_perf_stream *stream;
> > > > +
> > > > +	if (!dev_priv->perf.initialized)
> > > > +		return;
> > > > +
> > > > +	mutex_lock(&dev_priv->perf.streams_lock);
> > > 
> > > Justify a new global lock very, very carefully on execbuf.
> > 
> > The lock introduced here is to protect the the perf.streams list against
> > addition/deletion while we're processing the list during execbuf call.
> > The other places where the mutex is taken is when a new stream is being
> > created (using perf_open ioctl) or a stream is being destroyed
> > (perf_release ioctl), which just protect the list_add and list_del into
> > the perf.streams list.
> > As such, there should not be much impact on execbuf path.
> > Does this seem reasonable?
> 
> It doesn't sound like it needs a mutex, which will simplify the other
> users as well (if switched to, for example, an RCU protected list).

Ok. Sounds reasonable, I'll switch to using an RCU protected list here.

> 
> > > > +	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
> > > > +		if ((stream->state == I915_PERF_STREAM_ENABLED) &&
> > > > +					stream->cs_mode)
> > > > +			stream->ops->command_stream_hook(stream, request);
> > > > +	}
> > > > +	mutex_unlock(&dev_priv->perf.streams_lock);
> > > > +}
> > > 
> > > > +static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
> > > > +					struct drm_i915_gem_request *request)
> > > > +{
> > > > +	struct drm_i915_private *dev_priv = request->i915;
> > > > +	struct i915_gem_context *ctx = request->ctx;
> > > > +	struct i915_perf_cs_sample *sample;
> > > > +	u32 addr = 0;
> > > > +	u32 cmd, *cs;
> > > > +
> > > > +	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
> > > > +	if (sample == NULL) {
> > > > +		DRM_ERROR("Perf sample alloc failed\n");
> > > > +		return;
> > > > +	}
> > > > +
> > > > +	cs = intel_ring_begin(request, 4);
> > > > +	if (IS_ERR(cs)) {
> > > > +		kfree(sample);
> > > > +		return;
> > > > +	}
> > > > +
> > > > +	sample->ctx_id = ctx->hw_id;
> > > > +	i915_gem_request_assign(&sample->request, request);
> > > 
> > > > +
> > > > +	i915_gem_active_set(&stream->last_request, request);
> > > 
> > > Hint, you don't need you own request trap.
> > Sorry, I didn't understand you fully here. I'm storing the reference to
> > the last active request associated with the stream inside the
> > last_request 'gem_active' field. Do I need to do something differently
> > here?
> 
> It's the duplication.

Are you suggesting that request_assign() and active_set() is
duplication? 
Actually, I'm using the last_request active tracker for the purpose of
waiting on last request to complete, whenever the need.
But still I need to get reference for every request for which the
commands for collection of metrics are emitted. This is because I need
to check for their completion before collecting the associated metrics.
This is done inside 'append_command_stream_samples' function, which also
takes care of releasing the reference taken here.
Am I missing something w.r.t. the active_set() functionality?

> -Chris
> 

Thanks,
Sourab
Chris Wilson March 16, 2017, 10:09 a.m. UTC | #7
On Thu, Mar 16, 2017 at 03:22:03PM +0530, sourab gupta wrote:
> On Thu, 2017-03-16 at 02:03 -0700, Chris Wilson wrote:
> > On Thu, Mar 16, 2017 at 02:24:55PM +0530, sourab gupta wrote:
> > > On Thu, 2017-03-16 at 01:10 -0700, Chris Wilson wrote:
> > > > On Thu, Mar 16, 2017 at 11:44:10AM +0530, sourab.gupta@intel.com wrote:
> > > > > @@ -3593,6 +3670,7 @@ void i915_oa_init_reg_state(struct intel_engine_cs *engine,
> > > > >  void i915_oa_update_reg_state(struct intel_engine_cs *engine,
> > > > >  			      struct i915_gem_context *ctx,
> > > > >  			      uint32_t *reg_state);
> > > > > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
> > > > >  
> > > > >  /* i915_gem_evict.c */
> > > > >  int __must_check i915_gem_evict_something(struct i915_address_space *vm,
> > > > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > > index aa75ea2..7af32c97 100644
> > > > > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > > @@ -1441,12 +1441,16 @@ static void eb_export_fence(struct drm_i915_gem_object *obj,
> > > > >  	if (exec_len == 0)
> > > > >  		exec_len = params->batch->size - params->args_batch_start_offset;
> > > > >  
> > > > > +	i915_perf_command_stream_hook(params->request);
> > > > 
> > > > Could you have named it anything more cyptic and inconsistent?
> > > 
> > > Sorry. Would 'i915_perf_capture_metrics' work?
> > > Can you please suggest an appropriate name otherwise.
> > 
> > The verb we use for writting into the command stream is emit. So
> > i915_perf_emit_samples() (emit_record record is clumsy as it is not clear
> > whether it is the verb or noun).
> > 
> Thanks for suggesting. I'll use 'i915_perf_emit_samples' here.
> 
> > > > 
> > > > Quite clearly this can fail, so justify the non handling of errors.
> > > > 
> > > > DRM_ERROR is not error handling, it is an indication that this patch
> > > > isn't ready.
> > > 
> > > Well, the intent was to have minimal effect to execbuf normal routine,
> > > even if we fail. But, I guess I'm mistaken.
> > > I'll rectify this to handle the case, if perf callback fails.
> > 
> > That all depends on whether or not you can handle the unbalanced
> > metrics. If simply missing a report is fine, then just kill the
> > DRM_ERROR.
> > 
> > The bigger question is whether the following emit can to fail -- once
> > the batch is in the request, no more failures are tolerable. You have to
> > preallocate reserved space.
> > 
> > Don't you need a flush before the emit following the batch?
> > 
> 
> Ok. So, that would mean that we have to first of all reserve the space
> required by two 'perf_emit_samples' calls, so that we can't fail for the
> lack of space in the emit following the batch.
> Probably, I could pass an additional boolean parameter 'reserve_space'
> set to true in the first call, which would reserve the space for both
> emit_samples() calls (through intel_ring_begin)?

Hmm, yes, you can just tweak the request->reserved_space in the first
call to preallocate some space (and make it remains available) for the
second.

perf_emit_samples(rq, bool preallocate) {
	/* then in each sample callback */
	cs_len = foo;
	if (preallocate)
		rq->reserved_space += cs_len;
	else
		rq->reserved_space -= cs_len;
	cs = intel_ring_begin(rq, cs_len);
}
	
> Would it still need the flush before the emit following the batch?
> Ideally, the metrics should be collected as close to batch as possible.
> So, if there are cache flush/tlb invalidate commands, it would introduce
> some lag between the batch and following emit_samples command.
> Sorry, I'm not able to gauge the need for flush here. I can understand
> it's needed before the batch is programmed for flushing the cache/TLB
> entries for the new workload to be submitted. But for the Sample_emit
> commands, which generally only capture OA/timestamp/mmio metrics, is
> this still required? 

Depends on the desired accuracy. If you want your metrics sampled after
the user pipeline is completed, you need a stall at least, a flush if
your metrics include e.g. fragment data. If you want samples
taken in whilst the user's batch is still executing, then no.

> > > > > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
> > > > > +{
> > > > > +	struct intel_engine_cs *engine = request->engine;
> > > > > +	struct drm_i915_private *dev_priv = engine->i915;
> > > > > +	struct i915_perf_stream *stream;
> > > > > +
> > > > > +	if (!dev_priv->perf.initialized)
> > > > > +		return;
> > > > > +
> > > > > +	mutex_lock(&dev_priv->perf.streams_lock);
> > > > 
> > > > Justify a new global lock very, very carefully on execbuf.
> > > 
> > > The lock introduced here is to protect the the perf.streams list against
> > > addition/deletion while we're processing the list during execbuf call.
> > > The other places where the mutex is taken is when a new stream is being
> > > created (using perf_open ioctl) or a stream is being destroyed
> > > (perf_release ioctl), which just protect the list_add and list_del into
> > > the perf.streams list.
> > > As such, there should not be much impact on execbuf path.
> > > Does this seem reasonable?
> > 
> > It doesn't sound like it needs a mutex, which will simplify the other
> > users as well (if switched to, for example, an RCU protected list).
> 
> Ok. Sounds reasonable, I'll switch to using an RCU protected list here.

(I may be overthinking this, but it still seems overkill and made the
timer callback uglier than expected.)

> > > > > +	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
> > > > > +		if ((stream->state == I915_PERF_STREAM_ENABLED) &&
> > > > > +					stream->cs_mode)
> > > > > +			stream->ops->command_stream_hook(stream, request);
> > > > > +	}
> > > > > +	mutex_unlock(&dev_priv->perf.streams_lock);
> > > > > +}
> > > > 
> > > > > +static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
> > > > > +					struct drm_i915_gem_request *request)
> > > > > +{
> > > > > +	struct drm_i915_private *dev_priv = request->i915;
> > > > > +	struct i915_gem_context *ctx = request->ctx;
> > > > > +	struct i915_perf_cs_sample *sample;
> > > > > +	u32 addr = 0;
> > > > > +	u32 cmd, *cs;
> > > > > +
> > > > > +	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
> > > > > +	if (sample == NULL) {
> > > > > +		DRM_ERROR("Perf sample alloc failed\n");
> > > > > +		return;
> > > > > +	}
> > > > > +
> > > > > +	cs = intel_ring_begin(request, 4);
> > > > > +	if (IS_ERR(cs)) {
> > > > > +		kfree(sample);
> > > > > +		return;
> > > > > +	}
> > > > > +
> > > > > +	sample->ctx_id = ctx->hw_id;
> > > > > +	i915_gem_request_assign(&sample->request, request);
> > > > 
> > > > > +
> > > > > +	i915_gem_active_set(&stream->last_request, request);
> > > > 
> > > > Hint, you don't need you own request trap.
> > > Sorry, I didn't understand you fully here. I'm storing the reference to
> > > the last active request associated with the stream inside the
> > > last_request 'gem_active' field. Do I need to do something differently
> > > here?
> > 
> > It's the duplication.
> 
> Are you suggesting that request_assign() and active_set() is
> duplication? 
> Actually, I'm using the last_request active tracker for the purpose of
> waiting on last request to complete, whenever the need.
> But still I need to get reference for every request for which the
> commands for collection of metrics are emitted. This is because I need
> to check for their completion before collecting the associated metrics.

Missed the sample / stream difference. request_assign means update the
request field, had you used
	sample->request = i915_gem_request_get(request)
it would have been clearer.

Note that the requests are not ordered for the stream, so you cannot
track the last_request so easily.

> This is done inside 'append_command_stream_samples' function, which also
> takes care of releasing the reference taken here.
> Am I missing something w.r.t. the active_set() functionality?

I was mostly thrown by the idea that you were reassigning requests,
which history has shown to be a bad idea (hence i915_gem_active).
However, stream->last_request should be a reservation_object.
-Chris
sourab.gupta@intel.com March 16, 2017, 1:12 p.m. UTC | #8
On Thu, 2017-03-16 at 03:09 -0700, Chris Wilson wrote:
> On Thu, Mar 16, 2017 at 03:22:03PM +0530, sourab gupta wrote:
> > On Thu, 2017-03-16 at 02:03 -0700, Chris Wilson wrote:
> > > On Thu, Mar 16, 2017 at 02:24:55PM +0530, sourab gupta wrote:
> > > > On Thu, 2017-03-16 at 01:10 -0700, Chris Wilson wrote:
> > > > > On Thu, Mar 16, 2017 at 11:44:10AM +0530, sourab.gupta@intel.com wrote:
> > > > > > @@ -3593,6 +3670,7 @@ void i915_oa_init_reg_state(struct intel_engine_cs *engine,
> > > > > >  void i915_oa_update_reg_state(struct intel_engine_cs *engine,
> > > > > >  			      struct i915_gem_context *ctx,
> > > > > >  			      uint32_t *reg_state);
> > > > > > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
> > > > > >  
> > > > > >  /* i915_gem_evict.c */
> > > > > >  int __must_check i915_gem_evict_something(struct i915_address_space *vm,
> > > > > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > > > index aa75ea2..7af32c97 100644
> > > > > > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > > > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > > > @@ -1441,12 +1441,16 @@ static void eb_export_fence(struct drm_i915_gem_object *obj,
> > > > > >  	if (exec_len == 0)
> > > > > >  		exec_len = params->batch->size - params->args_batch_start_offset;
> > > > > >  
> > > > > > +	i915_perf_command_stream_hook(params->request);
> > > > > 
> > > > > Could you have named it anything more cyptic and inconsistent?
> > > > 
> > > > Sorry. Would 'i915_perf_capture_metrics' work?
> > > > Can you please suggest an appropriate name otherwise.
> > > 
> > > The verb we use for writting into the command stream is emit. So
> > > i915_perf_emit_samples() (emit_record record is clumsy as it is not clear
> > > whether it is the verb or noun).
> > > 
> > Thanks for suggesting. I'll use 'i915_perf_emit_samples' here.
> > 
> > > > > 
> > > > > Quite clearly this can fail, so justify the non handling of errors.
> > > > > 
> > > > > DRM_ERROR is not error handling, it is an indication that this patch
> > > > > isn't ready.
> > > > 
> > > > Well, the intent was to have minimal effect to execbuf normal routine,
> > > > even if we fail. But, I guess I'm mistaken.
> > > > I'll rectify this to handle the case, if perf callback fails.
> > > 
> > > That all depends on whether or not you can handle the unbalanced
> > > metrics. If simply missing a report is fine, then just kill the
> > > DRM_ERROR.
> > > 
> > > The bigger question is whether the following emit can to fail -- once
> > > the batch is in the request, no more failures are tolerable. You have to
> > > preallocate reserved space.
> > > 
> > > Don't you need a flush before the emit following the batch?
> > > 
> > 
> > Ok. So, that would mean that we have to first of all reserve the space
> > required by two 'perf_emit_samples' calls, so that we can't fail for the
> > lack of space in the emit following the batch.
> > Probably, I could pass an additional boolean parameter 'reserve_space'
> > set to true in the first call, which would reserve the space for both
> > emit_samples() calls (through intel_ring_begin)?
> 
> Hmm, yes, you can just tweak the request->reserved_space in the first
> call to preallocate some space (and make it remains available) for the
> second.
> 
> perf_emit_samples(rq, bool preallocate) {
> 	/* then in each sample callback */
> 	cs_len = foo;
> 	if (preallocate)
> 		rq->reserved_space += cs_len;
> 	else
> 		rq->reserved_space -= cs_len;
> 	cs = intel_ring_begin(rq, cs_len);
> }
> 	
Ok. I would do that. Thanks for suggesting.

> > Would it still need the flush before the emit following the batch?
> > Ideally, the metrics should be collected as close to batch as possible.
> > So, if there are cache flush/tlb invalidate commands, it would introduce
> > some lag between the batch and following emit_samples command.
> > Sorry, I'm not able to gauge the need for flush here. I can understand
> > it's needed before the batch is programmed for flushing the cache/TLB
> > entries for the new workload to be submitted. But for the Sample_emit
> > commands, which generally only capture OA/timestamp/mmio metrics, is
> > this still required? 
> 
> Depends on the desired accuracy. If you want your metrics sampled after
> the user pipeline is completed, you need a stall at least, a flush if
> your metrics include e.g. fragment data. If you want samples
> taken in whilst the user's batch is still executing, then no.
> 
I guess, in that case, I would atleast need a stall to ensure user
pipeline is completed before metrics are collected.
Any additional inputs here, Robert?

> > > > > > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
> > > > > > +{
> > > > > > +	struct intel_engine_cs *engine = request->engine;
> > > > > > +	struct drm_i915_private *dev_priv = engine->i915;
> > > > > > +	struct i915_perf_stream *stream;
> > > > > > +
> > > > > > +	if (!dev_priv->perf.initialized)
> > > > > > +		return;
> > > > > > +
> > > > > > +	mutex_lock(&dev_priv->perf.streams_lock);
> > > > > 
> > > > > Justify a new global lock very, very carefully on execbuf.
> > > > 
> > > > The lock introduced here is to protect the the perf.streams list against
> > > > addition/deletion while we're processing the list during execbuf call.
> > > > The other places where the mutex is taken is when a new stream is being
> > > > created (using perf_open ioctl) or a stream is being destroyed
> > > > (perf_release ioctl), which just protect the list_add and list_del into
> > > > the perf.streams list.
> > > > As such, there should not be much impact on execbuf path.
> > > > Does this seem reasonable?
> > > 
> > > It doesn't sound like it needs a mutex, which will simplify the other
> > > users as well (if switched to, for example, an RCU protected list).
> > 
> > Ok. Sounds reasonable, I'll switch to using an RCU protected list here.
> 
> (I may be overthinking this, but it still seems overkill and made the
> timer callback uglier than expected.)
> 
Are you suggesting that using RCU here is overkill, and mutex would do?

> > > > > > +	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
> > > > > > +		if ((stream->state == I915_PERF_STREAM_ENABLED) &&
> > > > > > +					stream->cs_mode)
> > > > > > +			stream->ops->command_stream_hook(stream, request);
> > > > > > +	}
> > > > > > +	mutex_unlock(&dev_priv->perf.streams_lock);
> > > > > > +}
> > > > > 
> > > > > > +static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
> > > > > > +					struct drm_i915_gem_request *request)
> > > > > > +{
> > > > > > +	struct drm_i915_private *dev_priv = request->i915;
> > > > > > +	struct i915_gem_context *ctx = request->ctx;
> > > > > > +	struct i915_perf_cs_sample *sample;
> > > > > > +	u32 addr = 0;
> > > > > > +	u32 cmd, *cs;
> > > > > > +
> > > > > > +	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
> > > > > > +	if (sample == NULL) {
> > > > > > +		DRM_ERROR("Perf sample alloc failed\n");
> > > > > > +		return;
> > > > > > +	}
> > > > > > +
> > > > > > +	cs = intel_ring_begin(request, 4);
> > > > > > +	if (IS_ERR(cs)) {
> > > > > > +		kfree(sample);
> > > > > > +		return;
> > > > > > +	}
> > > > > > +
> > > > > > +	sample->ctx_id = ctx->hw_id;
> > > > > > +	i915_gem_request_assign(&sample->request, request);
> > > > > 
> > > > > > +
> > > > > > +	i915_gem_active_set(&stream->last_request, request);
> > > > > 
> > > > > Hint, you don't need you own request trap.
> > > > Sorry, I didn't understand you fully here. I'm storing the reference to
> > > > the last active request associated with the stream inside the
> > > > last_request 'gem_active' field. Do I need to do something differently
> > > > here?
> > > 
> > > It's the duplication.
> > 
> > Are you suggesting that request_assign() and active_set() is
> > duplication? 
> > Actually, I'm using the last_request active tracker for the purpose of
> > waiting on last request to complete, whenever the need.
> > But still I need to get reference for every request for which the
> > commands for collection of metrics are emitted. This is because I need
> > to check for their completion before collecting the associated metrics.
> 
> Missed the sample / stream difference. request_assign means update the
> request field, had you used
> 	sample->request = i915_gem_request_get(request)
> it would have been clearer.
> 
> Note that the requests are not ordered for the stream, so you cannot
> track the last_request so easily.
> 
> > This is done inside 'append_command_stream_samples' function, which also
> > takes care of releasing the reference taken here.
> > Am I missing something w.r.t. the active_set() functionality?
> 
> I was mostly thrown by the idea that you were reassigning requests,
> which history has shown to be a bad idea (hence i915_gem_active).
> However, stream->last_request should be a reservation_object.
> -Chris
> 
If I understand correctly, I need to have last_request of type
reservation object to hold all fences corresponding to the requests
pertaining to the stream. So, instead of i915_gem_active_set, I need to
do something like this:
	reservation_object_add_excl_fence(obj->resv, &rq->fence); /* Or any
other api to be used here ? */
And, when we want to wait for requests submitted on the stream to
complete, we have to wait for fences associated with the 'last_request'
reservation_object, for e.g. as done in
'i915_gem_object_wait_reservation' function.
Is this all to it, or am I missing some piece of action with this
reservation_object?

Thanks,
Sourab
Chris Wilson March 16, 2017, 1:27 p.m. UTC | #9
On Thu, Mar 16, 2017 at 06:42:21PM +0530, sourab gupta wrote:
> On Thu, 2017-03-16 at 03:09 -0700, Chris Wilson wrote:
> > On Thu, Mar 16, 2017 at 03:22:03PM +0530, sourab gupta wrote:
> > > On Thu, 2017-03-16 at 02:03 -0700, Chris Wilson wrote:
> > > > On Thu, Mar 16, 2017 at 02:24:55PM +0530, sourab gupta wrote:
> > > > > On Thu, 2017-03-16 at 01:10 -0700, Chris Wilson wrote:
> > > > > > On Thu, Mar 16, 2017 at 11:44:10AM +0530, sourab.gupta@intel.com wrote:
> > > > > > > +void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
> > > > > > > +{
> > > > > > > +	struct intel_engine_cs *engine = request->engine;
> > > > > > > +	struct drm_i915_private *dev_priv = engine->i915;
> > > > > > > +	struct i915_perf_stream *stream;
> > > > > > > +
> > > > > > > +	if (!dev_priv->perf.initialized)
> > > > > > > +		return;
> > > > > > > +
> > > > > > > +	mutex_lock(&dev_priv->perf.streams_lock);
> > > > > > 
> > > > > > Justify a new global lock very, very carefully on execbuf.
> > > > > 
> > > > > The lock introduced here is to protect the the perf.streams list against
> > > > > addition/deletion while we're processing the list during execbuf call.
> > > > > The other places where the mutex is taken is when a new stream is being
> > > > > created (using perf_open ioctl) or a stream is being destroyed
> > > > > (perf_release ioctl), which just protect the list_add and list_del into
> > > > > the perf.streams list.
> > > > > As such, there should not be much impact on execbuf path.
> > > > > Does this seem reasonable?
> > > > 
> > > > It doesn't sound like it needs a mutex, which will simplify the other
> > > > users as well (if switched to, for example, an RCU protected list).
> > > 
> > > Ok. Sounds reasonable, I'll switch to using an RCU protected list here.
> > 
> > (I may be overthinking this, but it still seems overkill and made the
> > timer callback uglier than expected.)
> > 
> Are you suggesting that using RCU here is overkill, and mutex would do?

No, I still think it can be done without a mutex, just slightly more
ugly due to the potential sleep here (perhaps srcu?). It would just be a
shame when we get parallel submission of execbuf sorted for it all to be
serialised by walking a simple list.

> > I was mostly thrown by the idea that you were reassigning requests,
> > which history has shown to be a bad idea (hence i915_gem_active).
> > However, stream->last_request should be a reservation_object.
> > 
> If I understand correctly, I need to have last_request of type
> reservation object to hold all fences corresponding to the requests
> pertaining to the stream. So, instead of i915_gem_active_set, I need to
> do something like this:
> 	reservation_object_add_excl_fence(obj->resv, &rq->fence); /* Or any
> other api to be used here ? */

reservation_object_lock();
if (reservation_object_reserve_shared() == 0)
	reservation_object_add_shared_fence();
reservation_object_unlock();

> And, when we want to wait for requests submitted on the stream to
> complete, we have to wait for fences associated with the 'last_request'
> reservation_object, for e.g. as done in
> 'i915_gem_object_wait_reservation' function.
> Is this all to it, or am I missing some piece of action with this
> reservation_object?

reservation_object_test_signaled_rcu(.test_all = true)
reservation_object_wait_timeout_rcu(.wait_all = true)
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ef104ff5..0b70052 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1972,6 +1972,23 @@  struct i915_perf_stream_ops {
 	 * The stream will always be disabled before this is called.
 	 */
 	void (*destroy)(struct i915_perf_stream *stream);
+
+	/*
+	 * @command_stream_hook: Emit the commands in the command streamer
+	 * for a particular gpu engine.
+	 *
+	 * The commands are inserted to capture the perf sample data at
+	 * specific points during workload execution, such as before and after
+	 * the batch buffer.
+	 */
+	void (*command_stream_hook)(struct i915_perf_stream *stream,
+				struct drm_i915_gem_request *request);
+};
+
+enum i915_perf_stream_state {
+	I915_PERF_STREAM_DISABLED,
+	I915_PERF_STREAM_ENABLE_IN_PROGRESS,
+	I915_PERF_STREAM_ENABLED,
 };
 
 /**
@@ -1989,6 +2006,10 @@  struct i915_perf_stream {
 	struct list_head link;
 
 	/**
+	 * @engine: GPU engine associated with this particular stream
+	 */
+	enum intel_engine_id engine;
+	/**
 	 * @sample_flags: Flags representing the `DRM_I915_PERF_PROP_SAMPLE_*`
 	 * properties given when opening a stream, representing the contents
 	 * of a single sample as read() by userspace.
@@ -2009,11 +2030,25 @@  struct i915_perf_stream {
 	struct i915_gem_context *ctx;
 
 	/**
-	 * @enabled: Whether the stream is currently enabled, considering
-	 * whether the stream was opened in a disabled state and based
-	 * on `I915_PERF_IOCTL_ENABLE` and `I915_PERF_IOCTL_DISABLE` calls.
+	 * @state: Current stream state, which can be either disabled, enabled,
+	 * or enable_in_progress, while considering whether the stream was
+	 * opened in a disabled state and based on `I915_PERF_IOCTL_ENABLE` and
+	 * `I915_PERF_IOCTL_DISABLE` calls.
 	 */
-	bool enabled;
+	enum i915_perf_stream_state state;
+
+	/**
+	 * @cs_mode: Whether command stream based perf sample collection is
+	 * enabled for this stream
+	 */
+	bool cs_mode;
+
+	/**
+	 * @last_request: Contains an RCU guarded pointer to the last request
+	 * associated with the stream, when command stream mode is enabled for
+	 * the stream.
+	 */
+	struct i915_gem_active last_request;
 
 	/**
 	 * @ops: The callbacks providing the implementation of this specific
@@ -2074,7 +2109,8 @@  struct i915_oa_ops {
 	int (*read)(struct i915_perf_stream *stream,
 		    char __user *buf,
 		    size_t count,
-		    size_t *offset);
+		    size_t *offset,
+		    u32 ts);
 
 	/**
 	 * @oa_buffer_check: Check for OA buffer data + update tail
@@ -2093,6 +2129,36 @@  struct i915_oa_ops {
 	bool (*oa_buffer_check)(struct drm_i915_private *dev_priv);
 };
 
+/*
+ * i915_perf_cs_sample - Sample element to hold info about a single perf
+ * sample data associated with a particular GPU command stream.
+ */
+struct i915_perf_cs_sample {
+	/**
+	 * @link: Links the sample into ``&drm_i915_private->perf.cs_samples``
+	 */
+	struct list_head link;
+
+	/**
+	 * @request: Gem request associated with the sample. The commands to
+	 * capture the perf metrics are inserted into the command streamer in
+	 * context of this request.
+	 */
+	struct drm_i915_gem_request *request;
+
+	/**
+	 * @offset: Offset into ``&drm_i915_private->perf.command_stream_buf``
+	 * where the perf metrics will be collected, when the commands inserted
+	 * into the command stream are executed by GPU.
+	 */
+	u32 offset;
+
+	/**
+	 * @ctx_id: Context ID associated with this perf sample
+	 */
+	u32 ctx_id;
+};
+
 struct intel_cdclk_state {
 	unsigned int cdclk, vco, ref;
 };
@@ -2421,6 +2487,8 @@  struct drm_i915_private {
 		struct ctl_table_header *sysctl_header;
 
 		struct mutex lock;
+
+		struct mutex streams_lock;
 		struct list_head streams;
 
 		spinlock_t hook_lock;
@@ -2532,6 +2600,15 @@  struct drm_i915_private {
 			const struct i915_oa_format *oa_formats;
 			int n_builtin_sets;
 		} oa;
+
+		/* Command stream based perf data buffer */
+		struct {
+			struct i915_vma *vma;
+			u8 *vaddr;
+		} command_stream_buf;
+
+		struct list_head cs_samples;
+		spinlock_t sample_lock;
 	} perf;
 
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
@@ -3593,6 +3670,7 @@  void i915_oa_init_reg_state(struct intel_engine_cs *engine,
 void i915_oa_update_reg_state(struct intel_engine_cs *engine,
 			      struct i915_gem_context *ctx,
 			      uint32_t *reg_state);
+void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index aa75ea2..7af32c97 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1441,12 +1441,16 @@  static void eb_export_fence(struct drm_i915_gem_object *obj,
 	if (exec_len == 0)
 		exec_len = params->batch->size - params->args_batch_start_offset;
 
+	i915_perf_command_stream_hook(params->request);
+
 	ret = params->engine->emit_bb_start(params->request,
 					    exec_start, exec_len,
 					    params->dispatch_flags);
 	if (ret)
 		return ret;
 
+	i915_perf_command_stream_hook(params->request);
+
 	i915_gem_execbuffer_move_to_active(vmas, params->request);
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 540c5b2..3321dad 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -285,6 +285,12 @@ 
 #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
 #define OAREPORT_REASON_CLK_RATIO      (1<<5)
 
+/* Data common to periodic and RCS based OA samples */
+struct oa_sample_data {
+	u32 source;
+	u32 ctx_id;
+	const u8 *report;
+};
 
 /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
  *
@@ -323,8 +329,19 @@ 
 	[I915_OA_FORMAT_C4_B8]		    = { 7, 64 },
 };
 
+/* Duplicated from similar static enum in i915_gem_execbuffer.c */
+#define I915_USER_RINGS (4)
+static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
+	[I915_EXEC_DEFAULT]	= RCS,
+	[I915_EXEC_RENDER]	= RCS,
+	[I915_EXEC_BLT]		= BCS,
+	[I915_EXEC_BSD]		= VCS,
+	[I915_EXEC_VEBOX]	= VECS
+};
+
 #define SAMPLE_OA_REPORT      (1<<0)
 #define SAMPLE_OA_SOURCE_INFO	(1<<1)
+#define SAMPLE_CTX_ID		(1<<2)
 
 /**
  * struct perf_open_properties - for validated properties given to open a stream
@@ -335,6 +352,9 @@ 
  * @oa_format: An OA unit HW report format
  * @oa_periodic: Whether to enable periodic OA unit sampling
  * @oa_period_exponent: The OA unit sampling period is derived from this
+ * @cs_mode: Whether the stream is configured to enable collection of metrics
+ * associated with command stream of a particular GPU engine
+ * @engine: The GPU engine associated with the stream in case cs_mode is enabled
  *
  * As read_properties_unlocked() enumerates and validates the properties given
  * to open a stream of metrics the configuration is built up in the structure
@@ -351,9 +371,218 @@  struct perf_open_properties {
 	int oa_format;
 	bool oa_periodic;
 	int oa_period_exponent;
+
+	/* Command stream mode */
+	bool cs_mode;
+	enum intel_engine_id engine;
 };
 
 /**
+ * i915_perf_command_stream_hook - Insert the commands to capture metrics into the
+ * command stream of a GPU engine.
+ * @request: request in whose context the metrics are being collected.
+ *
+ * The function provides a hook through which the commands to capture perf
+ * metrics, are inserted into the command stream of a GPU engine.
+ */
+void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
+{
+	struct intel_engine_cs *engine = request->engine;
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_perf_stream *stream;
+
+	if (!dev_priv->perf.initialized)
+		return;
+
+	mutex_lock(&dev_priv->perf.streams_lock);
+	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
+		if ((stream->state == I915_PERF_STREAM_ENABLED) &&
+					stream->cs_mode)
+			stream->ops->command_stream_hook(stream, request);
+	}
+	mutex_unlock(&dev_priv->perf.streams_lock);
+}
+
+/**
+ * release_perf_samples - Release old perf samples to make space for new
+ * sample data.
+ * @dev_priv: i915 device private
+ * @target_size: Space required to be freed up.
+ *
+ * We also dereference the associated request before deleting the sample.
+ * Also, no need to check whether the commands associated with old samples
+ * have been completed. This is because these sample entries are anyways going
+ * to be replaced by a new sample, and gpu will eventually overwrite the buffer
+ * contents, when the request associated with new sample completes.
+ */
+static void release_perf_samples(struct drm_i915_private *dev_priv,
+					u32 target_size)
+{
+	struct i915_perf_cs_sample *sample, *next;
+	u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u32 size = 0;
+
+	list_for_each_entry_safe
+		(sample, next, &dev_priv->perf.cs_samples, link) {
+
+		size += sample_size;
+		i915_gem_request_put(sample->request);
+		list_del(&sample->link);
+		kfree(sample);
+
+		if (size >= target_size)
+			break;
+	}
+}
+
+/**
+ * insert_perf_sample - Insert a perf sample entry to the sample list.
+ * @dev_priv: i915 device private
+ * @sample: perf CS sample to be inserted into the list
+ *
+ * This function never fails, since it always manages to insert the sample.
+ * If the space is exhausted in the buffer, it will remove the older
+ * entries in order to make space.
+ */
+static void insert_perf_sample(struct drm_i915_private *dev_priv,
+				struct i915_perf_cs_sample *sample)
+{
+	struct i915_perf_cs_sample *first, *last;
+	int max_offset = dev_priv->perf.command_stream_buf.vma->obj->base.size;
+	u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size;
+
+	spin_lock(&dev_priv->perf.sample_lock);
+	if (list_empty(&dev_priv->perf.cs_samples)) {
+		sample->offset = 0;
+		list_add_tail(&sample->link, &dev_priv->perf.cs_samples);
+		spin_unlock(&dev_priv->perf.sample_lock);
+		return;
+	}
+
+	first = list_first_entry(&dev_priv->perf.cs_samples,typeof(*first),
+				link);
+	last = list_last_entry(&dev_priv->perf.cs_samples, typeof(*last),
+				link);
+
+	if (last->offset >= first->offset) {
+		/* Sufficient space available at the end of buffer? */
+		if (last->offset + 2*sample_size < max_offset)
+			sample->offset = last->offset + sample_size;
+		/*
+		 * Wraparound condition. Is sufficient space available at
+		 * beginning of buffer?
+		 */
+		else if (sample_size < first->offset)
+			sample->offset = 0;
+		/* Insufficient space. Overwrite existing old entries */
+		else {
+			u32 target_size = sample_size - first->offset;
+
+			release_perf_samples(dev_priv, target_size);
+			sample->offset = 0;
+		}
+	} else {
+		/* Sufficient space available? */
+		if (last->offset + 2*sample_size < first->offset)
+			sample->offset = last->offset + sample_size;
+		/* Insufficient space. Overwrite existing old entries */
+		else {
+			u32 target_size = sample_size -
+				(first->offset - last->offset -
+				sample_size);
+
+			release_perf_samples(dev_priv, target_size);
+			sample->offset = last->offset + sample_size;
+		}
+	}
+	list_add_tail(&sample->link, &dev_priv->perf.cs_samples);
+	spin_unlock(&dev_priv->perf.sample_lock);
+}
+
+/**
+ * i915_perf_command_stream_hook_oa - Insert the commands to capture OA reports
+ * metrics into the render command stream
+ * @stream: An i915-perf stream opened for OA metrics
+ * @request: request in whose context the metrics are being collected.
+ *
+ */
+static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
+					struct drm_i915_gem_request *request)
+{
+	struct drm_i915_private *dev_priv = request->i915;
+	struct i915_gem_context *ctx = request->ctx;
+	struct i915_perf_cs_sample *sample;
+	u32 addr = 0;
+	u32 cmd, *cs;
+
+	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
+	if (sample == NULL) {
+		DRM_ERROR("Perf sample alloc failed\n");
+		return;
+	}
+
+	cs = intel_ring_begin(request, 4);
+	if (IS_ERR(cs)) {
+		kfree(sample);
+		return;
+	}
+
+	sample->ctx_id = ctx->hw_id;
+	i915_gem_request_assign(&sample->request, request);
+
+	insert_perf_sample(dev_priv, sample);
+
+	addr = dev_priv->perf.command_stream_buf.vma->node.start +
+		sample->offset;
+
+	if (WARN_ON(addr & 0x3f)) {
+		DRM_ERROR("OA buffer address not aligned to 64 byte");
+		return;
+	}
+
+	cmd = MI_REPORT_PERF_COUNT | (1<<0);
+	if (INTEL_GEN(dev_priv) >= 8)
+		cmd |= (2<<0);
+
+	*cs++ = cmd;
+	*cs++ = addr | MI_REPORT_PERF_COUNT_GGTT;
+	*cs++ = request->global_seqno;
+
+	if (INTEL_GEN(dev_priv) >= 8)
+		*cs++ = 0;
+	else
+		*cs++ = MI_NOOP;
+
+	intel_ring_advance(request, cs);
+
+	i915_gem_active_set(&stream->last_request, request);
+	i915_vma_move_to_active(dev_priv->perf.command_stream_buf.vma, request,
+					EXEC_OBJECT_WRITE);
+}
+
+/**
+ * i915_oa_rcs_release_samples - Release the perf command stream samples
+ * @dev_priv: i915 device private
+ *
+ * Note: The associated requests should be completed before releasing the
+ * references here.
+ */
+static void i915_oa_rcs_release_samples(struct drm_i915_private *dev_priv)
+{
+	struct i915_perf_cs_sample *entry, *next;
+
+	list_for_each_entry_safe
+		(entry, next, &dev_priv->perf.cs_samples, link) {
+		i915_gem_request_put(entry->request);
+
+		spin_lock(&dev_priv->perf.sample_lock);
+		list_del(&entry->link);
+		spin_unlock(&dev_priv->perf.sample_lock);
+		kfree(entry);
+	}
+}
+
+/**
  * gen8_oa_buffer_check_unlocked - check for data and update tail ptr state
  * @dev_priv: i915 device instance
  *
@@ -626,7 +855,8 @@  static int append_oa_status(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
- * @report: A single OA report to (optionally) include as part of the sample
+ * @data: OA sample data, which contains (optionally) OA report with other
+ * sample metadata.
  *
  * The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*`
  * properties when opening a stream, tracked as `stream->sample_flags`. This
@@ -641,7 +871,7 @@  static int append_oa_sample(struct i915_perf_stream *stream,
 			    char __user *buf,
 			    size_t count,
 			    size_t *offset,
-			    const u8 *report)
+			    const struct oa_sample_data *data)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -661,17 +891,21 @@  static int append_oa_sample(struct i915_perf_stream *stream,
 	buf += sizeof(header);
 
 	if (sample_flags & SAMPLE_OA_SOURCE_INFO) {
-		enum drm_i915_perf_oa_event_source source =
-				I915_PERF_OA_EVENT_SOURCE_PERIODIC;
+		if (copy_to_user(buf, &data->source, 4))
+			return -EFAULT;
+		buf += 4;
+	}
 
-		if (copy_to_user(buf, &source, 4))
+	if (sample_flags & SAMPLE_CTX_ID) {
+		if (copy_to_user(buf, &data->ctx_id, 4))
 			return -EFAULT;
 		buf += 4;
 	}
 
 	if (sample_flags & SAMPLE_OA_REPORT) {
-		if (copy_to_user(buf, report, report_size))
+		if (copy_to_user(buf, data->report, report_size))
 			return -EFAULT;
+		buf += report_size;
 	}
 
 	(*offset) += header.size;
@@ -680,11 +914,47 @@  static int append_oa_sample(struct i915_perf_stream *stream,
 }
 
 /**
+ * append_oa_buffer_sample - Copies single periodic OA report into userspace
+ * read() buffer.
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ * @report: A single OA report to (optionally) include as part of the sample
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_oa_buffer_sample(struct i915_perf_stream *stream,
+				char __user *buf, size_t count,
+				size_t *offset,	const u8 *report)
+{
+	u32 sample_flags = stream->sample_flags;
+	struct oa_sample_data data = { 0 };
+
+	if (sample_flags & SAMPLE_OA_SOURCE_INFO)
+		data.source = I915_PERF_OA_EVENT_SOURCE_PERIODIC;
+
+	/*
+	 * FIXME: append_oa_buffer_sample: read ctx ID from report and map
+	 * that to a intel_context::hw_id"
+	 */
+	if (sample_flags & SAMPLE_CTX_ID)
+		data.ctx_id = 0;
+
+	if (sample_flags & SAMPLE_OA_REPORT)
+		data.report = report;
+
+	return append_oa_sample(stream, buf, count, offset, &data);
+}
+
+
+/**
  * Copies all buffered OA reports into userspace read() buffer.
  * @stream: An i915-perf stream opened for OA metrics
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Notably any error condition resulting in a short read (-%ENOSPC or
  * -%EFAULT) will be returned even though one or more records may
@@ -702,7 +972,8 @@  static int append_oa_sample(struct i915_perf_stream *stream,
 static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				  char __user *buf,
 				  size_t count,
-				  size_t *offset)
+				  size_t *offset,
+				  u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -716,7 +987,7 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	u32 taken;
 	int ret = 0;
 
-	if (WARN_ON(!stream->enabled))
+	if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
 		return -EIO;
 
 	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
@@ -759,6 +1030,11 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		u32 *report32 = (void *)report;
 		u32 ctx_id;
 		u32 reason;
+		u32 report_ts = report32[1];
+
+		/* Report timestamp should not exceed the given ts */
+		if (report_ts > ts)
+			break;
 
 		/* All the report sizes factor neatly into the buffer
 		 * size so we never expect to see a report split
@@ -846,8 +1122,8 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				report32[2] = INVALID_CTX_ID;
 			}
 
-			ret = append_oa_sample(stream, buf, count, offset,
-					       report);
+			ret = append_oa_buffer_sample(stream, buf, count,
+							offset, report);
 			if (ret)
 				break;
 
@@ -886,6 +1162,7 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Checks OA unit status registers and if necessary appends corresponding
  * status records for userspace (such as for a buffer full condition) and then
@@ -903,7 +1180,8 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 static int gen8_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
-			size_t *offset)
+			size_t *offset,
+			u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 oastatus;
@@ -953,7 +1231,7 @@  static int gen8_oa_read(struct i915_perf_stream *stream,
 		}
 	}
 
-	return gen8_append_oa_reports(stream, buf, count, offset);
+	return gen8_append_oa_reports(stream, buf, count, offset, ts);
 }
 
 /**
@@ -962,6 +1240,7 @@  static int gen8_oa_read(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Notably any error condition resulting in a short read (-%ENOSPC or
  * -%EFAULT) will be returned even though one or more records may
@@ -979,7 +1258,8 @@  static int gen8_oa_read(struct i915_perf_stream *stream,
 static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 				  char __user *buf,
 				  size_t count,
-				  size_t *offset)
+				  size_t *offset,
+				  u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -993,7 +1273,7 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	u32 taken;
 	int ret = 0;
 
-	if (WARN_ON(!stream->enabled))
+	if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
 		return -EIO;
 
 	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
@@ -1060,7 +1340,11 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 			continue;
 		}
 
-		ret = append_oa_sample(stream, buf, count, offset, report);
+		/* Report timestamp should not exceed the given ts */
+		if (report32[1] > ts)
+			break;
+
+		ret = append_oa_buffer_sample(stream, buf, count, offset, report);
 		if (ret)
 			break;
 
@@ -1099,6 +1383,7 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Checks Gen 7 specific OA unit status registers and if necessary appends
  * corresponding status records for userspace (such as for a buffer full
@@ -1112,7 +1397,8 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 static int gen7_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
-			size_t *offset)
+			size_t *offset,
+			u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 oastatus1;
@@ -1174,7 +1460,159 @@  static int gen7_oa_read(struct i915_perf_stream *stream,
 			GEN7_OASTATUS1_REPORT_LOST;
 	}
 
-	return gen7_append_oa_reports(stream, buf, count, offset);
+	return gen7_append_oa_reports(stream, buf, count, offset, ts);
+}
+
+/**
+ * append_oa_rcs_sample - Copies single RCS based OA report into userspace
+ * read() buffer.
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ * @node: Sample data associated with RCS based OA report
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_oa_rcs_sample(struct i915_perf_stream *stream,
+				char __user *buf,
+				size_t count,
+				size_t *offset,
+				struct i915_perf_cs_sample *node)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct oa_sample_data data = { 0 };
+	const u8 *report = dev_priv->perf.command_stream_buf.vaddr +
+				node->offset;
+	u32 sample_flags = stream->sample_flags;
+	u32 report_ts;
+	int ret;
+
+	/* First, append the periodic OA samples having lower timestamps */
+	report_ts = *(u32 *)(report + 4);
+	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset, report_ts);
+	if (ret)
+		return ret;
+
+	if (sample_flags & SAMPLE_OA_SOURCE_INFO)
+		data.source = I915_PERF_OA_EVENT_SOURCE_RCS;
+
+	if (sample_flags & SAMPLE_CTX_ID)
+		data.ctx_id = node->ctx_id;
+
+	if (sample_flags & SAMPLE_OA_REPORT)
+		data.report = report;
+
+	return append_oa_sample(stream, buf, count, offset, &data);
+}
+
+/**
+ * oa_rcs_append_reports: Copies all comand stream based OA reports into
+ * userspace read() buffer.
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ *
+ * Notably any error condition resulting in a short read (-%ENOSPC or
+ * -%EFAULT) will be returned even though one or more records may
+ * have been successfully copied. In this case it's up to the caller
+ * to decide if the error should be squashed before returning to
+ * userspace.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int oa_rcs_append_reports(struct i915_perf_stream *stream,
+				char __user *buf,
+				size_t count,
+				size_t *offset)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_perf_cs_sample *entry, *next;
+	LIST_HEAD(free_list);
+	int ret = 0;
+
+	spin_lock(&dev_priv->perf.sample_lock);
+	if (list_empty(&dev_priv->perf.cs_samples)) {
+		spin_unlock(&dev_priv->perf.sample_lock);
+		return 0;
+	}
+	list_for_each_entry_safe(entry, next,
+				 &dev_priv->perf.cs_samples, link) {
+		if (!i915_gem_request_completed(entry->request))
+			break;
+		list_move_tail(&entry->link, &free_list);
+	}
+	spin_unlock(&dev_priv->perf.sample_lock);
+
+	if (list_empty(&free_list))
+		return 0;
+
+	list_for_each_entry_safe(entry, next, &free_list, link) {
+		ret = append_oa_rcs_sample(stream, buf, count, offset, entry);
+		if (ret)
+			break;
+
+		list_del(&entry->link);
+		i915_gem_request_put(entry->request);
+		kfree(entry);
+	}
+
+	/* Don't discard remaining entries, keep them for next read */
+	spin_lock(&dev_priv->perf.sample_lock);
+	list_splice(&free_list, &dev_priv->perf.cs_samples);
+	spin_unlock(&dev_priv->perf.sample_lock);
+
+	return ret;
+}
+
+/*
+ * command_stream_buf_is_empty - Checks whether the command stream buffer
+ * associated with the stream has data available.
+ * @stream: An i915-perf stream opened for OA metrics
+ *
+ * Returns: true if atleast one request associated with command stream is
+ * completed, else returns false.
+ */
+static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
+
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_perf_cs_sample *entry = NULL;
+	struct drm_i915_gem_request *request = NULL;
+
+	spin_lock(&dev_priv->perf.sample_lock);
+	entry = list_first_entry_or_null(&dev_priv->perf.cs_samples,
+			struct i915_perf_cs_sample, link);
+	if (entry)
+		request = entry->request;
+	spin_unlock(&dev_priv->perf.sample_lock);
+
+	if (!entry)
+		return true;
+	else if (!i915_gem_request_completed(request))
+		return true;
+	else
+		return false;
+}
+
+/**
+ * stream_have_data_unlocked - Checks whether the stream has data available
+ * @stream: An i915-perf stream opened for OA metrics
+ *
+ * For command stream based streams, check if the command stream buffer has
+ * atleast one sample available, if not return false, irrespective of periodic
+ * oa buffer having the data or not.
+ */
+
+static bool stream_have_data_unlocked(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	if (stream->cs_mode)
+		return !command_stream_buf_is_empty(stream);
+	else
+		return dev_priv->perf.oa.ops.oa_buffer_check(dev_priv);
 }
 
 /**
@@ -1199,8 +1637,22 @@  static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
 	if (!dev_priv->perf.oa.periodic)
 		return -EIO;
 
+	if (stream->cs_mode) {
+		int ret;
+
+		/*
+		 * Wait for the last submitted 'active' request.
+		 */
+		ret = i915_gem_active_wait(&stream->last_request,
+					   I915_WAIT_INTERRUPTIBLE);
+		if (ret) {
+			DRM_ERROR("Failed to wait for stream active request\n");
+			return ret;
+		}
+	}
+
 	return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
-					dev_priv->perf.oa.ops.oa_buffer_check(dev_priv));
+					stream_have_data_unlocked(stream));
 }
 
 /**
@@ -1241,7 +1693,12 @@  static int i915_oa_read(struct i915_perf_stream *stream,
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
+
+	if (stream->cs_mode)
+		return oa_rcs_append_reports(stream, buf, count, offset);
+	else
+		return dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+						U32_MAX);
 }
 
 /**
@@ -1315,6 +1772,21 @@  static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 }
 
 static void
+free_command_stream_buf(struct drm_i915_private *dev_priv)
+{
+	mutex_lock(&dev_priv->drm.struct_mutex);
+
+	i915_gem_object_unpin_map(dev_priv->perf.command_stream_buf.vma->obj);
+	i915_vma_unpin(dev_priv->perf.command_stream_buf.vma);
+	i915_gem_object_put(dev_priv->perf.command_stream_buf.vma->obj);
+
+	dev_priv->perf.command_stream_buf.vma = NULL;
+	dev_priv->perf.command_stream_buf.vaddr = NULL;
+
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+}
+
+static void
 free_oa_buffer(struct drm_i915_private *i915)
 {
 	mutex_lock(&i915->drm.struct_mutex);
@@ -1333,7 +1805,8 @@  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
+	if (WARN_ON(stream != dev_priv->perf.oa.exclusive_stream))
+		return;
 
 	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
 
@@ -1345,6 +1818,9 @@  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 	if (stream->ctx)
 		oa_put_render_ctx_id(stream);
 
+	if (stream->cs_mode)
+		free_command_stream_buf(dev_priv);
+
 	dev_priv->perf.oa.exclusive_stream = NULL;
 }
 
@@ -1446,25 +1922,24 @@  static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	dev_priv->perf.oa.pollin = false;
 }
 
-static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+static int alloc_obj(struct drm_i915_private *dev_priv,
+		     struct i915_vma **vma, u8 **vaddr)
 {
 	struct drm_i915_gem_object *bo;
-	struct i915_vma *vma;
 	int ret;
 
-	if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
-		return -ENODEV;
+	intel_runtime_pm_get(dev_priv);
 
 	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
 	if (ret)
-		return ret;
+		goto out;
 
 	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
 	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
 
 	bo = i915_gem_object_create(dev_priv, OA_BUFFER_SIZE);
 	if (IS_ERR(bo)) {
-		DRM_ERROR("Failed to allocate OA buffer\n");
+		DRM_ERROR("Failed to allocate i915 perf obj\n");
 		ret = PTR_ERR(bo);
 		goto unlock;
 	}
@@ -1474,42 +1949,83 @@  static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 		goto err_unref;
 
 	/* PreHSW required 512K alignment, HSW requires 16M */
-	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
+	*vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
+	if (IS_ERR(*vma)) {
+		ret = PTR_ERR(*vma);
 		goto err_unref;
 	}
-	dev_priv->perf.oa.oa_buffer.vma = vma;
 
-	dev_priv->perf.oa.oa_buffer.vaddr =
-		i915_gem_object_pin_map(bo, I915_MAP_WB);
-	if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
-		ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
+	*vaddr = i915_gem_object_pin_map(bo, I915_MAP_WB);
+	if (IS_ERR(*vaddr)) {
+		ret = PTR_ERR(*vaddr);
 		goto err_unpin;
 	}
 
-	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
-
-	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p\n",
-			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
-			 dev_priv->perf.oa.oa_buffer.vaddr);
-
 	goto unlock;
 
 err_unpin:
-	__i915_vma_unpin(vma);
+	i915_vma_unpin(*vma);
 
 err_unref:
 	i915_gem_object_put(bo);
 
-	dev_priv->perf.oa.oa_buffer.vaddr = NULL;
-	dev_priv->perf.oa.oa_buffer.vma = NULL;
-
 unlock:
 	mutex_unlock(&dev_priv->drm.struct_mutex);
+out:
+	intel_runtime_pm_put(dev_priv);
 	return ret;
 }
 
+static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+{
+	struct i915_vma *vma;
+	u8 *vaddr;
+	int ret;
+
+	if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
+		return -ENODEV;
+
+	ret = alloc_obj(dev_priv, &vma, &vaddr);
+	if (ret)
+		return ret;
+
+	dev_priv->perf.oa.oa_buffer.vma = vma;
+	dev_priv->perf.oa.oa_buffer.vaddr = vaddr;
+
+	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
+
+	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
+			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
+			 dev_priv->perf.oa.oa_buffer.vaddr);
+	return 0;
+}
+
+static int alloc_command_stream_buf(struct drm_i915_private *dev_priv)
+{
+	struct i915_vma *vma;
+	u8 *vaddr;
+	int ret;
+
+	if (WARN_ON(dev_priv->perf.command_stream_buf.vma))
+		return -ENODEV;
+
+	ret = alloc_obj(dev_priv, &vma, &vaddr);
+	if (ret)
+		return ret;
+
+	dev_priv->perf.command_stream_buf.vma = vma;
+	dev_priv->perf.command_stream_buf.vaddr = vaddr;
+	if (WARN_ON(!list_empty(&dev_priv->perf.cs_samples)))
+		INIT_LIST_HEAD(&dev_priv->perf.cs_samples);
+
+	DRM_DEBUG_DRIVER(
+		"command stream buf initialized, gtt offset = 0x%x, vaddr = %p",
+		 i915_ggtt_offset(dev_priv->perf.command_stream_buf.vma),
+		 dev_priv->perf.command_stream_buf.vaddr);
+
+	return 0;
+}
+
 static void config_oa_regs(struct drm_i915_private *dev_priv,
 			   const struct i915_oa_reg *regs,
 			   int n_regs)
@@ -1848,7 +2364,8 @@  static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
 {
 	assert_spin_locked(&dev_priv->perf.hook_lock);
 
-	if (dev_priv->perf.oa.exclusive_stream->enabled) {
+	if (dev_priv->perf.oa.exclusive_stream->state !=
+					I915_PERF_STREAM_DISABLED) {
 		struct i915_gem_context *ctx =
 			dev_priv->perf.oa.exclusive_stream->ctx;
 		u32 ctx_id = dev_priv->perf.oa.specific_ctx_id;
@@ -1954,10 +2471,20 @@  static void i915_oa_stream_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	dev_priv->perf.oa.ops.oa_disable(dev_priv);
-
 	if (dev_priv->perf.oa.periodic)
 		hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
+
+	if (stream->cs_mode) {
+		/*
+		 * Wait for the last submitted 'active' request, before freeing
+		 * the requests associated with the stream.
+		 */
+		i915_gem_active_wait(&stream->last_request,
+					   I915_WAIT_INTERRUPTIBLE);
+		i915_oa_rcs_release_samples(dev_priv);
+	}
+
+	dev_priv->perf.oa.ops.oa_disable(dev_priv);
 }
 
 static const struct i915_perf_stream_ops i915_oa_stream_ops = {
@@ -1967,6 +2494,7 @@  static void i915_oa_stream_disable(struct i915_perf_stream *stream)
 	.wait_unlocked = i915_oa_wait_unlocked,
 	.poll_wait = i915_oa_poll_wait,
 	.read = i915_oa_read,
+	.command_stream_hook = i915_perf_command_stream_hook_oa,
 };
 
 /**
@@ -1992,28 +2520,11 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			       struct perf_open_properties *props)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	int format_size;
+	bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT |
+						      SAMPLE_OA_SOURCE_INFO);
+	bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT;
 	int ret;
 
-	/* If the sysfs metrics/ directory wasn't registered for some
-	 * reason then don't let userspace try their luck with config
-	 * IDs
-	 */
-	if (!dev_priv->perf.metrics_kobj) {
-		DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
-		return -EINVAL;
-	}
-
-	if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
-		DRM_DEBUG("Only OA report sampling supported\n");
-		return -EINVAL;
-	}
-
-	if (!dev_priv->perf.oa.ops.init_oa_buffer) {
-		DRM_DEBUG("OA unit not supported\n");
-		return -ENODEV;
-	}
-
 	/* To avoid the complexity of having to accurately filter
 	 * counter reports and marshal to the appropriate client
 	 * we currently only allow exclusive access
@@ -2023,80 +2534,154 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		return -EBUSY;
 	}
 
-	if (!props->metrics_set) {
-		DRM_DEBUG("OA metric set not specified\n");
-		return -EINVAL;
-	}
-
-	if (!props->oa_format) {
-		DRM_DEBUG("OA report format not specified\n");
-		return -EINVAL;
+	if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) {
+		if (IS_HASWELL(dev_priv)) {
+			DRM_ERROR(
+				"On HSW, context ID sampling only supported via command stream");
+			return -EINVAL;
+		} else if (!i915.enable_execlists) {
+			DRM_ERROR(
+				"On Gen8+ without execlists, context ID sampling only supported via command stream");
+			return -EINVAL;
+		}
 	}
 
 	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
 
-	format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
+	if (require_oa_unit) {
+		int format_size;
 
-	stream->sample_flags |= SAMPLE_OA_REPORT;
-	stream->sample_size += format_size;
+		/* If the sysfs metrics/ directory wasn't registered for some
+		 * reason then don't let userspace try their luck with config
+		 * IDs
+		 */
+		if (!dev_priv->perf.metrics_kobj) {
+			DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
+			return -EINVAL;
+		}
 
-	if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) {
-		stream->sample_flags |= SAMPLE_OA_SOURCE_INFO;
-		stream->sample_size += 4;
-	}
+		if (!dev_priv->perf.oa.ops.init_oa_buffer) {
+			DRM_DEBUG("OA unit not supported\n");
+			return -ENODEV;
+		}
 
-	dev_priv->perf.oa.oa_buffer.format_size = format_size;
-	if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
-		return -EINVAL;
+		if (!props->metrics_set) {
+			DRM_DEBUG("OA metric set not specified\n");
+			return -EINVAL;
+		}
 
-	dev_priv->perf.oa.oa_buffer.format =
-		dev_priv->perf.oa.oa_formats[props->oa_format].format;
+		if (!props->oa_format) {
+			DRM_DEBUG("OA report format not specified\n");
+			return -EINVAL;
+		}
 
-	dev_priv->perf.oa.metrics_set = props->metrics_set;
+		if (props->cs_mode  && (props->engine!= RCS)) {
+			DRM_ERROR(
+				  "Command stream OA metrics only available via Render CS\n");
+			return -EINVAL;
+		}
+		stream->engine= RCS;
 
-	dev_priv->perf.oa.periodic = props->oa_periodic;
-	if (dev_priv->perf.oa.periodic)
-		dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
+		format_size =
+			dev_priv->perf.oa.oa_formats[props->oa_format].size;
+
+		if (props->sample_flags & SAMPLE_OA_REPORT) {
+			stream->sample_flags |= SAMPLE_OA_REPORT;
+			stream->sample_size += format_size;
+		}
+
+		if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) {
+			if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
+				DRM_ERROR(
+					  "OA source type can't be sampled without OA report");
+				return -EINVAL;
+			}
+			stream->sample_flags |= SAMPLE_OA_SOURCE_INFO;
+			stream->sample_size += 4;
+		}
+
+		dev_priv->perf.oa.oa_buffer.format_size = format_size;
+		if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
+			return -EINVAL;
+
+		dev_priv->perf.oa.oa_buffer.format =
+			dev_priv->perf.oa.oa_formats[props->oa_format].format;
+
+		dev_priv->perf.oa.metrics_set = props->metrics_set;
+
+		dev_priv->perf.oa.periodic = props->oa_periodic;
+		if (dev_priv->perf.oa.periodic)
+			dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
+
+		if (stream->ctx) {
+			ret = oa_get_render_ctx_id(stream);
+			if (ret)
+				return ret;
+		}
 
-	if (stream->ctx) {
-		ret = oa_get_render_ctx_id(stream);
+		ret = alloc_oa_buffer(dev_priv);
 		if (ret)
-			return ret;
+			goto err_oa_buf_alloc;
+
+		/* PRM - observability performance counters:
+		 *
+		 *   OACONTROL, performance counter enable, note:
+		 *
+		 *   "When this bit is set, in order to have coherent counts,
+		 *   RC6 power state and trunk clock gating must be disabled.
+		 *   This can be achieved by programming MMIO registers as
+		 *   0xA094=0 and 0xA090[31]=1"
+		 *
+		 *   In our case we are expecting that taking pm + FORCEWAKE
+		 *   references will effectively disable RC6.
+		 */
+		intel_runtime_pm_get(dev_priv);
+		intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+		ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
+		if (ret)
+			goto err_enable;
+
 	}
 
-	ret = alloc_oa_buffer(dev_priv);
-	if (ret)
-		goto err_oa_buf_alloc;
+	if (props->sample_flags & SAMPLE_CTX_ID) {
+		stream->sample_flags |= SAMPLE_CTX_ID;
+		stream->sample_size += 4;
+	}
 
-	/* PRM - observability performance counters:
-	 *
-	 *   OACONTROL, performance counter enable, note:
-	 *
-	 *   "When this bit is set, in order to have coherent counts,
-	 *   RC6 power state and trunk clock gating must be disabled.
-	 *   This can be achieved by programming MMIO registers as
-	 *   0xA094=0 and 0xA090[31]=1"
-	 *
-	 *   In our case we are expecting that taking pm + FORCEWAKE
-	 *   references will effectively disable RC6.
-	 */
-	intel_runtime_pm_get(dev_priv);
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+	if (props->cs_mode) {
+		if (!cs_sample_data) {
+			DRM_ERROR(
+				"Stream engine given without requesting any CS data to sample");
+			ret = -EINVAL;
+			goto err_enable;
+		}
 
-	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
-	if (ret)
-		goto err_enable;
+		if (!(props->sample_flags & SAMPLE_CTX_ID)) {
+			DRM_ERROR(
+				"Stream engine given without requesting any CS specific property");
+			ret = -EINVAL;
+			goto err_enable;
+		}
+		stream->cs_mode = true;
+		ret = alloc_command_stream_buf(dev_priv);
+		if (ret)
+			goto err_enable;
 
-	stream->ops = &i915_oa_stream_ops;
+		init_request_active(&stream->last_request, NULL);
+	}
 
+	stream->ops = &i915_oa_stream_ops;
 	dev_priv->perf.oa.exclusive_stream = stream;
 
 	return 0;
 
 err_enable:
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-	intel_runtime_pm_put(dev_priv);
-	free_oa_buffer(dev_priv);
+	if (require_oa_unit) {
+		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+		intel_runtime_pm_put(dev_priv);
+		free_oa_buffer(dev_priv);
+	}
 
 err_oa_buf_alloc:
 	if (stream->ctx)
@@ -2265,7 +2850,7 @@  static ssize_t i915_perf_read(struct file *file,
 	 * disabled stream as an error. In particular it might otherwise lead
 	 * to a deadlock for blocking file descriptors...
 	 */
-	if (!stream->enabled)
+	if (stream->state == I915_PERF_STREAM_DISABLED)
 		return -EIO;
 
 	if (!(file->f_flags & O_NONBLOCK)) {
@@ -2312,13 +2897,21 @@  static ssize_t i915_perf_read(struct file *file,
 
 static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
 {
+	struct i915_perf_stream *stream;
 	struct drm_i915_private *dev_priv =
 		container_of(hrtimer, typeof(*dev_priv),
 			     perf.oa.poll_check_timer);
 
-	if (dev_priv->perf.oa.ops.oa_buffer_check(dev_priv)) {
-		dev_priv->perf.oa.pollin = true;
-		wake_up(&dev_priv->perf.oa.poll_wq);
+	/* No need to protect the streams list here, since the hrtimer is
+	 * disabled before the stream is removed from list, and currently a
+	 * single exclusive_stream is supported.
+	 * XXX: revisit this when multiple concurrent streams are supported.
+	 */
+	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
+		if (stream_have_data_unlocked(stream)) {
+			dev_priv->perf.oa.pollin = true;
+			wake_up(&dev_priv->perf.oa.poll_wq);
+		}
 	}
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
@@ -2401,14 +2994,16 @@  static unsigned int i915_perf_poll(struct file *file, poll_table *wait)
  */
 static void i915_perf_enable_locked(struct i915_perf_stream *stream)
 {
-	if (stream->enabled)
+	if (stream->state != I915_PERF_STREAM_DISABLED)
 		return;
 
 	/* Allow stream->ops->enable() to refer to this */
-	stream->enabled = true;
+	stream->state = I915_PERF_STREAM_ENABLE_IN_PROGRESS;
 
 	if (stream->ops->enable)
 		stream->ops->enable(stream);
+
+	stream->state = I915_PERF_STREAM_ENABLED;
 }
 
 /**
@@ -2427,11 +3022,11 @@  static void i915_perf_enable_locked(struct i915_perf_stream *stream)
  */
 static void i915_perf_disable_locked(struct i915_perf_stream *stream)
 {
-	if (!stream->enabled)
+	if (stream->state != I915_PERF_STREAM_ENABLED)
 		return;
 
 	/* Allow stream->ops->disable() to refer to this */
-	stream->enabled = false;
+	stream->state = I915_PERF_STREAM_DISABLED;
 
 	if (stream->ops->disable)
 		stream->ops->disable(stream);
@@ -2503,14 +3098,18 @@  static long i915_perf_ioctl(struct file *file,
  */
 static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
 {
-	if (stream->enabled)
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	mutex_lock(&dev_priv->perf.streams_lock);
+	list_del(&stream->link);
+	mutex_unlock(&dev_priv->perf.streams_lock);
+
+	if (stream->state == I915_PERF_STREAM_ENABLED)
 		i915_perf_disable_locked(stream);
 
 	if (stream->ops->destroy)
 		stream->ops->destroy(stream);
 
-	list_del(&stream->link);
-
 	if (stream->ctx)
 		i915_gem_context_put_unlocked(stream->ctx);
 
@@ -2673,7 +3272,9 @@  static int i915_perf_release(struct inode *inode, struct file *file)
 		goto err_alloc;
 	}
 
+	mutex_lock(&dev_priv->perf.streams_lock);
 	list_add(&stream->link, &dev_priv->perf.streams);
+	mutex_unlock(&dev_priv->perf.streams_lock);
 
 	if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
 		f_flags |= O_CLOEXEC;
@@ -2692,7 +3293,9 @@  static int i915_perf_release(struct inode *inode, struct file *file)
 	return stream_fd;
 
 err_open:
+	mutex_lock(&dev_priv->perf.streams_lock);
 	list_del(&stream->link);
+	mutex_unlock(&dev_priv->perf.streams_lock);
 	if (stream->ops->destroy)
 		stream->ops->destroy(stream);
 err_alloc:
@@ -2832,6 +3435,30 @@  static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE:
 			props->sample_flags |= SAMPLE_OA_SOURCE_INFO;
 			break;
+		case DRM_I915_PERF_PROP_ENGINE: {
+				unsigned int user_ring_id =
+					value & I915_EXEC_RING_MASK;
+				enum intel_engine_id engine;
+
+				if (user_ring_id > I915_USER_RINGS)
+					return -EINVAL;
+
+				/* XXX: Currently only RCS is supported.
+				 * Remove this check when support for other
+				 * engines is added
+				 */
+				engine = user_ring_map[user_ring_id];
+				if (engine != RCS)
+					return -EINVAL;
+
+				props->cs_mode = true;
+				props->engine = engine;
+			}
+			break;
+		case DRM_I915_PERF_PROP_SAMPLE_CTX_ID:
+			props->sample_flags |= SAMPLE_CTX_ID;
+			break;
+
 		default:
 			MISSING_CASE(id);
 			DRM_DEBUG("Unknown i915 perf property ID\n");
@@ -3140,8 +3767,11 @@  void i915_perf_init(struct drm_i915_private *dev_priv)
 		init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
 
 		INIT_LIST_HEAD(&dev_priv->perf.streams);
+		INIT_LIST_HEAD(&dev_priv->perf.cs_samples);
 		mutex_init(&dev_priv->perf.lock);
+		mutex_init(&dev_priv->perf.streams_lock);
 		spin_lock_init(&dev_priv->perf.hook_lock);
+		spin_lock_init(&dev_priv->perf.sample_lock);
 		spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
 
 		dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index c597e36..4cbefc8 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1314,6 +1314,7 @@  enum drm_i915_oa_format {
 
 enum drm_i915_perf_oa_event_source {
 	I915_PERF_OA_EVENT_SOURCE_PERIODIC,
+	I915_PERF_OA_EVENT_SOURCE_RCS,
 
 	I915_PERF_OA_EVENT_SOURCE_MAX	/* non-ABI */
 };
@@ -1359,6 +1360,19 @@  enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE,
 
+	/**
+	 * The value of this property specifies the GPU engine for which
+	 * the samples need to be collected. Specifying this property also
+	 * implies the command stream based sample collection.
+	 */
+	DRM_I915_PERF_PROP_ENGINE,
+
+	/**
+	 * The value of this property set to 1 requests inclusion of context ID
+	 * in the perf sample data.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_CTX_ID,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
@@ -1425,6 +1439,7 @@  enum drm_i915_perf_record_type {
 	 *     struct drm_i915_perf_record_header header;
 	 *
 	 *     { u32 source_info; } && DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE
+	 *     { u32 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID
 	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
 	 * };
 	 */