[RFC,2/8] drm/i915: Introduce mode for asynchronous capture of OA counters
diff mbox

Message ID 1434966619-3979-3-git-send-email-sourab.gupta@intel.com
State New
Headers show

Commit Message

sourab.gupta@intel.com June 22, 2015, 9:50 a.m. UTC
From: Sourab Gupta <sourab.gupta@intel.com>

The perf event framework supports periodic capture of OA counter snapshots. The
raw OA reports generated by HW are forwarded to userspace using perf apis. This
patch looks to extend the perf pmu introduced earlier to support the capture of
asynchronous OA snapshots (in addition to periodic snapshots). These
asynchronous snapshots will be forwarded by the perf pmu to userspace alongside
periodic snapshots in the same perf ringbuffer.
This patch introduces fields for enabling the asynchronous capture mode of OA
counter snapshots.

There may be usecases wherein we need more than periodic OA capture mode which
is supported by perf_event currently. We may need to insert the commands into
the ring to dump the OA counter snapshots at some asynchronous points during
workload execution.
This mode is primarily used for two usecases:
    - Ability to capture system wide metrics. The reports captured should be
      able to be mapped back to individual contexts.
    - Ability to inject tags for work, into the reports. This provides
      visibility into the multiple stages of work within single context.

The asynchronous reports generated in this way (using MI_REPORT_PERF_COUNT
commands), will be forwarded to userspace after appending a footer, which will
have this metadata information. This will enable the usecases mentioned above.

This may also be seen as a way to overcome a limitation of Haswell, which
doesn't write out a context ID with reports and handling this in the kernel
makes sense when we plan for compatibility with Broadwell which doesn't include
context id in reports.

This patch introduces an additional field in the oa attr structure
for supporting this type of capture. The data thus captured needs to be stored
in a separate buffer, which will be different from the buffer used otherwise for
periodic OA capture mode. Again this buffer address will not need to be mapped to
OA unit register addresses such as OASTATUS1, OASTATUS2 and OABUFFER.

The subsequent patches in the series introduce the data structures and mechanism
for forwarding reports to userspace, and mechanism for inserting corresponding
commands into the ringbuffer.

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
 drivers/gpu/drm/i915/i915_oa_perf.c | 122 ++++++++++++++++++++++++++++++------
 include/uapi/drm/i915_drm.h         |   4 +-
 3 files changed, 118 insertions(+), 19 deletions(-)

Comments

Daniel Vetter June 22, 2015, 3:59 p.m. UTC | #1
On Mon, Jun 22, 2015 at 03:20:13PM +0530, sourab.gupta@intel.com wrote:
> From: Sourab Gupta <sourab.gupta@intel.com>
> 
> The perf event framework supports periodic capture of OA counter snapshots. The
> raw OA reports generated by HW are forwarded to userspace using perf apis. This
> patch looks to extend the perf pmu introduced earlier to support the capture of
> asynchronous OA snapshots (in addition to periodic snapshots). These
> asynchronous snapshots will be forwarded by the perf pmu to userspace alongside
> periodic snapshots in the same perf ringbuffer.
> This patch introduces fields for enabling the asynchronous capture mode of OA
> counter snapshots.
> 
> There may be usecases wherein we need more than periodic OA capture mode which
> is supported by perf_event currently. We may need to insert the commands into
> the ring to dump the OA counter snapshots at some asynchronous points during
> workload execution.
> This mode is primarily used for two usecases:
>     - Ability to capture system wide metrics. The reports captured should be
>       able to be mapped back to individual contexts.
>     - Ability to inject tags for work, into the reports. This provides
>       visibility into the multiple stages of work within single context.
> 
> The asynchronous reports generated in this way (using MI_REPORT_PERF_COUNT
> commands), will be forwarded to userspace after appending a footer, which will
> have this metadata information. This will enable the usecases mentioned above.
> 
> This may also be seen as a way to overcome a limitation of Haswell, which
> doesn't write out a context ID with reports and handling this in the kernel
> makes sense when we plan for compatibility with Broadwell which doesn't include
> context id in reports.
> 
> This patch introduces an additional field in the oa attr structure
> for supporting this type of capture. The data thus captured needs to be stored
> in a separate buffer, which will be different from the buffer used otherwise for
> periodic OA capture mode. Again this buffer address will not need to be mapped to
> OA unit register addresses such as OASTATUS1, OASTATUS2 and OABUFFER.
> 
> The subsequent patches in the series introduce the data structures and mechanism
> for forwarding reports to userspace, and mechanism for inserting corresponding
> commands into the ringbuffer.
> 
> Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
>  drivers/gpu/drm/i915/i915_oa_perf.c | 122 ++++++++++++++++++++++++++++++------
>  include/uapi/drm/i915_drm.h         |   4 +-
>  3 files changed, 118 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index baa0234..ee4a5d3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1933,6 +1933,7 @@ struct drm_i915_private {
>  		u32 period_exponent;
>  
>  		u32 metrics_set;
> +		bool async_sample_mode;
>  
>  		struct {
>  			struct drm_i915_gem_object *obj;
> @@ -1944,6 +1945,16 @@ struct drm_i915_private {
>  			int format_size;
>  			spinlock_t flush_lock;
>  		} oa_buffer;
> +
> +		/* Fields for asynchronous OA snapshot capture */
> +		struct {
> +			struct drm_i915_gem_object *obj;
> +			u8 *addr;
> +			u32 head;
> +			u32 tail;
> +			int format;
> +			int format_size;
> +		} oa_async_buffer;
>  	} oa_pmu;
>  #endif
>  
> diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
> index e7e0b2b..419b6a5 100644
> --- a/drivers/gpu/drm/i915/i915_oa_perf.c
> +++ b/drivers/gpu/drm/i915/i915_oa_perf.c
> @@ -166,6 +166,20 @@ static void flush_oa_snapshots(struct drm_i915_private *dev_priv,
>  }
>  
>  static void
> +oa_async_buffer_destroy(struct drm_i915_private *i915)
> +{
> +	mutex_lock(&i915->dev->struct_mutex);
> +
> +	vunmap(i915->oa_pmu.oa_async_buffer.addr);
> +	i915_gem_object_ggtt_unpin(i915->oa_pmu.oa_async_buffer.obj);
> +	drm_gem_object_unreference(&i915->oa_pmu.oa_async_buffer.obj->base);
> +
> +	i915->oa_pmu.oa_async_buffer.obj = NULL;
> +	i915->oa_pmu.oa_async_buffer.addr = NULL;

Please don't reuse dev->struct_mutex to protect OA state, but instead
create a new lock. Yes you need dev->struct_mutex here for handling the
gem buffers, but that should only be taken right around the relevant
function calls.

dev->struct_mutex is _really_ complex and all over the place and therefore
one of the largest pieces of technical debt we have in i915. Any piece of
code that extends the coverage of dev->struct_mutex is pretty much
guaranteed to get rejected therefore.
-Daniel

> +	mutex_unlock(&i915->dev->struct_mutex);
> +}
> +
> +static void
>  oa_buffer_destroy(struct drm_i915_private *i915)
>  {
>  	mutex_lock(&i915->dev->struct_mutex);
> @@ -207,6 +221,9 @@ static void i915_oa_event_destroy(struct perf_event *event)
>  	I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
>  				      ~GT_NOA_ENABLE));
>  
> +	if (dev_priv->oa_pmu.async_sample_mode)
> +		oa_async_buffer_destroy(dev_priv);
> +
>  	oa_buffer_destroy(dev_priv);
>  
>  	BUG_ON(dev_priv->oa_pmu.exclusive_event != event);
> @@ -247,21 +264,11 @@ finish:
>  	return addr;
>  }
>  
> -static int init_oa_buffer(struct perf_event *event)
> +static int alloc_oa_obj(struct drm_i915_private *dev_priv,
> +				struct drm_i915_gem_object **obj)
>  {
> -	struct drm_i915_private *dev_priv =
> -		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
>  	struct drm_i915_gem_object *bo;
> -	int ret;
> -
> -	BUG_ON(!IS_HASWELL(dev_priv->dev));
> -	BUG_ON(dev_priv->oa_pmu.oa_buffer.obj);
> -
> -	ret = i915_mutex_lock_interruptible(dev_priv->dev);
> -	if (ret)
> -		return ret;
> -
> -	spin_lock_init(&dev_priv->oa_pmu.oa_buffer.flush_lock);
> +	int ret = 0;
>  
>  	/* NB: We over allocate the OA buffer due to the way raw sample data
>  	 * gets copied from the gpu mapped circular buffer into the perf
> @@ -277,13 +284,13 @@ static int init_oa_buffer(struct perf_event *event)
>  	 * when a report is at the end of the gpu mapped buffer we need to
>  	 * read 4 bytes past the end of the buffer.
>  	 */
> +	intel_runtime_pm_get(dev_priv);
>  	bo = i915_gem_alloc_object(dev_priv->dev, OA_BUFFER_SIZE + PAGE_SIZE);
>  	if (bo == NULL) {
>  		DRM_ERROR("Failed to allocate OA buffer\n");
>  		ret = -ENOMEM;
> -		goto unlock;
> +		goto out;
>  	}
> -	dev_priv->oa_pmu.oa_buffer.obj = bo;
>  
>  	ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
>  	if (ret)
> @@ -294,6 +301,38 @@ static int init_oa_buffer(struct perf_event *event)
>  	if (ret)
>  		goto err_unref;
>  
> +	*obj = bo;
> +	goto out;
> +
> +err_unref:
> +	drm_gem_object_unreference_unlocked(&bo->base);
> +out:
> +	intel_runtime_pm_put(dev_priv);
> +	return ret;
> +}
> +
> +static int init_oa_buffer(struct perf_event *event)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
> +	struct drm_i915_gem_object *bo;
> +	int ret;
> +
> +	BUG_ON(!IS_HASWELL(dev_priv->dev));
> +	BUG_ON(dev_priv->oa_pmu.oa_buffer.obj);
> +
> +	ret = i915_mutex_lock_interruptible(dev_priv->dev);
> +	if (ret)
> +		return ret;
> +
> +	spin_lock_init(&dev_priv->oa_pmu.oa_buffer.flush_lock);
> +
> +	ret = alloc_oa_obj(dev_priv, &bo);
> +	if (ret)
> +		goto unlock;
> +
> +	dev_priv->oa_pmu.oa_buffer.obj = bo;
> +
>  	dev_priv->oa_pmu.oa_buffer.gtt_offset = i915_gem_obj_ggtt_offset(bo);
>  	dev_priv->oa_pmu.oa_buffer.addr = vmap_oa_buffer(bo);
>  
> @@ -309,10 +348,35 @@ static int init_oa_buffer(struct perf_event *event)
>  			 dev_priv->oa_pmu.oa_buffer.gtt_offset,
>  			 dev_priv->oa_pmu.oa_buffer.addr);
>  
> -	goto unlock;
> +unlock:
> +	mutex_unlock(&dev_priv->dev->struct_mutex);
> +	return ret;
> +}
>  
> -err_unref:
> -	drm_gem_object_unreference(&bo->base);
> +static int init_async_oa_buffer(struct perf_event *event)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
> +	struct drm_i915_gem_object *bo;
> +	int ret;
> +
> +	BUG_ON(!IS_HASWELL(dev_priv->dev));
> +	BUG_ON(dev_priv->oa_pmu.oa_async_buffer.obj);
> +
> +	ret = i915_mutex_lock_interruptible(dev_priv->dev);
> +	if (ret)
> +		return ret;
> +
> +	ret = alloc_oa_obj(dev_priv, &bo);
> +	if (ret)
> +		goto unlock;
> +
> +	dev_priv->oa_pmu.oa_async_buffer.obj = bo;
> +
> +	dev_priv->oa_pmu.oa_async_buffer.addr = vmap_oa_buffer(bo);
> +
> +	DRM_DEBUG_DRIVER("OA Async Buffer initialized, vaddr = %p",
> +			 dev_priv->oa_pmu.oa_async_buffer.addr);
>  
>  unlock:
>  	mutex_unlock(&dev_priv->dev->struct_mutex);
> @@ -444,6 +508,9 @@ static int i915_oa_event_init(struct perf_event *event)
>  
>  	report_format = oa_attr.format;
>  	dev_priv->oa_pmu.oa_buffer.format = report_format;
> +	if (oa_attr.batch_buffer_sample)
> +		dev_priv->oa_pmu.oa_async_buffer.format = report_format;
> +
>  	dev_priv->oa_pmu.metrics_set = oa_attr.metrics_set;
>  
>  	if (IS_HASWELL(dev_priv->dev)) {
> @@ -457,6 +524,9 @@ static int i915_oa_event_init(struct perf_event *event)
>  			return -EINVAL;
>  
>  		dev_priv->oa_pmu.oa_buffer.format_size = snapshot_size;
> +		if (oa_attr.batch_buffer_sample)
> +			dev_priv->oa_pmu.oa_async_buffer.format_size =
> +				snapshot_size;
>  
>  		if (oa_attr.metrics_set > I915_OA_METRICS_SET_MAX)
>  			return -EINVAL;
> @@ -465,6 +535,16 @@ static int i915_oa_event_init(struct perf_event *event)
>  		return -ENODEV;
>  	}
>  
> +	/*
> +	 * In case of per batch buffer sampling, we need to check for
> +	 * CAP_SYS_ADMIN capability as we profile all the running contexts
> +	 */
> +	if (oa_attr.batch_buffer_sample) {
> +		if (!capable(CAP_SYS_ADMIN))
> +			return -EACCES;
> +		dev_priv->oa_pmu.async_sample_mode = true;
> +	}
> +
>  	/* Since we are limited to an exponential scale for
>  	 * programming the OA sampling period we don't allow userspace
>  	 * to pass a precise attr.sample_period. */
> @@ -528,6 +608,12 @@ static int i915_oa_event_init(struct perf_event *event)
>  	if (ret)
>  		return ret;
>  
> +	if (oa_attr.batch_buffer_sample) {
> +		ret = init_async_oa_buffer(event);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	BUG_ON(dev_priv->oa_pmu.exclusive_event);
>  	dev_priv->oa_pmu.exclusive_event = event;
>  
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 992e1e9..354dc3a 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -92,7 +92,9 @@ typedef struct _drm_i915_oa_attr {
>  	__u32 ctx_id;
>  
>  	__u64 single_context : 1,
> -	      __reserved_1 : 63;
> +	__reserved_1 : 31;
> +	__u32 batch_buffer_sample:1,
> +	__reserved_2:31;
>  } drm_i915_oa_attr_t;
>  
>  /* Header for PERF_RECORD_DEVICE type events */
> -- 
> 1.8.5.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index baa0234..ee4a5d3 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1933,6 +1933,7 @@  struct drm_i915_private {
 		u32 period_exponent;
 
 		u32 metrics_set;
+		bool async_sample_mode;
 
 		struct {
 			struct drm_i915_gem_object *obj;
@@ -1944,6 +1945,16 @@  struct drm_i915_private {
 			int format_size;
 			spinlock_t flush_lock;
 		} oa_buffer;
+
+		/* Fields for asynchronous OA snapshot capture */
+		struct {
+			struct drm_i915_gem_object *obj;
+			u8 *addr;
+			u32 head;
+			u32 tail;
+			int format;
+			int format_size;
+		} oa_async_buffer;
 	} oa_pmu;
 #endif
 
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index e7e0b2b..419b6a5 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -166,6 +166,20 @@  static void flush_oa_snapshots(struct drm_i915_private *dev_priv,
 }
 
 static void
+oa_async_buffer_destroy(struct drm_i915_private *i915)
+{
+	mutex_lock(&i915->dev->struct_mutex);
+
+	vunmap(i915->oa_pmu.oa_async_buffer.addr);
+	i915_gem_object_ggtt_unpin(i915->oa_pmu.oa_async_buffer.obj);
+	drm_gem_object_unreference(&i915->oa_pmu.oa_async_buffer.obj->base);
+
+	i915->oa_pmu.oa_async_buffer.obj = NULL;
+	i915->oa_pmu.oa_async_buffer.addr = NULL;
+	mutex_unlock(&i915->dev->struct_mutex);
+}
+
+static void
 oa_buffer_destroy(struct drm_i915_private *i915)
 {
 	mutex_lock(&i915->dev->struct_mutex);
@@ -207,6 +221,9 @@  static void i915_oa_event_destroy(struct perf_event *event)
 	I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
 				      ~GT_NOA_ENABLE));
 
+	if (dev_priv->oa_pmu.async_sample_mode)
+		oa_async_buffer_destroy(dev_priv);
+
 	oa_buffer_destroy(dev_priv);
 
 	BUG_ON(dev_priv->oa_pmu.exclusive_event != event);
@@ -247,21 +264,11 @@  finish:
 	return addr;
 }
 
-static int init_oa_buffer(struct perf_event *event)
+static int alloc_oa_obj(struct drm_i915_private *dev_priv,
+				struct drm_i915_gem_object **obj)
 {
-	struct drm_i915_private *dev_priv =
-		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
 	struct drm_i915_gem_object *bo;
-	int ret;
-
-	BUG_ON(!IS_HASWELL(dev_priv->dev));
-	BUG_ON(dev_priv->oa_pmu.oa_buffer.obj);
-
-	ret = i915_mutex_lock_interruptible(dev_priv->dev);
-	if (ret)
-		return ret;
-
-	spin_lock_init(&dev_priv->oa_pmu.oa_buffer.flush_lock);
+	int ret = 0;
 
 	/* NB: We over allocate the OA buffer due to the way raw sample data
 	 * gets copied from the gpu mapped circular buffer into the perf
@@ -277,13 +284,13 @@  static int init_oa_buffer(struct perf_event *event)
 	 * when a report is at the end of the gpu mapped buffer we need to
 	 * read 4 bytes past the end of the buffer.
 	 */
+	intel_runtime_pm_get(dev_priv);
 	bo = i915_gem_alloc_object(dev_priv->dev, OA_BUFFER_SIZE + PAGE_SIZE);
 	if (bo == NULL) {
 		DRM_ERROR("Failed to allocate OA buffer\n");
 		ret = -ENOMEM;
-		goto unlock;
+		goto out;
 	}
-	dev_priv->oa_pmu.oa_buffer.obj = bo;
 
 	ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
 	if (ret)
@@ -294,6 +301,38 @@  static int init_oa_buffer(struct perf_event *event)
 	if (ret)
 		goto err_unref;
 
+	*obj = bo;
+	goto out;
+
+err_unref:
+	drm_gem_object_unreference_unlocked(&bo->base);
+out:
+	intel_runtime_pm_put(dev_priv);
+	return ret;
+}
+
+static int init_oa_buffer(struct perf_event *event)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
+	struct drm_i915_gem_object *bo;
+	int ret;
+
+	BUG_ON(!IS_HASWELL(dev_priv->dev));
+	BUG_ON(dev_priv->oa_pmu.oa_buffer.obj);
+
+	ret = i915_mutex_lock_interruptible(dev_priv->dev);
+	if (ret)
+		return ret;
+
+	spin_lock_init(&dev_priv->oa_pmu.oa_buffer.flush_lock);
+
+	ret = alloc_oa_obj(dev_priv, &bo);
+	if (ret)
+		goto unlock;
+
+	dev_priv->oa_pmu.oa_buffer.obj = bo;
+
 	dev_priv->oa_pmu.oa_buffer.gtt_offset = i915_gem_obj_ggtt_offset(bo);
 	dev_priv->oa_pmu.oa_buffer.addr = vmap_oa_buffer(bo);
 
@@ -309,10 +348,35 @@  static int init_oa_buffer(struct perf_event *event)
 			 dev_priv->oa_pmu.oa_buffer.gtt_offset,
 			 dev_priv->oa_pmu.oa_buffer.addr);
 
-	goto unlock;
+unlock:
+	mutex_unlock(&dev_priv->dev->struct_mutex);
+	return ret;
+}
 
-err_unref:
-	drm_gem_object_unreference(&bo->base);
+static int init_async_oa_buffer(struct perf_event *event)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
+	struct drm_i915_gem_object *bo;
+	int ret;
+
+	BUG_ON(!IS_HASWELL(dev_priv->dev));
+	BUG_ON(dev_priv->oa_pmu.oa_async_buffer.obj);
+
+	ret = i915_mutex_lock_interruptible(dev_priv->dev);
+	if (ret)
+		return ret;
+
+	ret = alloc_oa_obj(dev_priv, &bo);
+	if (ret)
+		goto unlock;
+
+	dev_priv->oa_pmu.oa_async_buffer.obj = bo;
+
+	dev_priv->oa_pmu.oa_async_buffer.addr = vmap_oa_buffer(bo);
+
+	DRM_DEBUG_DRIVER("OA Async Buffer initialized, vaddr = %p",
+			 dev_priv->oa_pmu.oa_async_buffer.addr);
 
 unlock:
 	mutex_unlock(&dev_priv->dev->struct_mutex);
@@ -444,6 +508,9 @@  static int i915_oa_event_init(struct perf_event *event)
 
 	report_format = oa_attr.format;
 	dev_priv->oa_pmu.oa_buffer.format = report_format;
+	if (oa_attr.batch_buffer_sample)
+		dev_priv->oa_pmu.oa_async_buffer.format = report_format;
+
 	dev_priv->oa_pmu.metrics_set = oa_attr.metrics_set;
 
 	if (IS_HASWELL(dev_priv->dev)) {
@@ -457,6 +524,9 @@  static int i915_oa_event_init(struct perf_event *event)
 			return -EINVAL;
 
 		dev_priv->oa_pmu.oa_buffer.format_size = snapshot_size;
+		if (oa_attr.batch_buffer_sample)
+			dev_priv->oa_pmu.oa_async_buffer.format_size =
+				snapshot_size;
 
 		if (oa_attr.metrics_set > I915_OA_METRICS_SET_MAX)
 			return -EINVAL;
@@ -465,6 +535,16 @@  static int i915_oa_event_init(struct perf_event *event)
 		return -ENODEV;
 	}
 
+	/*
+	 * In case of per batch buffer sampling, we need to check for
+	 * CAP_SYS_ADMIN capability as we profile all the running contexts
+	 */
+	if (oa_attr.batch_buffer_sample) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		dev_priv->oa_pmu.async_sample_mode = true;
+	}
+
 	/* Since we are limited to an exponential scale for
 	 * programming the OA sampling period we don't allow userspace
 	 * to pass a precise attr.sample_period. */
@@ -528,6 +608,12 @@  static int i915_oa_event_init(struct perf_event *event)
 	if (ret)
 		return ret;
 
+	if (oa_attr.batch_buffer_sample) {
+		ret = init_async_oa_buffer(event);
+		if (ret)
+			return ret;
+	}
+
 	BUG_ON(dev_priv->oa_pmu.exclusive_event);
 	dev_priv->oa_pmu.exclusive_event = event;
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 992e1e9..354dc3a 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -92,7 +92,9 @@  typedef struct _drm_i915_oa_attr {
 	__u32 ctx_id;
 
 	__u64 single_context : 1,
-	      __reserved_1 : 63;
+	__reserved_1 : 31;
+	__u32 batch_buffer_sample:1,
+	__reserved_2:31;
 } drm_i915_oa_attr_t;
 
 /* Header for PERF_RECORD_DEVICE type events */