diff mbox

[1/2] lib: Add GPGPU fill

Message ID 1417521568-15820-1-git-send-email-zhenyuw@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhenyu Wang Dec. 2, 2014, 11:59 a.m. UTC
This is to add fill operation using GPGPU pipeline which is similar to
current media fill. This can be used to simply verify GPGPU pipeline
and help to enable it on newer HW, currently it works on Gen7 only and
will add support on later platform.

Now this sets very simply thread group dispatch for one thread per
thread group on SIMD16 dispatch. So the fill shader just uses thread
group ID for buffer offset.

Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 lib/gen7_media.h             |   2 +
 lib/intel_batchbuffer.c      |  19 +++++
 lib/intel_batchbuffer.h      |  25 +++++++
 lib/media_fill.h             |   7 ++
 lib/media_fill_gen7.c        | 161 +++++++++++++++++++++++++++++++++++++++++--
 shaders/gpgpu/README         |   4 ++
 shaders/gpgpu/gpgpu_fill.gxa |  51 ++++++++++++++
 7 files changed, 265 insertions(+), 4 deletions(-)
 create mode 100644 shaders/gpgpu/README
 create mode 100644 shaders/gpgpu/gpgpu_fill.gxa

Comments

Daniel Vetter Dec. 2, 2014, 10 a.m. UTC | #1
On Tue, Dec 02, 2014 at 07:59:27PM +0800, Zhenyu Wang wrote:
> This is to add fill operation using GPGPU pipeline which is similar to
> current media fill. This can be used to simply verify GPGPU pipeline
> and help to enable it on newer HW, currently it works on Gen7 only and
> will add support on later platform.
> 
> Now this sets very simply thread group dispatch for one thread per
> thread group on SIMD16 dispatch. So the fill shader just uses thread
> group ID for buffer offset.
> 
> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
> ---
>  lib/gen7_media.h             |   2 +
>  lib/intel_batchbuffer.c      |  19 +++++
>  lib/intel_batchbuffer.h      |  25 +++++++
>  lib/media_fill.h             |   7 ++
>  lib/media_fill_gen7.c        | 161 +++++++++++++++++++++++++++++++++++++++++--
>  shaders/gpgpu/README         |   4 ++
>  shaders/gpgpu/gpgpu_fill.gxa |  51 ++++++++++++++
>  7 files changed, 265 insertions(+), 4 deletions(-)
>  create mode 100644 shaders/gpgpu/README
>  create mode 100644 shaders/gpgpu/gpgpu_fill.gxa
> 
> diff --git a/lib/gen7_media.h b/lib/gen7_media.h
> index d5f9921..91294d2 100644
> --- a/lib/gen7_media.h
> +++ b/lib/gen7_media.h
> @@ -179,6 +179,7 @@
>  #define GEN7_PIPELINE_SELECT			GFXPIPE(1, 1, 4)
>  # define PIPELINE_SELECT_3D			(0 << 0)
>  # define PIPELINE_SELECT_MEDIA			(1 << 0)
> +# define PIPELINE_SELECT_GPGPU			(2 << 0)
>  
>  #define GEN7_STATE_BASE_ADDRESS			GFXPIPE(0, 1, 1)
>  # define BASE_ADDRESS_MODIFY			(1 << 0)
> @@ -187,6 +188,7 @@
>  #define GEN7_MEDIA_CURBE_LOAD			GFXPIPE(2, 0, 1)
>  #define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD	GFXPIPE(2, 0, 2)
>  #define GEN7_MEDIA_OBJECT			GFXPIPE(2, 1, 0)
> +#define GEN7_GPGPU_WALKER                       GFXPIPE(2, 1, 5)
>  
>  struct gen7_interface_descriptor_data
>  {
> diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
> index 30ef2cf..18b0ef3 100644
> --- a/lib/intel_batchbuffer.c
> +++ b/lib/intel_batchbuffer.c
> @@ -511,3 +511,22 @@ igt_media_fillfunc_t igt_get_media_fillfunc(int devid)
>  
>  	return fill;
>  }
> +
> +/**
> + * igt_get_gpgpu_fillfunc:
> + * @devid: pci device id
> + *
> + * Returns:
> + *
> + * The platform-specific media fill function pointer for the device specified
> + * with @devid. Will return NULL when no media fill function is implemented.
> + */
> +igt_gpgpu_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
> +{
> +	igt_gpgpu_fillfunc_t fill = NULL;
> +
> +	if (IS_GEN7(devid))
> +		fill = gen7_gpgpu_fillfunc;
> +
> +	return fill;
> +}
> diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
> index 0ec6601..b5d697f 100644
> --- a/lib/intel_batchbuffer.h
> +++ b/lib/intel_batchbuffer.h
> @@ -264,4 +264,29 @@ typedef void (*igt_media_fillfunc_t)(struct intel_batchbuffer *batch,
>  
>  igt_media_fillfunc_t igt_get_media_fillfunc(int devid);
>  
> +/**
> + * igt_gpgpu_fillfunc_t:
> + * @batch: batchbuffer object
> + * @dst: destination i-g-t buffer object
> + * @x: destination pixel x-coordination
> + * @y: destination pixel y-coordination
> + * @width: width of the filled rectangle
> + * @height: height of the filled rectangle
> + * @color: fill color to use
> + *
> + * This is the type of the per-platform media fill functions. The
> + * platform-specific implementation can be obtained by calling
> + * igt_get_gpgpu_fillfunc().
> + *
> + * A media fill function will emit a batchbuffer to the kernel which executes
> + * the specified blit fill operation using the media engine.
> + */
> +typedef void (*igt_gpgpu_fillfunc_t)(struct intel_batchbuffer *batch,
> +				     struct igt_buf *dst,
> +				     unsigned x, unsigned y,
> +				     unsigned width, unsigned height,
> +				     uint8_t color);
> +
> +igt_gpgpu_fillfunc_t igt_get_gpgpu_fillfunc(int devid);

Please don't create a new typedef for this but reuse the media_fillfunc
typedef. If you want you could rename that to igt_fillfunc_t (plus adjust
the api doc to no longer mention media). Having the same interfaces allows
you to do fancy tests where we switch between different implementations
(e.g. for different subtests).

Otherwise lgtm.
-Daniel

> +
>  #endif
> diff --git a/lib/media_fill.h b/lib/media_fill.h
> index 226489c..2a30055 100644
> --- a/lib/media_fill.h
> +++ b/lib/media_fill.h
> @@ -32,4 +32,11 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch,
>                  unsigned width, unsigned height,
>                  uint8_t color);
>  
> +void
> +gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
> +		    struct igt_buf *dst,
> +		    unsigned x, unsigned y,
> +		    unsigned width, unsigned height,
> +		    uint8_t color);
> +
>  #endif /* RENDE_MEDIA_FILL_H */
> diff --git a/lib/media_fill_gen7.c b/lib/media_fill_gen7.c
> index 5a23b7d..7113fda 100644
> --- a/lib/media_fill_gen7.c
> +++ b/lib/media_fill_gen7.c
> @@ -8,7 +8,6 @@
>  
>  #include <assert.h>
>  
> -
>  static const uint32_t media_kernel[][4] = {
>  	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
>  	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
> @@ -23,6 +22,23 @@ static const uint32_t media_kernel[][4] = {
>  	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
>  };
>  
> +/* shaders/gpgpu/gpgpu_fill.gxa */
> +static const uint32_t gpgpu_kernel[][4] = {
> +	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
> +	{ 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
> +	{ 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
> +	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
> +	{ 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
> +	{ 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
> +	{ 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
> +	{ 0x00800001, 0x20e00021, 0x00000020, 0x00000000 },
> +	{ 0x00800001, 0x21200021, 0x00000020, 0x00000000 },
> +	{ 0x00800001, 0x21600021, 0x00000020, 0x00000000 },
> +	{ 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 },
> +	{ 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
> +	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
> +};
> +
>  static uint32_t
>  batch_used(struct intel_batchbuffer *batch)
>  {
> @@ -160,14 +176,15 @@ gen7_fill_media_kernel(struct intel_batchbuffer *batch,
>  }
>  
>  static uint32_t
> -gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst)
> +gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
> +			       const uint32_t kernel[][4], size_t size)
>  {
>  	struct gen7_interface_descriptor_data *idd;
>  	uint32_t offset;
>  	uint32_t binding_table_offset, kernel_offset;
>  
>  	binding_table_offset = gen7_fill_binding_table(batch, dst);
> -	kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel));
> +	kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
>  
>  	idd = batch_alloc(batch, sizeof(*idd), 64);
>  	offset = batch_offset(batch, idd);
> @@ -329,7 +346,9 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
>  	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
>  
>  	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
> -	interface_descriptor = gen7_fill_interface_descriptor(batch, dst);
> +	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
> +							      media_kernel,
> +							      sizeof(media_kernel));
>  	igt_assert(batch->ptr < &batch->buffer[4095]);
>  
>  	/* media pipeline */
> @@ -353,3 +372,137 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
>  	gen7_render_flush(batch, batch_end);
>  	intel_batchbuffer_reset(batch);
>  }
> +
> +static void
> +gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
> +{
> +	OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
> +
> +	/* scratch buffer */
> +	OUT_BATCH(0);
> +
> +	/* number of threads & urb entries */
> +	OUT_BATCH(1 << 16 | /* max num of threads */
> +		  0 << 8 | /* num of URB entry */
> +		  1 << 2); /* GPGPU mode */
> +
> +	OUT_BATCH(0);
> +
> +	/* urb entry size & curbe size */
> +	OUT_BATCH(0 << 16 | 	/* URB entry size in 256 bits unit */
> +		  1);		/* CURBE entry size in 256 bits unit */
> +
> +	/* scoreboard */
> +	OUT_BATCH(0);
> +	OUT_BATCH(0);
> +	OUT_BATCH(0);
> +}
> +
> +static void
> +gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
> +		     unsigned x, unsigned y,
> +		     unsigned width, unsigned height)
> +{
> +	uint32_t x_dim, y_dim, tmp, right_mask;
> +
> +	/*
> +	 * Simply do SIMD16 based dispatch, so every thread uses
> +	 * SIMD16 channels.
> +	 *
> +	 * Define our own thread group size, e.g 16x1 for every group, then
> +	 * will have 1 thread each group in SIMD16 dispatch. So thread
> +	 * width/height/depth are all 1.
> +	 *
> +	 * Then thread group X = width / 16 (aligned to 16)
> +	 * thread group Y = height;
> +	 */
> +	x_dim = (width + 15) / 16;
> +	y_dim = height;
> +
> +	tmp = width & 15;
> +	if (tmp == 0)
> +		right_mask = (1 << 16) - 1;
> +	else
> +		right_mask = (1 << tmp) - 1;
> +
> +	OUT_BATCH(GEN7_GPGPU_WALKER | 9);
> +
> +	/* interface descriptor offset */
> +	OUT_BATCH(0);
> +
> +	/* SIMD size, thread w/h/d */
> +	OUT_BATCH(1 << 30 | /* SIMD16 */
> +		  0 << 16 | /* depth:1 */
> +		  0 << 8 | /* height:1 */
> +		  0); /* width:1 */
> +
> +	/* thread group X */
> +	OUT_BATCH(0);
> +	OUT_BATCH(x_dim);
> +
> +	/* thread group Y */
> +	OUT_BATCH(0);
> +	OUT_BATCH(y_dim);
> +
> +	/* thread group Z */
> +	OUT_BATCH(0);
> +	OUT_BATCH(1);
> +
> +	/* right mask */
> +	OUT_BATCH(right_mask);
> +
> +	/* bottom mask, height 1, always 0xffffffff */
> +	OUT_BATCH(0xffffffff);
> +}
> +
> +void
> +gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
> +		    struct igt_buf *dst,
> +		    unsigned x, unsigned y,
> +		    unsigned width, unsigned height,
> +		    uint8_t color)
> +{
> +	uint32_t curbe_buffer, interface_descriptor;
> +	uint32_t batch_end;
> +
> +	intel_batchbuffer_flush(batch);
> +
> +	/* setup states */
> +	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
> +
> +	/*
> +	 * const buffer needs to fill for every thread, but as we have just 1 thread
> +	 * per every group, so need only one curbe data.
> +	 *
> +	 * For each thread, just use thread group ID for buffer offset.
> +	 */
> +	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
> +
> +	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
> +							      gpgpu_kernel,
> +							      sizeof(gpgpu_kernel));
> +	igt_assert(batch->ptr < &batch->buffer[4095]);
> +
> +	batch->ptr = batch->buffer;
> +
> +	/* GPGPU pipeline */
> +	OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
> +
> +	gen7_emit_state_base_address(batch);
> +
> +	gen7_emit_vfe_state_gpgpu(batch);
> +
> +	gen7_emit_curbe_load(batch, curbe_buffer);
> +
> +	gen7_emit_interface_descriptor_load(batch, interface_descriptor);
> +
> +	gen7_emit_gpgpu_walk(batch, x, y, width, height);
> +
> +	OUT_BATCH(MI_BATCH_BUFFER_END);
> +
> +	batch_end = batch_align(batch, 8);
> +	igt_assert(batch_end < BATCH_STATE_SPLIT);
> +
> +	gen7_render_flush(batch, batch_end);
> +	intel_batchbuffer_reset(batch);
> +}
> diff --git a/shaders/gpgpu/README b/shaders/gpgpu/README
> new file mode 100644
> index 0000000..3bf328a
> --- /dev/null
> +++ b/shaders/gpgpu/README
> @@ -0,0 +1,4 @@
> +
> +Commands used to generate the shader on gen7
> +$> m4 gpgpu_fill.gxa > gpgpu_fill.gxm
> +$> intel-gen4asm -g 7 -o <output> gpgpu_fill.gxm
> diff --git a/shaders/gpgpu/gpgpu_fill.gxa b/shaders/gpgpu/gpgpu_fill.gxa
> new file mode 100644
> index 0000000..fc309f3
> --- /dev/null
> +++ b/shaders/gpgpu/gpgpu_fill.gxa
> @@ -0,0 +1,51 @@
> +/*
> + * Registers
> + * g0 -- header
> + * g1 -- constant
> + * g2 -- calculate X/Y offset
> + * g4-g12 payload for write message
> + */
> +define(`ORIG',          `g2.0<2,2,1>UD')
> +define(`ORIG_X',        `g2.0<1>UD')
> +define(`ORIG_Y',        `g2.4<1>UD')
> +define(`COLOR',         `g1.0')
> +define(`COLORUB',       `COLOR<0,1,0>UB')
> +define(`COLORUD',       `COLOR<0,1,0>UD')
> +define(`X',             `g0.4<0,1,0>UD')
> +define(`Y',             `g0.24<0,1,0>UD')
> +
> +mov(4)  COLOR<1>UB      COLORUB         {align1};
> +
> +/* WRITE */
> +/* count thread group ID for X/Y offset */
> +mul(1)  ORIG_X          X        0x10UD {align1};
> +mov(1)  ORIG_Y          Y               {align1};
> +mov(8)  g4.0<1>UD       g0.0<8,8,1>UD   {align1};
> +mov(2)  g4.0<1>UD       ORIG            {align1};
> +/* Normal mode: for block height 1 row and block width 16 bytes */
> +mov(1)  g4.8<1>UD       0x0000000fUD    {align1};
> +
> +mov(16) g5.0<1>UD       COLORUD         {align1 compr};
> +mov(16) g7.0<1>UD       COLORUD         {align1 compr};
> +mov(16) g9.0<1>UD       COLORUD         {align1 compr};
> +mov(16) g11.0<1>UD      COLORUD         {align1 compr};
> +
> +/*
> + * comment out the following instruction on Gen7
> + * write(0, 0, 10, 12)
> + *   10: media_block_write
> + *   12: data cache data port 1
> + */
> +send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1};
> +
> +/*
> + * uncomment the following instruction on Gen7
> + * write(0, 0, 10, 0)
> + *   10: media_block_write
> + *    0: reander cache data port
> + */
> +/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */
> +
> +/* EOT */
> +mov(8)  g112.0<1>UD       g0.0<8,8,1>UD   {align1};
> +send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
> -- 
> 2.1.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/lib/gen7_media.h b/lib/gen7_media.h
index d5f9921..91294d2 100644
--- a/lib/gen7_media.h
+++ b/lib/gen7_media.h
@@ -179,6 +179,7 @@ 
 #define GEN7_PIPELINE_SELECT			GFXPIPE(1, 1, 4)
 # define PIPELINE_SELECT_3D			(0 << 0)
 # define PIPELINE_SELECT_MEDIA			(1 << 0)
+# define PIPELINE_SELECT_GPGPU			(2 << 0)
 
 #define GEN7_STATE_BASE_ADDRESS			GFXPIPE(0, 1, 1)
 # define BASE_ADDRESS_MODIFY			(1 << 0)
@@ -187,6 +188,7 @@ 
 #define GEN7_MEDIA_CURBE_LOAD			GFXPIPE(2, 0, 1)
 #define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD	GFXPIPE(2, 0, 2)
 #define GEN7_MEDIA_OBJECT			GFXPIPE(2, 1, 0)
+#define GEN7_GPGPU_WALKER                       GFXPIPE(2, 1, 5)
 
 struct gen7_interface_descriptor_data
 {
diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
index 30ef2cf..18b0ef3 100644
--- a/lib/intel_batchbuffer.c
+++ b/lib/intel_batchbuffer.c
@@ -511,3 +511,22 @@  igt_media_fillfunc_t igt_get_media_fillfunc(int devid)
 
 	return fill;
 }
+
+/**
+ * igt_get_gpgpu_fillfunc:
+ * @devid: pci device id
+ *
+ * Returns:
+ *
+ * The platform-specific media fill function pointer for the device specified
+ * with @devid. Will return NULL when no media fill function is implemented.
+ */
+igt_gpgpu_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
+{
+	igt_gpgpu_fillfunc_t fill = NULL;
+
+	if (IS_GEN7(devid))
+		fill = gen7_gpgpu_fillfunc;
+
+	return fill;
+}
diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
index 0ec6601..b5d697f 100644
--- a/lib/intel_batchbuffer.h
+++ b/lib/intel_batchbuffer.h
@@ -264,4 +264,29 @@  typedef void (*igt_media_fillfunc_t)(struct intel_batchbuffer *batch,
 
 igt_media_fillfunc_t igt_get_media_fillfunc(int devid);
 
+/**
+ * igt_gpgpu_fillfunc_t:
+ * @batch: batchbuffer object
+ * @dst: destination i-g-t buffer object
+ * @x: destination pixel x-coordination
+ * @y: destination pixel y-coordination
+ * @width: width of the filled rectangle
+ * @height: height of the filled rectangle
+ * @color: fill color to use
+ *
+ * This is the type of the per-platform media fill functions. The
+ * platform-specific implementation can be obtained by calling
+ * igt_get_gpgpu_fillfunc().
+ *
+ * A media fill function will emit a batchbuffer to the kernel which executes
+ * the specified blit fill operation using the media engine.
+ */
+typedef void (*igt_gpgpu_fillfunc_t)(struct intel_batchbuffer *batch,
+				     struct igt_buf *dst,
+				     unsigned x, unsigned y,
+				     unsigned width, unsigned height,
+				     uint8_t color);
+
+igt_gpgpu_fillfunc_t igt_get_gpgpu_fillfunc(int devid);
+
 #endif
diff --git a/lib/media_fill.h b/lib/media_fill.h
index 226489c..2a30055 100644
--- a/lib/media_fill.h
+++ b/lib/media_fill.h
@@ -32,4 +32,11 @@  gen9_media_fillfunc(struct intel_batchbuffer *batch,
                 unsigned width, unsigned height,
                 uint8_t color);
 
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+		    struct igt_buf *dst,
+		    unsigned x, unsigned y,
+		    unsigned width, unsigned height,
+		    uint8_t color);
+
 #endif /* RENDE_MEDIA_FILL_H */
diff --git a/lib/media_fill_gen7.c b/lib/media_fill_gen7.c
index 5a23b7d..7113fda 100644
--- a/lib/media_fill_gen7.c
+++ b/lib/media_fill_gen7.c
@@ -8,7 +8,6 @@ 
 
 #include <assert.h>
 
-
 static const uint32_t media_kernel[][4] = {
 	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
 	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
@@ -23,6 +22,23 @@  static const uint32_t media_kernel[][4] = {
 	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
 };
 
+/* shaders/gpgpu/gpgpu_fill.gxa */
+static const uint32_t gpgpu_kernel[][4] = {
+	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
+	{ 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
+	{ 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
+	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
+	{ 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
+	{ 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
+	{ 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x20e00021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x21200021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x21600021, 0x00000020, 0x00000000 },
+	{ 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 },
+	{ 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
+	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
+};
+
 static uint32_t
 batch_used(struct intel_batchbuffer *batch)
 {
@@ -160,14 +176,15 @@  gen7_fill_media_kernel(struct intel_batchbuffer *batch,
 }
 
 static uint32_t
-gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst)
+gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
+			       const uint32_t kernel[][4], size_t size)
 {
 	struct gen7_interface_descriptor_data *idd;
 	uint32_t offset;
 	uint32_t binding_table_offset, kernel_offset;
 
 	binding_table_offset = gen7_fill_binding_table(batch, dst);
-	kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel));
+	kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
 
 	idd = batch_alloc(batch, sizeof(*idd), 64);
 	offset = batch_offset(batch, idd);
@@ -329,7 +346,9 @@  gen7_media_fillfunc(struct intel_batchbuffer *batch,
 	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
 
 	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
-	interface_descriptor = gen7_fill_interface_descriptor(batch, dst);
+	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+							      media_kernel,
+							      sizeof(media_kernel));
 	igt_assert(batch->ptr < &batch->buffer[4095]);
 
 	/* media pipeline */
@@ -353,3 +372,137 @@  gen7_media_fillfunc(struct intel_batchbuffer *batch,
 	gen7_render_flush(batch, batch_end);
 	intel_batchbuffer_reset(batch);
 }
+
+static void
+gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
+{
+	OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
+
+	/* scratch buffer */
+	OUT_BATCH(0);
+
+	/* number of threads & urb entries */
+	OUT_BATCH(1 << 16 | /* max num of threads */
+		  0 << 8 | /* num of URB entry */
+		  1 << 2); /* GPGPU mode */
+
+	OUT_BATCH(0);
+
+	/* urb entry size & curbe size */
+	OUT_BATCH(0 << 16 | 	/* URB entry size in 256 bits unit */
+		  1);		/* CURBE entry size in 256 bits unit */
+
+	/* scoreboard */
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+}
+
+static void
+gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
+		     unsigned x, unsigned y,
+		     unsigned width, unsigned height)
+{
+	uint32_t x_dim, y_dim, tmp, right_mask;
+
+	/*
+	 * Simply do SIMD16 based dispatch, so every thread uses
+	 * SIMD16 channels.
+	 *
+	 * Define our own thread group size, e.g 16x1 for every group, then
+	 * will have 1 thread each group in SIMD16 dispatch. So thread
+	 * width/height/depth are all 1.
+	 *
+	 * Then thread group X = width / 16 (aligned to 16)
+	 * thread group Y = height;
+	 */
+	x_dim = (width + 15) / 16;
+	y_dim = height;
+
+	tmp = width & 15;
+	if (tmp == 0)
+		right_mask = (1 << 16) - 1;
+	else
+		right_mask = (1 << tmp) - 1;
+
+	OUT_BATCH(GEN7_GPGPU_WALKER | 9);
+
+	/* interface descriptor offset */
+	OUT_BATCH(0);
+
+	/* SIMD size, thread w/h/d */
+	OUT_BATCH(1 << 30 | /* SIMD16 */
+		  0 << 16 | /* depth:1 */
+		  0 << 8 | /* height:1 */
+		  0); /* width:1 */
+
+	/* thread group X */
+	OUT_BATCH(0);
+	OUT_BATCH(x_dim);
+
+	/* thread group Y */
+	OUT_BATCH(0);
+	OUT_BATCH(y_dim);
+
+	/* thread group Z */
+	OUT_BATCH(0);
+	OUT_BATCH(1);
+
+	/* right mask */
+	OUT_BATCH(right_mask);
+
+	/* bottom mask, height 1, always 0xffffffff */
+	OUT_BATCH(0xffffffff);
+}
+
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+		    struct igt_buf *dst,
+		    unsigned x, unsigned y,
+		    unsigned width, unsigned height,
+		    uint8_t color)
+{
+	uint32_t curbe_buffer, interface_descriptor;
+	uint32_t batch_end;
+
+	intel_batchbuffer_flush(batch);
+
+	/* setup states */
+	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+
+	/*
+	 * const buffer needs to fill for every thread, but as we have just 1 thread
+	 * per every group, so need only one curbe data.
+	 *
+	 * For each thread, just use thread group ID for buffer offset.
+	 */
+	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
+
+	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+							      gpgpu_kernel,
+							      sizeof(gpgpu_kernel));
+	igt_assert(batch->ptr < &batch->buffer[4095]);
+
+	batch->ptr = batch->buffer;
+
+	/* GPGPU pipeline */
+	OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
+
+	gen7_emit_state_base_address(batch);
+
+	gen7_emit_vfe_state_gpgpu(batch);
+
+	gen7_emit_curbe_load(batch, curbe_buffer);
+
+	gen7_emit_interface_descriptor_load(batch, interface_descriptor);
+
+	gen7_emit_gpgpu_walk(batch, x, y, width, height);
+
+	OUT_BATCH(MI_BATCH_BUFFER_END);
+
+	batch_end = batch_align(batch, 8);
+	igt_assert(batch_end < BATCH_STATE_SPLIT);
+
+	gen7_render_flush(batch, batch_end);
+	intel_batchbuffer_reset(batch);
+}
diff --git a/shaders/gpgpu/README b/shaders/gpgpu/README
new file mode 100644
index 0000000..3bf328a
--- /dev/null
+++ b/shaders/gpgpu/README
@@ -0,0 +1,4 @@ 
+
+Commands used to generate the shader on gen7
+$> m4 gpgpu_fill.gxa > gpgpu_fill.gxm
+$> intel-gen4asm -g 7 -o <output> gpgpu_fill.gxm
diff --git a/shaders/gpgpu/gpgpu_fill.gxa b/shaders/gpgpu/gpgpu_fill.gxa
new file mode 100644
index 0000000..fc309f3
--- /dev/null
+++ b/shaders/gpgpu/gpgpu_fill.gxa
@@ -0,0 +1,51 @@ 
+/*
+ * Registers
+ * g0 -- header
+ * g1 -- constant
+ * g2 -- calculate X/Y offset
+ * g4-g12 payload for write message
+ */
+define(`ORIG',          `g2.0<2,2,1>UD')
+define(`ORIG_X',        `g2.0<1>UD')
+define(`ORIG_Y',        `g2.4<1>UD')
+define(`COLOR',         `g1.0')
+define(`COLORUB',       `COLOR<0,1,0>UB')
+define(`COLORUD',       `COLOR<0,1,0>UD')
+define(`X',             `g0.4<0,1,0>UD')
+define(`Y',             `g0.24<0,1,0>UD')
+
+mov(4)  COLOR<1>UB      COLORUB         {align1};
+
+/* WRITE */
+/* count thread group ID for X/Y offset */
+mul(1)  ORIG_X          X        0x10UD {align1};
+mov(1)  ORIG_Y          Y               {align1};
+mov(8)  g4.0<1>UD       g0.0<8,8,1>UD   {align1};
+mov(2)  g4.0<1>UD       ORIG            {align1};
+/* Normal mode: for block height 1 row and block width 16 bytes */
+mov(1)  g4.8<1>UD       0x0000000fUD    {align1};
+
+mov(16) g5.0<1>UD       COLORUD         {align1 compr};
+mov(16) g7.0<1>UD       COLORUD         {align1 compr};
+mov(16) g9.0<1>UD       COLORUD         {align1 compr};
+mov(16) g11.0<1>UD      COLORUD         {align1 compr};
+
+/*
+ * comment out the following instruction on Gen7
+ * write(0, 0, 10, 12)
+ *   10: media_block_write
+ *   12: data cache data port 1
+ */
+send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1};
+
+/*
+ * uncomment the following instruction on Gen7
+ * write(0, 0, 10, 0)
+ *   10: media_block_write
+ *    0: reander cache data port
+ */
+/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */
+
+/* EOT */
+mov(8)  g112.0<1>UD       g0.0<8,8,1>UD   {align1};
+send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};