diff mbox

drm/i915: HSW GT3 Slices: exec flag to warn kernel that userspace is using predication

Message ID 1382389218-3388-1-git-send-email-rodrigo.vivi@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Rodrigo Vivi Oct. 21, 2013, 9 p.m. UTC
If Userspace isn't using MI_PREDICATE all slices must be enabled for
backward compatibility.

If I915_EXEC_USE_PREDICATE isn't set and defaul is set to half, kernel will force
all slices on.

v2: fix the inverted logic for backwards compatibility
    USE_PREDICATE unset force gt_full when defaul is half
    instead of GT_FULL flag.

v3: Accepting Chris's suggestions: better variable names;
    better logic around state_default x legacy_userspace_busy;
    remove unecessary mutex;

CC: Chris Wilson <chris@chris-wilson.co.uk>
CC: Eric Anholt <eric@anholt.net>
CC: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@gmail.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |  8 ++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 69 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_reg.h            | 11 +++++
 drivers/gpu/drm/i915/i915_sysfs.c          |  6 ++-
 drivers/gpu/drm/i915/intel_display.c       |  6 +++
 drivers/gpu/drm/i915/intel_drv.h           |  1 +
 drivers/gpu/drm/i915/intel_pm.c            | 45 ++++++++++++++++++-
 include/uapi/drm/i915_drm.h                |  8 +++-
 8 files changed, 149 insertions(+), 5 deletions(-)

Comments

Chris Wilson Oct. 22, 2013, 7:47 a.m. UTC | #1
On Mon, Oct 21, 2013 at 07:00:18PM -0200, Rodrigo Vivi wrote:
>  static int
> +i915_legacy_userspace_busy(struct drm_device *dev,
> +			   struct intel_ring_buffer *ring)

s/i915_legacy_userspace_busy/gt_legacy_userspace_busy/

As that is a bit more distinctive.

> +{
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +	int ret;
> +
> +	if (!HAS_SLICE_SHUTDOWN(dev) || ring == &dev_priv->ring[BCS])
> +		return -ENODEV;
> +
> +	if (dev_priv->gt_slices.state_default == 1)
> +		return -EBADE;

This needs to be replaced.

if (dev_priv->gt_slices.config == 2)
  return 0;

> +
> +	ret = intel_ring_begin(ring, 3);

The dword count must be even, or else the hardware chokes.
However, since we cannot interrupt this sequence and leave the hw in an
inconsistent state, we need to emit this entire sequence in a single
block.

> +	if (ret)
> +		return ret;
> +	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
> +	intel_ring_emit(ring, HSW_GT_SLICE_INFO);
> +	intel_ring_emit(ring, SLICE_SEL_BOTH);
> +	intel_ring_advance(ring);
> +
> +	ret = intel_ring_begin(ring, 3);
> +	if (ret)
> +		return ret;
> +	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
> +	intel_ring_emit(ring, MI_PREDICATE_RESULT_2);
> +	intel_ring_emit(ring, LOWER_SLICE_ENABLED);
> +	intel_ring_advance(ring);
> +
> +	ret = intel_ring_begin(ring, 3);
> +	if (ret)
> +		return ret;
> +	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
> +	intel_ring_emit(ring, HSW_SLICESHUTDOWN);
> +	intel_ring_emit(ring, ~SLICE_SHUTDOWN);
> +	intel_ring_advance(ring);
> +
> +	ret = intel_ring_begin(ring, 3);
> +	if (ret)
> +		return ret;
> +	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
> +	intel_ring_emit(ring, RC_IDLE_MAX_COUNT);
> +	intel_ring_emit(ring, CS_IDLE_COUNT_1US);
> +	intel_ring_advance(ring);
> +	ret = intel_ring_begin(ring, 3);
> +	if (ret)
> +		return ret;
> +	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
> +	intel_ring_emit(ring, WAIT_FOR_RC6_EXIT);
> +	intel_ring_emit(ring, WAIT_RC6_EXIT | MASK_WAIT_RC6_EXIT);

_MASKED_BIT_ENABLE(WAIT_RC6_EXIT)

> +	intel_ring_advance(ring);
> +
> +	ret = intel_ring_begin(ring, 3);
> +	if (ret)
> +		return ret;
> +	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
> +	intel_ring_emit(ring, RC_IDLE_MAX_COUNT);
> +	intel_ring_emit(ring, CS_IDLE_COUNT_5US);
> +	intel_ring_advance(ring);
> +
    dev_priv->gt_slices.config = 2;
> +	return 0;
> +}
> +
> +static int
>  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  		       struct drm_file *file,
>  		       struct drm_i915_gem_execbuffer2 *args,
> @@ -999,6 +1063,11 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  		return -EINVAL;
>  	}
>  
> +	if ((args->flags & I915_EXEC_USE_PREDICATE) == 0 &&
> +	    !dev_priv->gt_slices.legacy_userspace_busy)
> +		if (i915_legacy_userspace_busy(dev, ring) == 0)
> +			dev_priv->gt_slices.legacy_userspace_busy = true;

Now here we cannot just ignore a failure to switch slice configuration.

 static bool gt_legacy_userspace(struct intel_ring_buffer *ring,
  		                 struct drm_i915_gem_execbuffer2 *args)
 {
    if (ring->id == BCS)
       return false;

    if (!HAS_SLICE_SHUTDOWN(ring->dev))
       return false;

    return (args->flags & I915_EXEC_USE_PREDICATE) == 0;
 }

 if (gt_legacy_userspace(ring, args)) {
    ret = gt_legacy_userspace_busy(ring);
    if (ret)
       return ret;

    dev_priv->gt_slices.legacy_userspace_busy = true;
 }


> +
>  	mode = args->flags & I915_EXEC_CONSTANTS_MASK;
>  	mask = I915_EXEC_CONSTANTS_MASK;
>  	switch (mode) {
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 497c441..0146bef 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -277,6 +277,17 @@
>  #define   SLICE_STATUS_MAIN_ON	(2<<0)
>  #define   SLICE_STATUS_BOTH_ON	(3<<0)
>  
> +#define HSW_SLICESHUTDOWN	0xA190
> +#define   SLICE_SHUTDOWN	(1<<0)
> +
> +#define RC_IDLE_MAX_COUNT	0x2054
> +#define   CS_IDLE_COUNT_1US	(1<<1)
> +#define   CS_IDLE_COUNT_5US	(1<<3)
> +
> +#define WAIT_FOR_RC6_EXIT	0x20CC
> +#define   WAIT_RC6_EXIT		(1<<0)
> +#define   MASK_WAIT_RC6_EXIT	(1<<16)
> +
>  /*
>   * 3D instructions used by the kernel
>   */
> diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
> index 86ccd52..1dd57fb 100644
> --- a/drivers/gpu/drm/i915/i915_sysfs.c
> +++ b/drivers/gpu/drm/i915/i915_sysfs.c
> @@ -125,8 +125,7 @@ static ssize_t gt_slice_config_show(struct device *kdev,
>  	struct drm_device *dev = minor->dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  
> -	return sprintf(buf, "%s\n", I915_READ(MI_PREDICATE_RESULT_2) ==
> -		       LOWER_SLICE_ENABLED ? "full" : "half");
> +	return sprintf(buf, "%s\n", I915_READ(MI_PREDICATE_RESULT_2) == LOWER_SLICE_ENABLED ? "full" : "half");
>  }
>  
>  static ssize_t gt_slice_config_store(struct device *kdev,
> @@ -135,16 +134,19 @@ static ssize_t gt_slice_config_store(struct device *kdev,
>  {
>  	struct drm_minor *minor = container_of(kdev, struct drm_minor, kdev);
>  	struct drm_device *dev = minor->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;

This cannot change state whilst legacy_userspace_busy.
>  
>  	if (!strncmp(buf, "full", sizeof("full") - 1)) {
>  		ret = intel_set_gt_full(dev);
>  		if (ret)
>  			return ret;
> +		dev_priv->gt_slices.state_default = 1;
dev_priv->gt_slices.max_config = 2;
>  	} else if (!strncmp(buf, "half", sizeof("half") - 1)) {
>  		ret = intel_set_gt_half(dev);
>  		if (ret)
>  			return ret;
> +		dev_priv->gt_slices.state_default = 0;
dev_priv->gt_slices.max_config = 1;
>  	} else
>  		return -EINVAL;

(void)intel_gt_update_slice_config(dev);

where

  int intel_gt_update_slice_config(struct drm_device *dev)
  {
     int config;

     if (!HAS_SLICE_SHUTDOWN(dev))
       return -ENODEV;

     if (dev_priv-gt_slices.legacy_userspace_busy)
       return -EBUSY;
       
     config = min(dev_priv->gt_slices.config,
                  dev_priv->gt_slices.max_config);
     if (config == dev_priv->gt_slices.config)
       return 0;

     /* slice_set_config(dev, config); ? */
     switch (config) {
     case 1: gt_slice_set_half(dev); break;
     case 2: gt_slice_set_full(dev); break;
     default: return -EINVAL;
     }

     dev_priv->gt_slices.config = config;
     return 0;
  }

>  	return count;
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 4f1b636..3810ecf 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -7778,6 +7778,12 @@ void intel_mark_idle(struct drm_device *dev)
>  
>  	if (dev_priv->info->gen >= 6)
>  		gen6_rps_idle(dev->dev_private);
> +
> +	if (HAS_SLICE_SHUTDOWN(dev) &&
> +	    dev_priv->gt_slices.legacy_userspace_busy) {
> +		intel_set_gt_half_async(dev);
> +	}

Just if (dev_priv->gt_slices.legacy_userspace_busy) {
         dev_priv->gt_slices.legacy_userspace_busy = false;
         intel_gt_update_slice_config(dev);
     }
Rodrigo Vivi Oct. 22, 2013, 5:33 p.m. UTC | #2
On Tue, Oct 22, 2013 at 5:47 AM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Mon, Oct 21, 2013 at 07:00:18PM -0200, Rodrigo Vivi wrote:
>>  static int
>> +i915_legacy_userspace_busy(struct drm_device *dev,
>> +                        struct intel_ring_buffer *ring)
>
> s/i915_legacy_userspace_busy/gt_legacy_userspace_busy/
>
> As that is a bit more distinctive.

ok, makes sense.

>
>> +{
>> +     drm_i915_private_t *dev_priv = dev->dev_private;
>> +     int ret;
>> +
>> +     if (!HAS_SLICE_SHUTDOWN(dev) || ring == &dev_priv->ring[BCS])
>> +             return -ENODEV;
>> +
>> +     if (dev_priv->gt_slices.state_default == 1)
>> +             return -EBADE;
>
> This needs to be replaced.
>
> if (dev_priv->gt_slices.config == 2)
>   return 0;
>
>> +
>> +     ret = intel_ring_begin(ring, 3);
>
> The dword count must be even, or else the hardware chokes.
> However, since we cannot interrupt this sequence and leave the hw in an
> inconsistent state, we need to emit this entire sequence in a single
> block.

makes sense... changed and worked locally here.

>
>> +     if (ret)
>> +             return ret;
>> +     intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
>> +     intel_ring_emit(ring, HSW_GT_SLICE_INFO);
>> +     intel_ring_emit(ring, SLICE_SEL_BOTH);
>> +     intel_ring_advance(ring);
>> +
>> +     ret = intel_ring_begin(ring, 3);
>> +     if (ret)
>> +             return ret;
>> +     intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
>> +     intel_ring_emit(ring, MI_PREDICATE_RESULT_2);
>> +     intel_ring_emit(ring, LOWER_SLICE_ENABLED);
>> +     intel_ring_advance(ring);
>> +
>> +     ret = intel_ring_begin(ring, 3);
>> +     if (ret)
>> +             return ret;
>> +     intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
>> +     intel_ring_emit(ring, HSW_SLICESHUTDOWN);
>> +     intel_ring_emit(ring, ~SLICE_SHUTDOWN);
>> +     intel_ring_advance(ring);
>> +
>> +     ret = intel_ring_begin(ring, 3);
>> +     if (ret)
>> +             return ret;
>> +     intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
>> +     intel_ring_emit(ring, RC_IDLE_MAX_COUNT);
>> +     intel_ring_emit(ring, CS_IDLE_COUNT_1US);
>> +     intel_ring_advance(ring);
>> +     ret = intel_ring_begin(ring, 3);
>> +     if (ret)
>> +             return ret;
>> +     intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
>> +     intel_ring_emit(ring, WAIT_FOR_RC6_EXIT);
>> +     intel_ring_emit(ring, WAIT_RC6_EXIT | MASK_WAIT_RC6_EXIT);
>
> _MASKED_BIT_ENABLE(WAIT_RC6_EXIT)

thanks

>
>> +     intel_ring_advance(ring);
>> +
>> +     ret = intel_ring_begin(ring, 3);
>> +     if (ret)
>> +             return ret;
>> +     intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
>> +     intel_ring_emit(ring, RC_IDLE_MAX_COUNT);
>> +     intel_ring_emit(ring, CS_IDLE_COUNT_5US);
>> +     intel_ring_advance(ring);
>> +
>     dev_priv->gt_slices.config = 2;
>> +     return 0;
>> +}
>> +
>> +static int
>>  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>                      struct drm_file *file,
>>                      struct drm_i915_gem_execbuffer2 *args,
>> @@ -999,6 +1063,11 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>               return -EINVAL;
>>       }
>>
>> +     if ((args->flags & I915_EXEC_USE_PREDICATE) == 0 &&
>> +         !dev_priv->gt_slices.legacy_userspace_busy)
>> +             if (i915_legacy_userspace_busy(dev, ring) == 0)
>> +                     dev_priv->gt_slices.legacy_userspace_busy = true;
>
> Now here we cannot just ignore a failure to switch slice configuration.
>
>  static bool gt_legacy_userspace(struct intel_ring_buffer *ring,
>                                  struct drm_i915_gem_execbuffer2 *args)
>  {
>     if (ring->id == BCS)
>        return false;
>
>     if (!HAS_SLICE_SHUTDOWN(ring->dev))
>        return false;
>
>     return (args->flags & I915_EXEC_USE_PREDICATE) == 0;
>  }
>
>  if (gt_legacy_userspace(ring, args)) {
>     ret = gt_legacy_userspace_busy(ring);
>     if (ret)
>        return ret;
>
>     dev_priv->gt_slices.legacy_userspace_busy = true;
>  }

I liked all this simplification. Thanks.
>
>
>> +
>>       mode = args->flags & I915_EXEC_CONSTANTS_MASK;
>>       mask = I915_EXEC_CONSTANTS_MASK;
>>       switch (mode) {
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>> index 497c441..0146bef 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -277,6 +277,17 @@
>>  #define   SLICE_STATUS_MAIN_ON       (2<<0)
>>  #define   SLICE_STATUS_BOTH_ON       (3<<0)
>>
>> +#define HSW_SLICESHUTDOWN    0xA190
>> +#define   SLICE_SHUTDOWN     (1<<0)
>> +
>> +#define RC_IDLE_MAX_COUNT    0x2054
>> +#define   CS_IDLE_COUNT_1US  (1<<1)
>> +#define   CS_IDLE_COUNT_5US  (1<<3)
>> +
>> +#define WAIT_FOR_RC6_EXIT    0x20CC
>> +#define   WAIT_RC6_EXIT              (1<<0)
>> +#define   MASK_WAIT_RC6_EXIT (1<<16)
>> +
>>  /*
>>   * 3D instructions used by the kernel
>>   */
>> diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
>> index 86ccd52..1dd57fb 100644
>> --- a/drivers/gpu/drm/i915/i915_sysfs.c
>> +++ b/drivers/gpu/drm/i915/i915_sysfs.c
>> @@ -125,8 +125,7 @@ static ssize_t gt_slice_config_show(struct device *kdev,
>>       struct drm_device *dev = minor->dev;
>>       struct drm_i915_private *dev_priv = dev->dev_private;
>>
>> -     return sprintf(buf, "%s\n", I915_READ(MI_PREDICATE_RESULT_2) ==
>> -                    LOWER_SLICE_ENABLED ? "full" : "half");
>> +     return sprintf(buf, "%s\n", I915_READ(MI_PREDICATE_RESULT_2) == LOWER_SLICE_ENABLED ? "full" : "half");
>>  }
>>
>>  static ssize_t gt_slice_config_store(struct device *kdev,
>> @@ -135,16 +134,19 @@ static ssize_t gt_slice_config_store(struct device *kdev,
>>  {
>>       struct drm_minor *minor = container_of(kdev, struct drm_minor, kdev);
>>       struct drm_device *dev = minor->dev;
>> +     struct drm_i915_private *dev_priv = dev->dev_private;
>>       int ret;
>
> This cannot change state whilst legacy_userspace_busy.

We cannot change to half while in legacy_userspace_busy and this check
was already inside
gt_slices_set_half(dev). So just the state_default was changed so on
next idleness it would come to half anyway.

>>
>>       if (!strncmp(buf, "full", sizeof("full") - 1)) {
>>               ret = intel_set_gt_full(dev);
>>               if (ret)
>>                       return ret;
>> +             dev_priv->gt_slices.state_default = 1;
> dev_priv->gt_slices.max_config = 2;
>>       } else if (!strncmp(buf, "half", sizeof("half") - 1)) {
>>               ret = intel_set_gt_half(dev);
>>               if (ret)
>>                       return ret;
>> +             dev_priv->gt_slices.state_default = 0;
> dev_priv->gt_slices.max_config = 1;
>>       } else
>>               return -EINVAL;
>
> (void)intel_gt_update_slice_config(dev);

This remove the syncrony you had asked. Without it testcase for sysfs
change is impossible because we don't know if it is in _busy... so we
cannot simply check state after request a change.

>
> where
>
>   int intel_gt_update_slice_config(struct drm_device *dev)
>   {
>      int config;
>
>      if (!HAS_SLICE_SHUTDOWN(dev))
>        return -ENODEV;
>
>      if (dev_priv-gt_slices.legacy_userspace_busy)
>        return -EBUSY;
>
>      config = min(dev_priv->gt_slices.config,
>                   dev_priv->gt_slices.max_config);
>      if (config == dev_priv->gt_slices.config)

I still think state_default is working, enough and simpler...
or do you really want me to use config and max_config instead?

>        return 0;
>
>      /* slice_set_config(dev, config); ? */
>      switch (config) {
>      case 1: gt_slice_set_half(dev); break;
>      case 2: gt_slice_set_full(dev); break;
>      default: return -EINVAL;
>      }
>
>      dev_priv->gt_slices.config = config;
>      return 0;
>   }
>
>>       return count;
>> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
>> index 4f1b636..3810ecf 100644
>> --- a/drivers/gpu/drm/i915/intel_display.c
>> +++ b/drivers/gpu/drm/i915/intel_display.c
>> @@ -7778,6 +7778,12 @@ void intel_mark_idle(struct drm_device *dev)
>>
>>       if (dev_priv->info->gen >= 6)
>>               gen6_rps_idle(dev->dev_private);
>> +
>> +     if (HAS_SLICE_SHUTDOWN(dev) &&
>> +         dev_priv->gt_slices.legacy_userspace_busy) {
>> +             intel_set_gt_half_async(dev);
>> +     }
>
> Just if (dev_priv->gt_slices.legacy_userspace_busy) {
>          dev_priv->gt_slices.legacy_userspace_busy = false;
>          intel_gt_update_slice_config(dev);
>      }
>
> --
> Chris Wilson, Intel Open Source Technology Centre

Thank you very much.
Chris Wilson Oct. 22, 2013, 5:40 p.m. UTC | #3
On Tue, Oct 22, 2013 at 03:33:16PM -0200, Rodrigo Vivi wrote:
> I still think state_default is working, enough and simpler...
> or do you really want me to use config and max_config instead?

I think keeping the hw config separate from the requested config helps a
lot when reviewing the code in 6 months time. See pc8 for a recent
example where we also need to track hw vs sw state. Similarly in trying
to split the code that implements the policy and that implements the
mechanism.

The devil is in the details, of course, and so this suggestion may
indeed suck when implemented. You are free to refute any suggestion with
a good reason. :)
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 685fb1d..67bbbce 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1219,6 +1219,12 @@  struct i915_package_c8 {
 	} regsave;
 };
 
+struct i915_gt_slices {
+	int state_default;
+	int legacy_userspace_busy;
+	struct mutex lock; /* locks access to this scruct and slice registers */
+};
+
 typedef struct drm_i915_private {
 	struct drm_device *dev;
 	struct kmem_cache *slab;
@@ -1418,6 +1424,8 @@  typedef struct drm_i915_private {
 
 	struct i915_package_c8 pc8;
 
+	struct i915_gt_slices gt_slices;
+
 	/* Old dri1 support infrastructure, beware the dragons ya fools entering
 	 * here! */
 	struct i915_dri1_state dri1;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0ce0d47..1d8187a 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -923,6 +923,70 @@  i915_reset_gen7_sol_offsets(struct drm_device *dev,
 }
 
 static int
+i915_legacy_userspace_busy(struct drm_device *dev,
+			   struct intel_ring_buffer *ring)
+{
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	int ret;
+
+	if (!HAS_SLICE_SHUTDOWN(dev) || ring == &dev_priv->ring[BCS])
+		return -ENODEV;
+
+	if (dev_priv->gt_slices.state_default == 1)
+		return -EBADE;
+
+	ret = intel_ring_begin(ring, 3);
+	if (ret)
+		return ret;
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, HSW_GT_SLICE_INFO);
+	intel_ring_emit(ring, SLICE_SEL_BOTH);
+	intel_ring_advance(ring);
+
+	ret = intel_ring_begin(ring, 3);
+	if (ret)
+		return ret;
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, MI_PREDICATE_RESULT_2);
+	intel_ring_emit(ring, LOWER_SLICE_ENABLED);
+	intel_ring_advance(ring);
+
+	ret = intel_ring_begin(ring, 3);
+	if (ret)
+		return ret;
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, HSW_SLICESHUTDOWN);
+	intel_ring_emit(ring, ~SLICE_SHUTDOWN);
+	intel_ring_advance(ring);
+
+	ret = intel_ring_begin(ring, 3);
+	if (ret)
+		return ret;
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, RC_IDLE_MAX_COUNT);
+	intel_ring_emit(ring, CS_IDLE_COUNT_1US);
+	intel_ring_advance(ring);
+
+	ret = intel_ring_begin(ring, 3);
+	if (ret)
+		return ret;
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, WAIT_FOR_RC6_EXIT);
+	intel_ring_emit(ring, WAIT_RC6_EXIT | MASK_WAIT_RC6_EXIT);
+	intel_ring_advance(ring);
+
+	ret = intel_ring_begin(ring, 3);
+	if (ret)
+		return ret;
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, RC_IDLE_MAX_COUNT);
+	intel_ring_emit(ring, CS_IDLE_COUNT_5US);
+	intel_ring_advance(ring);
+
+	return 0;
+}
+
+static int
 i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		       struct drm_file *file,
 		       struct drm_i915_gem_execbuffer2 *args,
@@ -999,6 +1063,11 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
+	if ((args->flags & I915_EXEC_USE_PREDICATE) == 0 &&
+	    !dev_priv->gt_slices.legacy_userspace_busy)
+		if (i915_legacy_userspace_busy(dev, ring) == 0)
+			dev_priv->gt_slices.legacy_userspace_busy = true;
+
 	mode = args->flags & I915_EXEC_CONSTANTS_MASK;
 	mask = I915_EXEC_CONSTANTS_MASK;
 	switch (mode) {
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 497c441..0146bef 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -277,6 +277,17 @@ 
 #define   SLICE_STATUS_MAIN_ON	(2<<0)
 #define   SLICE_STATUS_BOTH_ON	(3<<0)
 
+#define HSW_SLICESHUTDOWN	0xA190
+#define   SLICE_SHUTDOWN	(1<<0)
+
+#define RC_IDLE_MAX_COUNT	0x2054
+#define   CS_IDLE_COUNT_1US	(1<<1)
+#define   CS_IDLE_COUNT_5US	(1<<3)
+
+#define WAIT_FOR_RC6_EXIT	0x20CC
+#define   WAIT_RC6_EXIT		(1<<0)
+#define   MASK_WAIT_RC6_EXIT	(1<<16)
+
 /*
  * 3D instructions used by the kernel
  */
diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
index 86ccd52..1dd57fb 100644
--- a/drivers/gpu/drm/i915/i915_sysfs.c
+++ b/drivers/gpu/drm/i915/i915_sysfs.c
@@ -125,8 +125,7 @@  static ssize_t gt_slice_config_show(struct device *kdev,
 	struct drm_device *dev = minor->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
-	return sprintf(buf, "%s\n", I915_READ(MI_PREDICATE_RESULT_2) ==
-		       LOWER_SLICE_ENABLED ? "full" : "half");
+	return sprintf(buf, "%s\n", I915_READ(MI_PREDICATE_RESULT_2) == LOWER_SLICE_ENABLED ? "full" : "half");
 }
 
 static ssize_t gt_slice_config_store(struct device *kdev,
@@ -135,16 +134,19 @@  static ssize_t gt_slice_config_store(struct device *kdev,
 {
 	struct drm_minor *minor = container_of(kdev, struct drm_minor, kdev);
 	struct drm_device *dev = minor->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
 	if (!strncmp(buf, "full", sizeof("full") - 1)) {
 		ret = intel_set_gt_full(dev);
 		if (ret)
 			return ret;
+		dev_priv->gt_slices.state_default = 1;
 	} else if (!strncmp(buf, "half", sizeof("half") - 1)) {
 		ret = intel_set_gt_half(dev);
 		if (ret)
 			return ret;
+		dev_priv->gt_slices.state_default = 0;
 	} else
 		return -EINVAL;
 	return count;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 4f1b636..3810ecf 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -7778,6 +7778,12 @@  void intel_mark_idle(struct drm_device *dev)
 
 	if (dev_priv->info->gen >= 6)
 		gen6_rps_idle(dev->dev_private);
+
+	if (HAS_SLICE_SHUTDOWN(dev) &&
+	    dev_priv->gt_slices.legacy_userspace_busy) {
+		intel_set_gt_half_async(dev);
+		dev_priv->gt_slices.legacy_userspace_busy = false;
+	}
 }
 
 void intel_mark_fb_busy(struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index a9abbb5..98cd63e 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -836,6 +836,7 @@  void intel_disable_gt_powersave(struct drm_device *dev);
 void ironlake_teardown_rc6(struct drm_device *dev);
 int intel_set_gt_full(struct drm_device *dev);
 int intel_set_gt_half(struct drm_device *dev);
+void intel_set_gt_half_async(struct drm_device *dev);
 void intel_init_gt_slices(struct drm_device *dev);
 void gen6_update_ring_freq(struct drm_device *dev);
 void gen6_rps_idle(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 63af075..c1bde36 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3873,6 +3873,7 @@  int intel_set_gt_full(struct drm_device *dev)
 	if (!HAS_SLICE_SHUTDOWN(dev))
 		return -ENODEV;
 
+	mutex_lock(&dev_priv->gt_slices.lock);
 	I915_WRITE(HSW_GT_SLICE_INFO, SLICE_SEL_BOTH);
 
 	/* Slices are enabled on RC6 exit */
@@ -3881,13 +3882,18 @@  int intel_set_gt_full(struct drm_device *dev)
 	if (wait_for(((I915_READ(HSW_GT_SLICE_INFO) & SLICE_STATUS_MASK) ==
 		      SLICE_STATUS_BOTH_ON), 2000)) {
 		DRM_ERROR("Timeout enabling full gt slices\n");
+
 		I915_WRITE(HSW_GT_SLICE_INFO, ~SLICE_SEL_BOTH);
 		I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_DISABLED);
+
 		gen6_gt_force_wake_put(dev_priv);
+		mutex_unlock(&dev_priv->gt_slices.lock);
 		return -ETIMEDOUT;
 	}
+
 	I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_ENABLED);
 	gen6_gt_force_wake_put(dev_priv);
+	mutex_unlock(&dev_priv->gt_slices.lock);
 
 	return 0;
 }
@@ -3899,6 +3905,10 @@  int intel_set_gt_half(struct drm_device *dev)
 	if (!HAS_SLICE_SHUTDOWN(dev))
 		return -ENODEV;
 
+	if (dev_priv->gt_slices.legacy_userspace_busy)
+		return 0;
+
+	mutex_lock(&dev_priv->gt_slices.lock);
 	I915_WRITE(HSW_GT_SLICE_INFO, ~SLICE_SEL_BOTH);
 
 	/* Slices are disabled on RC6 exit */
@@ -3907,16 +3917,43 @@  int intel_set_gt_half(struct drm_device *dev)
 	if (wait_for(((I915_READ(HSW_GT_SLICE_INFO) & SLICE_STATUS_MASK) ==
 		      SLICE_STATUS_MAIN_ON), 2000)) {
 		DRM_ERROR("Timed out disabling half gt slices\n");
+
 		I915_WRITE(HSW_GT_SLICE_INFO, SLICE_SEL_BOTH);
 		I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_ENABLED);
+
 		gen6_gt_force_wake_put(dev_priv);
+		mutex_unlock(&dev_priv->gt_slices.lock);
 		return -ETIMEDOUT;
 	}
+
 	I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_DISABLED);
 	gen6_gt_force_wake_put(dev_priv);
+
+	mutex_unlock(&dev_priv->gt_slices.lock);
 	return 0;
 }
 
+/**
+ * On Haswell, slices on/off transitions are done via RC6 sequence.
+ * This async function allows you to request slices shutdown without waiting.
+ * Slices will be disabled on next RC6 exit.
+ */
+void intel_set_gt_half_async(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	if (!HAS_SLICE_SHUTDOWN(dev))
+		return;
+
+	if (dev_priv->gt_slices.state_default == 1)
+		return;
+
+	mutex_lock(&dev_priv->gt_slices.lock);
+	I915_WRITE(HSW_GT_SLICE_INFO, ~SLICE_SEL_BOTH);
+	I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_DISABLED);
+	mutex_unlock(&dev_priv->gt_slices.lock);
+}
+
 void intel_init_gt_slices(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -3927,9 +3964,13 @@  void intel_init_gt_slices(struct drm_device *dev)
 	if (!HAS_SLICE_SHUTDOWN(dev))
 		return;
 
+	dev_priv->gt_slices.state_default = 1;
+	dev_priv->gt_slices.legacy_userspace_busy = false;
+	mutex_init(&dev_priv->gt_slices.lock);
+
 	if (!i915_gt_slice_config) {
-		I915_WRITE(HSW_GT_SLICE_INFO, ~SLICE_SEL_BOTH);
-		I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_DISABLED);
+		dev_priv->gt_slices.state_default = 0;
+		intel_set_gt_half_async(dev);
 	}
 }
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 3a4e97b..3fa3e24 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -731,7 +731,13 @@  struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_HANDLE_LUT		(1<<12)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_HANDLE_LUT<<1)
+/* If this flag is set userspace is using predicate and half slices can be
+ * let disabled for power saving. Otherwise use all slices even when disabled
+ * by boot parameter or via sysfs interface
+ */
+#define I915_EXEC_USE_PREDICATE		(1<<13)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_USE_PREDICATE<<1)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \