diff mbox

[2/2] drm/i915: Minimize the huge amount of unecessary fbc sw cache clean.

Message ID 1409950641-10625-2-git-send-email-rodrigo.vivi@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Rodrigo Vivi Sept. 5, 2014, 8:57 p.m. UTC
The sw cache clean on gen8 is a tempoorary workaround because we cannot
set cache clean on blt ring with risk of hungs. So we are doing the cache clean on sw.
However we are doing much more than needed. Not only when using blt ring.
So, with this extra w/a we minimize the ammount of cache cleans and call it only
on same cases that it was being called on gen7.

fbc.need_sw_cache_clean works in the opposite information direction
of ring->fbc_dirty telling software on frontbuffer tracking to perform
the cache clean on sw side.

Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         | 8 ++++++++
 drivers/gpu/drm/i915/intel_display.c    | 4 +++-
 drivers/gpu/drm/i915/intel_ringbuffer.c | 9 +++++++--
 3 files changed, 18 insertions(+), 3 deletions(-)

Comments

Rodrigo Vivi Sept. 5, 2014, 10:14 p.m. UTC | #1
Hey Daniel,

funny story: Remember that with idle_frames=1 on BDW it was working but it
was faililng on HSW?
So, with these 2 patches applied now BDW PSR fails like HSW!!!

Ville, any thoughts on this?


On Fri, Sep 5, 2014 at 1:57 PM, Rodrigo Vivi <rodrigo.vivi@intel.com> wrote:

> The sw cache clean on gen8 is a tempoorary workaround because we cannot
> set cache clean on blt ring with risk of hungs. So we are doing the cache
> clean on sw.
> However we are doing much more than needed. Not only when using blt ring.
> So, with this extra w/a we minimize the ammount of cache cleans and call
> it only
> on same cases that it was being called on gen7.
>
> fbc.need_sw_cache_clean works in the opposite information direction
> of ring->fbc_dirty telling software on frontbuffer tracking to perform
> the cache clean on sw side.
>
> Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         | 8 ++++++++
>  drivers/gpu/drm/i915/intel_display.c    | 4 +++-
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 9 +++++++--
>  3 files changed, 18 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h
> b/drivers/gpu/drm/i915/i915_drv.h
> index 5706b8a..5acda40 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -657,6 +657,14 @@ struct i915_fbc {
>
>         bool false_color;
>
> +       /* On gen8 some rings cannont perform fbc clean operation so for
> now
> +        * we are doing this on SW with mmio.
> +        * This variable works in the opposite information direction
> +        * of ring->fbc_dirty telling software on frontbuffer tracking
> +        * to perform the cache clean on sw side.
> +        */
> +       bool need_sw_cache_clean;
> +
>         struct intel_fbc_work {
>                 struct delayed_work work;
>                 struct drm_crtc *crtc;
> diff --git a/drivers/gpu/drm/i915/intel_display.c
> b/drivers/gpu/drm/i915/intel_display.c
> index 965eb3c..731d925 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -9137,8 +9137,10 @@ void intel_frontbuffer_flush(struct drm_device *dev,
>          * needs to be reworked into a proper frontbuffer tracking scheme
> like
>          * psr employs.
>          */
> -       if (IS_BROADWELL(dev))
> +       if (IS_BROADWELL(dev) && dev_priv->fbc.need_sw_cache_clean) {
> +               dev_priv->fbc.need_sw_cache_clean = false;
>                 gen8_fbc_sw_flush(dev, FBC_REND_CACHE_CLEAN);
> +       }
>  }
>
>  /**
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 85fc2b1..02b43cd 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2227,6 +2227,7 @@ static int gen6_ring_flush(struct intel_engine_cs
> *ring,
>                            u32 invalidate, u32 flush)
>  {
>         struct drm_device *dev = ring->dev;
> +       struct drm_i915_private *dev_priv = dev->dev_private;
>         uint32_t cmd;
>         int ret;
>
> @@ -2257,8 +2258,12 @@ static int gen6_ring_flush(struct intel_engine_cs
> *ring,
>         }
>         intel_ring_advance(ring);
>
> -       if (IS_GEN7(dev) && !invalidate && flush)
> -               return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
> +       if (!invalidate && flush) {
> +               if (IS_GEN7(dev))
> +                       return gen7_ring_fbc_flush(ring,
> FBC_REND_CACHE_CLEAN);
> +               else if (IS_GEN8(dev))
> +                       dev_priv->fbc.need_sw_cache_clean = true;
> +       }
>
>         return 0;
>  }
> --
> 1.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
Rodrigo Vivi Sept. 17, 2014, 8:36 p.m. UTC | #2
Please just ignore my latest comment here...
I confirm this works propperly with the PSR with my latest fixes...

On Fri, Sep 5, 2014 at 3:14 PM, Rodrigo Vivi <rodrigo.vivi@gmail.com> wrote:

> Hey Daniel,
>
> funny story: Remember that with idle_frames=1 on BDW it was working but it
> was faililng on HSW?
> So, with these 2 patches applied now BDW PSR fails like HSW!!!
>
> Ville, any thoughts on this?
>
>
> On Fri, Sep 5, 2014 at 1:57 PM, Rodrigo Vivi <rodrigo.vivi@intel.com>
> wrote:
>
>> The sw cache clean on gen8 is a tempoorary workaround because we cannot
>> set cache clean on blt ring with risk of hungs. So we are doing the cache
>> clean on sw.
>> However we are doing much more than needed. Not only when using blt ring.
>> So, with this extra w/a we minimize the ammount of cache cleans and call
>> it only
>> on same cases that it was being called on gen7.
>>
>> fbc.need_sw_cache_clean works in the opposite information direction
>> of ring->fbc_dirty telling software on frontbuffer tracking to perform
>> the cache clean on sw side.
>>
>> Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h         | 8 ++++++++
>>  drivers/gpu/drm/i915/intel_display.c    | 4 +++-
>>  drivers/gpu/drm/i915/intel_ringbuffer.c | 9 +++++++--
>>  3 files changed, 18 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 5706b8a..5acda40 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -657,6 +657,14 @@ struct i915_fbc {
>>
>>         bool false_color;
>>
>> +       /* On gen8 some rings cannont perform fbc clean operation so for
>> now
>> +        * we are doing this on SW with mmio.
>> +        * This variable works in the opposite information direction
>> +        * of ring->fbc_dirty telling software on frontbuffer tracking
>> +        * to perform the cache clean on sw side.
>> +        */
>> +       bool need_sw_cache_clean;
>> +
>>         struct intel_fbc_work {
>>                 struct delayed_work work;
>>                 struct drm_crtc *crtc;
>> diff --git a/drivers/gpu/drm/i915/intel_display.c
>> b/drivers/gpu/drm/i915/intel_display.c
>> index 965eb3c..731d925 100644
>> --- a/drivers/gpu/drm/i915/intel_display.c
>> +++ b/drivers/gpu/drm/i915/intel_display.c
>> @@ -9137,8 +9137,10 @@ void intel_frontbuffer_flush(struct drm_device
>> *dev,
>>          * needs to be reworked into a proper frontbuffer tracking scheme
>> like
>>          * psr employs.
>>          */
>> -       if (IS_BROADWELL(dev))
>> +       if (IS_BROADWELL(dev) && dev_priv->fbc.need_sw_cache_clean) {
>> +               dev_priv->fbc.need_sw_cache_clean = false;
>>                 gen8_fbc_sw_flush(dev, FBC_REND_CACHE_CLEAN);
>> +       }
>>  }
>>
>>  /**
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> index 85fc2b1..02b43cd 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> @@ -2227,6 +2227,7 @@ static int gen6_ring_flush(struct intel_engine_cs
>> *ring,
>>                            u32 invalidate, u32 flush)
>>  {
>>         struct drm_device *dev = ring->dev;
>> +       struct drm_i915_private *dev_priv = dev->dev_private;
>>         uint32_t cmd;
>>         int ret;
>>
>> @@ -2257,8 +2258,12 @@ static int gen6_ring_flush(struct intel_engine_cs
>> *ring,
>>         }
>>         intel_ring_advance(ring);
>>
>> -       if (IS_GEN7(dev) && !invalidate && flush)
>> -               return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
>> +       if (!invalidate && flush) {
>> +               if (IS_GEN7(dev))
>> +                       return gen7_ring_fbc_flush(ring,
>> FBC_REND_CACHE_CLEAN);
>> +               else if (IS_GEN8(dev))
>> +                       dev_priv->fbc.need_sw_cache_clean = true;
>> +       }
>>
>>         return 0;
>>  }
>> --
>> 1.9.3
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>>
>
>
>
> --
> Rodrigo Vivi
> Blog: http://blog.vivi.eng.br
>
>
Paulo Zanoni Sept. 17, 2014, 8:49 p.m. UTC | #3
2014-09-05 17:57 GMT-03:00 Rodrigo Vivi <rodrigo.vivi@intel.com>:
> The sw cache clean on gen8 is a tempoorary workaround because we cannot
> set cache clean on blt ring with risk of hungs. So we are doing the cache clean on sw.
> However we are doing much more than needed. Not only when using blt ring.
> So, with this extra w/a we minimize the ammount of cache cleans and call it only
> on same cases that it was being called on gen7.
>
> fbc.need_sw_cache_clean works in the opposite information direction
> of ring->fbc_dirty telling software on frontbuffer tracking to perform
> the cache clean on sw side.
>
> Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         | 8 ++++++++
>  drivers/gpu/drm/i915/intel_display.c    | 4 +++-
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 9 +++++++--
>  3 files changed, 18 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5706b8a..5acda40 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -657,6 +657,14 @@ struct i915_fbc {
>
>         bool false_color;
>
> +       /* On gen8 some rings cannont perform fbc clean operation so for now
> +        * we are doing this on SW with mmio.
> +        * This variable works in the opposite information direction
> +        * of ring->fbc_dirty telling software on frontbuffer tracking
> +        * to perform the cache clean on sw side.
> +        */
> +       bool need_sw_cache_clean;


Ok, this is my first time trying to look at the buffer tracking code,
so I have some noob questions.

It seems need_sw_cache_clean is set every time we call
gen6_ring_flush(), just like gen7_ring_fbc_flush(). But the thing
about gen7_ring_fbc_flush() is that it becomes a noop when fbc_dirty
is false. So maybe on this code we're doing the sw cache clean every
time we do a BLT operation, even if this operation is not on the front
buffer? That would be an unnecessary overkill, right? Continuing the
same though, why can't we just reuse the ring->fbc_dirty variable
instead of using this new need_sw_cache_clean? Maybe we should only do
the sw cache clean when both fbc_dirty *and* need_sw_cache are true?

More below.


> +
>         struct intel_fbc_work {
>                 struct delayed_work work;
>                 struct drm_crtc *crtc;
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 965eb3c..731d925 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -9137,8 +9137,10 @@ void intel_frontbuffer_flush(struct drm_device *dev,
>          * needs to be reworked into a proper frontbuffer tracking scheme like
>          * psr employs.
>          */
> -       if (IS_BROADWELL(dev))
> +       if (IS_BROADWELL(dev) && dev_priv->fbc.need_sw_cache_clean) {
> +               dev_priv->fbc.need_sw_cache_clean = false;
>                 gen8_fbc_sw_flush(dev, FBC_REND_CACHE_CLEAN);
> +       }


Bikeshed: since this is the only caller, I think it makes sense to put
the checks inside the function instead of the caller.


>  }
>
>  /**
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 85fc2b1..02b43cd 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2227,6 +2227,7 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,


The thing about gen6_render_ring_flush is that it is also a function
for the vebox ring. Is this a problem? What are the consequences of
doing the FBC thing on it?


>                            u32 invalidate, u32 flush)
>  {
>         struct drm_device *dev = ring->dev;
> +       struct drm_i915_private *dev_priv = dev->dev_private;
>         uint32_t cmd;
>         int ret;
>
> @@ -2257,8 +2258,12 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,
>         }
>         intel_ring_advance(ring);
>
> -       if (IS_GEN7(dev) && !invalidate && flush)
> -               return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
> +       if (!invalidate && flush) {
> +               if (IS_GEN7(dev))
> +                       return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
> +               else if (IS_GEN8(dev))
> +                       dev_priv->fbc.need_sw_cache_clean = true;


Bikeshed: since I like to keep the callers clean, can't we just create
a new intel_blt_fbc_flush (or intel_ring_fbc_flush(), or whatever)
that will be responsible for deciding if it needs to call
gen7_ring_fbc_flush or set need_sw_cache_clean to true? That would
make even more sense if we conclude we also need to check for
fbc_dirty before setting need_sw_cache_clean to true...


> +       }
>
>         return 0;
>  }
> --
> 1.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Rodrigo Vivi Sept. 17, 2014, 9:16 p.m. UTC | #4
On Wed, Sep 17, 2014 at 1:49 PM, Paulo Zanoni <przanoni@gmail.com> wrote:

> 2014-09-05 17:57 GMT-03:00 Rodrigo Vivi <rodrigo.vivi@intel.com>:
> > The sw cache clean on gen8 is a tempoorary workaround because we cannot
> > set cache clean on blt ring with risk of hungs. So we are doing the
> cache clean on sw.
> > However we are doing much more than needed. Not only when using blt ring.
> > So, with this extra w/a we minimize the ammount of cache cleans and call
> it only
> > on same cases that it was being called on gen7.
> >
> > fbc.need_sw_cache_clean works in the opposite information direction
> > of ring->fbc_dirty telling software on frontbuffer tracking to perform
> > the cache clean on sw side.
> >
> > Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h         | 8 ++++++++
> >  drivers/gpu/drm/i915/intel_display.c    | 4 +++-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c | 9 +++++++--
> >  3 files changed, 18 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> b/drivers/gpu/drm/i915/i915_drv.h
> > index 5706b8a..5acda40 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -657,6 +657,14 @@ struct i915_fbc {
> >
> >         bool false_color;
> >
> > +       /* On gen8 some rings cannont perform fbc clean operation so for
> now
> > +        * we are doing this on SW with mmio.
> > +        * This variable works in the opposite information direction
> > +        * of ring->fbc_dirty telling software on frontbuffer tracking
> > +        * to perform the cache clean on sw side.
> > +        */
> > +       bool need_sw_cache_clean;
>
>
> Ok, this is my first time trying to look at the buffer tracking code,
> so I have some noob questions.
>
> It seems need_sw_cache_clean is set every time we call
> gen6_ring_flush(), just like gen7_ring_fbc_flush(). But the thing
> about gen7_ring_fbc_flush() is that it becomes a noop when fbc_dirty
> is false. So maybe on this code we're doing the sw cache clean every
> time we do a BLT operation, even if this operation is not on the front
> buffer? That would be an unnecessary overkill, right? Continuing the
> same though, why can't we just reuse the ring->fbc_dirty variable
> instead of using this new need_sw_cache_clean? Maybe we should only do
> the sw cache clean when both fbc_dirty *and* need_sw_cache are true?
>

need_sw_cache_clean is exactly the opposite information flow of fbc_dirty.
fbc_dirty was actually the first name that came to my mind but I noticed we
already had this name.

Well, the fbc cache need to be clean when it is dirty ;) That means on ring
flush (other than render one) and when frontbuffer was touched.

When frontbuffer was touched and there is render flush we Nuke FBC and when
frontbuffer was touched and other ring than render is flushed we clean the
cache. (that is the hw logic that is implemented there). The fbc_dirty is
to tell frontfbuffer was touched and cache clean lri can be executed on
ring.
need_sw_cache_clean is exactly the opposite when rings was flushed it tells
next frontbuffer touch to clean the cache.

I know the order of the things are different. The hw one is frontbuffer
than ring flush with lri for cache clean while this one is ring flush than
next frontbuffer do the mmio to clean the cache. So I imagine that might
have cases where the cache needs to be cleaned and no frontbuffer occurs
and fbc keeps disabled not compressed anything.... But this is exactly how
the hw was without the sw cache clean. And my tests here shows that with
this even not on the propper order fbc just keep working and compressing
propperly.



>
> More below.
>
>
> > +
> >         struct intel_fbc_work {
> >                 struct delayed_work work;
> >                 struct drm_crtc *crtc;
> > diff --git a/drivers/gpu/drm/i915/intel_display.c
> b/drivers/gpu/drm/i915/intel_display.c
> > index 965eb3c..731d925 100644
> > --- a/drivers/gpu/drm/i915/intel_display.c
> > +++ b/drivers/gpu/drm/i915/intel_display.c
> > @@ -9137,8 +9137,10 @@ void intel_frontbuffer_flush(struct drm_device
> *dev,
> >          * needs to be reworked into a proper frontbuffer tracking
> scheme like
> >          * psr employs.
> >          */
> > -       if (IS_BROADWELL(dev))
> > +       if (IS_BROADWELL(dev) && dev_priv->fbc.need_sw_cache_clean) {
> > +               dev_priv->fbc.need_sw_cache_clean = false;
> >                 gen8_fbc_sw_flush(dev, FBC_REND_CACHE_CLEAN);
> > +       }
>
>
> Bikeshed: since this is the only caller, I think it makes sense to put
> the checks inside the function instead of the caller.
>

Agree.


>
>
> >  }
> >
> >  /**
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 85fc2b1..02b43cd 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -2227,6 +2227,7 @@ static int gen6_ring_flush(struct intel_engine_cs
> *ring,
>
>
> The thing about gen6_render_ring_flush is that it is also a function
> for the vebox ring. Is this a problem? What are the consequences of
> doing the FBC thing on it?
>

As far as I remember we should nuke on render and clean cache on all other
rings flush so no problem with that.


>
>
> >                            u32 invalidate, u32 flush)
> >  {
> >         struct drm_device *dev = ring->dev;
> > +       struct drm_i915_private *dev_priv = dev->dev_private;
> >         uint32_t cmd;
> >         int ret;
> >
> > @@ -2257,8 +2258,12 @@ static int gen6_ring_flush(struct intel_engine_cs
> *ring,
> >         }
> >         intel_ring_advance(ring);
> >
> > -       if (IS_GEN7(dev) && !invalidate && flush)
> > -               return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
> > +       if (!invalidate && flush) {
> > +               if (IS_GEN7(dev))
> > +                       return gen7_ring_fbc_flush(ring,
> FBC_REND_CACHE_CLEAN);
> > +               else if (IS_GEN8(dev))
> > +                       dev_priv->fbc.need_sw_cache_clean = true;
>
>
> Bikeshed: since I like to keep the callers clean, can't we just create
> a new intel_blt_fbc_flush (or intel_ring_fbc_flush(), or whatever)
> that will be responsible for deciding if it needs to call
> gen7_ring_fbc_flush or set need_sw_cache_clean to true?


Agree this can be cleaner.


> That would
> make even more sense if we conclude we also need to check for
> fbc_dirty before setting need_sw_cache_clean to true...
>

I don't think so. fbc_dirty is set on the frontbuffer touch, exactly when
fbc_dirty is set.
This would only make a flow that necessary we touch the frontbuffer before
flushing the ring... But I'm confident this is already happening and we
wouldn't be saving any case.



>
>
> > +       }
> >
> >         return 0;
> >  }
> > --
> > 1.9.3
> >
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
>
>
> --
> Paulo Zanoni
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5706b8a..5acda40 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -657,6 +657,14 @@  struct i915_fbc {
 
 	bool false_color;
 
+	/* On gen8 some rings cannont perform fbc clean operation so for now
+	 * we are doing this on SW with mmio.
+	 * This variable works in the opposite information direction
+	 * of ring->fbc_dirty telling software on frontbuffer tracking
+	 * to perform the cache clean on sw side.
+	 */
+	bool need_sw_cache_clean;
+
 	struct intel_fbc_work {
 		struct delayed_work work;
 		struct drm_crtc *crtc;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 965eb3c..731d925 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -9137,8 +9137,10 @@  void intel_frontbuffer_flush(struct drm_device *dev,
 	 * needs to be reworked into a proper frontbuffer tracking scheme like
 	 * psr employs.
 	 */
-	if (IS_BROADWELL(dev))
+	if (IS_BROADWELL(dev) && dev_priv->fbc.need_sw_cache_clean) {
+		dev_priv->fbc.need_sw_cache_clean = false;
 		gen8_fbc_sw_flush(dev, FBC_REND_CACHE_CLEAN);
+	}
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 85fc2b1..02b43cd 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2227,6 +2227,7 @@  static int gen6_ring_flush(struct intel_engine_cs *ring,
 			   u32 invalidate, u32 flush)
 {
 	struct drm_device *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t cmd;
 	int ret;
 
@@ -2257,8 +2258,12 @@  static int gen6_ring_flush(struct intel_engine_cs *ring,
 	}
 	intel_ring_advance(ring);
 
-	if (IS_GEN7(dev) && !invalidate && flush)
-		return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
+	if (!invalidate && flush) {
+		if (IS_GEN7(dev))
+			return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
+		else if (IS_GEN8(dev))
+			dev_priv->fbc.need_sw_cache_clean = true;
+	}
 
 	return 0;
 }