Message ID | 20180119152356.17025-2-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 19/01/2018 15:23, Chris Wilson wrote: > When we finally decide the gpu is idle, that is a good time to shrink > our kmem_caches. > > v3: Defer until an rcu grace period after we idle. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> > --- > drivers/gpu/drm/i915/i915_gem.c | 65 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 65 insertions(+) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 7f0684ccc724..6a8fbcae835b 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -3341,12 +3341,59 @@ new_requests_since_last_retire(const struct drm_i915_private *i915) > work_pending(&i915->gt.idle_work.work)); > } > > +static void shrink_caches(struct drm_i915_private *i915) > +{ > + /* > + * kmem_cache_shrink() discards empty slabs and reorders partially > + * filled slabs to prioritise allocating from the mostly full slabs, > + * with the aim of reducing fragmentation. > + */ > + kmem_cache_shrink(i915->priorities); > + kmem_cache_shrink(i915->dependencies); > + kmem_cache_shrink(i915->requests); > + kmem_cache_shrink(i915->luts); > + kmem_cache_shrink(i915->vmas); > + kmem_cache_shrink(i915->objects); > +} > + > +struct sleep_rcu_work { > + struct drm_i915_private *i915; > + struct rcu_head rcu; > + struct work_struct work; > + u32 epoch; > +}; > + > +static void __sleep_work(struct work_struct *work) > +{ > + struct sleep_rcu_work *s = container_of(work, typeof(*s), work); > + struct drm_i915_private *i915 = s->i915; > + u32 epoch = s->epoch; > + > + kfree(s); > + if (epoch == READ_ONCE(i915->gt.epoch)) > + shrink_caches(i915); > +} > + > +static void __sleep_rcu(struct rcu_head *rcu) > +{ > + struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu); > + struct drm_i915_private *i915 = s->i915; > + > + if (s->epoch == READ_ONCE(i915->gt.epoch)) { > + INIT_WORK(&s->work, __sleep_work); > + queue_work(i915->wq, &s->work); > + } else { > + kfree(s); > + } > +} > + > static void > i915_gem_idle_work_handler(struct work_struct *work) > { > struct drm_i915_private *dev_priv = > container_of(work, typeof(*dev_priv), gt.idle_work.work); > bool rearm_hangcheck; > + u32 epoch = 0; > ktime_t end; > > if (!READ_ONCE(dev_priv->gt.awake)) > @@ -3406,6 +3453,7 @@ i915_gem_idle_work_handler(struct work_struct *work) > GEM_BUG_ON(!dev_priv->gt.awake); > dev_priv->gt.awake = false; > rearm_hangcheck = false; > + epoch = dev_priv->gt.epoch; > > if (INTEL_GEN(dev_priv) >= 6) > gen6_rps_idle(dev_priv); > @@ -3421,6 +3469,23 @@ i915_gem_idle_work_handler(struct work_struct *work) > GEM_BUG_ON(!dev_priv->gt.awake); > i915_queue_hangcheck(dev_priv); > } > + > + /* > + * When we are idle, it is an opportune time to reap our caches. > + * However, we have many objects that utilise RCU and the ordered > + * i915->wq that this work is executing on. To try and flush any > + * pending frees now we are idle, we first wait for an RCU grace > + * period, and then queue a task (that will run last on the wq) to > + * shrink and re-optimize the caches. > + */ > + if (epoch == READ_ONCE(dev_priv->gt.epoch)) { Theoretically this can be true on epoch wrap-around, when trylock failed. It's one in four billion busy transitions but it could be just worth handling it explicitly. Simplest probably to ensure gt.epoch is never zero when incrementing? > + struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL); > + if (s) { > + s->i915 = dev_priv; > + s->epoch = epoch; > + call_rcu(&s->rcu, __sleep_rcu); > + } > + } > } > > void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) > Otherwise it sounds believable and looks correct. Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-01-24 10:32:16) > > On 19/01/2018 15:23, Chris Wilson wrote: > > When we finally decide the gpu is idle, that is a good time to shrink > > our kmem_caches. > > > > v3: Defer until an rcu grace period after we idle. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> > > --- > > drivers/gpu/drm/i915/i915_gem.c | 65 +++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 65 insertions(+) > > > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > > index 7f0684ccc724..6a8fbcae835b 100644 > > --- a/drivers/gpu/drm/i915/i915_gem.c > > +++ b/drivers/gpu/drm/i915/i915_gem.c > > @@ -3341,12 +3341,59 @@ new_requests_since_last_retire(const struct drm_i915_private *i915) > > work_pending(&i915->gt.idle_work.work)); > > } > > > > +static void shrink_caches(struct drm_i915_private *i915) > > +{ > > + /* > > + * kmem_cache_shrink() discards empty slabs and reorders partially > > + * filled slabs to prioritise allocating from the mostly full slabs, > > + * with the aim of reducing fragmentation. > > + */ > > + kmem_cache_shrink(i915->priorities); > > + kmem_cache_shrink(i915->dependencies); > > + kmem_cache_shrink(i915->requests); > > + kmem_cache_shrink(i915->luts); > > + kmem_cache_shrink(i915->vmas); > > + kmem_cache_shrink(i915->objects); > > +} > > + > > +struct sleep_rcu_work { > > + struct drm_i915_private *i915; > > + struct rcu_head rcu; > > + struct work_struct work; > > + u32 epoch; > > +}; > > + > > +static void __sleep_work(struct work_struct *work) > > +{ > > + struct sleep_rcu_work *s = container_of(work, typeof(*s), work); > > + struct drm_i915_private *i915 = s->i915; > > + u32 epoch = s->epoch; > > + > > + kfree(s); > > + if (epoch == READ_ONCE(i915->gt.epoch)) > > + shrink_caches(i915); > > +} > > + > > +static void __sleep_rcu(struct rcu_head *rcu) > > +{ > > + struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu); > > + struct drm_i915_private *i915 = s->i915; > > + > > + if (s->epoch == READ_ONCE(i915->gt.epoch)) { > > + INIT_WORK(&s->work, __sleep_work); > > + queue_work(i915->wq, &s->work); > > + } else { > > + kfree(s); > > + } > > +} > > + > > static void > > i915_gem_idle_work_handler(struct work_struct *work) > > { > > struct drm_i915_private *dev_priv = > > container_of(work, typeof(*dev_priv), gt.idle_work.work); > > bool rearm_hangcheck; > > + u32 epoch = 0; > > ktime_t end; > > > > if (!READ_ONCE(dev_priv->gt.awake)) > > @@ -3406,6 +3453,7 @@ i915_gem_idle_work_handler(struct work_struct *work) > > GEM_BUG_ON(!dev_priv->gt.awake); > > dev_priv->gt.awake = false; > > rearm_hangcheck = false; > > + epoch = dev_priv->gt.epoch; > > > > if (INTEL_GEN(dev_priv) >= 6) > > gen6_rps_idle(dev_priv); > > @@ -3421,6 +3469,23 @@ i915_gem_idle_work_handler(struct work_struct *work) > > GEM_BUG_ON(!dev_priv->gt.awake); > > i915_queue_hangcheck(dev_priv); > > } > > + > > + /* > > + * When we are idle, it is an opportune time to reap our caches. > > + * However, we have many objects that utilise RCU and the ordered > > + * i915->wq that this work is executing on. To try and flush any > > + * pending frees now we are idle, we first wait for an RCU grace > > + * period, and then queue a task (that will run last on the wq) to > > + * shrink and re-optimize the caches. > > + */ > > + if (epoch == READ_ONCE(dev_priv->gt.epoch)) { > > Theoretically this can be true on epoch wrap-around, when trylock > failed. It's one in four billion busy transitions but it could be just > worth handling it explicitly. Simplest probably to ensure gt.epoch is > never zero when incrementing? I was working on the principle that a u32 wraparound takes at least 100ms * 2^32, or about 326 years. :) -Chris
On 24/01/2018 10:37, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-01-24 10:32:16) >> >> On 19/01/2018 15:23, Chris Wilson wrote: >>> When we finally decide the gpu is idle, that is a good time to shrink >>> our kmem_caches. >>> >>> v3: Defer until an rcu grace period after we idle. >>> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> >>> --- >>> drivers/gpu/drm/i915/i915_gem.c | 65 +++++++++++++++++++++++++++++++++++++++++ >>> 1 file changed, 65 insertions(+) >>> >>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c >>> index 7f0684ccc724..6a8fbcae835b 100644 >>> --- a/drivers/gpu/drm/i915/i915_gem.c >>> +++ b/drivers/gpu/drm/i915/i915_gem.c >>> @@ -3341,12 +3341,59 @@ new_requests_since_last_retire(const struct drm_i915_private *i915) >>> work_pending(&i915->gt.idle_work.work)); >>> } >>> >>> +static void shrink_caches(struct drm_i915_private *i915) >>> +{ >>> + /* >>> + * kmem_cache_shrink() discards empty slabs and reorders partially >>> + * filled slabs to prioritise allocating from the mostly full slabs, >>> + * with the aim of reducing fragmentation. >>> + */ >>> + kmem_cache_shrink(i915->priorities); >>> + kmem_cache_shrink(i915->dependencies); >>> + kmem_cache_shrink(i915->requests); >>> + kmem_cache_shrink(i915->luts); >>> + kmem_cache_shrink(i915->vmas); >>> + kmem_cache_shrink(i915->objects); >>> +} >>> + >>> +struct sleep_rcu_work { >>> + struct drm_i915_private *i915; >>> + struct rcu_head rcu; >>> + struct work_struct work; >>> + u32 epoch; >>> +}; >>> + >>> +static void __sleep_work(struct work_struct *work) >>> +{ >>> + struct sleep_rcu_work *s = container_of(work, typeof(*s), work); >>> + struct drm_i915_private *i915 = s->i915; >>> + u32 epoch = s->epoch; >>> + >>> + kfree(s); >>> + if (epoch == READ_ONCE(i915->gt.epoch)) >>> + shrink_caches(i915); >>> +} >>> + >>> +static void __sleep_rcu(struct rcu_head *rcu) >>> +{ >>> + struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu); >>> + struct drm_i915_private *i915 = s->i915; >>> + >>> + if (s->epoch == READ_ONCE(i915->gt.epoch)) { >>> + INIT_WORK(&s->work, __sleep_work); >>> + queue_work(i915->wq, &s->work); >>> + } else { >>> + kfree(s); >>> + } >>> +} >>> + >>> static void >>> i915_gem_idle_work_handler(struct work_struct *work) >>> { >>> struct drm_i915_private *dev_priv = >>> container_of(work, typeof(*dev_priv), gt.idle_work.work); >>> bool rearm_hangcheck; >>> + u32 epoch = 0; >>> ktime_t end; >>> >>> if (!READ_ONCE(dev_priv->gt.awake)) >>> @@ -3406,6 +3453,7 @@ i915_gem_idle_work_handler(struct work_struct *work) >>> GEM_BUG_ON(!dev_priv->gt.awake); >>> dev_priv->gt.awake = false; >>> rearm_hangcheck = false; >>> + epoch = dev_priv->gt.epoch; >>> >>> if (INTEL_GEN(dev_priv) >= 6) >>> gen6_rps_idle(dev_priv); >>> @@ -3421,6 +3469,23 @@ i915_gem_idle_work_handler(struct work_struct *work) >>> GEM_BUG_ON(!dev_priv->gt.awake); >>> i915_queue_hangcheck(dev_priv); >>> } >>> + >>> + /* >>> + * When we are idle, it is an opportune time to reap our caches. >>> + * However, we have many objects that utilise RCU and the ordered >>> + * i915->wq that this work is executing on. To try and flush any >>> + * pending frees now we are idle, we first wait for an RCU grace >>> + * period, and then queue a task (that will run last on the wq) to >>> + * shrink and re-optimize the caches. >>> + */ >>> + if (epoch == READ_ONCE(dev_priv->gt.epoch)) { >> >> Theoretically this can be true on epoch wrap-around, when trylock >> failed. It's one in four billion busy transitions but it could be just >> worth handling it explicitly. Simplest probably to ensure gt.epoch is >> never zero when incrementing? > > I was working on the principle that a u32 wraparound takes at least > 100ms * 2^32, or about 326 years. :) Ah forgot there is a time delay.. okay then. :) Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> To be kept if/when you change to unsigned int. Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 7f0684ccc724..6a8fbcae835b 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3341,12 +3341,59 @@ new_requests_since_last_retire(const struct drm_i915_private *i915) work_pending(&i915->gt.idle_work.work)); } +static void shrink_caches(struct drm_i915_private *i915) +{ + /* + * kmem_cache_shrink() discards empty slabs and reorders partially + * filled slabs to prioritise allocating from the mostly full slabs, + * with the aim of reducing fragmentation. + */ + kmem_cache_shrink(i915->priorities); + kmem_cache_shrink(i915->dependencies); + kmem_cache_shrink(i915->requests); + kmem_cache_shrink(i915->luts); + kmem_cache_shrink(i915->vmas); + kmem_cache_shrink(i915->objects); +} + +struct sleep_rcu_work { + struct drm_i915_private *i915; + struct rcu_head rcu; + struct work_struct work; + u32 epoch; +}; + +static void __sleep_work(struct work_struct *work) +{ + struct sleep_rcu_work *s = container_of(work, typeof(*s), work); + struct drm_i915_private *i915 = s->i915; + u32 epoch = s->epoch; + + kfree(s); + if (epoch == READ_ONCE(i915->gt.epoch)) + shrink_caches(i915); +} + +static void __sleep_rcu(struct rcu_head *rcu) +{ + struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu); + struct drm_i915_private *i915 = s->i915; + + if (s->epoch == READ_ONCE(i915->gt.epoch)) { + INIT_WORK(&s->work, __sleep_work); + queue_work(i915->wq, &s->work); + } else { + kfree(s); + } +} + static void i915_gem_idle_work_handler(struct work_struct *work) { struct drm_i915_private *dev_priv = container_of(work, typeof(*dev_priv), gt.idle_work.work); bool rearm_hangcheck; + u32 epoch = 0; ktime_t end; if (!READ_ONCE(dev_priv->gt.awake)) @@ -3406,6 +3453,7 @@ i915_gem_idle_work_handler(struct work_struct *work) GEM_BUG_ON(!dev_priv->gt.awake); dev_priv->gt.awake = false; rearm_hangcheck = false; + epoch = dev_priv->gt.epoch; if (INTEL_GEN(dev_priv) >= 6) gen6_rps_idle(dev_priv); @@ -3421,6 +3469,23 @@ i915_gem_idle_work_handler(struct work_struct *work) GEM_BUG_ON(!dev_priv->gt.awake); i915_queue_hangcheck(dev_priv); } + + /* + * When we are idle, it is an opportune time to reap our caches. + * However, we have many objects that utilise RCU and the ordered + * i915->wq that this work is executing on. To try and flush any + * pending frees now we are idle, we first wait for an RCU grace + * period, and then queue a task (that will run last on the wq) to + * shrink and re-optimize the caches. + */ + if (epoch == READ_ONCE(dev_priv->gt.epoch)) { + struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s) { + s->i915 = dev_priv; + s->epoch = epoch; + call_rcu(&s->rcu, __sleep_rcu); + } + } } void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
When we finally decide the gpu is idle, that is a good time to shrink our kmem_caches. v3: Defer until an rcu grace period after we idle. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> --- drivers/gpu/drm/i915/i915_gem.c | 65 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+)