drm: Inject a cond_resched() into long drm_clflush_sg()

Message ID	20200115205245.2772800-1-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=nDDZ=3E=lists.freedesktop.org=dri-devel-bounces@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 45CE5222C3 From: Chris Wilson <chris@chris-wilson.co.uk> To: dri-devel@lists.freedesktop.org Subject: [PATCH] drm: Inject a cond_resched() into long drm_clflush_sg() Date: Wed, 15 Jan 2020 20:52:45 +0000 Message-Id: <20200115205245.2772800-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Precedence: list Cc: intel-gfx@lists.freedesktop.org, David Laight <David.Laight@ACULAB.COM> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: dri-devel-bounces@lists.freedesktop.org Sender: "dri-devel" <dri-devel-bounces@lists.freedesktop.org>
Series	drm: Inject a cond_resched() into long drm_clflush_sg() \| expand drm: Inject a cond_resched() into long drm_clflush_sg()

Chris Wilson Jan. 15, 2020, 8:52 p.m. UTC

Since we may try and flush the cachelines associated with large buffers
(an 8K framebuffer is about 128MiB, even before we try HDR), this leads
to unacceptably long latencies (when using a voluntary CONFIG_PREEMPT).
If we call cond_resched() between each sg chunk, that it about every 128
pages, we have a natural break point in which to check if the process
needs to be rescheduled. Naturally, this means that drm_clflush_sg() can
only be called from process context -- which is true at the moment. The
other clflush routines remain usable from atomic context.

Even though flushing large objects takes a demonstrable amount to time
to flush all the cachelines, clflush is still preferred over a
system-wide wbinvd as the latter has unpredictable latencies affecting
the whole system not just the local task.

Reported-by: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: David Laight <David.Laight@ACULAB.COM>
---
 drivers/gpu/drm/drm_cache.c | 49 ++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 4 deletions(-)

Daniel Vetter Jan. 16, 2020, 6:52 a.m. UTC | #1

On Wed, Jan 15, 2020 at 08:52:45PM +0000, Chris Wilson wrote:
> Since we may try and flush the cachelines associated with large buffers
> (an 8K framebuffer is about 128MiB, even before we try HDR), this leads
> to unacceptably long latencies (when using a voluntary CONFIG_PREEMPT).
> If we call cond_resched() between each sg chunk, that it about every 128
> pages, we have a natural break point in which to check if the process
> needs to be rescheduled. Naturally, this means that drm_clflush_sg() can
> only be called from process context -- which is true at the moment. The
> other clflush routines remain usable from atomic context.
> 
> Even though flushing large objects takes a demonstrable amount to time
> to flush all the cachelines, clflush is still preferred over a
> system-wide wbinvd as the latter has unpredictable latencies affecting
> the whole system not just the local task.
> 
> Reported-by: David Laight <David.Laight@ACULAB.COM>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: David Laight <David.Laight@ACULAB.COM>

The original bug report is complaining about latencies for SCHED_RT
threads, on a system that doesn't even use CONFIG_PREEMPT. I'm not sure
it's terribly valid to cater to that use-case - all the desktop distros
seem a lot more reasonable. So firmly *shrug* from my side ...

Patch itself looks correct, just not seeing the point.
-Daniel


> ---
>  drivers/gpu/drm/drm_cache.c | 49 ++++++++++++++++++++++++++++++++++---
>  1 file changed, 45 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 03e01b000f7a..fbd2bb644544 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -112,23 +112,64 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages)
>  }
>  EXPORT_SYMBOL(drm_clflush_pages);
>  
> +static __always_inline struct sgt_iter {
> +	struct scatterlist *sgp;
> +	unsigned long pfn;
> +	unsigned int curr;
> +	unsigned int max;
> +} __sgt_iter(struct scatterlist *sgl) {
> +	struct sgt_iter s = { .sgp = sgl };
> +
> +	if (s.sgp) {
> +		s.max = s.curr = s.sgp->offset;
> +		s.max += s.sgp->length;
> +		s.pfn = page_to_pfn(sg_page(s.sgp));
> +	}
> +
> +	return s;
> +}
> +
> +static inline struct scatterlist *__sg_next_resched(struct scatterlist *sg)
> +{
> +	if (sg_is_last(sg))
> +		return NULL;
> +
> +	++sg;
> +	if (unlikely(sg_is_chain(sg))) {
> +		sg = sg_chain_ptr(sg);
> +		cond_resched();
> +	}
> +	return sg;
> +}
> +
> +#define for_each_sgt_page(__pp, __iter, __sgt)				\
> +	for ((__iter) = __sgt_iter((__sgt)->sgl);			\
> +	     ((__pp) = (__iter).pfn == 0 ? NULL :			\
> +	      pfn_to_page((__iter).pfn + ((__iter).curr >> PAGE_SHIFT))); \
> +	     (((__iter).curr += PAGE_SIZE) >= (__iter).max) ?		\
> +	     (__iter) = __sgt_iter(__sg_next_resched((__iter).sgp)), 0 : 0)
> +
>  /**
>   * drm_clflush_sg - Flush dcache lines pointing to a scather-gather.
>   * @st: struct sg_table.
>   *
>   * Flush every data cache line entry that points to an address in the
> - * sg.
> + * sg. This may schedule between scatterlist chunks, in order to keep
> + * the system preemption-latency down for large buffers.
>   */
>  void
>  drm_clflush_sg(struct sg_table *st)
>  {
> +	might_sleep();
> +
>  #if defined(CONFIG_X86)
>  	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
> -		struct sg_page_iter sg_iter;
> +		struct sgt_iter sg_iter;
> +		struct page *page;
>  
>  		mb(); /*CLFLUSH is ordered only by using memory barriers*/
> -		for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> -			drm_clflush_page(sg_page_iter_page(&sg_iter));
> +		for_each_sgt_page(page, sg_iter, st)
> +			drm_clflush_page(page);
>  		mb(); /*Make sure that all cache line entry is flushed*/
>  
>  		return;
> -- 
> 2.25.0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

Chris Wilson Jan. 16, 2020, 7:40 a.m. UTC | #2

Quoting Daniel Vetter (2020-01-16 06:52:42)
> On Wed, Jan 15, 2020 at 08:52:45PM +0000, Chris Wilson wrote:
> > Since we may try and flush the cachelines associated with large buffers
> > (an 8K framebuffer is about 128MiB, even before we try HDR), this leads
> > to unacceptably long latencies (when using a voluntary CONFIG_PREEMPT).
> > If we call cond_resched() between each sg chunk, that it about every 128
> > pages, we have a natural break point in which to check if the process
> > needs to be rescheduled. Naturally, this means that drm_clflush_sg() can
> > only be called from process context -- which is true at the moment. The
> > other clflush routines remain usable from atomic context.
> > 
> > Even though flushing large objects takes a demonstrable amount to time
> > to flush all the cachelines, clflush is still preferred over a
> > system-wide wbinvd as the latter has unpredictable latencies affecting
> > the whole system not just the local task.
> > 
> > Reported-by: David Laight <David.Laight@ACULAB.COM>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: David Laight <David.Laight@ACULAB.COM>
> 
> The original bug report is complaining about latencies for SCHED_RT
> threads, on a system that doesn't even use CONFIG_PREEMPT. I'm not sure
> it's terribly valid to cater to that use-case - all the desktop distros
> seem a lot more reasonable. So firmly *shrug* from my side ...

Yeah, I had the same immediate response to the complaint), but otoh we've
inserted cond_resched() before when it looks like may consume entire
jiffies inside a loop. At the very minimum, we should have a
might_sleep() here and a reminder that this can be very slow (remember
byt?).
-Chris

David Laight Jan. 16, 2020, 12:26 p.m. UTC | #3

From: Chris Wilson <chris@chris-wilson.co.uk>
> Sent: 16 January 2020 07:40
> Quoting Daniel Vetter (2020-01-16 06:52:42)
> > On Wed, Jan 15, 2020 at 08:52:45PM +0000, Chris Wilson wrote:
> > > Since we may try and flush the cachelines associated with large buffers
> > > (an 8K framebuffer is about 128MiB, even before we try HDR), this leads
> > > to unacceptably long latencies (when using a voluntary CONFIG_PREEMPT).
> > > If we call cond_resched() between each sg chunk, that it about every 128
> > > pages, we have a natural break point in which to check if the process
> > > needs to be rescheduled. Naturally, this means that drm_clflush_sg() can
> > > only be called from process context -- which is true at the moment. The
> > > other clflush routines remain usable from atomic context.
> > >
> > > Even though flushing large objects takes a demonstrable amount to time
> > > to flush all the cachelines, clflush is still preferred over a
> > > system-wide wbinvd as the latter has unpredictable latencies affecting
> > > the whole system not just the local task.
> > >
> > > Reported-by: David Laight <David.Laight@ACULAB.COM>
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: David Laight <David.Laight@ACULAB.COM>
> >
> > The original bug report is complaining about latencies for SCHED_RT
> > threads, on a system that doesn't even use CONFIG_PREEMPT. I'm not sure
> > it's terribly valid to cater to that use-case - all the desktop distros
> > seem a lot more reasonable. So firmly *shrug* from my side ...
> 
> Yeah, I had the same immediate response to the complaint), but otoh we've
> inserted cond_resched() before when it looks like may consume entire
> jiffies inside a loop. At the very minimum, we should have a
> might_sleep() here and a reminder that this can be very slow (remember
> byt?).

I'm using RT to get more deterministic scheduling to look for long
scheduling delays, rather than because we need very tight scheduling.
Delays of several 100us aren't a real problem.

The problem with CONFIG_PREEMPT is that the distros don't
enable it and it isn't a command line option.
So it is really useless unless you are able/willing to build your
own kernel.

I could run the code under the normal scheduler with 'nice -19'.
I stlll wouldn't expect to have all but one cpu idle when I've just
done a cv_broadcast() to wake up a lot of threads.

I've added 'if (!(++i & 31)) cond_resched();' after the drm_clfulsh_page()
call in drm_cflush_sg().
In my case that it 3600/32 reschedules in 3.3ms - plenty.

However there is a call from __i915_gem_objet_set_pages() that
is preceded by a lockdep_assert_held() check - so mustn't sleep.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

Chris Wilson Jan. 16, 2020, 12:28 p.m. UTC | #4

Quoting David Laight (2020-01-16 12:26:45)
> However there is a call from __i915_gem_objet_set_pages() that
> is preceded by a lockdep_assert_held() check - so mustn't sleep.

That is a mutex; it's allowed to sleep. The might_sleep() here should
help assuage your fears.
-Chris

David Laight Jan. 16, 2020, 1:58 p.m. UTC | #5

From: Chris Wilson <chris@chris-wilson.co.uk>
> Sent: 16 January 2020 12:29
> 
> Quoting David Laight (2020-01-16 12:26:45)
> > However there is a call from __i915_gem_objet_set_pages() that
> > is preceded by a lockdep_assert_held() check - so mustn't sleep.
> 
> That is a mutex; it's allowed to sleep. The might_sleep() here should
> help assuage your fears.

Gah. Having a mutex called mm.lock isn't entirely obvious when quickly
reading code.

From what I was reading earlier it seems that clflush() (and cflushopt)
are fast (well not stupidly slow) for buffers under 4k.
I suspect then can do a 'mark pending', but for larger buffers have to
wait for earlier requests to complete (although the graph carries on
increasing to the 16k RH margin.

If may well be that adding a cond_resched() after every 4k page
will have ~0 performance impact because the first few clflush
will be a bit faster (less slow).

I'll do some measurements later this afternoon.

FWIW does this code need to actually invalidate the cache lines?
Or is it just forcing a writeback?

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

Chris Wilson Jan. 16, 2020, 2:01 p.m. UTC | #6

Quoting David Laight (2020-01-16 13:58:44)
> From: Chris Wilson <chris@chris-wilson.co.uk>
> > Sent: 16 January 2020 12:29
> > 
> > Quoting David Laight (2020-01-16 12:26:45)
> > > However there is a call from __i915_gem_objet_set_pages() that
> > > is preceded by a lockdep_assert_held() check - so mustn't sleep.
> > 
> > That is a mutex; it's allowed to sleep. The might_sleep() here should
> > help assuage your fears.
> 
> Gah. Having a mutex called mm.lock isn't entirely obvious when quickly
> reading code.
> 
> From what I was reading earlier it seems that clflush() (and cflushopt)
> are fast (well not stupidly slow) for buffers under 4k.
> I suspect then can do a 'mark pending', but for larger buffers have to
> wait for earlier requests to complete (although the graph carries on
> increasing to the 16k RH margin.
> 
> If may well be that adding a cond_resched() after every 4k page
> will have ~0 performance impact because the first few clflush
> will be a bit faster (less slow).
> 
> I'll do some measurements later this afternoon.
> 
> FWIW does this code need to actually invalidate the cache lines?
> Or is it just forcing a writeback?

It is used for both.
-Chris

David Laight Jan. 16, 2020, 2:40 p.m. UTC | #7

> I'll do some measurements later this afternoon.

This is an Ivy bridge cpu, so clflush (not clflushopt).
With a cond_resched for every page I get:
(Note these calls are every 10 seconds....)

# tracer: function_graph
#
# CPU  DURATION                  FUNCTION CALLS
# |     |   |                     |   |   |   |
 0) # 3155.126 us |  drm_clflush_sg [drm]();
 1) # 3067.382 us |  drm_clflush_sg [drm]();
 2) # 3063.766 us |  drm_clflush_sg [drm]();
 3) # 3092.302 us |  drm_clflush_sg [drm]();
 0) # 3209.486 us |  drm_clflush_sg [drm]();
 ------------------------------------------
 0)   kworker-7    =>  kworker-319
 ------------------------------------------

 0) # 3633.803 us |  drm_clflush_sg [drm]();
 ------------------------------------------
 1)   kworker-7    =>  kworker-319
 ------------------------------------------

 1) # 3090.278 us |  drm_clflush_sg [drm]();
 2) # 3828.108 us |  drm_clflush_sg [drm]();
 3) # 3049.836 us |  drm_clflush_sg [drm]();
 ------------------------------------------
 3)  kworker-319   =>   kworker-7
 ------------------------------------------

 3) # 3295.017 us |  drm_clflush_sg [drm]();
 3) # 3064.077 us |  drm_clflush_sg [drm]();
 ------------------------------------------
 0)  kworker-319   =>   kworker-7
 ------------------------------------------

 0) # 3182.034 us |  drm_clflush_sg [drm]();
 ------------------------------------------
 3)   kworker-7    =>  kworker-319
 ------------------------------------------

 3) # 3065.754 us |  drm_clflush_sg [drm]();
 ------------------------------------------
 2)  kworker-319   =>   kworker-7
 ------------------------------------------

 2) # 3562.513 us |  drm_clflush_sg [drm]();
 ------------------------------------------
 3)  kworker-319   =>   kworker-7
 ------------------------------------------

 3) # 3048.914 us |  drm_clflush_sg [drm]();
 3) # 3062.469 us |  drm_clflush_sg [drm]();
 3) # 3055.727 us |  drm_clflush_sg [drm]();
 ------------------------------------------
 0)   kworker-7    =>  kworker-319
 ------------------------------------------

Without the cond_sched I suspect more of them are 3.0ms.
Not really a significant difference.
I guess the longer times are the scheduler looking for work?
I don't understand the extra traces - I'm guessing they are process switch related.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

David Laight Jan. 16, 2020, 3:04 p.m. UTC | #8

From: David Laight
> Sent: 16 January 2020 14:41
> > I'll do some measurements later this afternoon.
> 
> This is an Ivy bridge cpu, so clflush (not clflushopt).
> With a cond_resched for every page I get:
> (Note these calls are every 10 seconds....)

For comparison some times booted with the original drm.ko

 1) # 3125.116 us |  drm_clflush_sg [drm]();
 0) # 3181.705 us |  drm_clflush_sg [drm]();
 1) # 3108.863 us |  drm_clflush_sg [drm]();
 1) # 3051.926 us |  drm_clflush_sg [drm]();
 2) # 3088.468 us |  drm_clflush_sg [drm]();
 2) # 3012.729 us |  drm_clflush_sg [drm]();
 2) # 3191.268 us |  drm_clflush_sg [drm]();
 3) # 3044.294 us |  drm_clflush_sg [drm]();
 0) # 3163.916 us |  drm_clflush_sg [drm]();
 2) # 3029.307 us |  drm_clflush_sg [drm]();
 2) # 3116.360 us |  drm_clflush_sg [drm]();
 2) # 3031.620 us |  drm_clflush_sg [drm]();
 0) # 3349.706 us |  drm_clflush_sg [drm]();

Probably nothing really significant.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

David Laight Jan. 23, 2020, 2:36 p.m. UTC | #9

From: Chris Wilson <chris@chris-wilson.co.uk>
> Sent: 15 January 2020 20:53
> 
> Since we may try and flush the cachelines associated with large buffers
> (an 8K framebuffer is about 128MiB, even before we try HDR), this leads
> to unacceptably long latencies (when using a voluntary CONFIG_PREEMPT).
> If we call cond_resched() between each sg chunk, that it about every 128
> pages, we have a natural break point in which to check if the process
> needs to be rescheduled. Naturally, this means that drm_clflush_sg() can
> only be called from process context -- which is true at the moment. The
> other clflush routines remain usable from atomic context.
> 
> Even though flushing large objects takes a demonstrable amount to time
> to flush all the cachelines, clflush is still preferred over a
> system-wide wbinvd as the latter has unpredictable latencies affecting
> the whole system not just the local task.

Any progress on this.
I think the patch itself has it's whitespace messed up.

I've just done a measurement on a newer system that supports clflushopt.
drm_clflush_sg() took 400us for a 1920x1080 display.
No idea how fast the cpus were running, somewhere between 800MHz and 4GHz
depending on the whim of the hardware (probable at the low end).
Considerably faster, and enough that calling cond_resched() every 4k
is probably noticable.
So every 128 pages is probably a reasonable compromise.

	David


> Reported-by: David Laight <David.Laight@ACULAB.COM>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: David Laight <David.Laight@ACULAB.COM>
> ---
>  drivers/gpu/drm/drm_cache.c | 49 ++++++++++++++++++++++++++++++++++---
>  1 file changed, 45 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 03e01b000f7a..fbd2bb644544 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -112,23 +112,64 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages)
>  }
>  EXPORT_SYMBOL(drm_clflush_pages);
> 
> +static __always_inline struct sgt_iter {
> +struct scatterlist *sgp;
> +unsigned long pfn;
> +unsigned int curr;
> +unsigned int max;
> +} __sgt_iter(struct scatterlist *sgl) {
> +struct sgt_iter s = { .sgp = sgl };
> +
> +if (s.sgp) {
> +s.max = s.curr = s.sgp->offset;
> +s.max += s.sgp->length;
> +s.pfn = page_to_pfn(sg_page(s.sgp));
> +}
> +
> +return s;
> +}
> +
> +static inline struct scatterlist *__sg_next_resched(struct scatterlist *sg)
> +{
> +if (sg_is_last(sg))
> +return NULL;
> +
> +++sg;
> +if (unlikely(sg_is_chain(sg))) {
> +sg = sg_chain_ptr(sg);
> +cond_resched();
> +}
> +return sg;
> +}
> +
> +#define for_each_sgt_page(__pp, __iter, __sgt)\
> +for ((__iter) = __sgt_iter((__sgt)->sgl);\
> +     ((__pp) = (__iter).pfn == 0 ? NULL :\
> +      pfn_to_page((__iter).pfn + ((__iter).curr >> PAGE_SHIFT))); \
> +     (((__iter).curr += PAGE_SIZE) >= (__iter).max) ?\
> +     (__iter) = __sgt_iter(__sg_next_resched((__iter).sgp)), 0 : 0)
> +
>  /**
>   * drm_clflush_sg - Flush dcache lines pointing to a scather-gather.
>   * @st: struct sg_table.
>   *
>   * Flush every data cache line entry that points to an address in the
> - * sg.
> + * sg. This may schedule between scatterlist chunks, in order to keep
> + * the system preemption-latency down for large buffers.
>   */
>  void
>  drm_clflush_sg(struct sg_table *st)
>  {
> +might_sleep();
> +
>  #if defined(CONFIG_X86)
>  if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
> -struct sg_page_iter sg_iter;
> +struct sgt_iter sg_iter;
> +struct page *page;
> 
>  mb(); /*CLFLUSH is ordered only by using memory barriers*/
> -for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> -drm_clflush_page(sg_page_iter_page(&sg_iter));
> +for_each_sgt_page(page, sg_iter, st)
> +drm_clflush_page(page);
>  mb(); /*Make sure that all cache line entry is flushed*/
> 
>  return;
> --
> 2.25.0
> 

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

drm: Inject a cond_resched() into long drm_clflush_sg()

Commit Message

Comments

Patch