diff mbox series

[09/20] drm/i915/gem: Assign context id for async work

Message ID 20200706061926.6687-10-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/20] drm/i915: Preallocate stashes for vma page-directories | expand

Commit Message

Chris Wilson July 6, 2020, 6:19 a.m. UTC
Allocate a few dma fence context id that we can use to associate async work
[for the CPU] launched on behalf of this context. For extra fun, we allow
a configurable concurrency width.

A current example would be that we spawn an unbound worker for every
userptr get_pages. In the future, we wish to charge this work to the
context that initiated the async work and to impose concurrency limits
based on the context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c       | 4 ++++
 drivers/gpu/drm/i915/gem/i915_gem_context.h       | 6 ++++++
 drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 6 ++++++
 3 files changed, 16 insertions(+)

Comments

Tvrtko Ursulin July 8, 2020, 12:26 p.m. UTC | #1
On 06/07/2020 07:19, Chris Wilson wrote:
> Allocate a few dma fence context id that we can use to associate async work
> [for the CPU] launched on behalf of this context. For extra fun, we allow
> a configurable concurrency width.
> 
> A current example would be that we spawn an unbound worker for every
> userptr get_pages. In the future, we wish to charge this work to the
> context that initiated the async work and to impose concurrency limits
> based on the context.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c       | 4 ++++
>   drivers/gpu/drm/i915/gem/i915_gem_context.h       | 6 ++++++
>   drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 6 ++++++
>   3 files changed, 16 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 41784df51e58..bd68746327b3 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -714,6 +714,10 @@ __create_context(struct drm_i915_private *i915)
>   	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
>   	mutex_init(&ctx->mutex);
>   
> +	ctx->async.width = rounddown_pow_of_two(num_online_cpus());
> +	ctx->async.context = dma_fence_context_alloc(ctx->async.width);
> +	ctx->async.width--;

Hey I had a tri-core CPU back in the day.. :) Really, I can only assume 
you are oding some tricks with masks which maybe only work with power of 
2 num cpus? Hard to say.. please explain in a comment.

I don't even understand what the context will be for yet and why it 
needs a separate context id.

> +
>   	spin_lock_init(&ctx->stale.lock);
>   	INIT_LIST_HEAD(&ctx->stale.engines);
>   
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> index 3702b2fb27ab..e104ff0ae740 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> @@ -134,6 +134,12 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>   int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void *data,
>   				       struct drm_file *file);
>   
> +static inline u64 i915_gem_context_async_id(struct i915_gem_context *ctx)
> +{
> +	return (ctx->async.context +
> +		(atomic_fetch_inc(&ctx->async.cur) & ctx->async.width));
> +}
> +
>   static inline struct i915_gem_context *
>   i915_gem_context_get(struct i915_gem_context *ctx)
>   {
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> index ae14ca24a11f..52561f98000f 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> @@ -85,6 +85,12 @@ struct i915_gem_context {
>   
>   	struct intel_timeline *timeline;
>   
> +	struct {
> +		u64 context;
> +		atomic_t cur;

What is cur? In which patch it gets used? (Can't see it.)

> +		unsigned int width;
> +	} async;
> +
>   	/**
>   	 * @vm: unique address space (GTT)
>   	 *
> 

Regards,

Tvrtko
Chris Wilson July 8, 2020, 12:42 p.m. UTC | #2
Quoting Tvrtko Ursulin (2020-07-08 13:26:24)
> 
> On 06/07/2020 07:19, Chris Wilson wrote:
> > Allocate a few dma fence context id that we can use to associate async work
> > [for the CPU] launched on behalf of this context. For extra fun, we allow
> > a configurable concurrency width.
> > 
> > A current example would be that we spawn an unbound worker for every
> > userptr get_pages. In the future, we wish to charge this work to the
> > context that initiated the async work and to impose concurrency limits
> > based on the context.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/gem/i915_gem_context.c       | 4 ++++
> >   drivers/gpu/drm/i915/gem/i915_gem_context.h       | 6 ++++++
> >   drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 6 ++++++
> >   3 files changed, 16 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > index 41784df51e58..bd68746327b3 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > @@ -714,6 +714,10 @@ __create_context(struct drm_i915_private *i915)
> >       ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
> >       mutex_init(&ctx->mutex);
> >   
> > +     ctx->async.width = rounddown_pow_of_two(num_online_cpus());
> > +     ctx->async.context = dma_fence_context_alloc(ctx->async.width);
> > +     ctx->async.width--;
> 
> Hey I had a tri-core CPU back in the day.. :) Really, I can only assume 
> you are oding some tricks with masks which maybe only work with power of 
> 2 num cpus? Hard to say.. please explain in a comment.

Just a pot mask, that fits in the currently available set of CPUs.
 
> I don't even understand what the context will be for yet and why it 
> needs a separate context id.

The longer term view is that I want to pull the various async tasks we
use into a CPU scheduling kthread[s], that shares the same priority
inheritance of tasks. The issue at the moment is that as we use the
system_wq, that imposes an implicit FIFO ordering on our tasks upsetting
our context priorities. This is a step towards that to start looking at
how we might limit concurrency in various stages by using a bunch of
timelines for each stage, and queuing our work along each timeline before
submitting to an unbound system_wq. [The immediate goal is to limit how
much of the CPU one client can hog by submitting deferred work that would
run in parallel, with a view to making that configurable per-context.]

> >       spin_lock_init(&ctx->stale.lock);
> >       INIT_LIST_HEAD(&ctx->stale.engines);
> >   
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> > index 3702b2fb27ab..e104ff0ae740 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> > @@ -134,6 +134,12 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> >   int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void *data,
> >                                      struct drm_file *file);
> >   
> > +static inline u64 i915_gem_context_async_id(struct i915_gem_context *ctx)
> > +{
> > +     return (ctx->async.context +
> > +             (atomic_fetch_inc(&ctx->async.cur) & ctx->async.width));
> > +}
> > +
> >   static inline struct i915_gem_context *
> >   i915_gem_context_get(struct i915_gem_context *ctx)
> >   {
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > index ae14ca24a11f..52561f98000f 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > @@ -85,6 +85,12 @@ struct i915_gem_context {
> >   
> >       struct intel_timeline *timeline;
> >   
> > +     struct {
> > +             u64 context;
> > +             atomic_t cur;
> 
> What is cur? In which patch it gets used? (Can't see it.)

See i915_gem_context_async_id() above.
-Chris
Tvrtko Ursulin July 8, 2020, 12:45 p.m. UTC | #3
On 08/07/2020 13:26, Tvrtko Ursulin wrote:
> 
> On 06/07/2020 07:19, Chris Wilson wrote:
>> Allocate a few dma fence context id that we can use to associate async 
>> work
>> [for the CPU] launched on behalf of this context. For extra fun, we allow
>> a configurable concurrency width.
>>
>> A current example would be that we spawn an unbound worker for every
>> userptr get_pages. In the future, we wish to charge this work to the
>> context that initiated the async work and to impose concurrency limits
>> based on the context.
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   drivers/gpu/drm/i915/gem/i915_gem_context.c       | 4 ++++
>>   drivers/gpu/drm/i915/gem/i915_gem_context.h       | 6 ++++++
>>   drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 6 ++++++
>>   3 files changed, 16 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
>> b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>> index 41784df51e58..bd68746327b3 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>> @@ -714,6 +714,10 @@ __create_context(struct drm_i915_private *i915)
>>       ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
>>       mutex_init(&ctx->mutex);
>> +    ctx->async.width = rounddown_pow_of_two(num_online_cpus());
>> +    ctx->async.context = dma_fence_context_alloc(ctx->async.width);
>> +    ctx->async.width--;
> 
> Hey I had a tri-core CPU back in the day.. :) Really, I can only assume 
> you are oding some tricks with masks which maybe only work with power of 
> 2 num cpus? Hard to say.. please explain in a comment.

Doh missed rounddown_pow_of_two..

> I don't even understand what the context will be for yet and why it 
> needs a separate context id.
> 
>> +
>>       spin_lock_init(&ctx->stale.lock);
>>       INIT_LIST_HEAD(&ctx->stale.engines);
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h 
>> b/drivers/gpu/drm/i915/gem/i915_gem_context.h
>> index 3702b2fb27ab..e104ff0ae740 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
>> @@ -134,6 +134,12 @@ int i915_gem_context_setparam_ioctl(struct 
>> drm_device *dev, void *data,
>>   int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void 
>> *data,
>>                          struct drm_file *file);
>> +static inline u64 i915_gem_context_async_id(struct i915_gem_context 
>> *ctx)
>> +{
>> +    return (ctx->async.context +
>> +        (atomic_fetch_inc(&ctx->async.cur) & ctx->async.width));
>> +}
>> +
>>   static inline struct i915_gem_context *
>>   i915_gem_context_get(struct i915_gem_context *ctx)
>>   {
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h 
>> b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>> index ae14ca24a11f..52561f98000f 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>> @@ -85,6 +85,12 @@ struct i915_gem_context {
>>       struct intel_timeline *timeline;
>> +    struct {
>> +        u64 context;
>> +        atomic_t cur;
> 
> What is cur? In which patch it gets used? (Can't see it.)

Found it. Just more explanation why it is needed is required.

Regards,

Tvrtko

> 
>> +        unsigned int width;
>> +    } async;
>> +
>>       /**
>>        * @vm: unique address space (GTT)
>>        *
>>
> 
> Regards,
> 
> Tvrtko
Tvrtko Ursulin July 8, 2020, 2:24 p.m. UTC | #4
On 08/07/2020 13:42, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-07-08 13:26:24)
>>
>> On 06/07/2020 07:19, Chris Wilson wrote:
>>> Allocate a few dma fence context id that we can use to associate async work
>>> [for the CPU] launched on behalf of this context. For extra fun, we allow
>>> a configurable concurrency width.
>>>
>>> A current example would be that we spawn an unbound worker for every
>>> userptr get_pages. In the future, we wish to charge this work to the
>>> context that initiated the async work and to impose concurrency limits
>>> based on the context.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    drivers/gpu/drm/i915/gem/i915_gem_context.c       | 4 ++++
>>>    drivers/gpu/drm/i915/gem/i915_gem_context.h       | 6 ++++++
>>>    drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 6 ++++++
>>>    3 files changed, 16 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> index 41784df51e58..bd68746327b3 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> @@ -714,6 +714,10 @@ __create_context(struct drm_i915_private *i915)
>>>        ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
>>>        mutex_init(&ctx->mutex);
>>>    
>>> +     ctx->async.width = rounddown_pow_of_two(num_online_cpus());
>>> +     ctx->async.context = dma_fence_context_alloc(ctx->async.width);
>>> +     ctx->async.width--;
>>
>> Hey I had a tri-core CPU back in the day.. :) Really, I can only assume
>> you are oding some tricks with masks which maybe only work with power of
>> 2 num cpus? Hard to say.. please explain in a comment.
> 
> Just a pot mask, that fits in the currently available set of CPUs.
>   
>> I don't even understand what the context will be for yet and why it
>> needs a separate context id.
> 
> The longer term view is that I want to pull the various async tasks we
> use into a CPU scheduling kthread[s], that shares the same priority
> inheritance of tasks. The issue at the moment is that as we use the
> system_wq, that imposes an implicit FIFO ordering on our tasks upsetting
> our context priorities. This is a step towards that to start looking at
> how we might limit concurrency in various stages by using a bunch of
> timelines for each stage, and queuing our work along each timeline before
> submitting to an unbound system_wq. [The immediate goal is to limit how
> much of the CPU one client can hog by submitting deferred work that would
> run in parallel, with a view to making that configurable per-context.]

You are thinking of connecting the GEM context priority with task 
priority? Or create the async kthreads with the same task priority as 
the task who owns the GEM context has? Will that be too many kthreads? I 
suppose they would be created and destroyed on demand so maybe not.

> 
>>>        spin_lock_init(&ctx->stale.lock);
>>>        INIT_LIST_HEAD(&ctx->stale.engines);
>>>    
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
>>> index 3702b2fb27ab..e104ff0ae740 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
>>> @@ -134,6 +134,12 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>>    int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void *data,
>>>                                       struct drm_file *file);
>>>    
>>> +static inline u64 i915_gem_context_async_id(struct i915_gem_context *ctx)
>>> +{
>>> +     return (ctx->async.context +
>>> +             (atomic_fetch_inc(&ctx->async.cur) & ctx->async.width));
>>> +}
>>> +
>>>    static inline struct i915_gem_context *
>>>    i915_gem_context_get(struct i915_gem_context *ctx)
>>>    {
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>> index ae14ca24a11f..52561f98000f 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>> @@ -85,6 +85,12 @@ struct i915_gem_context {
>>>    
>>>        struct intel_timeline *timeline;
>>>    
>>> +     struct {
>>> +             u64 context;
>>> +             atomic_t cur;
>>
>> What is cur? In which patch it gets used? (Can't see it.)
> 
> See i915_gem_context_async_id() above.

Yeah found it later.

So in the patch where you use it, could you explain the significance of 
number of fence contexts vs the number of CPUs. What logic drives the 
choice of CPU concurrency per GEM context?

And what is the effective behaviour you get with N contexts - emit N 
concurrent operations and for N + 1 block in execbuf?

Regards,

Tvrtko
Chris Wilson July 8, 2020, 3:36 p.m. UTC | #5
Quoting Tvrtko Ursulin (2020-07-08 15:24:20)
> 
> On 08/07/2020 13:42, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-07-08 13:26:24)
> >>
> >> On 06/07/2020 07:19, Chris Wilson wrote:
> >>> Allocate a few dma fence context id that we can use to associate async work
> >>> [for the CPU] launched on behalf of this context. For extra fun, we allow
> >>> a configurable concurrency width.
> >>>
> >>> A current example would be that we spawn an unbound worker for every
> >>> userptr get_pages. In the future, we wish to charge this work to the
> >>> context that initiated the async work and to impose concurrency limits
> >>> based on the context.
> >>>
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> ---
> >>>    drivers/gpu/drm/i915/gem/i915_gem_context.c       | 4 ++++
> >>>    drivers/gpu/drm/i915/gem/i915_gem_context.h       | 6 ++++++
> >>>    drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 6 ++++++
> >>>    3 files changed, 16 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> >>> index 41784df51e58..bd68746327b3 100644
> >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> >>> @@ -714,6 +714,10 @@ __create_context(struct drm_i915_private *i915)
> >>>        ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
> >>>        mutex_init(&ctx->mutex);
> >>>    
> >>> +     ctx->async.width = rounddown_pow_of_two(num_online_cpus());
> >>> +     ctx->async.context = dma_fence_context_alloc(ctx->async.width);
> >>> +     ctx->async.width--;
> >>
> >> Hey I had a tri-core CPU back in the day.. :) Really, I can only assume
> >> you are oding some tricks with masks which maybe only work with power of
> >> 2 num cpus? Hard to say.. please explain in a comment.
> > 
> > Just a pot mask, that fits in the currently available set of CPUs.
> >   
> >> I don't even understand what the context will be for yet and why it
> >> needs a separate context id.
> > 
> > The longer term view is that I want to pull the various async tasks we
> > use into a CPU scheduling kthread[s], that shares the same priority
> > inheritance of tasks. The issue at the moment is that as we use the
> > system_wq, that imposes an implicit FIFO ordering on our tasks upsetting
> > our context priorities. This is a step towards that to start looking at
> > how we might limit concurrency in various stages by using a bunch of
> > timelines for each stage, and queuing our work along each timeline before
> > submitting to an unbound system_wq. [The immediate goal is to limit how
> > much of the CPU one client can hog by submitting deferred work that would
> > run in parallel, with a view to making that configurable per-context.]
> 
> You are thinking of connecting the GEM context priority with task 
> priority? Or create the async kthreads with the same task priority as 
> the task who owns the GEM context has? Will that be too many kthreads? I 
> suppose they would be created and destroyed on demand so maybe not.

I'm thinking of having dedicated kthread task runners. Maybe adjusting
between midRT-prio and normal-prio depending on workload. The essence is
to simply replace the FIFO workqueue with our own priolists. (Running
the first task in the queue, hopefully each task is short enough so that
we really don't have to start thinking about making the tasks
preemptible.]

Then world domination.

But first something that works with/like kthread_worker.

> >>>        spin_lock_init(&ctx->stale.lock);
> >>>        INIT_LIST_HEAD(&ctx->stale.engines);
> >>>    
> >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> >>> index 3702b2fb27ab..e104ff0ae740 100644
> >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
> >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> >>> @@ -134,6 +134,12 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> >>>    int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void *data,
> >>>                                       struct drm_file *file);
> >>>    
> >>> +static inline u64 i915_gem_context_async_id(struct i915_gem_context *ctx)
> >>> +{
> >>> +     return (ctx->async.context +
> >>> +             (atomic_fetch_inc(&ctx->async.cur) & ctx->async.width));
> >>> +}
> >>> +
> >>>    static inline struct i915_gem_context *
> >>>    i915_gem_context_get(struct i915_gem_context *ctx)
> >>>    {
> >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> >>> index ae14ca24a11f..52561f98000f 100644
> >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> >>> @@ -85,6 +85,12 @@ struct i915_gem_context {
> >>>    
> >>>        struct intel_timeline *timeline;
> >>>    
> >>> +     struct {
> >>> +             u64 context;
> >>> +             atomic_t cur;
> >>
> >> What is cur? In which patch it gets used? (Can't see it.)
> > 
> > See i915_gem_context_async_id() above.
> 
> Yeah found it later.
> 
> So in the patch where you use it, could you explain the significance of 
> number of fence contexts vs the number of CPUs. What logic drives the 
> choice of CPU concurrency per GEM context?

Logic? Pick a number out of a hat.

> And what is the effective behaviour you get with N contexts - emit N 
> concurrent operations and for N + 1 block in execbuf?

Each context defines a timeline. A task is not ready to run until the
task before it in its timeline is completed. So we don't block in
execbuf, the scheduler waits until the request is ready before putting
it into the HW queues -- i.e. the number chain of fences with everything
that entails about ensuring it runs to completion [whether successfully
or not, if not we then rely on the error propagation to limit the damage
and report it back to the user if they kept a fence around to inspect].
-Chris
Tvrtko Ursulin July 9, 2020, 11:01 a.m. UTC | #6
On 08/07/2020 16:36, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-07-08 15:24:20)
>>
>> On 08/07/2020 13:42, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2020-07-08 13:26:24)
>>>>
>>>> On 06/07/2020 07:19, Chris Wilson wrote:
>>>>> Allocate a few dma fence context id that we can use to associate async work
>>>>> [for the CPU] launched on behalf of this context. For extra fun, we allow
>>>>> a configurable concurrency width.
>>>>>
>>>>> A current example would be that we spawn an unbound worker for every
>>>>> userptr get_pages. In the future, we wish to charge this work to the
>>>>> context that initiated the async work and to impose concurrency limits
>>>>> based on the context.
>>>>>
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> ---
>>>>>     drivers/gpu/drm/i915/gem/i915_gem_context.c       | 4 ++++
>>>>>     drivers/gpu/drm/i915/gem/i915_gem_context.h       | 6 ++++++
>>>>>     drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 6 ++++++
>>>>>     3 files changed, 16 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>>>> index 41784df51e58..bd68746327b3 100644
>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>>>> @@ -714,6 +714,10 @@ __create_context(struct drm_i915_private *i915)
>>>>>         ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
>>>>>         mutex_init(&ctx->mutex);
>>>>>     
>>>>> +     ctx->async.width = rounddown_pow_of_two(num_online_cpus());
>>>>> +     ctx->async.context = dma_fence_context_alloc(ctx->async.width);
>>>>> +     ctx->async.width--;
>>>>
>>>> Hey I had a tri-core CPU back in the day.. :) Really, I can only assume
>>>> you are oding some tricks with masks which maybe only work with power of
>>>> 2 num cpus? Hard to say.. please explain in a comment.
>>>
>>> Just a pot mask, that fits in the currently available set of CPUs.
>>>    
>>>> I don't even understand what the context will be for yet and why it
>>>> needs a separate context id.
>>>
>>> The longer term view is that I want to pull the various async tasks we
>>> use into a CPU scheduling kthread[s], that shares the same priority
>>> inheritance of tasks. The issue at the moment is that as we use the
>>> system_wq, that imposes an implicit FIFO ordering on our tasks upsetting
>>> our context priorities. This is a step towards that to start looking at
>>> how we might limit concurrency in various stages by using a bunch of
>>> timelines for each stage, and queuing our work along each timeline before
>>> submitting to an unbound system_wq. [The immediate goal is to limit how
>>> much of the CPU one client can hog by submitting deferred work that would
>>> run in parallel, with a view to making that configurable per-context.]
>>
>> You are thinking of connecting the GEM context priority with task
>> priority? Or create the async kthreads with the same task priority as
>> the task who owns the GEM context has? Will that be too many kthreads? I
>> suppose they would be created and destroyed on demand so maybe not.
> 
> I'm thinking of having dedicated kthread task runners. Maybe adjusting
> between midRT-prio and normal-prio depending on workload. The essence is
> to simply replace the FIFO workqueue with our own priolists. (Running
> the first task in the queue, hopefully each task is short enough so that
> we really don't have to start thinking about making the tasks
> preemptible.]
> 
> Then world domination.
> 
> But first something that works with/like kthread_worker.
> 
>>>>>         spin_lock_init(&ctx->stale.lock);
>>>>>         INIT_LIST_HEAD(&ctx->stale.engines);
>>>>>     
>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
>>>>> index 3702b2fb27ab..e104ff0ae740 100644
>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
>>>>> @@ -134,6 +134,12 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>>>>     int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void *data,
>>>>>                                        struct drm_file *file);
>>>>>     
>>>>> +static inline u64 i915_gem_context_async_id(struct i915_gem_context *ctx)
>>>>> +{
>>>>> +     return (ctx->async.context +
>>>>> +             (atomic_fetch_inc(&ctx->async.cur) & ctx->async.width));
>>>>> +}
>>>>> +
>>>>>     static inline struct i915_gem_context *
>>>>>     i915_gem_context_get(struct i915_gem_context *ctx)
>>>>>     {
>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>>>> index ae14ca24a11f..52561f98000f 100644
>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
>>>>> @@ -85,6 +85,12 @@ struct i915_gem_context {
>>>>>     
>>>>>         struct intel_timeline *timeline;
>>>>>     
>>>>> +     struct {
>>>>> +             u64 context;
>>>>> +             atomic_t cur;
>>>>
>>>> What is cur? In which patch it gets used? (Can't see it.)
>>>
>>> See i915_gem_context_async_id() above.
>>
>> Yeah found it later.
>>
>> So in the patch where you use it, could you explain the significance of
>> number of fence contexts vs the number of CPUs. What logic drives the
>> choice of CPU concurrency per GEM context?
> 
> Logic? Pick a number out of a hat.
> 
>> And what is the effective behaviour you get with N contexts - emit N
>> concurrent operations and for N + 1 block in execbuf?
> 
> Each context defines a timeline. A task is not ready to run until the
> task before it in its timeline is completed. So we don't block in
> execbuf, the scheduler waits until the request is ready before putting
> it into the HW queues -- i.e. the number chain of fences with everything
> that entails about ensuring it runs to completion [whether successfully
> or not, if not we then rely on the error propagation to limit the damage
> and report it back to the user if they kept a fence around to inspect].

Okay but what is the benefit of N contexts in this series, before the 
work is actually spread over ctx async width CPUs? Is there any? If not 
I would prefer this patch is delayed until the time some actual 
parallelism is ready to be added.

Regards,

Tvrtko
Chris Wilson July 9, 2020, 11:07 a.m. UTC | #7
Quoting Tvrtko Ursulin (2020-07-09 12:01:29)
> 
> On 08/07/2020 16:36, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-07-08 15:24:20)
> >> And what is the effective behaviour you get with N contexts - emit N
> >> concurrent operations and for N + 1 block in execbuf?
> > 
> > Each context defines a timeline. A task is not ready to run until the
> > task before it in its timeline is completed. So we don't block in
> > execbuf, the scheduler waits until the request is ready before putting
> > it into the HW queues -- i.e. the number chain of fences with everything
> > that entails about ensuring it runs to completion [whether successfully
> > or not, if not we then rely on the error propagation to limit the damage
> > and report it back to the user if they kept a fence around to inspect].
> 
> Okay but what is the benefit of N contexts in this series, before the 
> work is actually spread over ctx async width CPUs? Is there any? If not 
> I would prefer this patch is delayed until the time some actual 
> parallelism is ready to be added.

We currently submit an unbounded amount of work. This patch is added
along with its user to restrict the amount of work allowed to run in
parallel, and also is used to [crudely] serialise the multiple threads
attempting to allocate space in the vm when we completely exhaust that
address space. We need at least one fence-context id for each user, this
took the opportunity to generalise that to N ids for each user.
-Chris
Tvrtko Ursulin July 9, 2020, 11:59 a.m. UTC | #8
On 09/07/2020 12:07, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-07-09 12:01:29)
>>
>> On 08/07/2020 16:36, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2020-07-08 15:24:20)
>>>> And what is the effective behaviour you get with N contexts - emit N
>>>> concurrent operations and for N + 1 block in execbuf?
>>>
>>> Each context defines a timeline. A task is not ready to run until the
>>> task before it in its timeline is completed. So we don't block in
>>> execbuf, the scheduler waits until the request is ready before putting
>>> it into the HW queues -- i.e. the number chain of fences with everything
>>> that entails about ensuring it runs to completion [whether successfully
>>> or not, if not we then rely on the error propagation to limit the damage
>>> and report it back to the user if they kept a fence around to inspect].
>>
>> Okay but what is the benefit of N contexts in this series, before the
>> work is actually spread over ctx async width CPUs? Is there any? If not
>> I would prefer this patch is delayed until the time some actual
>> parallelism is ready to be added.
> 
> We currently submit an unbounded amount of work. This patch is added
> along with its user to restrict the amount of work allowed to run in
> parallel, and also is used to [crudely] serialise the multiple threads
> attempting to allocate space in the vm when we completely exhaust that
> address space. We need at least one fence-context id for each user, this
> took the opportunity to generalise that to N ids for each user.

Right, this is what I asked at the beginning - restricting amount of 
work run in parallel - does mean there is some "blocking"/serialisation 
during execbuf? Or it is all async but then what is restricted?

Regards,

Tvrtko
Chris Wilson July 9, 2020, 12:07 p.m. UTC | #9
Quoting Tvrtko Ursulin (2020-07-09 12:59:51)
> 
> On 09/07/2020 12:07, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-07-09 12:01:29)
> >>
> >> On 08/07/2020 16:36, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2020-07-08 15:24:20)
> >>>> And what is the effective behaviour you get with N contexts - emit N
> >>>> concurrent operations and for N + 1 block in execbuf?
> >>>
> >>> Each context defines a timeline. A task is not ready to run until the
> >>> task before it in its timeline is completed. So we don't block in
> >>> execbuf, the scheduler waits until the request is ready before putting
> >>> it into the HW queues -- i.e. the number chain of fences with everything
> >>> that entails about ensuring it runs to completion [whether successfully
> >>> or not, if not we then rely on the error propagation to limit the damage
> >>> and report it back to the user if they kept a fence around to inspect].
> >>
> >> Okay but what is the benefit of N contexts in this series, before the
> >> work is actually spread over ctx async width CPUs? Is there any? If not
> >> I would prefer this patch is delayed until the time some actual
> >> parallelism is ready to be added.
> > 
> > We currently submit an unbounded amount of work. This patch is added
> > along with its user to restrict the amount of work allowed to run in
> > parallel, and also is used to [crudely] serialise the multiple threads
> > attempting to allocate space in the vm when we completely exhaust that
> > address space. We need at least one fence-context id for each user, this
> > took the opportunity to generalise that to N ids for each user.
> 
> Right, this is what I asked at the beginning - restricting amount of 
> work run in parallel - does mean there is some "blocking"/serialisation 
> during execbuf? Or it is all async but then what is restricted?

It's all* async, so the number of workqueues we utilise is restricted,
and so limits the number of CPUs we allow the one context to spread
across with multiple execbufs.

*fsvo all.
-Chris
Tvrtko Ursulin July 13, 2020, 12:22 p.m. UTC | #10
On 09/07/2020 13:07, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-07-09 12:59:51)
>>
>> On 09/07/2020 12:07, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2020-07-09 12:01:29)
>>>>
>>>> On 08/07/2020 16:36, Chris Wilson wrote:
>>>>> Quoting Tvrtko Ursulin (2020-07-08 15:24:20)
>>>>>> And what is the effective behaviour you get with N contexts - emit N
>>>>>> concurrent operations and for N + 1 block in execbuf?
>>>>>
>>>>> Each context defines a timeline. A task is not ready to run until the
>>>>> task before it in its timeline is completed. So we don't block in
>>>>> execbuf, the scheduler waits until the request is ready before putting
>>>>> it into the HW queues -- i.e. the number chain of fences with everything
>>>>> that entails about ensuring it runs to completion [whether successfully
>>>>> or not, if not we then rely on the error propagation to limit the damage
>>>>> and report it back to the user if they kept a fence around to inspect].
>>>>
>>>> Okay but what is the benefit of N contexts in this series, before the
>>>> work is actually spread over ctx async width CPUs? Is there any? If not
>>>> I would prefer this patch is delayed until the time some actual
>>>> parallelism is ready to be added.
>>>
>>> We currently submit an unbounded amount of work. This patch is added
>>> along with its user to restrict the amount of work allowed to run in
>>> parallel, and also is used to [crudely] serialise the multiple threads
>>> attempting to allocate space in the vm when we completely exhaust that
>>> address space. We need at least one fence-context id for each user, this
>>> took the opportunity to generalise that to N ids for each user.
>>
>> Right, this is what I asked at the beginning - restricting amount of
>> work run in parallel - does mean there is some "blocking"/serialisation
>> during execbuf? Or it is all async but then what is restricted?
> 
> It's all* async, so the number of workqueues we utilise is restricted,
> and so limits the number of CPUs we allow the one context to spread
> across with multiple execbufs.
> 
> *fsvo all.

Okay.

Related topic - have we ever thought about what happens when fence 
context id wraps? I know it's 64-bit, and even with this patch giving 
out num_cpus blocks, it still feels impossible that it would wrap in 
normal use. But I wonder if malicious client could create/destroy 
contexts to cause a wrap and then how well we handle it. I am probably 
just underestimating today how big 64-bit is and how many ioctls that 
would require..

Regards,

Tvrtko
Chris Wilson July 14, 2020, 2:01 p.m. UTC | #11
Quoting Tvrtko Ursulin (2020-07-13 13:22:19)
> 
> On 09/07/2020 13:07, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-07-09 12:59:51)
> >>
> >> On 09/07/2020 12:07, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2020-07-09 12:01:29)
> >>>>
> >>>> On 08/07/2020 16:36, Chris Wilson wrote:
> >>>>> Quoting Tvrtko Ursulin (2020-07-08 15:24:20)
> >>>>>> And what is the effective behaviour you get with N contexts - emit N
> >>>>>> concurrent operations and for N + 1 block in execbuf?
> >>>>>
> >>>>> Each context defines a timeline. A task is not ready to run until the
> >>>>> task before it in its timeline is completed. So we don't block in
> >>>>> execbuf, the scheduler waits until the request is ready before putting
> >>>>> it into the HW queues -- i.e. the number chain of fences with everything
> >>>>> that entails about ensuring it runs to completion [whether successfully
> >>>>> or not, if not we then rely on the error propagation to limit the damage
> >>>>> and report it back to the user if they kept a fence around to inspect].
> >>>>
> >>>> Okay but what is the benefit of N contexts in this series, before the
> >>>> work is actually spread over ctx async width CPUs? Is there any? If not
> >>>> I would prefer this patch is delayed until the time some actual
> >>>> parallelism is ready to be added.
> >>>
> >>> We currently submit an unbounded amount of work. This patch is added
> >>> along with its user to restrict the amount of work allowed to run in
> >>> parallel, and also is used to [crudely] serialise the multiple threads
> >>> attempting to allocate space in the vm when we completely exhaust that
> >>> address space. We need at least one fence-context id for each user, this
> >>> took the opportunity to generalise that to N ids for each user.
> >>
> >> Right, this is what I asked at the beginning - restricting amount of
> >> work run in parallel - does mean there is some "blocking"/serialisation
> >> during execbuf? Or it is all async but then what is restricted?
> > 
> > It's all* async, so the number of workqueues we utilise is restricted,
> > and so limits the number of CPUs we allow the one context to spread
> > across with multiple execbufs.
> > 
> > *fsvo all.
> 
> Okay.
> 
> Related topic - have we ever thought about what happens when fence 
> context id wraps? I know it's 64-bit, and even with this patch giving 
> out num_cpus blocks, it still feels impossible that it would wrap in 
> normal use. But I wonder if malicious client could create/destroy 
> contexts to cause a wrap and then how well we handle it. I am probably 
> just underestimating today how big 64-bit is and how many ioctls that 
> would require..

I've had cold sweats. We will get silent glitches. I *don't* think we
will corrupt kernel data and oops, but we will corrupt user data.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 41784df51e58..bd68746327b3 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -714,6 +714,10 @@  __create_context(struct drm_i915_private *i915)
 	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
 	mutex_init(&ctx->mutex);
 
+	ctx->async.width = rounddown_pow_of_two(num_online_cpus());
+	ctx->async.context = dma_fence_context_alloc(ctx->async.width);
+	ctx->async.width--;
+
 	spin_lock_init(&ctx->stale.lock);
 	INIT_LIST_HEAD(&ctx->stale.engines);
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
index 3702b2fb27ab..e104ff0ae740 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
@@ -134,6 +134,12 @@  int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void *data,
 				       struct drm_file *file);
 
+static inline u64 i915_gem_context_async_id(struct i915_gem_context *ctx)
+{
+	return (ctx->async.context +
+		(atomic_fetch_inc(&ctx->async.cur) & ctx->async.width));
+}
+
 static inline struct i915_gem_context *
 i915_gem_context_get(struct i915_gem_context *ctx)
 {
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index ae14ca24a11f..52561f98000f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -85,6 +85,12 @@  struct i915_gem_context {
 
 	struct intel_timeline *timeline;
 
+	struct {
+		u64 context;
+		atomic_t cur;
+		unsigned int width;
+	} async;
+
 	/**
 	 * @vm: unique address space (GTT)
 	 *