diff mbox series

[v4,1/5] drm/i915: Add engine reset count in get-reset-stats ioctl

Message ID 20190221025820.28447-2-carlos.santa@intel.com (mailing list archive)
State New, archived
Headers show
Series GEN8+ GPU Watchdog Reset Support | expand

Commit Message

Santa, Carlos Feb. 21, 2019, 2:58 a.m. UTC
From: Michel Thierry <michel.thierry@intel.com>

Users/tests relying on the total reset count will start seeing a smaller
number since most of the hangs can be handled by engine reset.
Note that if reset engine x, context a running on engine y will be unaware
and unaffected.

To start the discussion, include just a total engine reset count. If it
is deemed useful, it can be extended to report each engine separately.

Our igt's gem_reset_stats test will need changes to ignore the pad field,
since it can now return reset_engine_count.

v2: s/engine_reset/reset_engine/, use union in uapi to not break compatibility.
v3: Keep rejecting attempts to use pad as input (Antonio)
v4: Rebased.
v5: Rebased.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Antonio Argenziano <antonio.argenziano@intel.com>
Cc: Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
Signed-off-by: Carlos Santa <carlos.santa@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++--
 include/uapi/drm/i915_drm.h             |  6 +++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

Comments

Tvrtko Ursulin Feb. 25, 2019, 1:34 p.m. UTC | #1
On 21/02/2019 02:58, Carlos Santa wrote:
> From: Michel Thierry <michel.thierry@intel.com>
> 
> Users/tests relying on the total reset count will start seeing a smaller
> number since most of the hangs can be handled by engine reset.
> Note that if reset engine x, context a running on engine y will be unaware
> and unaffected.
> 
> To start the discussion, include just a total engine reset count. If it
> is deemed useful, it can be extended to report each engine separately.
> 
> Our igt's gem_reset_stats test will need changes to ignore the pad field,
> since it can now return reset_engine_count.
> 
> v2: s/engine_reset/reset_engine/, use union in uapi to not break compatibility.
> v3: Keep rejecting attempts to use pad as input (Antonio)
> v4: Rebased.
> v5: Rebased.
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Antonio Argenziano <antonio.argenziano@intel.com>
> Cc: Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++--
>   include/uapi/drm/i915_drm.h             |  6 +++++-
>   2 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 459f8eae1c39..cbfe8f2eb3f2 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -1889,6 +1889,8 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
>   	struct drm_i915_private *dev_priv = to_i915(dev);
>   	struct drm_i915_reset_stats *args = data;
>   	struct i915_gem_context *ctx;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
>   	int ret;
>   
>   	if (args->flags || args->pad)
> @@ -1907,10 +1909,16 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
>   	 * we should wrap the hangstats with a seqlock.
>   	 */
>   
> -	if (capable(CAP_SYS_ADMIN))
> +	if (capable(CAP_SYS_ADMIN)) {
>   		args->reset_count = i915_reset_count(&dev_priv->gpu_error);
> -	else
> +		for_each_engine(engine, dev_priv, id)
> +			args->reset_engine_count +=
> +				i915_reset_engine_count(&dev_priv->gpu_error,
> +							engine);

If access to global GPU reset count is privileged, why is access to 
global engine reset count not? It seems to be fundamentally same level 
of data leakage.

If we wanted to provide some numbers to unprivileged users I think we 
would need to store some counters per file_priv/context and return those 
when !CAP_SYS_ADMIN.

> +	} else {
>   		args->reset_count = 0;
> +		args->reset_engine_count = 0;
> +	}
>   
>   	args->batch_active = atomic_read(&ctx->guilty_count);
>   	args->batch_pending = atomic_read(&ctx->active_count);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index cc03ef9f885f..3f2c89740b0e 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1642,7 +1642,11 @@ struct drm_i915_reset_stats {
>   	/* Number of batches lost pending for execution, for this context */
>   	__u32 batch_pending;
>   
> -	__u32 pad;
> +	union {
> +		__u32 pad;
> +		/* Engine resets since boot/module reload, for all contexts */
> +		__u32 reset_engine_count;
> +	};

Chris pointed out in some other review that anonymous unions are not 
friendly towards C++ compilers.

Not sure what is the best option here. Renaming the field could break 
old userspace building against newer headers. Is that acceptable?

>   };
>   
>   struct drm_i915_gem_userptr {
> 

Regards,

Tvrtko
Santa, Carlos March 6, 2019, 11:08 p.m. UTC | #2
On Mon, 2019-02-25 at 13:34 +0000, Tvrtko Ursulin wrote:
> On 21/02/2019 02:58, Carlos Santa wrote:
> > From: Michel Thierry <michel.thierry@intel.com>
> > 
> > Users/tests relying on the total reset count will start seeing a
> > smaller
> > number since most of the hangs can be handled by engine reset.
> > Note that if reset engine x, context a running on engine y will be
> > unaware
> > and unaffected.
> > 
> > To start the discussion, include just a total engine reset count.
> > If it
> > is deemed useful, it can be extended to report each engine
> > separately.
> > 
> > Our igt's gem_reset_stats test will need changes to ignore the pad
> > field,
> > since it can now return reset_engine_count.
> > 
> > v2: s/engine_reset/reset_engine/, use union in uapi to not break
> > compatibility.
> > v3: Keep rejecting attempts to use pad as input (Antonio)
> > v4: Rebased.
> > v5: Rebased.
> > 
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Antonio Argenziano <antonio.argenziano@intel.com>
> > Cc: Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> > Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++--
> >   include/uapi/drm/i915_drm.h             |  6 +++++-
> >   2 files changed, 15 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> > b/drivers/gpu/drm/i915/i915_gem_context.c
> > index 459f8eae1c39..cbfe8f2eb3f2 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -1889,6 +1889,8 @@ int i915_gem_context_reset_stats_ioctl(struct
> > drm_device *dev,
> >   	struct drm_i915_private *dev_priv = to_i915(dev);
> >   	struct drm_i915_reset_stats *args = data;
> >   	struct i915_gem_context *ctx;
> > +	struct intel_engine_cs *engine;
> > +	enum intel_engine_id id;
> >   	int ret;
> >   
> >   	if (args->flags || args->pad)
> > @@ -1907,10 +1909,16 @@ int
> > i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
> >   	 * we should wrap the hangstats with a seqlock.
> >   	 */
> >   
> > -	if (capable(CAP_SYS_ADMIN))
> > +	if (capable(CAP_SYS_ADMIN)) {
> >   		args->reset_count = i915_reset_count(&dev_priv-
> > >gpu_error);
> > -	else
> > +		for_each_engine(engine, dev_priv, id)
> > +			args->reset_engine_count +=
> > +				i915_reset_engine_count(&dev_priv-
> > >gpu_error,
> > +							engine);
> 
> If access to global GPU reset count is privileged, why is access to 
> global engine reset count not? It seems to be fundamentally same
> level 
> of data leakage.

But access to global engine reset count (i915_reset_engine_count) is
indeed priviledged. They both are inside if(CAP_SYS_ADMIN){...}, or
maybe I am missing something?

> 
> If we wanted to provide some numbers to unprivileged users I think
> we 
> would need to store some counters per file_priv/context and return
> those 
> when !CAP_SYS_ADMIN.

The question would be why access to global GPU reset count is
priviledged then? I can't think of a reason why it should be
priviledged. I think the new counter (per engine) should fall in the
same category as the global GPU reset one, right? So, can we make them
both unpriviledged? 


> 
> > +	} else {
> >   		args->reset_count = 0;
> > +		args->reset_engine_count = 0;
> > +	}
> >   
> >   	args->batch_active = atomic_read(&ctx->guilty_count);
> >   	args->batch_pending = atomic_read(&ctx->active_count);
> > diff --git a/include/uapi/drm/i915_drm.h
> > b/include/uapi/drm/i915_drm.h
> > index cc03ef9f885f..3f2c89740b0e 100644
> > --- a/include/uapi/drm/i915_drm.h
> > +++ b/include/uapi/drm/i915_drm.h
> > @@ -1642,7 +1642,11 @@ struct drm_i915_reset_stats {
> >   	/* Number of batches lost pending for execution, for this
> > context */
> >   	__u32 batch_pending;
> >   
> > -	__u32 pad;
> > +	union {
> > +		__u32 pad;
> > +		/* Engine resets since boot/module reload, for all
> > contexts */
> > +		__u32 reset_engine_count;
> > +	};
> 
> Chris pointed out in some other review that anonymous unions are not 
> friendly towards C++ compilers.
> 
> Not sure what is the best option here. Renaming the field could
> break 
> old userspace building against newer headers. Is that acceptable?
> 

I dug up some old comments from Chris and he stated that recycling the
union like that would be a bad idea since that would make the pad field
an output only parameter thus invalidating gem_reset_stats...

Why can't we simply add a new field __u32 reset_engine_count; as part
of the drm_i915_reset_stats struct?

Regards,
Carlos

> >   };
> >   
> >   struct drm_i915_gem_userptr {
> > 
> 
> Regards,
> 
> Tvrtko
Tvrtko Ursulin March 7, 2019, 7:27 a.m. UTC | #3
On 06/03/2019 23:08, Carlos Santa wrote:
> On Mon, 2019-02-25 at 13:34 +0000, Tvrtko Ursulin wrote:
>> On 21/02/2019 02:58, Carlos Santa wrote:
>>> From: Michel Thierry <michel.thierry@intel.com>
>>>
>>> Users/tests relying on the total reset count will start seeing a
>>> smaller
>>> number since most of the hangs can be handled by engine reset.
>>> Note that if reset engine x, context a running on engine y will be
>>> unaware
>>> and unaffected.
>>>
>>> To start the discussion, include just a total engine reset count.
>>> If it
>>> is deemed useful, it can be extended to report each engine
>>> separately.
>>>
>>> Our igt's gem_reset_stats test will need changes to ignore the pad
>>> field,
>>> since it can now return reset_engine_count.
>>>
>>> v2: s/engine_reset/reset_engine/, use union in uapi to not break
>>> compatibility.
>>> v3: Keep rejecting attempts to use pad as input (Antonio)
>>> v4: Rebased.
>>> v5: Rebased.
>>>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>>> Cc: Antonio Argenziano <antonio.argenziano@intel.com>
>>> Cc: Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>> Signed-off-by: Carlos Santa <carlos.santa@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++--
>>>    include/uapi/drm/i915_drm.h             |  6 +++++-
>>>    2 files changed, 15 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
>>> b/drivers/gpu/drm/i915/i915_gem_context.c
>>> index 459f8eae1c39..cbfe8f2eb3f2 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>>> @@ -1889,6 +1889,8 @@ int i915_gem_context_reset_stats_ioctl(struct
>>> drm_device *dev,
>>>    	struct drm_i915_private *dev_priv = to_i915(dev);
>>>    	struct drm_i915_reset_stats *args = data;
>>>    	struct i915_gem_context *ctx;
>>> +	struct intel_engine_cs *engine;
>>> +	enum intel_engine_id id;
>>>    	int ret;
>>>    
>>>    	if (args->flags || args->pad)
>>> @@ -1907,10 +1909,16 @@ int
>>> i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
>>>    	 * we should wrap the hangstats with a seqlock.
>>>    	 */
>>>    
>>> -	if (capable(CAP_SYS_ADMIN))
>>> +	if (capable(CAP_SYS_ADMIN)) {
>>>    		args->reset_count = i915_reset_count(&dev_priv-
>>>> gpu_error);
>>> -	else
>>> +		for_each_engine(engine, dev_priv, id)
>>> +			args->reset_engine_count +=
>>> +				i915_reset_engine_count(&dev_priv-
>>>> gpu_error,
>>> +							engine);
>>
>> If access to global GPU reset count is privileged, why is access to
>> global engine reset count not? It seems to be fundamentally same
>> level
>> of data leakage.
> 
> But access to global engine reset count (i915_reset_engine_count) is
> indeed priviledged. They both are inside if(CAP_SYS_ADMIN){...}, or
> maybe I am missing something?

Looks like I misread the diff, sorry. Been processing a lot of patches 
lately.

Regards,

Tvrtko

>>
>> If we wanted to provide some numbers to unprivileged users I think
>> we
>> would need to store some counters per file_priv/context and return
>> those
>> when !CAP_SYS_ADMIN.
> 
> The question would be why access to global GPU reset count is
> priviledged then? I can't think of a reason why it should be
> priviledged. I think the new counter (per engine) should fall in the
> same category as the global GPU reset one, right? So, can we make them
> both unpriviledged?
> 
> 
>>
>>> +	} else {
>>>    		args->reset_count = 0;
>>> +		args->reset_engine_count = 0;
>>> +	}
>>>    
>>>    	args->batch_active = atomic_read(&ctx->guilty_count);
>>>    	args->batch_pending = atomic_read(&ctx->active_count);
>>> diff --git a/include/uapi/drm/i915_drm.h
>>> b/include/uapi/drm/i915_drm.h
>>> index cc03ef9f885f..3f2c89740b0e 100644
>>> --- a/include/uapi/drm/i915_drm.h
>>> +++ b/include/uapi/drm/i915_drm.h
>>> @@ -1642,7 +1642,11 @@ struct drm_i915_reset_stats {
>>>    	/* Number of batches lost pending for execution, for this
>>> context */
>>>    	__u32 batch_pending;
>>>    
>>> -	__u32 pad;
>>> +	union {
>>> +		__u32 pad;
>>> +		/* Engine resets since boot/module reload, for all
>>> contexts */
>>> +		__u32 reset_engine_count;
>>> +	};
>>
>> Chris pointed out in some other review that anonymous unions are not
>> friendly towards C++ compilers.
>>
>> Not sure what is the best option here. Renaming the field could
>> break
>> old userspace building against newer headers. Is that acceptable?
>>
> 
> I dug up some old comments from Chris and he stated that recycling the
> union like that would be a bad idea since that would make the pad field
> an output only parameter thus invalidating gem_reset_stats...
> 
> Why can't we simply add a new field __u32 reset_engine_count; as part
> of the drm_i915_reset_stats struct?
> 
> Regards,
> Carlos
> 
>>>    };
>>>    
>>>    struct drm_i915_gem_userptr {
>>>
>>
>> Regards,
>>
>> Tvrtko
> 
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 459f8eae1c39..cbfe8f2eb3f2 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1889,6 +1889,8 @@  int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_i915_reset_stats *args = data;
 	struct i915_gem_context *ctx;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 	int ret;
 
 	if (args->flags || args->pad)
@@ -1907,10 +1909,16 @@  int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
 	 * we should wrap the hangstats with a seqlock.
 	 */
 
-	if (capable(CAP_SYS_ADMIN))
+	if (capable(CAP_SYS_ADMIN)) {
 		args->reset_count = i915_reset_count(&dev_priv->gpu_error);
-	else
+		for_each_engine(engine, dev_priv, id)
+			args->reset_engine_count +=
+				i915_reset_engine_count(&dev_priv->gpu_error,
+							engine);
+	} else {
 		args->reset_count = 0;
+		args->reset_engine_count = 0;
+	}
 
 	args->batch_active = atomic_read(&ctx->guilty_count);
 	args->batch_pending = atomic_read(&ctx->active_count);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index cc03ef9f885f..3f2c89740b0e 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1642,7 +1642,11 @@  struct drm_i915_reset_stats {
 	/* Number of batches lost pending for execution, for this context */
 	__u32 batch_pending;
 
-	__u32 pad;
+	union {
+		__u32 pad;
+		/* Engine resets since boot/module reload, for all contexts */
+		__u32 reset_engine_count;
+	};
 };
 
 struct drm_i915_gem_userptr {