diff mbox

drm/i915: Add process identifier to requests

Message ID 1423666214-6145-1-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Feb. 11, 2015, 2:50 p.m. UTC
We use the pid of the process which opened our device when
we track which was the culprit of the gpu hang. But as that
file descriptor might get inherited, we might blame the
wrong process when we record the error state.

Track process identifiers in requests to always find
the correct offender.

Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h       | 3 +++
 drivers/gpu/drm/i915/i915_gem.c       | 3 +++
 drivers/gpu/drm/i915/i915_gpu_error.c | 5 ++---
 3 files changed, 8 insertions(+), 3 deletions(-)

Comments

Chris Wilson Feb. 11, 2015, 3:29 p.m. UTC | #1
On Wed, Feb 11, 2015 at 04:50:14PM +0200, Mika Kuoppala wrote:
> We use the pid of the process which opened our device when
> we track which was the culprit of the gpu hang. But as that
> file descriptor might get inherited, we might blame the
> wrong process when we record the error state.
> 
> Track process identifiers in requests to always find
> the correct offender.
> 
> Cc: Kenneth Graunke <kenneth@whitecape.org>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       | 3 +++
>  drivers/gpu/drm/i915/i915_gem.c       | 3 +++
>  drivers/gpu/drm/i915/i915_gpu_error.c | 5 ++---
>  3 files changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c0b8644..9093654 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2153,6 +2153,9 @@ struct drm_i915_gem_request {
>  	/** file_priv list entry for this request */
>  	struct list_head client_list;
>  
> +	/** process identifier submitting this request */
> +	struct pid *pid;
> +
>  	uint32_t uniq;
>  
>  	/**
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c26d36c..47affaf 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2483,6 +2483,7 @@ int __i915_add_request(struct intel_engine_cs *ring,
>  	request->emitted_jiffies = jiffies;
>  	list_add_tail(&request->list, &ring->request_list);
>  	request->file_priv = NULL;
> +	request->pid = get_pid(task_pid(current));
>  
>  	if (file) {

I would suggest you only track processes for requests submitted by
userspace. Then if there is no associated pid, we know that the kernel
was in control (and not stuck figuring out if kworker was acting on
behalf of the user or the kernel).
-Chris
Shuang He Feb. 12, 2015, 6:49 a.m. UTC | #2
Tested-By: PRC QA PRTS (Patch Regression Test System Contact: shuang.he@intel.com)
Task id: 5760
-------------------------------------Summary-------------------------------------
Platform          Delta          drm-intel-nightly          Series Applied
PNV                 -1              282/282              281/282
ILK                                  313/313              313/313
SNB                                  309/323              309/323
IVB                                  380/380              380/380
BYT                                  296/296              296/296
HSW                 -1              425/425              424/425
BDW                 -1              318/318              317/318
-------------------------------------Detailed-------------------------------------
Platform  Test                                drm-intel-nightly          Series Applied
*PNV  igt_gen3_render_linear_blits      PASS(2, M23)      CRASH(1, M23)PASS(1, M23)
 HSW  igt_kms_flip_plain-flip-fb-recreate-interruptible      TIMEOUT(2, M20)PASS(1, M20)      TIMEOUT(1, M20)PASS(1, M20)
*BDW  igt_gem_gtt_hog      PASS(4, M30)      DMESG_WARN(1, M30)PASS(1, M30)
Note: You need to pay more attention to line start with '*'
John Harrison Feb. 13, 2015, 4:24 p.m. UTC | #3
On 11/02/2015 15:29, Chris Wilson wrote:
> On Wed, Feb 11, 2015 at 04:50:14PM +0200, Mika Kuoppala wrote:
>> We use the pid of the process which opened our device when
>> we track which was the culprit of the gpu hang. But as that
>> file descriptor might get inherited, we might blame the
>> wrong process when we record the error state.
>>
>> Track process identifiers in requests to always find
>> the correct offender.
>>
>> Cc: Kenneth Graunke <kenneth@whitecape.org>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h       | 3 +++
>>   drivers/gpu/drm/i915/i915_gem.c       | 3 +++
>>   drivers/gpu/drm/i915/i915_gpu_error.c | 5 ++---
>>   3 files changed, 8 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index c0b8644..9093654 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2153,6 +2153,9 @@ struct drm_i915_gem_request {
>>   	/** file_priv list entry for this request */
>>   	struct list_head client_list;
>>   
>> +	/** process identifier submitting this request */
>> +	struct pid *pid;
>> +
>>   	uint32_t uniq;
>>   
>>   	/**
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index c26d36c..47affaf 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2483,6 +2483,7 @@ int __i915_add_request(struct intel_engine_cs *ring,
>>   	request->emitted_jiffies = jiffies;
>>   	list_add_tail(&request->list, &ring->request_list);
>>   	request->file_priv = NULL;
>> +	request->pid = get_pid(task_pid(current));
>>   
>>   	if (file) {
> I would suggest you only track processes for requests submitted by
> userspace. Then if there is no associated pid, we know that the kernel
> was in control (and not stuck figuring out if kworker was acting on
> behalf of the user or the kernel).
> -Chris
>

With the GPU scheduler, the actual batch buffer submission via 
i915_add_request() could be disconnected from the original IOCTL call 
into the driver. Thus the recorded pid would be the kernel worker thread 
not the user land application. Is there any particular reason why the 
pid could not be recorded when the request is first created rather than 
when it is submitted?
Chris Wilson Feb. 13, 2015, 4:54 p.m. UTC | #4
On Fri, Feb 13, 2015 at 04:24:36PM +0000, John Harrison wrote:
> On 11/02/2015 15:29, Chris Wilson wrote:
> >On Wed, Feb 11, 2015 at 04:50:14PM +0200, Mika Kuoppala wrote:
> >>We use the pid of the process which opened our device when
> >>we track which was the culprit of the gpu hang. But as that
> >>file descriptor might get inherited, we might blame the
> >>wrong process when we record the error state.
> >>
> >>Track process identifiers in requests to always find
> >>the correct offender.
> >>
> >>Cc: Kenneth Graunke <kenneth@whitecape.org>
> >>Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >>Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> >>---
> >>  drivers/gpu/drm/i915/i915_drv.h       | 3 +++
> >>  drivers/gpu/drm/i915/i915_gem.c       | 3 +++
> >>  drivers/gpu/drm/i915/i915_gpu_error.c | 5 ++---
> >>  3 files changed, 8 insertions(+), 3 deletions(-)
> >>
> >>diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >>index c0b8644..9093654 100644
> >>--- a/drivers/gpu/drm/i915/i915_drv.h
> >>+++ b/drivers/gpu/drm/i915/i915_drv.h
> >>@@ -2153,6 +2153,9 @@ struct drm_i915_gem_request {
> >>  	/** file_priv list entry for this request */
> >>  	struct list_head client_list;
> >>+	/** process identifier submitting this request */
> >>+	struct pid *pid;
> >>+
> >>  	uint32_t uniq;
> >>  	/**
> >>diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >>index c26d36c..47affaf 100644
> >>--- a/drivers/gpu/drm/i915/i915_gem.c
> >>+++ b/drivers/gpu/drm/i915/i915_gem.c
> >>@@ -2483,6 +2483,7 @@ int __i915_add_request(struct intel_engine_cs *ring,
> >>  	request->emitted_jiffies = jiffies;
> >>  	list_add_tail(&request->list, &ring->request_list);
> >>  	request->file_priv = NULL;
> >>+	request->pid = get_pid(task_pid(current));
> >>  	if (file) {
> >I would suggest you only track processes for requests submitted by
> >userspace. Then if there is no associated pid, we know that the kernel
> >was in control (and not stuck figuring out if kworker was acting on
> >behalf of the user or the kernel).
> >-Chris
> >
> 
> With the GPU scheduler, the actual batch buffer submission via
> i915_add_request() could be disconnected from the original IOCTL
> call into the driver.

However, with requests i915_add_request() should have been replaced by a
engine/scheduler vfunc. The request is constucted from the execbuffer,
thus can be associated with the user process. The request is then
submitted intact to the scheduler for queueing for submission to the
hardware.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c0b8644..9093654 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2153,6 +2153,9 @@  struct drm_i915_gem_request {
 	/** file_priv list entry for this request */
 	struct list_head client_list;
 
+	/** process identifier submitting this request */
+	struct pid *pid;
+
 	uint32_t uniq;
 
 	/**
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c26d36c..47affaf 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2483,6 +2483,7 @@  int __i915_add_request(struct intel_engine_cs *ring,
 	request->emitted_jiffies = jiffies;
 	list_add_tail(&request->list, &ring->request_list);
 	request->file_priv = NULL;
+	request->pid = get_pid(task_pid(current));
 
 	if (file) {
 		struct drm_i915_file_private *file_priv = file->driver_priv;
@@ -2572,6 +2573,8 @@  static void i915_gem_free_request(struct drm_i915_gem_request *request)
 	list_del(&request->list);
 	i915_gem_request_remove_from_client(request);
 
+	put_pid(request->pid);
+
 	i915_gem_request_unreference(request);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 48ddbf4..a982849 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -994,12 +994,11 @@  static void i915_gem_record_rings(struct drm_device *dev,
 					i915_error_ggtt_object_create(dev_priv,
 							     ring->scratch.obj);
 
-			if (request->file_priv) {
+			if (request->pid) {
 				struct task_struct *task;
 
 				rcu_read_lock();
-				task = pid_task(request->file_priv->file->pid,
-						PIDTYPE_PID);
+				task = pid_task(request->pid, PIDTYPE_PID);
 				if (task) {
 					strcpy(error->ring[i].comm, task->comm);
 					error->ring[i].pid = task->pid;