diff mbox series

drm/i915: Skip the ERR_PTR error state

Message ID 20181207091640.709-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series drm/i915: Skip the ERR_PTR error state | expand

Commit Message

Chris Wilson Dec. 7, 2018, 9:16 a.m. UTC
Although commit fb6f0b64e455 ("drm/i915: Prevent machine hang from
Broxton's vtd w/a and error capture") applied cleanly after a 24 month
hiatus, the code had moved on with new methods for peeking and fetching
the captured gpu info. Make sure we catch all uses of the stashed error
state and avoid dereferencing the error pointer.

v2: Move error pointer determination into i915_gpu_capture_state

Fixes: fb6f0b64e455 ("drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 12 +++++++++---
 drivers/gpu/drm/i915/i915_gpu_error.c | 26 ++++++++++++++------------
 drivers/gpu/drm/i915/i915_sysfs.c     |  4 +++-
 3 files changed, 26 insertions(+), 16 deletions(-)

Comments

Tvrtko Ursulin Dec. 7, 2018, 10:45 a.m. UTC | #1
On 07/12/2018 09:16, Chris Wilson wrote:
> Although commit fb6f0b64e455 ("drm/i915: Prevent machine hang from
> Broxton's vtd w/a and error capture") applied cleanly after a 24 month
> hiatus, the code had moved on with new methods for peeking and fetching
> the captured gpu info. Make sure we catch all uses of the stashed error
> state and avoid dereferencing the error pointer.
> 
> v2: Move error pointer determination into i915_gpu_capture_state
> 
> Fixes: fb6f0b64e455 ("drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c   | 12 +++++++++---
>   drivers/gpu/drm/i915/i915_gpu_error.c | 26 ++++++++++++++------------
>   drivers/gpu/drm/i915/i915_sysfs.c     |  4 +++-
>   3 files changed, 26 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 38dcee1ca062..40a61ef9aac1 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -984,8 +984,8 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file)
>   	intel_runtime_pm_get(i915);
>   	gpu = i915_capture_gpu_state(i915);
>   	intel_runtime_pm_put(i915);
> -	if (!gpu)
> -		return -ENOMEM;
> +	if (IS_ERR(gpu))
> +		return PTR_ERR(gpu);
>   
>   	file->private_data = gpu;
>   	return 0;
> @@ -1018,7 +1018,13 @@ i915_error_state_write(struct file *filp,
>   
>   static int i915_error_state_open(struct inode *inode, struct file *file)
>   {
> -	file->private_data = i915_first_error_state(inode->i_private);
> +	struct i915_gpu_state *error;
> +
> +	error = i915_first_error_state(inode->i_private);
> +	if (IS_ERR(error))
> +		return PTR_ERR(error);
> +
> +	file->private_data  = error;
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 07465123c166..9dc6600544c5 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1907,9 +1907,16 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
>   {
>   	struct i915_gpu_state *error;
>   
> +	/* Check if GPU capture has been disabled */
> +	error = READ_ONCE(i915->gpu_error.first_error);
> +	if (IS_ERR(error))
> +		return error;
> +
>   	error = kzalloc(sizeof(*error), GFP_ATOMIC);
> -	if (!error)
> -		return NULL;
> +	if (!error) {
> +		i915_disable_error_state(i915, -ENOMEM);
> +		return ERR_PTR(-ENOMEM);
> +	}
>   
>   	kref_init(&error->ref);
>   	error->i915 = i915;
> @@ -1941,15 +1948,9 @@ void i915_capture_error_state(struct drm_i915_private *i915,
>   	if (!i915_modparams.error_capture)
>   		return;
>   
> -	if (READ_ONCE(i915->gpu_error.first_error))
> -		return;
> -

Will it now overwrite the first error on subsequent hangs? 
i915_capture_gpu_state only does early exit on stored error.

Regards,

Tvrtko

>   	error = i915_capture_gpu_state(i915);
> -	if (!error) {
> -		DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
> -		i915_disable_error_state(i915, -ENOMEM);
> +	if (IS_ERR(error))
>   		return;
> -	}
>   
>   	i915_error_capture_msg(i915, error, engine_mask, error_msg);
>   	DRM_INFO("%s\n", error->error_msg);
> @@ -1987,7 +1988,7 @@ i915_first_error_state(struct drm_i915_private *i915)
>   
>   	spin_lock_irq(&i915->gpu_error.lock);
>   	error = i915->gpu_error.first_error;
> -	if (error)
> +	if (!IS_ERR_OR_NULL(error))
>   		i915_gpu_state_get(error);
>   	spin_unlock_irq(&i915->gpu_error.lock);
>   
> @@ -2000,10 +2001,11 @@ void i915_reset_error_state(struct drm_i915_private *i915)
>   
>   	spin_lock_irq(&i915->gpu_error.lock);
>   	error = i915->gpu_error.first_error;
> -	i915->gpu_error.first_error = NULL;
> +	if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
> +		i915->gpu_error.first_error = NULL;
>   	spin_unlock_irq(&i915->gpu_error.lock);
>   
> -	if (!IS_ERR(error))
> +	if (!IS_ERR_OR_NULL(error))
>   		i915_gpu_state_put(error);
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
> index 535caebd9813..c0cfe7ae2ba5 100644
> --- a/drivers/gpu/drm/i915/i915_sysfs.c
> +++ b/drivers/gpu/drm/i915/i915_sysfs.c
> @@ -521,7 +521,9 @@ static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
>   	ssize_t ret;
>   
>   	gpu = i915_first_error_state(i915);
> -	if (gpu) {
> +	if (IS_ERR(gpu)) {
> +		ret = PTR_ERR(gpu);
> +	} else if (gpu) {
>   		ret = i915_gpu_state_copy_to_buffer(gpu, buf, off, count);
>   		i915_gpu_state_put(gpu);
>   	} else {
>
Chris Wilson Dec. 7, 2018, 11:03 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-12-07 10:45:17)
> > @@ -1941,15 +1948,9 @@ void i915_capture_error_state(struct drm_i915_private *i915,
> >       if (!i915_modparams.error_capture)
> >               return;
> >   
> > -     if (READ_ONCE(i915->gpu_error.first_error))
> > -             return;
> > -
> 
> Will it now overwrite the first error on subsequent hangs? 
> i915_capture_gpu_state only does early exit on stored error.

No, after we capture the state, we do another check under the spinlock
to make sure we don't overwrite existing state.

However, that early check is still useful to avoid capturing subsequent
errors and throwing them away.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 38dcee1ca062..40a61ef9aac1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -984,8 +984,8 @@  static int i915_gpu_info_open(struct inode *inode, struct file *file)
 	intel_runtime_pm_get(i915);
 	gpu = i915_capture_gpu_state(i915);
 	intel_runtime_pm_put(i915);
-	if (!gpu)
-		return -ENOMEM;
+	if (IS_ERR(gpu))
+		return PTR_ERR(gpu);
 
 	file->private_data = gpu;
 	return 0;
@@ -1018,7 +1018,13 @@  i915_error_state_write(struct file *filp,
 
 static int i915_error_state_open(struct inode *inode, struct file *file)
 {
-	file->private_data = i915_first_error_state(inode->i_private);
+	struct i915_gpu_state *error;
+
+	error = i915_first_error_state(inode->i_private);
+	if (IS_ERR(error))
+		return PTR_ERR(error);
+
+	file->private_data  = error;
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 07465123c166..9dc6600544c5 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1907,9 +1907,16 @@  i915_capture_gpu_state(struct drm_i915_private *i915)
 {
 	struct i915_gpu_state *error;
 
+	/* Check if GPU capture has been disabled */
+	error = READ_ONCE(i915->gpu_error.first_error);
+	if (IS_ERR(error))
+		return error;
+
 	error = kzalloc(sizeof(*error), GFP_ATOMIC);
-	if (!error)
-		return NULL;
+	if (!error) {
+		i915_disable_error_state(i915, -ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	kref_init(&error->ref);
 	error->i915 = i915;
@@ -1941,15 +1948,9 @@  void i915_capture_error_state(struct drm_i915_private *i915,
 	if (!i915_modparams.error_capture)
 		return;
 
-	if (READ_ONCE(i915->gpu_error.first_error))
-		return;
-
 	error = i915_capture_gpu_state(i915);
-	if (!error) {
-		DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
-		i915_disable_error_state(i915, -ENOMEM);
+	if (IS_ERR(error))
 		return;
-	}
 
 	i915_error_capture_msg(i915, error, engine_mask, error_msg);
 	DRM_INFO("%s\n", error->error_msg);
@@ -1987,7 +1988,7 @@  i915_first_error_state(struct drm_i915_private *i915)
 
 	spin_lock_irq(&i915->gpu_error.lock);
 	error = i915->gpu_error.first_error;
-	if (error)
+	if (!IS_ERR_OR_NULL(error))
 		i915_gpu_state_get(error);
 	spin_unlock_irq(&i915->gpu_error.lock);
 
@@ -2000,10 +2001,11 @@  void i915_reset_error_state(struct drm_i915_private *i915)
 
 	spin_lock_irq(&i915->gpu_error.lock);
 	error = i915->gpu_error.first_error;
-	i915->gpu_error.first_error = NULL;
+	if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
+		i915->gpu_error.first_error = NULL;
 	spin_unlock_irq(&i915->gpu_error.lock);
 
-	if (!IS_ERR(error))
+	if (!IS_ERR_OR_NULL(error))
 		i915_gpu_state_put(error);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
index 535caebd9813..c0cfe7ae2ba5 100644
--- a/drivers/gpu/drm/i915/i915_sysfs.c
+++ b/drivers/gpu/drm/i915/i915_sysfs.c
@@ -521,7 +521,9 @@  static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
 	ssize_t ret;
 
 	gpu = i915_first_error_state(i915);
-	if (gpu) {
+	if (IS_ERR(gpu)) {
+		ret = PTR_ERR(gpu);
+	} else if (gpu) {
 		ret = i915_gpu_state_copy_to_buffer(gpu, buf, off, count);
 		i915_gpu_state_put(gpu);
 	} else {