diff mbox

[2/7] drm/msm: add hang_debug module param

Message ID 1394470062-27442-3-git-send-email-robdclark@gmail.com (mailing list archive)
State Accepted
Headers show

Commit Message

Rob Clark March 10, 2014, 4:47 p.m. UTC
msm.hang_debug=y will dump out current register values if the gpu locks
up, for easier debugging.

Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 drivers/gpu/drm/msm/adreno/a3xx_gpu.c   | 34 ++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 18 +++++++++++++++++
 drivers/gpu/drm/msm/adreno/adreno_gpu.h |  1 +
 3 files changed, 52 insertions(+), 1 deletion(-)

Comments

Jordan Crouse March 10, 2014, 8:17 p.m. UTC | #1
On 03/10/2014 10:47 AM, Rob Clark wrote:
> msm.hang_debug=y will dump out current register values if the gpu locks
> up, for easier debugging.
>
> Signed-off-by: Rob Clark <robdclark@gmail.com>
> ---
>   drivers/gpu/drm/msm/adreno/a3xx_gpu.c   | 34 ++++++++++++++++++++++++++++++++-
>   drivers/gpu/drm/msm/adreno/adreno_gpu.c | 18 +++++++++++++++++
>   drivers/gpu/drm/msm/adreno/adreno_gpu.h |  1 +
>   3 files changed, 52 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
> index 461df93..8b6fb84 100644
> --- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
> @@ -35,6 +35,12 @@
>   	 A3XX_INT0_CP_AHB_ERROR_HALT |     \
>   	 A3XX_INT0_UCHE_OOB_ACCESS)
>
> +
> +static bool hang_debug = false;
> +MODULE_PARM_DESC(hang_debug, "Dump registers when hang is detected (can be slow!)");
> +module_param_named(hang_debug, hang_debug, bool, 0600);
> +static void a3xx_dump(struct msm_gpu *gpu);
> +
>   static struct platform_device *a3xx_pdev;
>
>   static void a3xx_me_init(struct msm_gpu *gpu)
> @@ -291,6 +297,9 @@ static int a3xx_hw_init(struct msm_gpu *gpu)
>
>   static void a3xx_recover(struct msm_gpu *gpu)
>   {
> +	/* dump registers before resetting gpu, if enabled: */
> +	if (hang_debug)
> +		a3xx_dump(gpu);

You will need to be careful here - not all the registers are suitable for reading back.
On some flavors of A330 in particular some HLSQ registers reads will bring the system
down. You can see the gory details in the KGSL driver.  If this works for you for now
great but fair warning that some cores might have different experiences.

>   	gpu_write(gpu, REG_A3XX_RBBM_SW_RESET_CMD, 1);
>   	gpu_read(gpu, REG_A3XX_RBBM_SW_RESET_CMD);
>   	gpu_write(gpu, REG_A3XX_RBBM_SW_RESET_CMD, 0);
> @@ -352,7 +361,6 @@ static irqreturn_t a3xx_irq(struct msm_gpu *gpu)
>   	return IRQ_HANDLED;
>   }
>
> -#ifdef CONFIG_DEBUG_FS
>   static const unsigned int a3xx_registers[] = {
>   	0x0000, 0x0002, 0x0010, 0x0012, 0x0018, 0x0018, 0x0020, 0x0027,
>   	0x0029, 0x002b, 0x002e, 0x0033, 0x0040, 0x0042, 0x0050, 0x005c,
> @@ -392,6 +400,7 @@ static const unsigned int a3xx_registers[] = {
>   	0x303c, 0x303c, 0x305e, 0x305f,
>   };
>
> +#ifdef CONFIG_DEBUG_FS
>   static void a3xx_show(struct msm_gpu *gpu, struct seq_file *m)
>   {
>   	int i;
> @@ -415,6 +424,29 @@ static void a3xx_show(struct msm_gpu *gpu, struct seq_file *m)
>   }
>   #endif
>
> +/* would be nice to not have to duplicate the _show() stuff with printk(): */
> +static void a3xx_dump(struct msm_gpu *gpu)
> +{
> +	int i;
> +
> +	adreno_dump(gpu);
> +	printk("status:   %08x\n",
> +			gpu_read(gpu, REG_A3XX_RBBM_STATUS));
> +
> +	/* dump these out in a form that can be parsed by demsm: */
> +	printk("IO:region %s 00000000 00020000\n", gpu->name);
> +	for (i = 0; i < ARRAY_SIZE(a3xx_registers); i += 2) {
> +		uint32_t start = a3xx_registers[i];
> +		uint32_t end   = a3xx_registers[i+1];
> +		uint32_t addr;
> +
> +		for (addr = start; addr <= end; addr++) {
> +			uint32_t val = gpu_read(gpu, addr);
> +			printk("IO:R %08x %08x\n", addr<<2, val);
> +		}
> +	}
> +}
> +
>   static const struct adreno_gpu_funcs funcs = {
>   	.base = {
>   		.get_param = adreno_get_param,
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> index d321099..cf6eb97 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> @@ -260,6 +260,24 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
>   }
>   #endif
>
> +/* would be nice to not have to duplicate the _show() stuff with printk(): */
> +void adreno_dump(struct msm_gpu *gpu)
> +{
> +	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> +
> +	printk("revision: %d (%d.%d.%d.%d)\n",
> +			adreno_gpu->info->revn, adreno_gpu->rev.core,
> +			adreno_gpu->rev.major, adreno_gpu->rev.minor,
> +			adreno_gpu->rev.patchid);
> +
> +	printk("fence:    %d/%d\n", adreno_gpu->memptrs->fence,
> +			gpu->submitted_fence);
> +	printk("rptr:     %d\n", adreno_gpu->memptrs->rptr);
> +	printk("wptr:     %d\n", adreno_gpu->memptrs->wptr);
> +	printk("rb wptr:  %d\n", get_wptr(gpu->rb));
> +
> +}
> +

We should introduce you to the gooey wonderfulness of the snapshot :)  All in
due time.

Jordan
Rob Clark March 10, 2014, 8:27 p.m. UTC | #2
On Mon, Mar 10, 2014 at 4:17 PM, Jordan Crouse <jcrouse@codeaurora.org> wrote:
> On 03/10/2014 10:47 AM, Rob Clark wrote:
>>
>> msm.hang_debug=y will dump out current register values if the gpu locks
>> up, for easier debugging.
>>
>> Signed-off-by: Rob Clark <robdclark@gmail.com>
>> ---
>>   drivers/gpu/drm/msm/adreno/a3xx_gpu.c   | 34
>> ++++++++++++++++++++++++++++++++-
>>   drivers/gpu/drm/msm/adreno/adreno_gpu.c | 18 +++++++++++++++++
>>   drivers/gpu/drm/msm/adreno/adreno_gpu.h |  1 +
>>   3 files changed, 52 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
>> b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
>> index 461df93..8b6fb84 100644
>> --- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
>> @@ -35,6 +35,12 @@
>>          A3XX_INT0_CP_AHB_ERROR_HALT |     \
>>          A3XX_INT0_UCHE_OOB_ACCESS)
>>
>> +
>> +static bool hang_debug = false;
>> +MODULE_PARM_DESC(hang_debug, "Dump registers when hang is detected (can
>> be slow!)");
>> +module_param_named(hang_debug, hang_debug, bool, 0600);
>> +static void a3xx_dump(struct msm_gpu *gpu);
>> +
>>   static struct platform_device *a3xx_pdev;
>>
>>   static void a3xx_me_init(struct msm_gpu *gpu)
>> @@ -291,6 +297,9 @@ static int a3xx_hw_init(struct msm_gpu *gpu)
>>
>>   static void a3xx_recover(struct msm_gpu *gpu)
>>   {
>> +       /* dump registers before resetting gpu, if enabled: */
>> +       if (hang_debug)
>> +               a3xx_dump(gpu);
>
>
> You will need to be careful here - not all the registers are suitable for
> reading back.
> On some flavors of A330 in particular some HLSQ registers reads will bring
> the system
> down. You can see the gory details in the KGSL driver.  If this works for
> you for now
> great but fair warning that some cores might have different experiences.

From what I've seen, certain types of GPU lockup (for example shader
const overrun), even on a320, leave things such that reading HLSQ
registers does bad things, which is why I don't enable the bootarg by
default.

If there is a way to *detect* when HLSQ is in a bad state and skip
those registers, that would be awesome.  I guess I should have another
look at latest kgsl to see if there is some trick I missed.

Or otherwise I should probably make the bootarg a bitmask so you can
choose to enable/disable certain groups of registers.

>
>>         gpu_write(gpu, REG_A3XX_RBBM_SW_RESET_CMD, 1);
>>         gpu_read(gpu, REG_A3XX_RBBM_SW_RESET_CMD);
>>         gpu_write(gpu, REG_A3XX_RBBM_SW_RESET_CMD, 0);
>> @@ -352,7 +361,6 @@ static irqreturn_t a3xx_irq(struct msm_gpu *gpu)
>>         return IRQ_HANDLED;
>>   }
>>
>> -#ifdef CONFIG_DEBUG_FS
>>   static const unsigned int a3xx_registers[] = {
>>         0x0000, 0x0002, 0x0010, 0x0012, 0x0018, 0x0018, 0x0020, 0x0027,
>>         0x0029, 0x002b, 0x002e, 0x0033, 0x0040, 0x0042, 0x0050, 0x005c,
>> @@ -392,6 +400,7 @@ static const unsigned int a3xx_registers[] = {
>>         0x303c, 0x303c, 0x305e, 0x305f,
>>   };
>>
>> +#ifdef CONFIG_DEBUG_FS
>>   static void a3xx_show(struct msm_gpu *gpu, struct seq_file *m)
>>   {
>>         int i;
>> @@ -415,6 +424,29 @@ static void a3xx_show(struct msm_gpu *gpu, struct
>> seq_file *m)
>>   }
>>   #endif
>>
>> +/* would be nice to not have to duplicate the _show() stuff with
>> printk(): */
>> +static void a3xx_dump(struct msm_gpu *gpu)
>> +{
>> +       int i;
>> +
>> +       adreno_dump(gpu);
>> +       printk("status:   %08x\n",
>> +                       gpu_read(gpu, REG_A3XX_RBBM_STATUS));
>> +
>> +       /* dump these out in a form that can be parsed by demsm: */
>> +       printk("IO:region %s 00000000 00020000\n", gpu->name);
>> +       for (i = 0; i < ARRAY_SIZE(a3xx_registers); i += 2) {
>> +               uint32_t start = a3xx_registers[i];
>> +               uint32_t end   = a3xx_registers[i+1];
>> +               uint32_t addr;
>> +
>> +               for (addr = start; addr <= end; addr++) {
>> +                       uint32_t val = gpu_read(gpu, addr);
>> +                       printk("IO:R %08x %08x\n", addr<<2, val);
>> +               }
>> +       }
>> +}
>> +
>>   static const struct adreno_gpu_funcs funcs = {
>>         .base = {
>>                 .get_param = adreno_get_param,
>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> index d321099..cf6eb97 100644
>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> @@ -260,6 +260,24 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file
>> *m)
>>   }
>>   #endif
>>
>> +/* would be nice to not have to duplicate the _show() stuff with
>> printk(): */
>> +void adreno_dump(struct msm_gpu *gpu)
>> +{
>> +       struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>> +
>> +       printk("revision: %d (%d.%d.%d.%d)\n",
>> +                       adreno_gpu->info->revn, adreno_gpu->rev.core,
>> +                       adreno_gpu->rev.major, adreno_gpu->rev.minor,
>> +                       adreno_gpu->rev.patchid);
>> +
>> +       printk("fence:    %d/%d\n", adreno_gpu->memptrs->fence,
>> +                       gpu->submitted_fence);
>> +       printk("rptr:     %d\n", adreno_gpu->memptrs->rptr);
>> +       printk("wptr:     %d\n", adreno_gpu->memptrs->wptr);
>> +       printk("rb wptr:  %d\n", get_wptr(gpu->rb));
>> +
>> +}
>> +
>
>
> We should introduce you to the gooey wonderfulness of the snapshot :)  All
> in
> due time.

if I get the rd dumping upstream (for which I first need to fix some
chicken/egg problem in drm debugfs init), that plus scratch register
writes (for tile and draw # so I can "triangulate" the position), it's
actually not a bad way to debug.

(although I suppose there are some juicy status bits in the register
dump which would be pretty helpful too if I knew what they meant :-P)

BR,
-R


> Jordan
> --
> The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> hosted by The Linux Foundation
diff mbox

Patch

diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
index 461df93..8b6fb84 100644
--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -35,6 +35,12 @@ 
 	 A3XX_INT0_CP_AHB_ERROR_HALT |     \
 	 A3XX_INT0_UCHE_OOB_ACCESS)
 
+
+static bool hang_debug = false;
+MODULE_PARM_DESC(hang_debug, "Dump registers when hang is detected (can be slow!)");
+module_param_named(hang_debug, hang_debug, bool, 0600);
+static void a3xx_dump(struct msm_gpu *gpu);
+
 static struct platform_device *a3xx_pdev;
 
 static void a3xx_me_init(struct msm_gpu *gpu)
@@ -291,6 +297,9 @@  static int a3xx_hw_init(struct msm_gpu *gpu)
 
 static void a3xx_recover(struct msm_gpu *gpu)
 {
+	/* dump registers before resetting gpu, if enabled: */
+	if (hang_debug)
+		a3xx_dump(gpu);
 	gpu_write(gpu, REG_A3XX_RBBM_SW_RESET_CMD, 1);
 	gpu_read(gpu, REG_A3XX_RBBM_SW_RESET_CMD);
 	gpu_write(gpu, REG_A3XX_RBBM_SW_RESET_CMD, 0);
@@ -352,7 +361,6 @@  static irqreturn_t a3xx_irq(struct msm_gpu *gpu)
 	return IRQ_HANDLED;
 }
 
-#ifdef CONFIG_DEBUG_FS
 static const unsigned int a3xx_registers[] = {
 	0x0000, 0x0002, 0x0010, 0x0012, 0x0018, 0x0018, 0x0020, 0x0027,
 	0x0029, 0x002b, 0x002e, 0x0033, 0x0040, 0x0042, 0x0050, 0x005c,
@@ -392,6 +400,7 @@  static const unsigned int a3xx_registers[] = {
 	0x303c, 0x303c, 0x305e, 0x305f,
 };
 
+#ifdef CONFIG_DEBUG_FS
 static void a3xx_show(struct msm_gpu *gpu, struct seq_file *m)
 {
 	int i;
@@ -415,6 +424,29 @@  static void a3xx_show(struct msm_gpu *gpu, struct seq_file *m)
 }
 #endif
 
+/* would be nice to not have to duplicate the _show() stuff with printk(): */
+static void a3xx_dump(struct msm_gpu *gpu)
+{
+	int i;
+
+	adreno_dump(gpu);
+	printk("status:   %08x\n",
+			gpu_read(gpu, REG_A3XX_RBBM_STATUS));
+
+	/* dump these out in a form that can be parsed by demsm: */
+	printk("IO:region %s 00000000 00020000\n", gpu->name);
+	for (i = 0; i < ARRAY_SIZE(a3xx_registers); i += 2) {
+		uint32_t start = a3xx_registers[i];
+		uint32_t end   = a3xx_registers[i+1];
+		uint32_t addr;
+
+		for (addr = start; addr <= end; addr++) {
+			uint32_t val = gpu_read(gpu, addr);
+			printk("IO:R %08x %08x\n", addr<<2, val);
+		}
+	}
+}
+
 static const struct adreno_gpu_funcs funcs = {
 	.base = {
 		.get_param = adreno_get_param,
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index d321099..cf6eb97 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -260,6 +260,24 @@  void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 }
 #endif
 
+/* would be nice to not have to duplicate the _show() stuff with printk(): */
+void adreno_dump(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+
+	printk("revision: %d (%d.%d.%d.%d)\n",
+			adreno_gpu->info->revn, adreno_gpu->rev.core,
+			adreno_gpu->rev.major, adreno_gpu->rev.minor,
+			adreno_gpu->rev.patchid);
+
+	printk("fence:    %d/%d\n", adreno_gpu->memptrs->fence,
+			gpu->submitted_fence);
+	printk("rptr:     %d\n", adreno_gpu->memptrs->rptr);
+	printk("wptr:     %d\n", adreno_gpu->memptrs->wptr);
+	printk("rb wptr:  %d\n", get_wptr(gpu->rb));
+
+}
+
 void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index ca11ea4..e16200d 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -114,6 +114,7 @@  void adreno_idle(struct msm_gpu *gpu);
 #ifdef CONFIG_DEBUG_FS
 void adreno_show(struct msm_gpu *gpu, struct seq_file *m);
 #endif
+void adreno_dump(struct msm_gpu *gpu);
 void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords);
 
 int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,