diff mbox

[2/2] iommu/msm: wire up fault handling

Message ID 1471015747-569-2-git-send-email-robdclark@gmail.com (mailing list archive)
State Not Applicable, archived
Delegated to: Andy Gross
Headers show

Commit Message

Rob Clark Aug. 12, 2016, 3:29 p.m. UTC
When things go wrong on the gpu, we can get *thousands* of faults.  With
so many pr_err() prints, which were slowing down resuming the iommu,
drm/msm would think the GPU had actually hung and reset it.

Wire up the fault reporting, so instead we get a small ratelimited print
of the fault address from drm/msm's fault handler instead.

Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 drivers/iommu/msm_iommu.c | 16 +++++++++++-----
 drivers/iommu/msm_iommu.h |  3 +++
 2 files changed, 14 insertions(+), 5 deletions(-)

Comments

Sricharan Ramabadhran Aug. 12, 2016, 4:17 p.m. UTC | #1
Hi,

>When things go wrong on the gpu, we can get *thousands* of faults.  With
>so many pr_err() prints, which were slowing down resuming the iommu,
>drm/msm would think the GPU had actually hung and reset it.
>
>Wire up the fault reporting, so instead we get a small ratelimited print
>of the fault address from drm/msm's fault handler instead.
>
>Signed-off-by: Rob Clark <robdclark@gmail.com>
>---
> drivers/iommu/msm_iommu.c | 16 +++++++++++-----
> drivers/iommu/msm_iommu.h |  3 +++
> 2 files changed, 14 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
>index f6f596f..1110b72 100644
>--- a/drivers/iommu/msm_iommu.c
>+++ b/drivers/iommu/msm_iommu.c
>@@ -411,6 +411,7 @@ static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
> 			}
> 			__disable_clocks(iommu);
> 			list_add(&iommu->dom_node, &priv->list_attached);
>+			iommu->domain = domain;
> 		}
> 	}
>
>@@ -614,8 +615,8 @@ irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id)
> 		goto fail;
> 	}
>
>-	pr_err("Unexpected IOMMU page fault!\n");
>-	pr_err("base = %08x\n", (unsigned int)iommu->base);
>+	pr_debug("Unexpected IOMMU page fault!\n");

              So was just thinking if its better to have only this as a ratelimited print,
              for global faults ?, otherwise
                Reviewed-by: sricharan@codeaurora.org
 
Regards,
 Sricharan
  


--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rob Clark Aug. 12, 2016, 4:39 p.m. UTC | #2
On Fri, Aug 12, 2016 at 12:17 PM, Sricharan <sricharan@codeaurora.org> wrote:
> Hi,
>
>>When things go wrong on the gpu, we can get *thousands* of faults.  With
>>so many pr_err() prints, which were slowing down resuming the iommu,
>>drm/msm would think the GPU had actually hung and reset it.
>>
>>Wire up the fault reporting, so instead we get a small ratelimited print
>>of the fault address from drm/msm's fault handler instead.
>>
>>Signed-off-by: Rob Clark <robdclark@gmail.com>
>>---
>> drivers/iommu/msm_iommu.c | 16 +++++++++++-----
>> drivers/iommu/msm_iommu.h |  3 +++
>> 2 files changed, 14 insertions(+), 5 deletions(-)
>>
>>diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
>>index f6f596f..1110b72 100644
>>--- a/drivers/iommu/msm_iommu.c
>>+++ b/drivers/iommu/msm_iommu.c
>>@@ -411,6 +411,7 @@ static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
>>                       }
>>                       __disable_clocks(iommu);
>>                       list_add(&iommu->dom_node, &priv->list_attached);
>>+                      iommu->domain = domain;
>>               }
>>       }
>>
>>@@ -614,8 +615,8 @@ irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id)
>>               goto fail;
>>       }
>>
>>-      pr_err("Unexpected IOMMU page fault!\n");
>>-      pr_err("base = %08x\n", (unsigned int)iommu->base);
>>+      pr_debug("Unexpected IOMMU page fault!\n");
>
>               So was just thinking if its better to have only this as a ratelimited print,
>               for global faults ?, otherwise

it is possibly a good idea to ratelimit the pr_err prints that get
printed when there is not a fault handler installed..  although in the
case there is a handler, I don't think we should print anything.  (At
least not unless DEBUG is defined.)

If we can actually resume the faulting memory transaction, then we
could use this to implement virtual memory for the GPU, like the HMM
stuff.. in order to use malloc'd memory with the gpu without having to
pin..

(I know we can resume future memory transactions, but not sure if we
can update iommu page tables and resume the transaction that triggered
the fault..)

BR,
-R

>                 Reviewed-by: sricharan@codeaurora.org
>
> Regards,
>  Sricharan
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index f6f596f..1110b72 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -411,6 +411,7 @@  static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
 			}
 			__disable_clocks(iommu);
 			list_add(&iommu->dom_node, &priv->list_attached);
+			iommu->domain = domain;
 		}
 	}
 
@@ -614,8 +615,8 @@  irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id)
 		goto fail;
 	}
 
-	pr_err("Unexpected IOMMU page fault!\n");
-	pr_err("base = %08x\n", (unsigned int)iommu->base);
+	pr_debug("Unexpected IOMMU page fault!\n");
+	pr_debug("base = %08x\n", (unsigned int)iommu->base);
 
 	ret = __enable_clocks(iommu);
 	if (ret)
@@ -624,9 +625,14 @@  irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id)
 	for (i = 0; i < iommu->ncb; i++) {
 		fsr = GET_FSR(iommu->base, i);
 		if (fsr) {
-			pr_err("Fault occurred in context %d.\n", i);
-			pr_err("Interesting registers:\n");
-			print_ctx_regs(iommu->base, i);
+			int ret = report_iommu_fault(iommu->domain,
+					to_msm_priv(iommu->domain)->dev,
+					GET_FAR(iommu->base, i), 0);
+			if (ret == -ENOSYS) {
+				pr_err("Fault occurred in context %d.\n", i);
+				pr_err("Interesting registers:\n");
+				print_ctx_regs(iommu->base, i);
+			}
 			SET_FSR(iommu->base, i, 0x4000000F);
 			SET_RESUME(iommu->base, i, 1);
 		}
diff --git a/drivers/iommu/msm_iommu.h b/drivers/iommu/msm_iommu.h
index 4ca25d5..c53016c 100644
--- a/drivers/iommu/msm_iommu.h
+++ b/drivers/iommu/msm_iommu.h
@@ -56,6 +56,8 @@ 
  * dom_node:	list head for domain
  * ctx_list:	list of 'struct msm_iommu_ctx_dev'
  * context_map: Bitmap to track allocated context banks
+ * domain:	iommu domain that this iommu dev is a member of,
+ * 		ie. whose msm_priv::list_attached are we on?
  */
 struct msm_iommu_dev {
 	void __iomem *base;
@@ -68,6 +70,7 @@  struct msm_iommu_dev {
 	struct list_head dom_node;
 	struct list_head ctx_list;
 	DECLARE_BITMAP(context_map, IOMMU_MAX_CBS);
+	struct iommu_domain *domain;
 };
 
 /**