diff mbox series

[v7,21/24] iommu/arm-smmu-v3: Add stall support for platform devices

Message ID 20200519175502.2504091-22-jean-philippe@linaro.org (mailing list archive)
State New, archived
Headers show
Series iommu: Shared Virtual Addressing for SMMUv3 | expand

Commit Message

Jean-Philippe Brucker May 19, 2020, 5:54 p.m. UTC
The SMMU provides a Stall model for handling page faults in platform
devices. It is similar to PCI PRI, but doesn't require devices to have
their own translation cache. Instead, faulting transactions are parked
and the OS is given a chance to fix the page tables and retry the
transaction.

Enable stall for devices that support it (opt-in by firmware). When an
event corresponds to a translation error, call the IOMMU fault handler.
If the fault is recoverable, it will call us back to terminate or
continue the stall.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
---
 drivers/iommu/Kconfig       |   1 +
 include/linux/iommu.h       |   2 +
 drivers/iommu/arm-smmu-v3.c | 284 ++++++++++++++++++++++++++++++++++--
 drivers/iommu/of_iommu.c    |   5 +-
 4 files changed, 281 insertions(+), 11 deletions(-)

Comments

Shameerali Kolothum Thodi June 1, 2020, 12:42 p.m. UTC | #1
Hi Jean,

> -----Original Message-----
> From: iommu [mailto:iommu-bounces@lists.linux-foundation.org] On Behalf Of
> Jean-Philippe Brucker
> Sent: 19 May 2020 18:55
> To: iommu@lists.linux-foundation.org; devicetree@vger.kernel.org;
> linux-arm-kernel@lists.infradead.org; linux-pci@vger.kernel.org;
> linux-mm@kvack.org
> Cc: fenghua.yu@intel.com; kevin.tian@intel.com; jgg@ziepe.ca;
> catalin.marinas@arm.com; robin.murphy@arm.com; hch@infradead.org;
> zhangfei.gao@linaro.org; Jean-Philippe Brucker <jean-philippe@linaro.org>;
> felix.kuehling@amd.com; will@kernel.org; christian.koenig@amd.com
> Subject: [PATCH v7 21/24] iommu/arm-smmu-v3: Add stall support for platform
> devices
> 
> The SMMU provides a Stall model for handling page faults in platform
> devices. It is similar to PCI PRI, but doesn't require devices to have
> their own translation cache. Instead, faulting transactions are parked
> and the OS is given a chance to fix the page tables and retry the
> transaction.
> 
> Enable stall for devices that support it (opt-in by firmware). When an
> event corresponds to a translation error, call the IOMMU fault handler.
> If the fault is recoverable, it will call us back to terminate or
> continue the stall.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
> ---
>  drivers/iommu/Kconfig       |   1 +
>  include/linux/iommu.h       |   2 +
>  drivers/iommu/arm-smmu-v3.c | 284
> ++++++++++++++++++++++++++++++++++--
>  drivers/iommu/of_iommu.c    |   5 +-
>  4 files changed, 281 insertions(+), 11 deletions(-)
> 

[...]
 
> +static int arm_smmu_page_response(struct device *dev,
> +				  struct iommu_fault_event *unused,
> +				  struct iommu_page_response *resp)
> +{
> +	struct arm_smmu_cmdq_ent cmd = {0};
> +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> +	int sid = master->streams[0].id;
> +
> +	if (master->stall_enabled) {
> +		cmd.opcode		= CMDQ_OP_RESUME;
> +		cmd.resume.sid		= sid;
> +		cmd.resume.stag		= resp->grpid;
> +		switch (resp->code) {
> +		case IOMMU_PAGE_RESP_INVALID:
> +		case IOMMU_PAGE_RESP_FAILURE:
> +			cmd.resume.resp = CMDQ_RESUME_0_RESP_ABORT;
> +			break;
> +		case IOMMU_PAGE_RESP_SUCCESS:
> +			cmd.resume.resp = CMDQ_RESUME_0_RESP_RETRY;
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	} else {
> +		/* TODO: insert PRI response here */
> +		return -ENODEV;
> +	}
> +
> +	arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
> +	/*
> +	 * Don't send a SYNC, it doesn't do anything for RESUME or PRI_RESP.
> +	 * RESUME consumption guarantees that the stalled transaction will be
> +	 * terminated... at some point in the future. PRI_RESP is fire and
> +	 * forget.
> +	 */
> +
> +	return 0;
> +}
> +
>  /* Context descriptor manipulation functions */
>  static void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16
> asid)
>  {
> @@ -1762,8 +1846,7 @@ static int arm_smmu_write_ctx_desc(struct
> arm_smmu_domain *smmu_domain,
>  			FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) |
>  			CTXDESC_CD_0_V;
> 
> -		/* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */
> -		if (smmu->features & ARM_SMMU_FEAT_STALL_FORCE)
> +		if (smmu_domain->stall_enabled)
>  			val |= CTXDESC_CD_0_S;
>  	}
> 
> @@ -2171,7 +2254,7 @@ static void arm_smmu_write_strtab_ent(struct
> arm_smmu_master *master, u32 sid,
>  			 FIELD_PREP(STRTAB_STE_1_STRW, strw));
> 
>  		if (smmu->features & ARM_SMMU_FEAT_STALLS &&
> -		   !(smmu->features & ARM_SMMU_FEAT_STALL_FORCE))
> +		    !master->stall_enabled)
>  			dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
> 
>  		val |= (s1_cfg->cdcfg.cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK)
> |
> @@ -2248,7 +2331,6 @@ static int arm_smmu_init_l2_strtab(struct
> arm_smmu_device *smmu, u32 sid)
>  	return 0;
>  }
> 
> -__maybe_unused
>  static struct arm_smmu_master *
>  arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
>  {
> @@ -2275,23 +2357,123 @@ arm_smmu_find_master(struct
> arm_smmu_device *smmu, u32 sid)
>  }
> 
>  /* IRQ and event handlers */
> +static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
> +{
> +	int ret;
> +	u32 perm = 0;
> +	struct arm_smmu_master *master;
> +	bool ssid_valid = evt[0] & EVTQ_0_SSV;
> +	u8 type = FIELD_GET(EVTQ_0_ID, evt[0]);
> +	u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
> +	struct iommu_fault_event fault_evt = { };
> +	struct iommu_fault *flt = &fault_evt.fault;
> +
> +	/* Stage-2 is always pinned at the moment */
> +	if (evt[1] & EVTQ_1_S2)
> +		return -EFAULT;
> +
> +	master = arm_smmu_find_master(smmu, sid);
> +	if (!master)
> +		return -EINVAL;
> +
> +	if (evt[1] & EVTQ_1_READ)
> +		perm |= IOMMU_FAULT_PERM_READ;
> +	else
> +		perm |= IOMMU_FAULT_PERM_WRITE;
> +
> +	if (evt[1] & EVTQ_1_EXEC)
> +		perm |= IOMMU_FAULT_PERM_EXEC;
> +
> +	if (evt[1] & EVTQ_1_PRIV)
> +		perm |= IOMMU_FAULT_PERM_PRIV;
> +
> +	if (evt[1] & EVTQ_1_STALL) {
> +		flt->type = IOMMU_FAULT_PAGE_REQ;
> +		flt->prm = (struct iommu_fault_page_request) {
> +			.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
> +			.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]),
> +			.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
> +			.perm = perm,
> +			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
> +		};
> +

> +		if (ssid_valid)
> +			flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;

Do we need to set this for STALL mode only support? I had an issue with this
being set on a vSVA POC based on our D06 zip device(which is a "fake " pci dev
that supports STALL mode but no PRI). The issue is, CMDQ_OP_RESUME doesn't
have any ssid or SSV params and works on sid and stag only. Hence, it is difficult for
Qemu SMMUv3 to populate this fields while preparing a page response. I can see
that this flag is being checked in iopf_handle_single() (patch 04/24) as well. For POC,
I used a temp fix[1] to work around this. Please let me know your thoughts.

Thanks,
Shameer

1. https://github.com/hisilicon/kernel-dev/commit/99ff96146e924055f38d97a5897e4becfa378d15
Jean-Philippe Brucker June 2, 2020, 9:38 a.m. UTC | #2
Hi Shameer,

On Mon, Jun 01, 2020 at 12:42:15PM +0000, Shameerali Kolothum Thodi wrote:
> >  /* IRQ and event handlers */
> > +static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
> > +{
> > +	int ret;
> > +	u32 perm = 0;
> > +	struct arm_smmu_master *master;
> > +	bool ssid_valid = evt[0] & EVTQ_0_SSV;
> > +	u8 type = FIELD_GET(EVTQ_0_ID, evt[0]);
> > +	u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
> > +	struct iommu_fault_event fault_evt = { };
> > +	struct iommu_fault *flt = &fault_evt.fault;
> > +
> > +	/* Stage-2 is always pinned at the moment */
> > +	if (evt[1] & EVTQ_1_S2)
> > +		return -EFAULT;
> > +
> > +	master = arm_smmu_find_master(smmu, sid);
> > +	if (!master)
> > +		return -EINVAL;
> > +
> > +	if (evt[1] & EVTQ_1_READ)
> > +		perm |= IOMMU_FAULT_PERM_READ;
> > +	else
> > +		perm |= IOMMU_FAULT_PERM_WRITE;
> > +
> > +	if (evt[1] & EVTQ_1_EXEC)
> > +		perm |= IOMMU_FAULT_PERM_EXEC;
> > +
> > +	if (evt[1] & EVTQ_1_PRIV)
> > +		perm |= IOMMU_FAULT_PERM_PRIV;
> > +
> > +	if (evt[1] & EVTQ_1_STALL) {
> > +		flt->type = IOMMU_FAULT_PAGE_REQ;
> > +		flt->prm = (struct iommu_fault_page_request) {
> > +			.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
> > +			.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]),
> > +			.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
> > +			.perm = perm,
> > +			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
> > +		};
> > +
> 
> > +		if (ssid_valid)
> > +			flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
> 
> Do we need to set this for STALL mode only support? I had an issue with this
> being set on a vSVA POC based on our D06 zip device(which is a "fake " pci dev
> that supports STALL mode but no PRI). The issue is, CMDQ_OP_RESUME doesn't
> have any ssid or SSV params and works on sid and stag only.

I don't understand the problem, arm_smmu_page_response() doesn't set SSID
or SSV when sending a CMDQ_OP_RESUME. Could you detail the flow of a stall
event and RESUME command in your prototype?  Are you getting issues with
the host driver or the guest driver?

We do need to forward the SSV flag all the way to the guest driver, so the
guest can find the faulting address space using the SSID. Once the guest
handled the fault, then we don't send the SSID back to the host as part of
the RESUME command.

Thanks,
Jean

> Hence, it is difficult for
> Qemu SMMUv3 to populate this fields while preparing a page response. I can see
> that this flag is being checked in iopf_handle_single() (patch 04/24) as well. For POC,
> I used a temp fix[1] to work around this. Please let me know your thoughts.
> 
> Thanks,
> Shameer
> 
> 1. https://github.com/hisilicon/kernel-dev/commit/99ff96146e924055f38d97a5897e4becfa378d15
>
Shameerali Kolothum Thodi June 2, 2020, 10:31 a.m. UTC | #3
Hi Jean,

> -----Original Message-----
> From: linux-arm-kernel [mailto:linux-arm-kernel-bounces@lists.infradead.org]
> On Behalf Of Jean-Philippe Brucker
> Sent: 02 June 2020 10:39
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: devicetree@vger.kernel.org; kevin.tian@intel.com; will@kernel.org;
> fenghua.yu@intel.com; jgg@ziepe.ca; linux-pci@vger.kernel.org;
> felix.kuehling@amd.com; hch@infradead.org; linux-mm@kvack.org;
> iommu@lists.linux-foundation.org; catalin.marinas@arm.com;
> zhangfei.gao@linaro.org; robin.murphy@arm.com;
> christian.koenig@amd.com; linux-arm-kernel@lists.infradead.org
> Subject: Re: [PATCH v7 21/24] iommu/arm-smmu-v3: Add stall support for
> platform devices
> 
> Hi Shameer,
> 
> On Mon, Jun 01, 2020 at 12:42:15PM +0000, Shameerali Kolothum Thodi
> wrote:
> > >  /* IRQ and event handlers */
> > > +static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64
> > > +*evt) {
> > > +	int ret;
> > > +	u32 perm = 0;
> > > +	struct arm_smmu_master *master;
> > > +	bool ssid_valid = evt[0] & EVTQ_0_SSV;
> > > +	u8 type = FIELD_GET(EVTQ_0_ID, evt[0]);
> > > +	u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
> > > +	struct iommu_fault_event fault_evt = { };
> > > +	struct iommu_fault *flt = &fault_evt.fault;
> > > +
> > > +	/* Stage-2 is always pinned at the moment */
> > > +	if (evt[1] & EVTQ_1_S2)
> > > +		return -EFAULT;
> > > +
> > > +	master = arm_smmu_find_master(smmu, sid);
> > > +	if (!master)
> > > +		return -EINVAL;
> > > +
> > > +	if (evt[1] & EVTQ_1_READ)
> > > +		perm |= IOMMU_FAULT_PERM_READ;
> > > +	else
> > > +		perm |= IOMMU_FAULT_PERM_WRITE;
> > > +
> > > +	if (evt[1] & EVTQ_1_EXEC)
> > > +		perm |= IOMMU_FAULT_PERM_EXEC;
> > > +
> > > +	if (evt[1] & EVTQ_1_PRIV)
> > > +		perm |= IOMMU_FAULT_PERM_PRIV;
> > > +
> > > +	if (evt[1] & EVTQ_1_STALL) {
> > > +		flt->type = IOMMU_FAULT_PAGE_REQ;
> > > +		flt->prm = (struct iommu_fault_page_request) {
> > > +			.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
> > > +			.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]),
> > > +			.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
> > > +			.perm = perm,
> > > +			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
> > > +		};
> > > +
> >
> > > +		if (ssid_valid)
> > > +			flt->prm.flags |=
> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
> >
> > Do we need to set this for STALL mode only support? I had an issue
> > with this being set on a vSVA POC based on our D06 zip device(which is
> > a "fake " pci dev that supports STALL mode but no PRI). The issue is,
> > CMDQ_OP_RESUME doesn't have any ssid or SSV params and works on sid
> and stag only.
> 
> I don't understand the problem, arm_smmu_page_response() doesn't set SSID
> or SSV when sending a CMDQ_OP_RESUME. Could you detail the flow of a stall
> event and RESUME command in your prototype?  Are you getting issues with
> the host driver or the guest driver?

The issue is on the host side iommu_page_response(). The flow is something like
below.

Stall: Host:-

arm_smmu_handle_evt()
  iommu_report_device_fault()
    vfio_pci_iommu_dev_fault_handler()
      
Stall: Qemu:-

vfio_dma_fault_notifier_handler()
  inject_faults()
    smmuv3_inject_faults()

Stall: Guest:-

arm_smmu_handle_evt()
  iommu_report_device_fault()
    iommu_queue_iopf
  ...
  iopf_handle_group()
    iopf_handle_single()
      handle_mm_fault()
        iopf_complete()
           iommu_page_response()
             arm_smmu_page_response()
               arm_smmu_cmdq_issue_cmd(CMDQ_OP_RESUME)

Resume: Qemu:-

smmuv3_cmdq_consume(SMMU_CMD_RESUME)
  smmuv3_notify_page_resp()
    vfio:ioctl(page_response)  --> struct iommu_page_response is filled
                             with only version, grpid and code.

Resume: Host:-
  ioctl(page_response)
    iommu_page_response()  --> fails as the pending req has PASID_VALID flag
                             set and it checks for a match.
      arm_smmu_page_response()

Hope the above is clear.

> We do need to forward the SSV flag all the way to the guest driver, so the guest
> can find the faulting address space using the SSID. Once the guest handled the
> fault, then we don't send the SSID back to the host as part of the RESUME
> command.

True, the guest requires SSV flag to handle the page fault. But, as shown in the
flow above, the issue is on the host side iommu_page_response() where it
searches for a matching pending req based on pasid. Not sure we can bypass
that and call arm_smmu_page_response() directly but then have to delete the
pending req from the list as well.

Please let me know if there is a better way to handle the host side page
response.

Thanks,
Shameer

> Thanks,
> Jean
> 
> > Hence, it is difficult for
> > Qemu SMMUv3 to populate this fields while preparing a page response. I
> > can see that this flag is being checked in iopf_handle_single() (patch
> > 04/24) as well. For POC, I used a temp fix[1] to work around this. Please let
> me know your thoughts.
> >
> > Thanks,
> > Shameer
> >
> > 1.
> > https://github.com/hisilicon/kernel-dev/commit/99ff96146e924055f38d97a
> > 5897e4becfa378d15
> >
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Jean-Philippe Brucker June 2, 2020, 11:46 a.m. UTC | #4
On Tue, Jun 02, 2020 at 10:31:29AM +0000, Shameerali Kolothum Thodi wrote:
> Hi Jean,
> 
> > -----Original Message-----
> > From: linux-arm-kernel [mailto:linux-arm-kernel-bounces@lists.infradead.org]
> > On Behalf Of Jean-Philippe Brucker
> > Sent: 02 June 2020 10:39
> > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> > Cc: devicetree@vger.kernel.org; kevin.tian@intel.com; will@kernel.org;
> > fenghua.yu@intel.com; jgg@ziepe.ca; linux-pci@vger.kernel.org;
> > felix.kuehling@amd.com; hch@infradead.org; linux-mm@kvack.org;
> > iommu@lists.linux-foundation.org; catalin.marinas@arm.com;
> > zhangfei.gao@linaro.org; robin.murphy@arm.com;
> > christian.koenig@amd.com; linux-arm-kernel@lists.infradead.org
> > Subject: Re: [PATCH v7 21/24] iommu/arm-smmu-v3: Add stall support for
> > platform devices
> > 
> > Hi Shameer,
> > 
> > On Mon, Jun 01, 2020 at 12:42:15PM +0000, Shameerali Kolothum Thodi
> > wrote:
> > > >  /* IRQ and event handlers */
> > > > +static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64
> > > > +*evt) {
> > > > +	int ret;
> > > > +	u32 perm = 0;
> > > > +	struct arm_smmu_master *master;
> > > > +	bool ssid_valid = evt[0] & EVTQ_0_SSV;
> > > > +	u8 type = FIELD_GET(EVTQ_0_ID, evt[0]);
> > > > +	u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
> > > > +	struct iommu_fault_event fault_evt = { };
> > > > +	struct iommu_fault *flt = &fault_evt.fault;
> > > > +
> > > > +	/* Stage-2 is always pinned at the moment */
> > > > +	if (evt[1] & EVTQ_1_S2)
> > > > +		return -EFAULT;
> > > > +
> > > > +	master = arm_smmu_find_master(smmu, sid);
> > > > +	if (!master)
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (evt[1] & EVTQ_1_READ)
> > > > +		perm |= IOMMU_FAULT_PERM_READ;
> > > > +	else
> > > > +		perm |= IOMMU_FAULT_PERM_WRITE;
> > > > +
> > > > +	if (evt[1] & EVTQ_1_EXEC)
> > > > +		perm |= IOMMU_FAULT_PERM_EXEC;
> > > > +
> > > > +	if (evt[1] & EVTQ_1_PRIV)
> > > > +		perm |= IOMMU_FAULT_PERM_PRIV;
> > > > +
> > > > +	if (evt[1] & EVTQ_1_STALL) {
> > > > +		flt->type = IOMMU_FAULT_PAGE_REQ;
> > > > +		flt->prm = (struct iommu_fault_page_request) {
> > > > +			.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
> > > > +			.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]),
> > > > +			.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
> > > > +			.perm = perm,
> > > > +			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
> > > > +		};
> > > > +
> > >
> > > > +		if (ssid_valid)
> > > > +			flt->prm.flags |=
> > IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
> > >
> > > Do we need to set this for STALL mode only support? I had an issue
> > > with this being set on a vSVA POC based on our D06 zip device(which is
> > > a "fake " pci dev that supports STALL mode but no PRI). The issue is,
> > > CMDQ_OP_RESUME doesn't have any ssid or SSV params and works on sid
> > and stag only.
> > 
> > I don't understand the problem, arm_smmu_page_response() doesn't set SSID
> > or SSV when sending a CMDQ_OP_RESUME. Could you detail the flow of a stall
> > event and RESUME command in your prototype?  Are you getting issues with
> > the host driver or the guest driver?
> 
> The issue is on the host side iommu_page_response(). The flow is something like
> below.
> 
> Stall: Host:-
> 
> arm_smmu_handle_evt()
>   iommu_report_device_fault()
>     vfio_pci_iommu_dev_fault_handler()
>       
> Stall: Qemu:-
> 
> vfio_dma_fault_notifier_handler()
>   inject_faults()
>     smmuv3_inject_faults()
> 
> Stall: Guest:-
> 
> arm_smmu_handle_evt()
>   iommu_report_device_fault()
>     iommu_queue_iopf
>   ...
>   iopf_handle_group()
>     iopf_handle_single()
>       handle_mm_fault()
>         iopf_complete()
>            iommu_page_response()
>              arm_smmu_page_response()
>                arm_smmu_cmdq_issue_cmd(CMDQ_OP_RESUME)
> 
> Resume: Qemu:-
> 
> smmuv3_cmdq_consume(SMMU_CMD_RESUME)
>   smmuv3_notify_page_resp()
>     vfio:ioctl(page_response)  --> struct iommu_page_response is filled
>                              with only version, grpid and code.
> 
> Resume: Host:-
>   ioctl(page_response)
>     iommu_page_response()  --> fails as the pending req has PASID_VALID flag
>                              set and it checks for a match.

I believe the fix needs to be here. It's also wrong for PRI since not all
PCIe endpoint require a PASID in the page response. Could you try the
attached patch?

Thanks,
Jean

>       arm_smmu_page_response()
> 
> Hope the above is clear.
> 
> > We do need to forward the SSV flag all the way to the guest driver, so the guest
> > can find the faulting address space using the SSID. Once the guest handled the
> > fault, then we don't send the SSID back to the host as part of the RESUME
> > command.
> 
> True, the guest requires SSV flag to handle the page fault. But, as shown in the
> flow above, the issue is on the host side iommu_page_response() where it
> searches for a matching pending req based on pasid. Not sure we can bypass
> that and call arm_smmu_page_response() directly but then have to delete the
> pending req from the list as well.
> 
> Please let me know if there is a better way to handle the host side page
> response.
> 
> Thanks,
> Shameer
> 
> > Thanks,
> > Jean
> > 
> > > Hence, it is difficult for
> > > Qemu SMMUv3 to populate this fields while preparing a page response. I
> > > can see that this flag is being checked in iopf_handle_single() (patch
> > > 04/24) as well. For POC, I used a temp fix[1] to work around this. Please let
> > me know your thoughts.
> > >
> > > Thanks,
> > > Shameer
> > >
> > > 1.
> > > https://github.com/hisilicon/kernel-dev/commit/99ff96146e924055f38d97a
> > > 5897e4becfa378d15
> > >
> > 
> > _______________________________________________
> > linux-arm-kernel mailing list
> > linux-arm-kernel@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
From 9baf5b9894d4e4be05e665d80fd0ebac8b621aa4 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Tue, 2 Jun 2020 13:13:27 +0200
Subject: [PATCH] iommu: Allow page responses without PASID

Some PCIe devices do not expect a PASID value in PRI Page Responses. If
the "PRG Response PASID Required" bit in the PRI capability is zero,
then the OS should not set the PASID field. Similarly on Arm SMMU,
responses to stall events do not have a PASID.

Currently iommu_page_response() checks that the PASID in the page
response corresponds to the one in the page request without first
checking the "PASID valid" bit. A page response coming from a guest OS
does not necessarily have a PASID, if the passed-through device does not
require one.

Allow page responses without PASID. The page request corresponding to a
page response is identified by device and by Page Response Group Index
(or stall tag).

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
---
 drivers/iommu/iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e61a9fc65b7e4..e481fdfafb77c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1296,7 +1296,8 @@ int iommu_page_response(struct device *dev,
 	 */
 	list_for_each_entry(evt, &param->fault_param->faults, list) {
 		prm = &evt->fault.prm;
-		pasid_valid = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+		pasid_valid = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID
+			   && msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
 
 		if ((pasid_valid && prm->pasid != msg->pasid) ||
 		    prm->grpid != msg->grpid)
Shameerali Kolothum Thodi June 2, 2020, 12:12 p.m. UTC | #5
> -----Original Message-----
> From: linux-arm-kernel [mailto:linux-arm-kernel-bounces@lists.infradead.org]
> On Behalf Of Jean-Philippe Brucker
> Sent: 02 June 2020 12:46
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: devicetree@vger.kernel.org; kevin.tian@intel.com; fenghua.yu@intel.com;
> linux-pci@vger.kernel.org; felix.kuehling@amd.com; robin.murphy@arm.com;
> christian.koenig@amd.com; hch@infradead.org; jgg@ziepe.ca;
> iommu@lists.linux-foundation.org; catalin.marinas@arm.com;
> zhangfei.gao@linaro.org; will@kernel.org; linux-mm@kvack.org;
> linux-arm-kernel@lists.infradead.org
> Subject: Re: [PATCH v7 21/24] iommu/arm-smmu-v3: Add stall support for
> platform devices
> 
> On Tue, Jun 02, 2020 at 10:31:29AM +0000, Shameerali Kolothum Thodi wrote:
> > Hi Jean,
> >
> > > -----Original Message-----
> > > From: linux-arm-kernel
> > > [mailto:linux-arm-kernel-bounces@lists.infradead.org]
> > > On Behalf Of Jean-Philippe Brucker
> > > Sent: 02 June 2020 10:39
> > > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> > > Cc: devicetree@vger.kernel.org; kevin.tian@intel.com;
> > > will@kernel.org; fenghua.yu@intel.com; jgg@ziepe.ca;
> > > linux-pci@vger.kernel.org; felix.kuehling@amd.com;
> > > hch@infradead.org; linux-mm@kvack.org;
> > > iommu@lists.linux-foundation.org; catalin.marinas@arm.com;
> > > zhangfei.gao@linaro.org; robin.murphy@arm.com;
> > > christian.koenig@amd.com; linux-arm-kernel@lists.infradead.org
> > > Subject: Re: [PATCH v7 21/24] iommu/arm-smmu-v3: Add stall support
> > > for platform devices
> > >
> > > Hi Shameer,
> > >
> > > On Mon, Jun 01, 2020 at 12:42:15PM +0000, Shameerali Kolothum Thodi
> > > wrote:
> > > > >  /* IRQ and event handlers */
> > > > > +static int arm_smmu_handle_evt(struct arm_smmu_device *smmu,
> > > > > +u64
> > > > > +*evt) {
> > > > > +	int ret;
> > > > > +	u32 perm = 0;
> > > > > +	struct arm_smmu_master *master;
> > > > > +	bool ssid_valid = evt[0] & EVTQ_0_SSV;
> > > > > +	u8 type = FIELD_GET(EVTQ_0_ID, evt[0]);
> > > > > +	u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
> > > > > +	struct iommu_fault_event fault_evt = { };
> > > > > +	struct iommu_fault *flt = &fault_evt.fault;
> > > > > +
> > > > > +	/* Stage-2 is always pinned at the moment */
> > > > > +	if (evt[1] & EVTQ_1_S2)
> > > > > +		return -EFAULT;
> > > > > +
> > > > > +	master = arm_smmu_find_master(smmu, sid);
> > > > > +	if (!master)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (evt[1] & EVTQ_1_READ)
> > > > > +		perm |= IOMMU_FAULT_PERM_READ;
> > > > > +	else
> > > > > +		perm |= IOMMU_FAULT_PERM_WRITE;
> > > > > +
> > > > > +	if (evt[1] & EVTQ_1_EXEC)
> > > > > +		perm |= IOMMU_FAULT_PERM_EXEC;
> > > > > +
> > > > > +	if (evt[1] & EVTQ_1_PRIV)
> > > > > +		perm |= IOMMU_FAULT_PERM_PRIV;
> > > > > +
> > > > > +	if (evt[1] & EVTQ_1_STALL) {
> > > > > +		flt->type = IOMMU_FAULT_PAGE_REQ;
> > > > > +		flt->prm = (struct iommu_fault_page_request) {
> > > > > +			.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
> > > > > +			.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]),
> > > > > +			.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
> > > > > +			.perm = perm,
> > > > > +			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
> > > > > +		};
> > > > > +
> > > >
> > > > > +		if (ssid_valid)
> > > > > +			flt->prm.flags |=
> > > IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
> > > >
> > > > Do we need to set this for STALL mode only support? I had an issue
> > > > with this being set on a vSVA POC based on our D06 zip
> > > > device(which is a "fake " pci dev that supports STALL mode but no
> > > > PRI). The issue is, CMDQ_OP_RESUME doesn't have any ssid or SSV
> > > > params and works on sid
> > > and stag only.
> > >
> > > I don't understand the problem, arm_smmu_page_response() doesn't set
> > > SSID or SSV when sending a CMDQ_OP_RESUME. Could you detail the flow
> > > of a stall event and RESUME command in your prototype?  Are you
> > > getting issues with the host driver or the guest driver?
> >
> > The issue is on the host side iommu_page_response(). The flow is
> > something like below.
> >
> > Stall: Host:-
> >
> > arm_smmu_handle_evt()
> >   iommu_report_device_fault()
> >     vfio_pci_iommu_dev_fault_handler()
> >
> > Stall: Qemu:-
> >
> > vfio_dma_fault_notifier_handler()
> >   inject_faults()
> >     smmuv3_inject_faults()
> >
> > Stall: Guest:-
> >
> > arm_smmu_handle_evt()
> >   iommu_report_device_fault()
> >     iommu_queue_iopf
> >   ...
> >   iopf_handle_group()
> >     iopf_handle_single()
> >       handle_mm_fault()
> >         iopf_complete()
> >            iommu_page_response()
> >              arm_smmu_page_response()
> >                arm_smmu_cmdq_issue_cmd(CMDQ_OP_RESUME)
> >
> > Resume: Qemu:-
> >
> > smmuv3_cmdq_consume(SMMU_CMD_RESUME)
> >   smmuv3_notify_page_resp()
> >     vfio:ioctl(page_response)  --> struct iommu_page_response is filled
> >                              with only version, grpid and code.
> >
> > Resume: Host:-
> >   ioctl(page_response)
> >     iommu_page_response()  --> fails as the pending req has PASID_VALID
> flag
> >                              set and it checks for a match.
> 
> I believe the fix needs to be here. It's also wrong for PRI since not all PCIe
> endpoint require a PASID in the page response. Could you try the attached
> patch?

Going through the patch, yes, that will definitely fix the issue. But isn't it better if
the request itself indicate whether it expects a response msg with a valid pasid or
not? The response msg can come from userspace as well(vSVA) and if for some reason
doesn't set it for a req that expects pasid then it should be an error, right? In the temp
fix I had, I introduced another flag to indicate the endpoint has PRI support or not and
used that to verify the pasid requirement. But for the PRI case you mentioned 
above, not sure it is easy to get that information or not. May be I am complicating things
here :)

Thanks,
Shameer

> Thanks,
> Jean
> 
> >       arm_smmu_page_response()
> >
> > Hope the above is clear.
> >
> > > We do need to forward the SSV flag all the way to the guest driver,
> > > so the guest can find the faulting address space using the SSID.
> > > Once the guest handled the fault, then we don't send the SSID back
> > > to the host as part of the RESUME command.
> >
> > True, the guest requires SSV flag to handle the page fault. But, as
> > shown in the flow above, the issue is on the host side
> > iommu_page_response() where it searches for a matching pending req
> > based on pasid. Not sure we can bypass that and call
> > arm_smmu_page_response() directly but then have to delete the pending req
> from the list as well.
> >
> > Please let me know if there is a better way to handle the host side
> > page response.
> >
> > Thanks,
> > Shameer
> >
> > > Thanks,
> > > Jean
> > >
> > > > Hence, it is difficult for
> > > > Qemu SMMUv3 to populate this fields while preparing a page
> > > > response. I can see that this flag is being checked in
> > > > iopf_handle_single() (patch
> > > > 04/24) as well. For POC, I used a temp fix[1] to work around this.
> > > > Please let
> > > me know your thoughts.
> > > >
> > > > Thanks,
> > > > Shameer
> > > >
> > > > 1.
> > > > https://github.com/hisilicon/kernel-dev/commit/99ff96146e924055f38
> > > > d97a
> > > > 5897e4becfa378d15
> > > >
> > >
> > > _______________________________________________
> > > linux-arm-kernel mailing list
> > > linux-arm-kernel@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Jean-Philippe Brucker June 3, 2020, 7:38 a.m. UTC | #6
On Tue, Jun 02, 2020 at 12:12:30PM +0000, Shameerali Kolothum Thodi wrote:
> > > > > > +		if (ssid_valid)
> > > > > > +			flt->prm.flags |=
> > > > IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
> > > > >
> > > > > Do we need to set this for STALL mode only support? I had an issue
> > > > > with this being set on a vSVA POC based on our D06 zip
> > > > > device(which is a "fake " pci dev that supports STALL mode but no
> > > > > PRI). The issue is, CMDQ_OP_RESUME doesn't have any ssid or SSV
> > > > > params and works on sid
> > > > and stag only.
> > > >
> > > > I don't understand the problem, arm_smmu_page_response() doesn't set
> > > > SSID or SSV when sending a CMDQ_OP_RESUME. Could you detail the flow
> > > > of a stall event and RESUME command in your prototype?  Are you
> > > > getting issues with the host driver or the guest driver?
> > >
> > > The issue is on the host side iommu_page_response(). The flow is
> > > something like below.
> > >
> > > Stall: Host:-
> > >
> > > arm_smmu_handle_evt()
> > >   iommu_report_device_fault()
> > >     vfio_pci_iommu_dev_fault_handler()
> > >
> > > Stall: Qemu:-
> > >
> > > vfio_dma_fault_notifier_handler()
> > >   inject_faults()
> > >     smmuv3_inject_faults()
> > >
> > > Stall: Guest:-
> > >
> > > arm_smmu_handle_evt()
> > >   iommu_report_device_fault()
> > >     iommu_queue_iopf
> > >   ...
> > >   iopf_handle_group()
> > >     iopf_handle_single()
> > >       handle_mm_fault()
> > >         iopf_complete()
> > >            iommu_page_response()
> > >              arm_smmu_page_response()
> > >                arm_smmu_cmdq_issue_cmd(CMDQ_OP_RESUME)
> > >
> > > Resume: Qemu:-
> > >
> > > smmuv3_cmdq_consume(SMMU_CMD_RESUME)
> > >   smmuv3_notify_page_resp()
> > >     vfio:ioctl(page_response)  --> struct iommu_page_response is filled
> > >                              with only version, grpid and code.
> > >
> > > Resume: Host:-
> > >   ioctl(page_response)
> > >     iommu_page_response()  --> fails as the pending req has PASID_VALID
> > flag
> > >                              set and it checks for a match.
> > 
> > I believe the fix needs to be here. It's also wrong for PRI since not all PCIe
> > endpoint require a PASID in the page response. Could you try the attached
> > patch?
> 
> Going through the patch, yes, that will definitely fix the issue. But isn't it better if
> the request itself indicate whether it expects a response msg with a valid pasid or
> not? The response msg can come from userspace as well(vSVA) and if for some reason
> doesn't set it for a req that expects pasid then it should be an error, right? In the temp
> fix I had, I introduced another flag to indicate the endpoint has PRI support or not and
> used that to verify the pasid requirement. But for the PRI case you mentioned 
> above, not sure it is easy to get that information or not. May be I am complicating things
> here :)

No you're right, we shouldn't send back malformed responses to the SMMU. I
suppose we can store a flag "PASID required" in the fault and check that
against the response. If we have to discard the guest's response, then we
can either fake a response (abort the stall) right away, or wait for the
response timeout to kick, which will do the same.

Thanks,
Jean
diff mbox series

Patch

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 00b517f449ab..16fb38d5dcc7 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -433,6 +433,7 @@  config ARM_SMMU_V3
 	depends on ARM64
 	select IOMMU_API
 	select IOMMU_SVA
+	select IOMMU_PAGE_FAULT
 	select IOMMU_IO_PGTABLE_LPAE
 	select GENERIC_MSI_IRQ_DOMAIN
 	select MMU_NOTIFIER
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a462157c855b..2768f9927237 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -567,6 +567,7 @@  struct iommu_group *fsl_mc_device_group(struct device *dev);
  * @iommu_fwnode: firmware handle for this device's IOMMU
  * @iommu_priv: IOMMU driver private data for this device
  * @num_pasid_bits: number of PASID bits supported by this device
+ * @can_stall: the device is allowed to stall
  * @num_ids: number of associated device IDs
  * @ids: IDs which this device may present to the IOMMU
  */
@@ -574,6 +575,7 @@  struct iommu_fwspec {
 	const struct iommu_ops	*ops;
 	struct fwnode_handle	*iommu_fwnode;
 	u32			num_pasid_bits;
+	bool			can_stall;
 	unsigned int		num_ids;
 	u32			ids[];
 };
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 70dfbd2817aa..9ec2f362802b 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -385,6 +385,13 @@ 
 #define CMDQ_PRI_1_GRPID		GENMASK_ULL(8, 0)
 #define CMDQ_PRI_1_RESP			GENMASK_ULL(13, 12)
 
+#define CMDQ_RESUME_0_SID		GENMASK_ULL(63, 32)
+#define CMDQ_RESUME_0_RESP_TERM		0UL
+#define CMDQ_RESUME_0_RESP_RETRY	1UL
+#define CMDQ_RESUME_0_RESP_ABORT	2UL
+#define CMDQ_RESUME_0_RESP		GENMASK_ULL(13, 12)
+#define CMDQ_RESUME_1_STAG		GENMASK_ULL(15, 0)
+
 #define CMDQ_SYNC_0_CS			GENMASK_ULL(13, 12)
 #define CMDQ_SYNC_0_CS_NONE		0
 #define CMDQ_SYNC_0_CS_IRQ		1
@@ -401,6 +408,25 @@ 
 
 #define EVTQ_0_ID			GENMASK_ULL(7, 0)
 
+#define EVT_ID_TRANSLATION_FAULT	0x10
+#define EVT_ID_ADDR_SIZE_FAULT		0x11
+#define EVT_ID_ACCESS_FAULT		0x12
+#define EVT_ID_PERMISSION_FAULT		0x13
+
+#define EVTQ_0_SSV			(1UL << 11)
+#define EVTQ_0_SSID			GENMASK_ULL(31, 12)
+#define EVTQ_0_SID			GENMASK_ULL(63, 32)
+#define EVTQ_1_STAG			GENMASK_ULL(15, 0)
+#define EVTQ_1_STALL			(1UL << 31)
+#define EVTQ_1_PRIV			(1UL << 33)
+#define EVTQ_1_EXEC			(1UL << 34)
+#define EVTQ_1_READ			(1UL << 35)
+#define EVTQ_1_S2			(1UL << 39)
+#define EVTQ_1_CLASS			GENMASK_ULL(41, 40)
+#define EVTQ_1_TT_READ			(1UL << 44)
+#define EVTQ_2_ADDR			GENMASK_ULL(63, 0)
+#define EVTQ_3_IPA			GENMASK_ULL(51, 12)
+
 /* PRI queue */
 #define PRIQ_ENT_SZ_SHIFT		4
 #define PRIQ_ENT_DWORDS			((1 << PRIQ_ENT_SZ_SHIFT) >> 3)
@@ -525,6 +551,13 @@  struct arm_smmu_cmdq_ent {
 			enum pri_resp		resp;
 		} pri;
 
+		#define CMDQ_OP_RESUME		0x44
+		struct {
+			u32			sid;
+			u16			stag;
+			u8			resp;
+		} resume;
+
 		#define CMDQ_OP_CMD_SYNC	0x46
 		struct {
 			u64			msiaddr;
@@ -560,6 +593,10 @@  struct arm_smmu_queue {
 
 	u32 __iomem			*prod_reg;
 	u32 __iomem			*cons_reg;
+
+	/* Event and PRI */
+	u64				batch;
+	wait_queue_head_t		wq;
 };
 
 struct arm_smmu_queue_poll {
@@ -583,6 +620,7 @@  struct arm_smmu_cmdq_batch {
 
 struct arm_smmu_evtq {
 	struct arm_smmu_queue		q;
+	struct iopf_queue		*iopf;
 	u32				max_stalls;
 };
 
@@ -721,6 +759,7 @@  struct arm_smmu_master {
 	struct arm_smmu_stream		*streams;
 	unsigned int			num_streams;
 	bool				ats_enabled;
+	bool				stall_enabled;
 	bool				sva_enabled;
 	struct list_head		bonds;
 	unsigned int			ssid_bits;
@@ -740,6 +779,7 @@  struct arm_smmu_domain {
 
 	struct io_pgtable_ops		*pgtbl_ops;
 	bool				non_strict;
+	bool				stall_enabled;
 	atomic_t			nr_ats_masters;
 
 	enum arm_smmu_domain_stage	stage;
@@ -1034,6 +1074,11 @@  static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 		}
 		cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
 		break;
+	case CMDQ_OP_RESUME:
+		cmd[0] |= FIELD_PREP(CMDQ_RESUME_0_SID, ent->resume.sid);
+		cmd[0] |= FIELD_PREP(CMDQ_RESUME_0_RESP, ent->resume.resp);
+		cmd[1] |= FIELD_PREP(CMDQ_RESUME_1_STAG, ent->resume.stag);
+		break;
 	case CMDQ_OP_CMD_SYNC:
 		if (ent->sync.msiaddr) {
 			cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
@@ -1600,6 +1645,45 @@  static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
 	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
 }
 
+static int arm_smmu_page_response(struct device *dev,
+				  struct iommu_fault_event *unused,
+				  struct iommu_page_response *resp)
+{
+	struct arm_smmu_cmdq_ent cmd = {0};
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	int sid = master->streams[0].id;
+
+	if (master->stall_enabled) {
+		cmd.opcode		= CMDQ_OP_RESUME;
+		cmd.resume.sid		= sid;
+		cmd.resume.stag		= resp->grpid;
+		switch (resp->code) {
+		case IOMMU_PAGE_RESP_INVALID:
+		case IOMMU_PAGE_RESP_FAILURE:
+			cmd.resume.resp = CMDQ_RESUME_0_RESP_ABORT;
+			break;
+		case IOMMU_PAGE_RESP_SUCCESS:
+			cmd.resume.resp = CMDQ_RESUME_0_RESP_RETRY;
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
+		/* TODO: insert PRI response here */
+		return -ENODEV;
+	}
+
+	arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
+	/*
+	 * Don't send a SYNC, it doesn't do anything for RESUME or PRI_RESP.
+	 * RESUME consumption guarantees that the stalled transaction will be
+	 * terminated... at some point in the future. PRI_RESP is fire and
+	 * forget.
+	 */
+
+	return 0;
+}
+
 /* Context descriptor manipulation functions */
 static void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
 {
@@ -1762,8 +1846,7 @@  static int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain,
 			FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) |
 			CTXDESC_CD_0_V;
 
-		/* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */
-		if (smmu->features & ARM_SMMU_FEAT_STALL_FORCE)
+		if (smmu_domain->stall_enabled)
 			val |= CTXDESC_CD_0_S;
 	}
 
@@ -2171,7 +2254,7 @@  static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 			 FIELD_PREP(STRTAB_STE_1_STRW, strw));
 
 		if (smmu->features & ARM_SMMU_FEAT_STALLS &&
-		   !(smmu->features & ARM_SMMU_FEAT_STALL_FORCE))
+		    !master->stall_enabled)
 			dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
 
 		val |= (s1_cfg->cdcfg.cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
@@ -2248,7 +2331,6 @@  static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 	return 0;
 }
 
-__maybe_unused
 static struct arm_smmu_master *
 arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
 {
@@ -2275,23 +2357,123 @@  arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
 }
 
 /* IRQ and event handlers */
+static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
+{
+	int ret;
+	u32 perm = 0;
+	struct arm_smmu_master *master;
+	bool ssid_valid = evt[0] & EVTQ_0_SSV;
+	u8 type = FIELD_GET(EVTQ_0_ID, evt[0]);
+	u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
+	struct iommu_fault_event fault_evt = { };
+	struct iommu_fault *flt = &fault_evt.fault;
+
+	/* Stage-2 is always pinned at the moment */
+	if (evt[1] & EVTQ_1_S2)
+		return -EFAULT;
+
+	master = arm_smmu_find_master(smmu, sid);
+	if (!master)
+		return -EINVAL;
+
+	if (evt[1] & EVTQ_1_READ)
+		perm |= IOMMU_FAULT_PERM_READ;
+	else
+		perm |= IOMMU_FAULT_PERM_WRITE;
+
+	if (evt[1] & EVTQ_1_EXEC)
+		perm |= IOMMU_FAULT_PERM_EXEC;
+
+	if (evt[1] & EVTQ_1_PRIV)
+		perm |= IOMMU_FAULT_PERM_PRIV;
+
+	if (evt[1] & EVTQ_1_STALL) {
+		flt->type = IOMMU_FAULT_PAGE_REQ;
+		flt->prm = (struct iommu_fault_page_request) {
+			.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
+			.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]),
+			.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
+			.perm = perm,
+			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
+		};
+
+		if (ssid_valid)
+			flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+	} else {
+		flt->type = IOMMU_FAULT_DMA_UNRECOV;
+		flt->event = (struct iommu_fault_unrecoverable) {
+			.flags = IOMMU_FAULT_UNRECOV_ADDR_VALID |
+				 IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID,
+			.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]),
+			.perm = perm,
+			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
+			.fetch_addr = FIELD_GET(EVTQ_3_IPA, evt[3]),
+		};
+
+		if (ssid_valid)
+			flt->event.flags |= IOMMU_FAULT_UNRECOV_PASID_VALID;
+
+		switch (type) {
+		case EVT_ID_TRANSLATION_FAULT:
+		case EVT_ID_ADDR_SIZE_FAULT:
+		case EVT_ID_ACCESS_FAULT:
+			flt->event.reason = IOMMU_FAULT_REASON_PTE_FETCH;
+			break;
+		case EVT_ID_PERMISSION_FAULT:
+			flt->event.reason = IOMMU_FAULT_REASON_PERMISSION;
+			break;
+		default:
+			/* TODO: report other unrecoverable faults. */
+			return -EFAULT;
+		}
+	}
+
+	ret = iommu_report_device_fault(master->dev, &fault_evt);
+	if (ret && flt->type == IOMMU_FAULT_PAGE_REQ) {
+		/* Nobody cared, abort the access */
+		struct iommu_page_response resp = {
+			.pasid		= flt->prm.pasid,
+			.grpid		= flt->prm.grpid,
+			.code		= IOMMU_PAGE_RESP_FAILURE,
+		};
+		arm_smmu_page_response(master->dev, NULL, &resp);
+	}
+
+	return ret;
+}
+
 static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
 {
-	int i;
+	int i, ret;
+	int num_handled = 0;
 	struct arm_smmu_device *smmu = dev;
 	struct arm_smmu_queue *q = &smmu->evtq.q;
 	struct arm_smmu_ll_queue *llq = &q->llq;
+	size_t queue_size = 1 << llq->max_n_shift;
 	u64 evt[EVTQ_ENT_DWORDS];
 
+	spin_lock(&q->wq.lock);
 	do {
 		while (!queue_remove_raw(q, evt)) {
 			u8 id = FIELD_GET(EVTQ_0_ID, evt[0]);
 
-			dev_info(smmu->dev, "event 0x%02x received:\n", id);
-			for (i = 0; i < ARRAY_SIZE(evt); ++i)
-				dev_info(smmu->dev, "\t0x%016llx\n",
-					 (unsigned long long)evt[i]);
+			spin_unlock(&q->wq.lock);
+			ret = arm_smmu_handle_evt(smmu, evt);
+			spin_lock(&q->wq.lock);
+
+			if (++num_handled == queue_size) {
+				q->batch++;
+				wake_up_all_locked(&q->wq);
+				num_handled = 0;
+			}
 
+			if (ret) {
+				dev_info(smmu->dev, "event 0x%02x received:\n",
+					 id);
+				for (i = 0; i < ARRAY_SIZE(evt); ++i)
+					dev_info(smmu->dev, "\t0x%016llx\n",
+						 (unsigned long long)evt[i]);
+			}
 		}
 
 		/*
@@ -2305,6 +2487,11 @@  static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
 	/* Sync our overflow flag, as we believe we're up to speed */
 	llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) |
 		    Q_IDX(llq, llq->cons);
+	queue_sync_cons_out(q);
+
+	wake_up_all_locked(&q->wq);
+	spin_unlock(&q->wq.lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -2368,6 +2555,36 @@  static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
+/*
+ * arm_smmu_flush_evtq - wait until all events currently in the queue have been
+ *                       consumed.
+ *
+ * Wait until there are no more event for this @pasid in the queue. Either until
+ * the queue becomes empty or, if new events are continually added the queue,
+ * until the event queue thread has handled a full batch (where one batch
+ * corresponds to the queue size). For that we take the batch number when
+ * entering flush() and wait for the event queue thread to increment it twice.
+ * Note that we don't handle overflows on q->batch. If it occurs, just wait for
+ * the queue to become empty.
+ */
+static int arm_smmu_flush_evtq(struct arm_smmu_device *smmu)
+{
+	int ret;
+	u64 batch;
+	struct arm_smmu_queue *q = &smmu->evtq.q;
+
+	spin_lock(&q->wq.lock);
+	if (queue_sync_prod_in(q) == -EOVERFLOW)
+		dev_err(smmu->dev, "evtq overflow detected -- requests lost\n");
+
+	batch = q->batch;
+	ret = wait_event_interruptible_locked(q->wq, queue_empty(&q->llq) ||
+					      q->batch >= batch + 2);
+	spin_unlock(&q->wq.lock);
+
+	return ret;
+}
+
 static int arm_smmu_device_disable(struct arm_smmu_device *smmu);
 
 static irqreturn_t arm_smmu_gerror_handler(int irq, void *dev)
@@ -2812,6 +3029,8 @@  static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
 
 	cfg->s1cdmax = master->ssid_bits;
 
+	smmu_domain->stall_enabled = master->stall_enabled;
+
 	ret = arm_smmu_alloc_cd_tables(smmu_domain);
 	if (ret)
 		goto out_free_asid;
@@ -3156,6 +3375,11 @@  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 			smmu_domain->s1_cfg.s1cdmax, master->ssid_bits);
 		ret = -EINVAL;
 		goto out_unlock;
+	} else if (smmu_domain->stall_enabled != master->stall_enabled) {
+		dev_err(dev, "cannot attach to stall-%s domain\n",
+			smmu_domain->stall_enabled ? "enabled" : "disabled");
+		ret = -EINVAL;
+		goto out_unlock;
 	}
 
 	master->domain = smmu_domain;
@@ -3425,6 +3649,11 @@  arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 static void arm_smmu_sva_unbind(struct iommu_sva *handle)
 {
 	struct arm_smmu_bond *bond = sva_to_bond(handle);
+	struct arm_smmu_master *master = dev_iommu_priv_get(handle->dev);
+
+	if (master->stall_enabled)
+		arm_smmu_flush_evtq(master->smmu);
+	iopf_queue_flush_dev(handle->dev);
 
 	mutex_lock(&sva_lock);
 	if (refcount_dec_and_test(&bond->refs)) {
@@ -3603,6 +3832,10 @@  static int arm_smmu_add_device(struct device *dev)
 		master->ssid_bits = min_t(u8, master->ssid_bits,
 					  CTXDESC_LINEAR_CDMAX);
 
+	if ((smmu->features & ARM_SMMU_FEAT_STALLS && fwspec->can_stall) ||
+	    smmu->features & ARM_SMMU_FEAT_STALL_FORCE)
+		master->stall_enabled = true;
+
 	ret = iommu_device_link(&smmu->iommu, dev);
 	if (ret)
 		goto err_disable_pasid;
@@ -3638,6 +3871,7 @@  static void arm_smmu_remove_device(struct device *dev)
 
 	master = dev_iommu_priv_get(dev);
 	smmu = master->smmu;
+	iopf_queue_remove_device(smmu->evtq.iopf, dev);
 	WARN_ON(master->sva_enabled);
 	arm_smmu_detach_dev(master);
 	iommu_group_remove_device(dev);
@@ -3761,7 +3995,7 @@  static void arm_smmu_get_resv_regions(struct device *dev,
 
 static bool arm_smmu_iopf_supported(struct arm_smmu_master *master)
 {
-	return false;
+	return master->stall_enabled;
 }
 
 static bool arm_smmu_dev_has_feature(struct device *dev,
@@ -3806,13 +4040,28 @@  static bool arm_smmu_dev_feature_enabled(struct device *dev,
 
 static int arm_smmu_dev_enable_sva(struct device *dev)
 {
+	int ret;
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 
+	if (master->stall_enabled) {
+		ret = iopf_queue_add_device(master->smmu->evtq.iopf, dev);
+		if (ret)
+			return ret;
+	}
+
+	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
+	if (ret)
+		goto err_disable_iopf;
+
 	mutex_lock(&sva_lock);
 	master->sva_enabled = true;
 	mutex_unlock(&sva_lock);
 
 	return 0;
+
+err_disable_iopf:
+	iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
+	return ret;
 }
 
 static int arm_smmu_dev_disable_sva(struct device *dev)
@@ -3828,6 +4077,9 @@  static int arm_smmu_dev_disable_sva(struct device *dev)
 	master->sva_enabled = false;
 	mutex_unlock(&sva_lock);
 
+	iommu_unregister_device_fault_handler(dev);
+	iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
+
 	return 0;
 }
 
@@ -3887,6 +4139,7 @@  static struct iommu_ops arm_smmu_ops = {
 	.sva_bind		= arm_smmu_sva_bind,
 	.sva_unbind		= arm_smmu_sva_unbind,
 	.sva_get_pasid		= arm_smmu_sva_get_pasid,
+	.page_response		= arm_smmu_page_response,
 	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
 };
 
@@ -3930,6 +4183,10 @@  static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 	q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift);
 
 	q->llq.prod = q->llq.cons = 0;
+
+	init_waitqueue_head(&q->wq);
+	q->batch = 0;
+
 	return 0;
 }
 
@@ -3983,6 +4240,12 @@  static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
 	if (ret)
 		return ret;
 
+	if (smmu->features & ARM_SMMU_FEAT_STALLS) {
+		smmu->evtq.iopf = iopf_queue_alloc(dev_name(smmu->dev));
+		if (!smmu->evtq.iopf)
+			return -ENOMEM;
+	}
+
 	/* priq */
 	if (!(smmu->features & ARM_SMMU_FEAT_PRI))
 		return 0;
@@ -4988,6 +5251,7 @@  static int arm_smmu_device_remove(struct platform_device *pdev)
 	iommu_device_unregister(&smmu->iommu);
 	iommu_device_sysfs_remove(&smmu->iommu);
 	arm_smmu_device_disable(smmu);
+	iopf_queue_free(smmu->evtq.iopf);
 
 	return 0;
 }
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 20738aacac89..dd7017750954 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -205,9 +205,12 @@  const struct iommu_ops *of_iommu_configure(struct device *dev,
 		}
 
 		fwspec = dev_iommu_fwspec_get(dev);
-		if (!err && fwspec)
+		if (!err && fwspec) {
 			of_property_read_u32(master_np, "pasid-num-bits",
 					     &fwspec->num_pasid_bits);
+			fwspec->can_stall = of_property_read_bool(master_np,
+								  "dma-can-stall");
+		}
 	}
 
 	/*