diff mbox series

[06/11] RISC-V: drivers/iommu/riscv: Add command, fault, page-req queues

Message ID 1fd79e5c53d9d6ed2264f60dd4261f293cc00472.1689792825.git.tjeznach@rivosinc.com (mailing list archive)
State Awaiting Upstream, archived
Headers show
Series Linux RISC-V IOMMU Support | expand

Checks

Context Check Description
conchuod/cover_letter success Series has a cover letter
conchuod/tree_selection success Guessed tree name to be for-next at HEAD 471aba2e4760
conchuod/fixes_present success Fixes tag not required for -next series
conchuod/maintainers_pattern success MAINTAINERS pattern errors before the patch: 4 and now 4
conchuod/verify_signedoff success Signed-off-by tag matches author and committer
conchuod/kdoc success Errors and warnings before: 0 this patch: 0
conchuod/build_rv64_clang_allmodconfig fail Errors and warnings before: 10 this patch: 16
conchuod/module_param fail Was 0 now: 3
conchuod/build_rv64_gcc_allmodconfig fail Errors and warnings before: 12 this patch: 15
conchuod/build_rv32_defconfig fail Build failed
conchuod/dtb_warn_rv64 success Errors and warnings before: 3 this patch: 3
conchuod/header_inline success No static functions without inline keyword in header files
conchuod/checkpatch warning CHECK: Unbalanced braces around else statement CHECK: braces {} should be used on all arms of this statement WARNING: Consecutive strings are generally better as a single string WARNING: Do not crash the kernel unless it is absolutely unavoidable--use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants WARNING: Missing a blank line after declarations WARNING: Prefer 'unsigned int *' to bare use of 'unsigned *' WARNING: Prefer 'unsigned int' to bare use of 'unsigned' WARNING: line length of 104 exceeds 100 columns WARNING: memory barrier without comment
conchuod/build_rv64_nommu_k210_defconfig success Build OK
conchuod/verify_fixes success No Fixes tag
conchuod/build_rv64_nommu_virt_defconfig success Build OK

Commit Message

Tomasz Jeznach July 19, 2023, 7:33 p.m. UTC
Enables message or wire signal interrupts for PCIe and platforms devices.

Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
---
 drivers/iommu/riscv/iommu-pci.c      |  72 ++++
 drivers/iommu/riscv/iommu-platform.c |  66 +++
 drivers/iommu/riscv/iommu.c          | 604 ++++++++++++++++++++++++++-
 drivers/iommu/riscv/iommu.h          |  28 ++
 4 files changed, 769 insertions(+), 1 deletion(-)

Comments

Nick Kossifidis July 20, 2023, 3:11 a.m. UTC | #1
Hello Tomasz,

On 7/19/23 22:33, Tomasz Jeznach wrote:
> Enables message or wire signal interrupts for PCIe and platforms devices.
> 

The description doesn't match the subject nor the patch content (we 
don't jus enable interrupts, we also init the queues).

> +	/* Parse Queue lengts */
> +	ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> +	if (!ret)
> +		dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> +
> +	ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> +	if (!ret)
> +		dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> +
> +	ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> +	if (!ret)
> +		dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> +
>   	dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
>   

We need to add those to the device tree binding doc (or throw them away, 
I thought it would be better to have them as part of the device 
desciption than a module parameter).


> +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
> +static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
> +

> +	case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
> +		q = &iommu->priq;
> +		q->len = sizeof(struct riscv_iommu_pq_record);
> +		count = iommu->priq_len;
> +		irq = iommu->irq_priq;
> +		irq_check = riscv_iommu_priq_irq_check;
> +		irq_process = riscv_iommu_priq_process;
> +		q->qbr = RISCV_IOMMU_REG_PQB;
> +		q->qcr = RISCV_IOMMU_REG_PQCSR;
> +		name = "priq";
> +		break;


It makes more sense to add the code for the page request queue in the 
patch that adds ATS/PRI support IMHO. This comment also applies to its 
interrupt handlers below.


> +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> +						  u64 addr)
> +{
> +	cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> +	cmd->dword1 = addr;
> +}
> +

This needs to be (addr >> 2) to match the spec, same as in the iofence 
command.

Regards,
Nick
Baolu Lu July 20, 2023, 1:08 p.m. UTC | #2
On 2023/7/20 3:33, Tomasz Jeznach wrote:
> Enables message or wire signal interrupts for PCIe and platforms devices.

If this patch could be divided into multiple small patches, each
logically doing one specific thing, it will help people better review
the code.

Best regards,
baolu
Tomasz Jeznach July 20, 2023, 5:49 p.m. UTC | #3
On Thu, Jul 20, 2023 at 6:18 AM Baolu Lu <baolu.lu@linux.intel.com> wrote:
>
> On 2023/7/20 3:33, Tomasz Jeznach wrote:
> > Enables message or wire signal interrupts for PCIe and platforms devices.
>
> If this patch could be divided into multiple small patches, each
> logically doing one specific thing, it will help people better review
> the code.
>

ack. I've got a similar comment regarding this patch already.
I will split and add more notes to the commit message. Thanks.


> Best regards,
> baolu
>


regards,
- Tomasz
Tomasz Jeznach July 20, 2023, 6 p.m. UTC | #4
On Wed, Jul 19, 2023 at 8:12 PM Nick Kossifidis <mick@ics.forth.gr> wrote:
>
> Hello Tomasz,
>
> On 7/19/23 22:33, Tomasz Jeznach wrote:
> > Enables message or wire signal interrupts for PCIe and platforms devices.
> >
>
> The description doesn't match the subject nor the patch content (we
> don't jus enable interrupts, we also init the queues).
>
> > +     /* Parse Queue lengts */
> > +     ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> > +     if (!ret)
> > +             dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> > +
> > +     ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> > +     if (!ret)
> > +             dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> > +
> > +     ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> > +     if (!ret)
> > +             dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> > +
> >       dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
> >
>
> We need to add those to the device tree binding doc (or throw them away,
> I thought it would be better to have them as part of the device
> desciption than a module parameter).
>

We can add them as an optional fields to DT.
Alternatively, I've been looking into an option to auto-scale CQ/PQ
based on number of attached devices, but this gets trickier for
hot-pluggable systems. I've added module parameters as a bare-minimum,
but still looking for better solutions.

>
> > +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
> > +static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
> > +
>
> > +     case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
> > +             q = &iommu->priq;
> > +             q->len = sizeof(struct riscv_iommu_pq_record);
> > +             count = iommu->priq_len;
> > +             irq = iommu->irq_priq;
> > +             irq_check = riscv_iommu_priq_irq_check;
> > +             irq_process = riscv_iommu_priq_process;
> > +             q->qbr = RISCV_IOMMU_REG_PQB;
> > +             q->qcr = RISCV_IOMMU_REG_PQCSR;
> > +             name = "priq";
> > +             break;
>
>
> It makes more sense to add the code for the page request queue in the
> patch that adds ATS/PRI support IMHO. This comment also applies to its
> interrupt handlers below.
>

ack. will do.

>
> > +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> > +                                               u64 addr)
> > +{
> > +     cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> > +     cmd->dword1 = addr;
> > +}
> > +
>
> This needs to be (addr >> 2) to match the spec, same as in the iofence
> command.
>

oops. Thanks!

> Regards,
> Nick
>

regards,
- Tomasz
Conor Dooley July 20, 2023, 6:43 p.m. UTC | #5
On Thu, Jul 20, 2023 at 11:00:10AM -0700, Tomasz Jeznach wrote:
> On Wed, Jul 19, 2023 at 8:12 PM Nick Kossifidis <mick@ics.forth.gr> wrote:
> > On 7/19/23 22:33, Tomasz Jeznach wrote:
> > The description doesn't match the subject nor the patch content (we
> > don't jus enable interrupts, we also init the queues).
> >
> > > +     /* Parse Queue lengts */
> > > +     ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> > > +     if (!ret)
> > > +             dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> > > +
> > > +     ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> > > +     if (!ret)
> > > +             dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> > > +
> > > +     ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> > > +     if (!ret)
> > > +             dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> > > +
> > >       dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
> > >
> >
> > We need to add those to the device tree binding doc (or throw them away,
> > I thought it would be better to have them as part of the device
> > desciption than a module parameter).

Aye, I didn't notice these. Any DT properties /must/ be documented.
To avoid having to make the comments on v2, properties should also not
contain underscores.

> We can add them as an optional fields to DT.
> Alternatively, I've been looking into an option to auto-scale CQ/PQ
> based on number of attached devices, but this gets trickier for
> hot-pluggable systems. I've added module parameters as a bare-minimum,
> but still looking for better solutions.

If they're properties of the hardware, they should come from DT/ACPI,
unless they're auto-detectable, in which case that is preferred.
To quote GregKH "please do not add new module parameters for drivers,
this is not the 1990s" :)
Zong Li July 24, 2023, 9:47 a.m. UTC | #6
On Fri, Jul 21, 2023 at 2:00 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> On Wed, Jul 19, 2023 at 8:12 PM Nick Kossifidis <mick@ics.forth.gr> wrote:
> >
> > Hello Tomasz,
> >
> > On 7/19/23 22:33, Tomasz Jeznach wrote:
> > > Enables message or wire signal interrupts for PCIe and platforms devices.
> > >
> >
> > The description doesn't match the subject nor the patch content (we
> > don't jus enable interrupts, we also init the queues).
> >
> > > +     /* Parse Queue lengts */
> > > +     ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> > > +     if (!ret)
> > > +             dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> > > +
> > > +     ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> > > +     if (!ret)
> > > +             dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> > > +
> > > +     ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> > > +     if (!ret)
> > > +             dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> > > +
> > >       dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
> > >
> >
> > We need to add those to the device tree binding doc (or throw them away,
> > I thought it would be better to have them as part of the device
> > desciption than a module parameter).
> >
>
> We can add them as an optional fields to DT.
> Alternatively, I've been looking into an option to auto-scale CQ/PQ
> based on number of attached devices, but this gets trickier for
> hot-pluggable systems. I've added module parameters as a bare-minimum,
> but still looking for better solutions.
>
> >
> > > +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
> > > +static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
> > > +
> >
> > > +     case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
> > > +             q = &iommu->priq;
> > > +             q->len = sizeof(struct riscv_iommu_pq_record);
> > > +             count = iommu->priq_len;
> > > +             irq = iommu->irq_priq;
> > > +             irq_check = riscv_iommu_priq_irq_check;
> > > +             irq_process = riscv_iommu_priq_process;
> > > +             q->qbr = RISCV_IOMMU_REG_PQB;
> > > +             q->qcr = RISCV_IOMMU_REG_PQCSR;
> > > +             name = "priq";
> > > +             break;
> >
> >
> > It makes more sense to add the code for the page request queue in the
> > patch that adds ATS/PRI support IMHO. This comment also applies to its
> > interrupt handlers below.
> >
>
> ack. will do.
>
> >
> > > +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> > > +                                               u64 addr)
> > > +{
> > > +     cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> > > +     cmd->dword1 = addr;
> > > +}
> > > +
> >
> > This needs to be (addr >> 2) to match the spec, same as in the iofence
> > command.
> >
>
> oops. Thanks!
>

I think it should be (addr >> 12) according to the spec.

> > Regards,
> > Nick
> >
>
> regards,
> - Tomasz
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Tomasz Jeznach July 28, 2023, 5:18 a.m. UTC | #7
On Mon, Jul 24, 2023 at 11:47 AM Zong Li <zong.li@sifive.com> wrote:
>
> On Fri, Jul 21, 2023 at 2:00 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> >
> > On Wed, Jul 19, 2023 at 8:12 PM Nick Kossifidis <mick@ics.forth.gr> wrote:
> > >
> > > Hello Tomasz,
> > >
> > > On 7/19/23 22:33, Tomasz Jeznach wrote:
> > > > Enables message or wire signal interrupts for PCIe and platforms devices.
> > > >
> > >
> > > The description doesn't match the subject nor the patch content (we
> > > don't jus enable interrupts, we also init the queues).
> > >
> > > > +     /* Parse Queue lengts */
> > > > +     ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> > > > +     if (!ret)
> > > > +             dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> > > > +
> > > > +     ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> > > > +     if (!ret)
> > > > +             dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> > > > +
> > > > +     ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> > > > +     if (!ret)
> > > > +             dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> > > > +
> > > >       dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
> > > >
> > >
> > > We need to add those to the device tree binding doc (or throw them away,
> > > I thought it would be better to have them as part of the device
> > > desciption than a module parameter).
> > >
> >
> > We can add them as an optional fields to DT.
> > Alternatively, I've been looking into an option to auto-scale CQ/PQ
> > based on number of attached devices, but this gets trickier for
> > hot-pluggable systems. I've added module parameters as a bare-minimum,
> > but still looking for better solutions.
> >
> > >
> > > > +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
> > > > +static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
> > > > +
> > >
> > > > +     case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
> > > > +             q = &iommu->priq;
> > > > +             q->len = sizeof(struct riscv_iommu_pq_record);
> > > > +             count = iommu->priq_len;
> > > > +             irq = iommu->irq_priq;
> > > > +             irq_check = riscv_iommu_priq_irq_check;
> > > > +             irq_process = riscv_iommu_priq_process;
> > > > +             q->qbr = RISCV_IOMMU_REG_PQB;
> > > > +             q->qcr = RISCV_IOMMU_REG_PQCSR;
> > > > +             name = "priq";
> > > > +             break;
> > >
> > >
> > > It makes more sense to add the code for the page request queue in the
> > > patch that adds ATS/PRI support IMHO. This comment also applies to its
> > > interrupt handlers below.
> > >
> >
> > ack. will do.
> >
> > >
> > > > +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> > > > +                                               u64 addr)
> > > > +{
> > > > +     cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> > > > +     cmd->dword1 = addr;
> > > > +}
> > > > +
> > >
> > > This needs to be (addr >> 2) to match the spec, same as in the iofence
> > > command.
> > >
> >
> > oops. Thanks!
> >
>
> I think it should be (addr >> 12) according to the spec.
>

My reading of the spec '3.1.1. IOMMU Page-Table cache invalidation commands'
is that it is a 4k page aligned address packed at dword1[61:10], so
effectively shifted by 2 bits.

regards,
- Tomasz

> > > Regards,
> > > Nick
> > >
> >
> > regards,
> > - Tomasz
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
Zong Li July 28, 2023, 8:48 a.m. UTC | #8
On Fri, Jul 28, 2023 at 1:19 PM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> On Mon, Jul 24, 2023 at 11:47 AM Zong Li <zong.li@sifive.com> wrote:
> >
> > On Fri, Jul 21, 2023 at 2:00 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> > >
> > > On Wed, Jul 19, 2023 at 8:12 PM Nick Kossifidis <mick@ics.forth.gr> wrote:
> > > >
> > > > Hello Tomasz,
> > > >
> > > > On 7/19/23 22:33, Tomasz Jeznach wrote:
> > > > > Enables message or wire signal interrupts for PCIe and platforms devices.
> > > > >
> > > >
> > > > The description doesn't match the subject nor the patch content (we
> > > > don't jus enable interrupts, we also init the queues).
> > > >
> > > > > +     /* Parse Queue lengts */
> > > > > +     ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> > > > > +     if (!ret)
> > > > > +             dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> > > > > +
> > > > > +     ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> > > > > +     if (!ret)
> > > > > +             dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> > > > > +
> > > > > +     ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> > > > > +     if (!ret)
> > > > > +             dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> > > > > +
> > > > >       dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
> > > > >
> > > >
> > > > We need to add those to the device tree binding doc (or throw them away,
> > > > I thought it would be better to have them as part of the device
> > > > desciption than a module parameter).
> > > >
> > >
> > > We can add them as an optional fields to DT.
> > > Alternatively, I've been looking into an option to auto-scale CQ/PQ
> > > based on number of attached devices, but this gets trickier for
> > > hot-pluggable systems. I've added module parameters as a bare-minimum,
> > > but still looking for better solutions.
> > >
> > > >
> > > > > +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
> > > > > +static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
> > > > > +
> > > >
> > > > > +     case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
> > > > > +             q = &iommu->priq;
> > > > > +             q->len = sizeof(struct riscv_iommu_pq_record);
> > > > > +             count = iommu->priq_len;
> > > > > +             irq = iommu->irq_priq;
> > > > > +             irq_check = riscv_iommu_priq_irq_check;
> > > > > +             irq_process = riscv_iommu_priq_process;
> > > > > +             q->qbr = RISCV_IOMMU_REG_PQB;
> > > > > +             q->qcr = RISCV_IOMMU_REG_PQCSR;
> > > > > +             name = "priq";
> > > > > +             break;
> > > >
> > > >
> > > > It makes more sense to add the code for the page request queue in the
> > > > patch that adds ATS/PRI support IMHO. This comment also applies to its
> > > > interrupt handlers below.
> > > >
> > >
> > > ack. will do.
> > >
> > > >
> > > > > +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> > > > > +                                               u64 addr)
> > > > > +{
> > > > > +     cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> > > > > +     cmd->dword1 = addr;
> > > > > +}
> > > > > +
> > > >
> > > > This needs to be (addr >> 2) to match the spec, same as in the iofence
> > > > command.
> > > >
> > >
> > > oops. Thanks!
> > >
> >
> > I think it should be (addr >> 12) according to the spec.
> >
>
> My reading of the spec '3.1.1. IOMMU Page-Table cache invalidation commands'
> is that it is a 4k page aligned address packed at dword1[61:10], so
> effectively shifted by 2 bits.

Thanks for your clarifying. Just an opinion, perhaps you can use
'FIELD_PREP()' on it as well, it might be clearer.

>
> regards,
> - Tomasz
>
> > > > Regards,
> > > > Nick
> > > >
> > >
> > > regards,
> > > - Tomasz
> > >
> > > _______________________________________________
> > > linux-riscv mailing list
> > > linux-riscv@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/linux-riscv
Zong Li July 29, 2023, 12:58 p.m. UTC | #9
On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> Enables message or wire signal interrupts for PCIe and platforms devices.
>
> Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
> Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> ---
>  drivers/iommu/riscv/iommu-pci.c      |  72 ++++
>  drivers/iommu/riscv/iommu-platform.c |  66 +++
>  drivers/iommu/riscv/iommu.c          | 604 ++++++++++++++++++++++++++-
>  drivers/iommu/riscv/iommu.h          |  28 ++
>  4 files changed, 769 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/riscv/iommu-pci.c b/drivers/iommu/riscv/iommu-pci.c
> index c91f963d7a29..9ea0647f7b92 100644
> --- a/drivers/iommu/riscv/iommu-pci.c
> +++ b/drivers/iommu/riscv/iommu-pci.c
> @@ -34,6 +34,7 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
>  {
>         struct device *dev = &pdev->dev;
>         struct riscv_iommu_device *iommu;
> +       u64 icvec;
>         int ret;
>
>         ret = pci_enable_device_mem(pdev);
> @@ -67,14 +68,84 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
>         iommu->dev = dev;
>         dev_set_drvdata(dev, iommu);
>
> +       /* Check device reported capabilities. */
> +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> +
> +       /* The PCI driver only uses MSIs, make sure the IOMMU supports this */
> +       switch (FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap)) {
> +       case RISCV_IOMMU_CAP_IGS_MSI:
> +       case RISCV_IOMMU_CAP_IGS_BOTH:
> +               break;
> +       default:
> +               dev_err(dev, "unable to use message-signaled interrupts\n");
> +               ret = -ENODEV;
> +               goto fail;
> +       }
> +
>         dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
>         pci_set_master(pdev);
>
> +       /* Allocate and assign IRQ vectors for the various events */
> +       ret = pci_alloc_irq_vectors(pdev, 1, RISCV_IOMMU_INTR_COUNT, PCI_IRQ_MSIX);
> +       if (ret < 0) {
> +               dev_err(dev, "unable to allocate irq vectors\n");
> +               goto fail;
> +       }
> +
> +       ret = -ENODEV;
> +
> +       iommu->irq_cmdq = msi_get_virq(dev, RISCV_IOMMU_INTR_CQ);
> +       if (!iommu->irq_cmdq) {
> +               dev_warn(dev, "no MSI vector %d for the command queue\n",
> +                        RISCV_IOMMU_INTR_CQ);
> +               goto fail;
> +       }
> +
> +       iommu->irq_fltq = msi_get_virq(dev, RISCV_IOMMU_INTR_FQ);
> +       if (!iommu->irq_fltq) {
> +               dev_warn(dev, "no MSI vector %d for the fault/event queue\n",
> +                        RISCV_IOMMU_INTR_FQ);
> +               goto fail;
> +       }
> +
> +       if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
> +               iommu->irq_pm = msi_get_virq(dev, RISCV_IOMMU_INTR_PM);
> +               if (!iommu->irq_pm) {
> +                       dev_warn(dev,
> +                                "no MSI vector %d for performance monitoring\n",
> +                                RISCV_IOMMU_INTR_PM);
> +                       goto fail;
> +               }
> +       }
> +
> +       if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
> +               iommu->irq_priq = msi_get_virq(dev, RISCV_IOMMU_INTR_PQ);
> +               if (!iommu->irq_priq) {
> +                       dev_warn(dev,
> +                                "no MSI vector %d for page-request queue\n",
> +                                RISCV_IOMMU_INTR_PQ);
> +                       goto fail;
> +               }
> +       }
> +
> +       /* Set simple 1:1 mapping for MSI vectors */
> +       icvec = FIELD_PREP(RISCV_IOMMU_IVEC_CIV, RISCV_IOMMU_INTR_CQ) |
> +           FIELD_PREP(RISCV_IOMMU_IVEC_FIV, RISCV_IOMMU_INTR_FQ);
> +
> +       if (iommu->cap & RISCV_IOMMU_CAP_HPM)
> +               icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, RISCV_IOMMU_INTR_PM);
> +
> +       if (iommu->cap & RISCV_IOMMU_CAP_ATS)
> +               icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, RISCV_IOMMU_INTR_PQ);
> +
> +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IVEC, icvec);
> +
>         ret = riscv_iommu_init(iommu);
>         if (!ret)
>                 return ret;
>
>   fail:
> +       pci_free_irq_vectors(pdev);
>         pci_clear_master(pdev);
>         pci_release_regions(pdev);
>         pci_disable_device(pdev);
> @@ -85,6 +156,7 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
>  static void riscv_iommu_pci_remove(struct pci_dev *pdev)
>  {
>         riscv_iommu_remove(dev_get_drvdata(&pdev->dev));
> +       pci_free_irq_vectors(pdev);
>         pci_clear_master(pdev);
>         pci_release_regions(pdev);
>         pci_disable_device(pdev);
> diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c
> index e4e8ca6711e7..35935d3c7ef4 100644
> --- a/drivers/iommu/riscv/iommu-platform.c
> +++ b/drivers/iommu/riscv/iommu-platform.c
> @@ -20,6 +20,8 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
>         struct device *dev = &pdev->dev;
>         struct riscv_iommu_device *iommu = NULL;
>         struct resource *res = NULL;
> +       u32 fctl = 0;
> +       int irq = 0;
>         int ret = 0;
>
>         iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
> @@ -53,6 +55,70 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
>                 goto fail;
>         }
>
> +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> +
> +       /* For now we only support WSIs until we have AIA support */

I'm not completely understand AIA support here, because I saw the pci
case uses the MSI, and kernel seems to have the AIA implementation.
Could you please elaborate it?

> +       ret = FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap);
> +       if (ret == RISCV_IOMMU_CAP_IGS_MSI) {
> +               dev_err(dev, "IOMMU only supports MSIs\n");
> +               goto fail;
> +       }
> +
> +       /* Parse IRQ assignment */
> +       irq = platform_get_irq_byname_optional(pdev, "cmdq");
> +       if (irq > 0)
> +               iommu->irq_cmdq = irq;
> +       else {
> +               dev_err(dev, "no IRQ provided for the command queue\n");
> +               goto fail;
> +       }
> +
> +       irq = platform_get_irq_byname_optional(pdev, "fltq");
> +       if (irq > 0)
> +               iommu->irq_fltq = irq;
> +       else {
> +               dev_err(dev, "no IRQ provided for the fault/event queue\n");
> +               goto fail;
> +       }
> +
> +       if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
> +               irq = platform_get_irq_byname_optional(pdev, "pm");
> +               if (irq > 0)
> +                       iommu->irq_pm = irq;
> +               else {
> +                       dev_err(dev, "no IRQ provided for performance monitoring\n");
> +                       goto fail;
> +               }
> +       }
> +
> +       if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
> +               irq = platform_get_irq_byname_optional(pdev, "priq");
> +               if (irq > 0)
> +                       iommu->irq_priq = irq;
> +               else {
> +                       dev_err(dev, "no IRQ provided for the page-request queue\n");
> +                       goto fail;
> +               }
> +       }

Should we define the "interrupt-names" in dt-bindings?

> +
> +       /* Make sure fctl.WSI is set */
> +       fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
> +       fctl |= RISCV_IOMMU_FCTL_WSI;
> +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, fctl);
> +
> +       /* Parse Queue lengts */
> +       ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> +       if (!ret)
> +               dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> +
> +       ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> +       if (!ret)
> +               dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> +
> +       ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> +       if (!ret)
> +               dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> +
>         dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
>
>         return riscv_iommu_init(iommu);
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 31dc3c458e13..5c4cf9875302 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -45,6 +45,18 @@ static int ddt_mode = RISCV_IOMMU_DDTP_MODE_BARE;
>  module_param(ddt_mode, int, 0644);
>  MODULE_PARM_DESC(ddt_mode, "Device Directory Table mode.");
>
> +static int cmdq_length = 1024;
> +module_param(cmdq_length, int, 0644);
> +MODULE_PARM_DESC(cmdq_length, "Command queue length.");
> +
> +static int fltq_length = 1024;
> +module_param(fltq_length, int, 0644);
> +MODULE_PARM_DESC(fltq_length, "Fault queue length.");
> +
> +static int priq_length = 1024;
> +module_param(priq_length, int, 0644);
> +MODULE_PARM_DESC(priq_length, "Page request interface queue length.");
> +
>  /* IOMMU PSCID allocation namespace. */
>  #define RISCV_IOMMU_MAX_PSCID  (1U << 20)
>  static DEFINE_IDA(riscv_iommu_pscids);
> @@ -65,6 +77,497 @@ static DEFINE_IDA(riscv_iommu_pscids);
>  static const struct iommu_domain_ops riscv_iommu_domain_ops;
>  static const struct iommu_ops riscv_iommu_ops;
>
> +/*
> + * Common queue management routines
> + */
> +
> +/* Note: offsets are the same for all queues */
> +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
> +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
> +
> +static unsigned riscv_iommu_queue_consume(struct riscv_iommu_device *iommu,
> +                                         struct riscv_iommu_queue *q, unsigned *ready)
> +{
> +       u32 tail = riscv_iommu_readl(iommu, Q_TAIL(q));
> +       *ready = q->lui;
> +
> +       BUG_ON(q->cnt <= tail);
> +       if (q->lui <= tail)
> +               return tail - q->lui;
> +       return q->cnt - q->lui;
> +}
> +
> +static void riscv_iommu_queue_release(struct riscv_iommu_device *iommu,
> +                                     struct riscv_iommu_queue *q, unsigned count)
> +{
> +       q->lui = (q->lui + count) & (q->cnt - 1);
> +       riscv_iommu_writel(iommu, Q_HEAD(q), q->lui);
> +}
> +
> +static u32 riscv_iommu_queue_ctrl(struct riscv_iommu_device *iommu,
> +                                 struct riscv_iommu_queue *q, u32 val)
> +{
> +       cycles_t end_cycles = RISCV_IOMMU_TIMEOUT + get_cycles();
> +
> +       riscv_iommu_writel(iommu, q->qcr, val);
> +       do {
> +               val = riscv_iommu_readl(iommu, q->qcr);
> +               if (!(val & RISCV_IOMMU_QUEUE_BUSY))
> +                       break;
> +               cpu_relax();
> +       } while (get_cycles() < end_cycles);
> +
> +       return val;
> +}
> +
> +static void riscv_iommu_queue_free(struct riscv_iommu_device *iommu,
> +                                  struct riscv_iommu_queue *q)
> +{
> +       size_t size = q->len * q->cnt;
> +
> +       riscv_iommu_queue_ctrl(iommu, q, 0);
> +
> +       if (q->base) {
> +               if (q->in_iomem)
> +                       iounmap(q->base);
> +               else
> +                       dmam_free_coherent(iommu->dev, size, q->base, q->base_dma);
> +       }
> +       if (q->irq)
> +               free_irq(q->irq, q);
> +}
> +
> +static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data);
> +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data);
> +static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data);
> +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data);
> +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
> +static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
> +
> +static int riscv_iommu_queue_init(struct riscv_iommu_device *iommu, int queue_id)
> +{
> +       struct device *dev = iommu->dev;
> +       struct riscv_iommu_queue *q = NULL;
> +       size_t queue_size = 0;
> +       irq_handler_t irq_check;
> +       irq_handler_t irq_process;
> +       const char *name;
> +       int count = 0;
> +       int irq = 0;
> +       unsigned order = 0;
> +       u64 qbr_val = 0;
> +       u64 qbr_readback = 0;
> +       u64 qbr_paddr = 0;
> +       int ret = 0;
> +
> +       switch (queue_id) {
> +       case RISCV_IOMMU_COMMAND_QUEUE:
> +               q = &iommu->cmdq;
> +               q->len = sizeof(struct riscv_iommu_command);
> +               count = iommu->cmdq_len;
> +               irq = iommu->irq_cmdq;
> +               irq_check = riscv_iommu_cmdq_irq_check;
> +               irq_process = riscv_iommu_cmdq_process;
> +               q->qbr = RISCV_IOMMU_REG_CQB;
> +               q->qcr = RISCV_IOMMU_REG_CQCSR;
> +               name = "cmdq";
> +               break;
> +       case RISCV_IOMMU_FAULT_QUEUE:
> +               q = &iommu->fltq;
> +               q->len = sizeof(struct riscv_iommu_fq_record);
> +               count = iommu->fltq_len;
> +               irq = iommu->irq_fltq;
> +               irq_check = riscv_iommu_fltq_irq_check;
> +               irq_process = riscv_iommu_fltq_process;
> +               q->qbr = RISCV_IOMMU_REG_FQB;
> +               q->qcr = RISCV_IOMMU_REG_FQCSR;
> +               name = "fltq";
> +               break;
> +       case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
> +               q = &iommu->priq;
> +               q->len = sizeof(struct riscv_iommu_pq_record);
> +               count = iommu->priq_len;
> +               irq = iommu->irq_priq;
> +               irq_check = riscv_iommu_priq_irq_check;
> +               irq_process = riscv_iommu_priq_process;
> +               q->qbr = RISCV_IOMMU_REG_PQB;
> +               q->qcr = RISCV_IOMMU_REG_PQCSR;
> +               name = "priq";
> +               break;
> +       default:
> +               dev_err(dev, "invalid queue interrupt index in queue_init!\n");
> +               return -EINVAL;
> +       }
> +
> +       /* Polling not implemented */
> +       if (!irq)
> +               return -ENODEV;
> +
> +       /* Allocate queue in memory and set the base register */
> +       order = ilog2(count);
> +       do {
> +               queue_size = q->len * (1ULL << order);
> +               q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);
> +               if (q->base || queue_size < PAGE_SIZE)
> +                       break;
> +
> +               order--;
> +       } while (1);
> +
> +       if (!q->base) {
> +               dev_err(dev, "failed to allocate %s queue (cnt: %u)\n", name, count);
> +               return -ENOMEM;
> +       }
> +
> +       q->cnt = 1ULL << order;
> +
> +       qbr_val = phys_to_ppn(q->base_dma) |
> +           FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
> +
> +       riscv_iommu_writeq(iommu, q->qbr, qbr_val);
> +
> +       /*
> +        * Queue base registers are WARL, so it's possible that whatever we wrote
> +        * there was illegal/not supported by the hw in which case we need to make
> +        * sure we set a supported PPN and/or queue size.
> +        */
> +       qbr_readback = riscv_iommu_readq(iommu, q->qbr);
> +       if (qbr_readback == qbr_val)
> +               goto irq;
> +
> +       dmam_free_coherent(dev, queue_size, q->base, q->base_dma);
> +
> +       /* Get supported queue size */
> +       order = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qbr_readback) + 1;
> +       q->cnt = 1ULL << order;
> +       queue_size = q->len * q->cnt;
> +
> +       /*
> +        * In case we also failed to set PPN, it means the field is hardcoded and the
> +        * queue resides in I/O memory instead, so get its physical address and
> +        * ioremap it.
> +        */
> +       qbr_paddr = ppn_to_phys(qbr_readback);
> +       if (qbr_paddr != q->base_dma) {
> +               dev_info(dev,
> +                        "hardcoded ppn in %s base register, using io memory for the queue\n",
> +                        name);
> +               dev_info(dev, "queue length for %s set to %i\n", name, q->cnt);
> +               q->in_iomem = true;
> +               q->base = ioremap(qbr_paddr, queue_size);
> +               if (!q->base) {
> +                       dev_err(dev, "failed to map %s queue (cnt: %u)\n", name, q->cnt);
> +                       return -ENOMEM;
> +               }
> +               q->base_dma = qbr_paddr;
> +       } else {
> +               /*
> +                * We only failed to set the queue size, re-try to allocate memory with
> +                * the queue size supported by the hw.
> +                */
> +               dev_info(dev, "hardcoded queue size in %s base register\n", name);
> +               dev_info(dev, "retrying with queue length: %i\n", q->cnt);
> +               q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);
> +               if (!q->base) {
> +                       dev_err(dev, "failed to allocate %s queue (cnt: %u)\n",
> +                               name, q->cnt);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       qbr_val = phys_to_ppn(q->base_dma) |
> +           FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
> +       riscv_iommu_writeq(iommu, q->qbr, qbr_val);
> +
> +       /* Final check to make sure hw accepted our write */
> +       qbr_readback = riscv_iommu_readq(iommu, q->qbr);
> +       if (qbr_readback != qbr_val) {
> +               dev_err(dev, "failed to set base register for %s\n", name);
> +               goto fail;
> +       }
> +
> + irq:
> +       if (request_threaded_irq(irq, irq_check, irq_process, IRQF_ONESHOT | IRQF_SHARED,
> +                                dev_name(dev), q)) {
> +               dev_err(dev, "fail to request irq %d for %s\n", irq, name);
> +               goto fail;
> +       }
> +
> +       q->irq = irq;
> +
> +       /* Note: All RIO_xQ_EN/IE fields are in the same offsets */
> +       ret =
> +           riscv_iommu_queue_ctrl(iommu, q,
> +                                  RISCV_IOMMU_QUEUE_ENABLE |
> +                                  RISCV_IOMMU_QUEUE_INTR_ENABLE);
> +       if (ret & RISCV_IOMMU_QUEUE_BUSY) {
> +               dev_err(dev, "%s init timeout\n", name);
> +               ret = -EBUSY;
> +               goto fail;
> +       }
> +
> +       return 0;
> +
> + fail:
> +       riscv_iommu_queue_free(iommu, q);
> +       return 0;
> +}
> +
> +/*
> + * I/O MMU Command queue chapter 3.1
> + */
> +
> +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
> +{
> +       cmd->dword0 =
> +           FIELD_PREP(RISCV_IOMMU_CMD_OPCODE,
> +                      RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | FIELD_PREP(RISCV_IOMMU_CMD_FUNC,
> +                                                                    RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
> +       cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> +                                                 u64 addr)
> +{
> +       cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> +       cmd->dword1 = addr;
> +}
> +
> +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
> +                                                  unsigned pscid)
> +{
> +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
> +           RISCV_IOMMU_CMD_IOTINVAL_PSCV;
> +}
> +
> +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
> +                                                  unsigned gscid)
> +{
> +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
> +           RISCV_IOMMU_CMD_IOTINVAL_GV;
> +}
> +
> +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
> +{
> +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C);
> +       cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
> +                                                 u64 addr, u32 data)
> +{
> +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> +           FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) | RISCV_IOMMU_CMD_IOFENCE_AV;
> +       cmd->dword1 = (addr >> 2);
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
> +{
> +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
> +       cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
> +{
> +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
> +       cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
> +                                                unsigned devid)
> +{
> +       cmd->dword0 |=
> +           FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) | RISCV_IOMMU_CMD_IODIR_DV;
> +}
> +
> +/* TODO: Convert into lock-less MPSC implementation. */
> +static bool riscv_iommu_post_sync(struct riscv_iommu_device *iommu,
> +                                 struct riscv_iommu_command *cmd, bool sync)
> +{
> +       u32 head, tail, next, last;
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&iommu->cq_lock, flags);
> +       head = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) & (iommu->cmdq.cnt - 1);
> +       tail = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
> +       last = iommu->cmdq.lui;
> +       if (tail != last) {
> +               spin_unlock_irqrestore(&iommu->cq_lock, flags);
> +               /*
> +                * FIXME: This is a workaround for dropped MMIO writes/reads on QEMU platform.
> +                *        While debugging of the problem is still ongoing, this provides
> +                *        a simple impolementation of try-again policy.
> +                *        Will be changed to lock-less algorithm in the feature.
> +                */
> +               dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (1st)\n", last, tail);
> +               spin_lock_irqsave(&iommu->cq_lock, flags);
> +               tail =
> +                   riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
> +               last = iommu->cmdq.lui;
> +               if (tail != last) {
> +                       spin_unlock_irqrestore(&iommu->cq_lock, flags);
> +                       dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (2nd)\n", last, tail);
> +                       spin_lock_irqsave(&iommu->cq_lock, flags);
> +               }
> +       }
> +
> +       next = (last + 1) & (iommu->cmdq.cnt - 1);
> +       if (next != head) {
> +               struct riscv_iommu_command *ptr = iommu->cmdq.base;
> +               ptr[last] = *cmd;
> +               wmb();
> +               riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQT, next);
> +               iommu->cmdq.lui = next;
> +       }
> +
> +       spin_unlock_irqrestore(&iommu->cq_lock, flags);
> +
> +       if (sync && head != next) {
> +               cycles_t start_time = get_cycles();
> +               while (1) {
> +                       last = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) &
> +                           (iommu->cmdq.cnt - 1);
> +                       if (head < next && last >= next)
> +                               break;
> +                       if (head > next && last < head && last >= next)
> +                               break;
> +                       if (RISCV_IOMMU_TIMEOUT < (get_cycles() - start_time)) {

This condition will be imprecise, because here is not in irq disabled
context, it will be scheduled out or preempted. When we come back
here, it might be over 1 second, but the IOFENCE is actually
completed.

> +                               dev_err(iommu->dev, "IOFENCE TIMEOUT\n");
> +                               return false;
> +                       }
> +                       cpu_relax();
> +               }
> +       }
> +
> +       return next != head;
> +}
> +
> +static bool riscv_iommu_post(struct riscv_iommu_device *iommu,
> +                            struct riscv_iommu_command *cmd)
> +{
> +       return riscv_iommu_post_sync(iommu, cmd, false);
> +}
> +
> +static bool riscv_iommu_iofence_sync(struct riscv_iommu_device *iommu)
> +{
> +       struct riscv_iommu_command cmd;
> +       riscv_iommu_cmd_iofence(&cmd);
> +       return riscv_iommu_post_sync(iommu, &cmd, true);
> +}
> +
> +/* Command queue primary interrupt handler */
> +static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data)
> +{
> +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +       struct riscv_iommu_device *iommu =
> +           container_of(q, struct riscv_iommu_device, cmdq);
> +       u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> +       if (ipsr & RISCV_IOMMU_IPSR_CIP)
> +               return IRQ_WAKE_THREAD;
> +       return IRQ_NONE;
> +}
> +
> +/* Command queue interrupt hanlder thread function */
> +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
> +{
> +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +       struct riscv_iommu_device *iommu;
> +       unsigned ctrl;
> +
> +       iommu = container_of(q, struct riscv_iommu_device, cmdq);
> +
> +       /* Error reporting, clear error reports if any. */
> +       ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQCSR);
> +       if (ctrl & (RISCV_IOMMU_CQCSR_CQMF |
> +                   RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL)) {
> +               riscv_iommu_queue_ctrl(iommu, &iommu->cmdq, ctrl);
> +               dev_warn_ratelimited(iommu->dev,
> +                                    "Command queue error: fault: %d tout: %d err: %d\n",
> +                                    !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
> +                                    !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
> +                                    !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL));

We need to handle the error by either adjusting the tail to remove the
failed command or fixing the failed command itself. Otherwise, the
failed command will keep in the queue and IOMMU will try to execute
it. I guess the first option might be easier to implement.

> +       }
> +
> +       /* Clear fault interrupt pending. */
> +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_CIP);
> +
> +       return IRQ_HANDLED;
> +}
> +
> +/*
> + * Fault/event queue, chapter 3.2
> + */
> +
> +static void riscv_iommu_fault_report(struct riscv_iommu_device *iommu,
> +                                    struct riscv_iommu_fq_record *event)
> +{
> +       unsigned err, devid;
> +
> +       err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
> +       devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
> +
> +       dev_warn_ratelimited(iommu->dev,
> +                            "Fault %d devid: %d" " iotval: %llx iotval2: %llx\n", err,
> +                            devid, event->iotval, event->iotval2);
> +}
> +
> +/* Fault/event queue primary interrupt handler */
> +static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data)
> +{
> +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +       struct riscv_iommu_device *iommu =
> +           container_of(q, struct riscv_iommu_device, fltq);
> +       u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> +       if (ipsr & RISCV_IOMMU_IPSR_FIP)
> +               return IRQ_WAKE_THREAD;
> +       return IRQ_NONE;
> +}
> +
> +/* Fault queue interrupt hanlder thread function */
> +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
> +{
> +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +       struct riscv_iommu_device *iommu;
> +       struct riscv_iommu_fq_record *events;
> +       unsigned cnt, len, idx, ctrl;
> +
> +       iommu = container_of(q, struct riscv_iommu_device, fltq);
> +       events = (struct riscv_iommu_fq_record *)q->base;
> +
> +       /* Error reporting, clear error reports if any. */
> +       ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FQCSR);
> +       if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
> +               riscv_iommu_queue_ctrl(iommu, &iommu->fltq, ctrl);
> +               dev_warn_ratelimited(iommu->dev,
> +                                    "Fault queue error: fault: %d full: %d\n",
> +                                    !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
> +                                    !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
> +       }
> +
> +       /* Clear fault interrupt pending. */
> +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_FIP);
> +
> +       /* Report fault events. */
> +       do {
> +               cnt = riscv_iommu_queue_consume(iommu, q, &idx);
> +               if (!cnt)
> +                       break;
> +               for (len = 0; len < cnt; idx++, len++)
> +                       riscv_iommu_fault_report(iommu, &events[idx]);
> +               riscv_iommu_queue_release(iommu, q, cnt);
> +       } while (1);
> +
> +       return IRQ_HANDLED;
> +}
> +
> +/*
> + * Page request queue, chapter 3.3
> + */
> +
>  /*
>   * Register device for IOMMU tracking.
>   */
> @@ -97,6 +600,54 @@ static void riscv_iommu_add_device(struct riscv_iommu_device *iommu, struct devi
>         mutex_unlock(&iommu->eps_mutex);
>  }
>
> +/* Page request interface queue primary interrupt handler */
> +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data)
> +{
> +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +       struct riscv_iommu_device *iommu =
> +           container_of(q, struct riscv_iommu_device, priq);
> +       u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> +       if (ipsr & RISCV_IOMMU_IPSR_PIP)
> +               return IRQ_WAKE_THREAD;
> +       return IRQ_NONE;
> +}
> +
> +/* Page request interface queue interrupt hanlder thread function */
> +static irqreturn_t riscv_iommu_priq_process(int irq, void *data)
> +{
> +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +       struct riscv_iommu_device *iommu;
> +       struct riscv_iommu_pq_record *requests;
> +       unsigned cnt, idx, ctrl;
> +
> +       iommu = container_of(q, struct riscv_iommu_device, priq);
> +       requests = (struct riscv_iommu_pq_record *)q->base;
> +
> +       /* Error reporting, clear error reports if any. */
> +       ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_PQCSR);
> +       if (ctrl & (RISCV_IOMMU_PQCSR_PQMF | RISCV_IOMMU_PQCSR_PQOF)) {
> +               riscv_iommu_queue_ctrl(iommu, &iommu->priq, ctrl);
> +               dev_warn_ratelimited(iommu->dev,
> +                                    "Page request queue error: fault: %d full: %d\n",
> +                                    !!(ctrl & RISCV_IOMMU_PQCSR_PQMF),
> +                                    !!(ctrl & RISCV_IOMMU_PQCSR_PQOF));
> +       }
> +
> +       /* Clear page request interrupt pending. */
> +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_PIP);
> +
> +       /* Process page requests. */
> +       do {
> +               cnt = riscv_iommu_queue_consume(iommu, q, &idx);
> +               if (!cnt)
> +                       break;
> +               dev_warn(iommu->dev, "unexpected %u page requests\n", cnt);
> +               riscv_iommu_queue_release(iommu, q, cnt);
> +       } while (1);
> +
> +       return IRQ_HANDLED;
> +}
> +
>  /*
>   * Endpoint management
>   */
> @@ -350,7 +901,29 @@ static void riscv_iommu_flush_iotlb_range(struct iommu_domain *iommu_domain,
>                                           unsigned long *start, unsigned long *end,
>                                           size_t *pgsize)
>  {
> -       /* Command interface not implemented */
> +       struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
> +       struct riscv_iommu_command cmd;
> +       unsigned long iova;
> +
> +       if (domain->mode == RISCV_IOMMU_DC_FSC_MODE_BARE)
> +               return;
> +
> +       /* Domain not attached to an IOMMU! */
> +       BUG_ON(!domain->iommu);
> +
> +       riscv_iommu_cmd_inval_vma(&cmd);
> +       riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> +
> +       if (start && end && pgsize) {
> +               /* Cover only the range that is needed */
> +               for (iova = *start; iova <= *end; iova += *pgsize) {
> +                       riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> +                       riscv_iommu_post(domain->iommu, &cmd);
> +               }
> +       } else {
> +               riscv_iommu_post(domain->iommu, &cmd);
> +       }
> +       riscv_iommu_iofence_sync(domain->iommu);
>  }
>
>  static void riscv_iommu_flush_iotlb_all(struct iommu_domain *iommu_domain)
> @@ -610,6 +1183,9 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
>         iommu_device_unregister(&iommu->iommu);
>         iommu_device_sysfs_remove(&iommu->iommu);
>         riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> +       riscv_iommu_queue_free(iommu, &iommu->cmdq);
> +       riscv_iommu_queue_free(iommu, &iommu->fltq);
> +       riscv_iommu_queue_free(iommu, &iommu->priq);
>  }
>
>  int riscv_iommu_init(struct riscv_iommu_device *iommu)
> @@ -632,6 +1208,16 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>         }
>  #endif
>
> +       /*
> +        * Assign queue lengths from module parameters if not already
> +        * set on the device tree.
> +        */
> +       if (!iommu->cmdq_len)
> +               iommu->cmdq_len = cmdq_length;
> +       if (!iommu->fltq_len)
> +               iommu->fltq_len = fltq_length;
> +       if (!iommu->priq_len)
> +               iommu->priq_len = priq_length;
>         /* Clear any pending interrupt flag. */
>         riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR,
>                            RISCV_IOMMU_IPSR_CIP |
> @@ -639,7 +1225,20 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>                            RISCV_IOMMU_IPSR_PMIP | RISCV_IOMMU_IPSR_PIP);
>         spin_lock_init(&iommu->cq_lock);
>         mutex_init(&iommu->eps_mutex);
> +       ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_COMMAND_QUEUE);
> +       if (ret)
> +               goto fail;
> +       ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_FAULT_QUEUE);
> +       if (ret)
> +               goto fail;
> +       if (!(iommu->cap & RISCV_IOMMU_CAP_ATS))
> +               goto no_ats;
> +
> +       ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_PAGE_REQUEST_QUEUE);
> +       if (ret)
> +               goto fail;
>
> + no_ats:
>         ret = riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_BARE);
>
>         if (ret) {
> @@ -663,5 +1262,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>         return 0;
>   fail:
>         riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> +       riscv_iommu_queue_free(iommu, &iommu->priq);
> +       riscv_iommu_queue_free(iommu, &iommu->fltq);
> +       riscv_iommu_queue_free(iommu, &iommu->cmdq);
>         return ret;
>  }
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index 7dc9baa59a50..04148a2a8ffd 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -28,6 +28,24 @@
>  #define IOMMU_PAGE_SIZE_1G     BIT_ULL(30)
>  #define IOMMU_PAGE_SIZE_512G   BIT_ULL(39)
>
> +struct riscv_iommu_queue {
> +       dma_addr_t base_dma;    /* ring buffer bus address */
> +       void *base;             /* ring buffer pointer */
> +       size_t len;             /* single item length */
> +       u32 cnt;                /* items count */
> +       u32 lui;                /* last used index, consumer/producer share */
> +       unsigned qbr;           /* queue base register offset */
> +       unsigned qcr;           /* queue control and status register offset */
> +       int irq;                /* registered interrupt number */
> +       bool in_iomem;          /* indicates queue data are in I/O memory  */
> +};
> +
> +enum riscv_queue_ids {
> +       RISCV_IOMMU_COMMAND_QUEUE       = 0,
> +       RISCV_IOMMU_FAULT_QUEUE         = 1,
> +       RISCV_IOMMU_PAGE_REQUEST_QUEUE  = 2
> +};
> +
>  struct riscv_iommu_device {
>         struct iommu_device iommu;      /* iommu core interface */
>         struct device *dev;             /* iommu hardware */
> @@ -42,6 +60,11 @@ struct riscv_iommu_device {
>         int irq_pm;
>         int irq_priq;
>
> +       /* Queue lengths */
> +       int cmdq_len;
> +       int fltq_len;
> +       int priq_len;
> +
>         /* supported and enabled hardware capabilities */
>         u64 cap;
>
> @@ -53,6 +76,11 @@ struct riscv_iommu_device {
>         unsigned ddt_mode;
>         bool ddtp_in_iomem;
>
> +       /* hardware queues */
> +       struct riscv_iommu_queue cmdq;
> +       struct riscv_iommu_queue fltq;
> +       struct riscv_iommu_queue priq;
> +
>         /* Connected end-points */
>         struct rb_root eps;
>         struct mutex eps_mutex;
> --
> 2.34.1
>
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Nick Kossifidis July 31, 2023, 9:32 a.m. UTC | #10
On 7/29/23 15:58, Zong Li wrote:
> On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>> +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
>> +
>> +       /* For now we only support WSIs until we have AIA support */
> 
> I'm not completely understand AIA support here, because I saw the pci
> case uses the MSI, and kernel seems to have the AIA implementation.
> Could you please elaborate it?
> 

When I wrote this we didn't have AIA in the kernel, and without IMSIC we 
can't have MSIs in the hart (we can still have MSIs in the PCIe controller).

> 
> Should we define the "interrupt-names" in dt-bindings?
> 

Yes we should, along with queue lengths below.

>> +
>> +       /* Make sure fctl.WSI is set */
>> +       fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
>> +       fctl |= RISCV_IOMMU_FCTL_WSI;
>> +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, fctl);
>> +
>> +       /* Parse Queue lengts */
>> +       ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
>> +       if (!ret)
>> +               dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
>> +
>> +       ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
>> +       if (!ret)
>> +               dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
>> +
>> +       ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
>> +       if (!ret)
>> +               dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
>> +
>>          dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
>>
Zong Li July 31, 2023, 1:15 p.m. UTC | #11
On Mon, Jul 31, 2023 at 5:32 PM Nick Kossifidis <mick@ics.forth.gr> wrote:
>
> On 7/29/23 15:58, Zong Li wrote:
> > On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> >> +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> >> +
> >> +       /* For now we only support WSIs until we have AIA support */
> >
> > I'm not completely understand AIA support here, because I saw the pci
> > case uses the MSI, and kernel seems to have the AIA implementation.
> > Could you please elaborate it?
> >
>
> When I wrote this we didn't have AIA in the kernel, and without IMSIC we
> can't have MSIs in the hart (we can still have MSIs in the PCIe controller).

Thanks for your clarification, do we support the MSI in next version?

>
> >
> > Should we define the "interrupt-names" in dt-bindings?
> >
>
> Yes we should, along with queue lengths below.
>
> >> +
> >> +       /* Make sure fctl.WSI is set */
> >> +       fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
> >> +       fctl |= RISCV_IOMMU_FCTL_WSI;
> >> +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, fctl);
> >> +
> >> +       /* Parse Queue lengts */
> >> +       ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> >> +       if (!ret)
> >> +               dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> >> +
> >> +       ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> >> +       if (!ret)
> >> +               dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> >> +
> >> +       ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> >> +       if (!ret)
> >> +               dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> >> +
> >>          dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
> >>
Nick Kossifidis July 31, 2023, 11:35 p.m. UTC | #12
On 7/31/23 16:15, Zong Li wrote:
> On Mon, Jul 31, 2023 at 5:32 PM Nick Kossifidis <mick@ics.forth.gr> wrote:
>>
>> On 7/29/23 15:58, Zong Li wrote:
>>> On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>>>> +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
>>>> +
>>>> +       /* For now we only support WSIs until we have AIA support */
>>>
>>> I'm not completely understand AIA support here, because I saw the pci
>>> case uses the MSI, and kernel seems to have the AIA implementation.
>>> Could you please elaborate it?
>>>
>>
>> When I wrote this we didn't have AIA in the kernel, and without IMSIC we
>> can't have MSIs in the hart (we can still have MSIs in the PCIe controller).
> 
> Thanks for your clarification, do we support the MSI in next version?
> 

I don't think there is an IOMMU implementation out there (emulated or in 
hw) that can do MSIs and is not a pcie device (the QEMU implementation 
is a pcie device). If we have something to test this against, and we 
also have an IMSIC etc, we can work on that.
Zong Li Aug. 1, 2023, 12:37 a.m. UTC | #13
On Tue, Aug 1, 2023 at 7:35 AM Nick Kossifidis <mick@ics.forth.gr> wrote:
>
> On 7/31/23 16:15, Zong Li wrote:
> > On Mon, Jul 31, 2023 at 5:32 PM Nick Kossifidis <mick@ics.forth.gr> wrote:
> >>
> >> On 7/29/23 15:58, Zong Li wrote:
> >>> On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> >>>> +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> >>>> +
> >>>> +       /* For now we only support WSIs until we have AIA support */
> >>>
> >>> I'm not completely understand AIA support here, because I saw the pci
> >>> case uses the MSI, and kernel seems to have the AIA implementation.
> >>> Could you please elaborate it?
> >>>
> >>
> >> When I wrote this we didn't have AIA in the kernel, and without IMSIC we
> >> can't have MSIs in the hart (we can still have MSIs in the PCIe controller).
> >
> > Thanks for your clarification, do we support the MSI in next version?
> >
>
> I don't think there is an IOMMU implementation out there (emulated or in
> hw) that can do MSIs and is not a pcie device (the QEMU implementation
> is a pcie device). If we have something to test this against, and we
> also have an IMSIC etc, we can work on that.

I guess I can assist with that. We have an IOMMU hardware (non-pcie
device) that has already implemented the MSI functionality, and I have
conducted testing on it. Perhaps let me add the related implementation
here after this series is merged.
Tomasz Jeznach Aug. 2, 2023, 8:28 p.m. UTC | #14
On Mon, Jul 31, 2023 at 5:38 PM Zong Li <zong.li@sifive.com> wrote:
>
> On Tue, Aug 1, 2023 at 7:35 AM Nick Kossifidis <mick@ics.forth.gr> wrote:
> >
> > On 7/31/23 16:15, Zong Li wrote:
> > > On Mon, Jul 31, 2023 at 5:32 PM Nick Kossifidis <mick@ics.forth.gr> wrote:
> > >>
> > >> On 7/29/23 15:58, Zong Li wrote:
> > >>> On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> > >>>> +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> > >>>> +
> > >>>> +       /* For now we only support WSIs until we have AIA support */
> > >>>
> > >>> I'm not completely understand AIA support here, because I saw the pci
> > >>> case uses the MSI, and kernel seems to have the AIA implementation.
> > >>> Could you please elaborate it?
> > >>>
> > >>
> > >> When I wrote this we didn't have AIA in the kernel, and without IMSIC we
> > >> can't have MSIs in the hart (we can still have MSIs in the PCIe controller).
> > >
> > > Thanks for your clarification, do we support the MSI in next version?
> > >
> >
> > I don't think there is an IOMMU implementation out there (emulated or in
> > hw) that can do MSIs and is not a pcie device (the QEMU implementation
> > is a pcie device). If we have something to test this against, and we
> > also have an IMSIC etc, we can work on that.
>
> I guess I can assist with that. We have an IOMMU hardware (non-pcie
> device) that has already implemented the MSI functionality, and I have
> conducted testing on it. Perhaps let me add the related implementation
> here after this series is merged.

Thanks, getting MSI support for non-PCIe IOMMU hardware would be great!

best,
- Tomasz
Tomasz Jeznach Aug. 2, 2023, 8:50 p.m. UTC | #15
On Sat, Jul 29, 2023 at 5:58 AM Zong Li <zong.li@sifive.com> wrote:
>
> On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> >
> > Enables message or wire signal interrupts for PCIe and platforms devices.
> >
> > Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
> > Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
> > Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> > ---
> >  drivers/iommu/riscv/iommu-pci.c      |  72 ++++
> >  drivers/iommu/riscv/iommu-platform.c |  66 +++
> >  drivers/iommu/riscv/iommu.c          | 604 ++++++++++++++++++++++++++-
> >  drivers/iommu/riscv/iommu.h          |  28 ++
> >  4 files changed, 769 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/iommu/riscv/iommu-pci.c b/drivers/iommu/riscv/iommu-pci.c
> > index c91f963d7a29..9ea0647f7b92 100644
> > --- a/drivers/iommu/riscv/iommu-pci.c
> > +++ b/drivers/iommu/riscv/iommu-pci.c
> > @@ -34,6 +34,7 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
> >  {
> >         struct device *dev = &pdev->dev;
> >         struct riscv_iommu_device *iommu;
> > +       u64 icvec;
> >         int ret;
> >
> >         ret = pci_enable_device_mem(pdev);
> > @@ -67,14 +68,84 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
> >         iommu->dev = dev;
> >         dev_set_drvdata(dev, iommu);
> >
> > +       /* Check device reported capabilities. */
> > +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> > +
> > +       /* The PCI driver only uses MSIs, make sure the IOMMU supports this */
> > +       switch (FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap)) {
> > +       case RISCV_IOMMU_CAP_IGS_MSI:
> > +       case RISCV_IOMMU_CAP_IGS_BOTH:
> > +               break;
> > +       default:
> > +               dev_err(dev, "unable to use message-signaled interrupts\n");
> > +               ret = -ENODEV;
> > +               goto fail;
> > +       }
> > +
> >         dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> >         pci_set_master(pdev);
> >
> > +       /* Allocate and assign IRQ vectors for the various events */
> > +       ret = pci_alloc_irq_vectors(pdev, 1, RISCV_IOMMU_INTR_COUNT, PCI_IRQ_MSIX);
> > +       if (ret < 0) {
> > +               dev_err(dev, "unable to allocate irq vectors\n");
> > +               goto fail;
> > +       }
> > +
> > +       ret = -ENODEV;
> > +
> > +       iommu->irq_cmdq = msi_get_virq(dev, RISCV_IOMMU_INTR_CQ);
> > +       if (!iommu->irq_cmdq) {
> > +               dev_warn(dev, "no MSI vector %d for the command queue\n",
> > +                        RISCV_IOMMU_INTR_CQ);
> > +               goto fail;
> > +       }
> > +
> > +       iommu->irq_fltq = msi_get_virq(dev, RISCV_IOMMU_INTR_FQ);
> > +       if (!iommu->irq_fltq) {
> > +               dev_warn(dev, "no MSI vector %d for the fault/event queue\n",
> > +                        RISCV_IOMMU_INTR_FQ);
> > +               goto fail;
> > +       }
> > +
> > +       if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
> > +               iommu->irq_pm = msi_get_virq(dev, RISCV_IOMMU_INTR_PM);
> > +               if (!iommu->irq_pm) {
> > +                       dev_warn(dev,
> > +                                "no MSI vector %d for performance monitoring\n",
> > +                                RISCV_IOMMU_INTR_PM);
> > +                       goto fail;
> > +               }
> > +       }
> > +
> > +       if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
> > +               iommu->irq_priq = msi_get_virq(dev, RISCV_IOMMU_INTR_PQ);
> > +               if (!iommu->irq_priq) {
> > +                       dev_warn(dev,
> > +                                "no MSI vector %d for page-request queue\n",
> > +                                RISCV_IOMMU_INTR_PQ);
> > +                       goto fail;
> > +               }
> > +       }
> > +
> > +       /* Set simple 1:1 mapping for MSI vectors */
> > +       icvec = FIELD_PREP(RISCV_IOMMU_IVEC_CIV, RISCV_IOMMU_INTR_CQ) |
> > +           FIELD_PREP(RISCV_IOMMU_IVEC_FIV, RISCV_IOMMU_INTR_FQ);
> > +
> > +       if (iommu->cap & RISCV_IOMMU_CAP_HPM)
> > +               icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, RISCV_IOMMU_INTR_PM);
> > +
> > +       if (iommu->cap & RISCV_IOMMU_CAP_ATS)
> > +               icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, RISCV_IOMMU_INTR_PQ);
> > +
> > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IVEC, icvec);
> > +
> >         ret = riscv_iommu_init(iommu);
> >         if (!ret)
> >                 return ret;
> >
> >   fail:
> > +       pci_free_irq_vectors(pdev);
> >         pci_clear_master(pdev);
> >         pci_release_regions(pdev);
> >         pci_disable_device(pdev);
> > @@ -85,6 +156,7 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
> >  static void riscv_iommu_pci_remove(struct pci_dev *pdev)
> >  {
> >         riscv_iommu_remove(dev_get_drvdata(&pdev->dev));
> > +       pci_free_irq_vectors(pdev);
> >         pci_clear_master(pdev);
> >         pci_release_regions(pdev);
> >         pci_disable_device(pdev);
> > diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c
> > index e4e8ca6711e7..35935d3c7ef4 100644
> > --- a/drivers/iommu/riscv/iommu-platform.c
> > +++ b/drivers/iommu/riscv/iommu-platform.c
> > @@ -20,6 +20,8 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
> >         struct device *dev = &pdev->dev;
> >         struct riscv_iommu_device *iommu = NULL;
> >         struct resource *res = NULL;
> > +       u32 fctl = 0;
> > +       int irq = 0;
> >         int ret = 0;
> >
> >         iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
> > @@ -53,6 +55,70 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
> >                 goto fail;
> >         }
> >
> > +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> > +
> > +       /* For now we only support WSIs until we have AIA support */
>
> I'm not completely understand AIA support here, because I saw the pci
> case uses the MSI, and kernel seems to have the AIA implementation.
> Could you please elaborate it?
>
> > +       ret = FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap);
> > +       if (ret == RISCV_IOMMU_CAP_IGS_MSI) {
> > +               dev_err(dev, "IOMMU only supports MSIs\n");
> > +               goto fail;
> > +       }
> > +
> > +       /* Parse IRQ assignment */
> > +       irq = platform_get_irq_byname_optional(pdev, "cmdq");
> > +       if (irq > 0)
> > +               iommu->irq_cmdq = irq;
> > +       else {
> > +               dev_err(dev, "no IRQ provided for the command queue\n");
> > +               goto fail;
> > +       }
> > +
> > +       irq = platform_get_irq_byname_optional(pdev, "fltq");
> > +       if (irq > 0)
> > +               iommu->irq_fltq = irq;
> > +       else {
> > +               dev_err(dev, "no IRQ provided for the fault/event queue\n");
> > +               goto fail;
> > +       }
> > +
> > +       if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
> > +               irq = platform_get_irq_byname_optional(pdev, "pm");
> > +               if (irq > 0)
> > +                       iommu->irq_pm = irq;
> > +               else {
> > +                       dev_err(dev, "no IRQ provided for performance monitoring\n");
> > +                       goto fail;
> > +               }
> > +       }
> > +
> > +       if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
> > +               irq = platform_get_irq_byname_optional(pdev, "priq");
> > +               if (irq > 0)
> > +                       iommu->irq_priq = irq;
> > +               else {
> > +                       dev_err(dev, "no IRQ provided for the page-request queue\n");
> > +                       goto fail;
> > +               }
> > +       }
>
> Should we define the "interrupt-names" in dt-bindings?
>

Yes, this was brought up earlier wrt dt-bindings.

I'm considering removal of interrupt names from DT (and get-byname
option), as IOMMU hardware cause-to-vector remapping `icvec` should be
used to map interrupt source to actual interrupt vector. If possible
device driver should map cause to interrupt (based on number of
vectors available) or rely on ICVEC WARL properties to discover fixed
cause-to-vector mapping in the hardware.

Please let me know if this is reasonable change.

> > +
> > +       /* Make sure fctl.WSI is set */
> > +       fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
> > +       fctl |= RISCV_IOMMU_FCTL_WSI;
> > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, fctl);
> > +
> > +       /* Parse Queue lengts */
> > +       ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> > +       if (!ret)
> > +               dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> > +
> > +       ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> > +       if (!ret)
> > +               dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> > +
> > +       ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> > +       if (!ret)
> > +               dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> > +
> >         dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
> >
> >         return riscv_iommu_init(iommu);
> > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> > index 31dc3c458e13..5c4cf9875302 100644
> > --- a/drivers/iommu/riscv/iommu.c
> > +++ b/drivers/iommu/riscv/iommu.c
> > @@ -45,6 +45,18 @@ static int ddt_mode = RISCV_IOMMU_DDTP_MODE_BARE;
> >  module_param(ddt_mode, int, 0644);
> >  MODULE_PARM_DESC(ddt_mode, "Device Directory Table mode.");
> >
> > +static int cmdq_length = 1024;
> > +module_param(cmdq_length, int, 0644);
> > +MODULE_PARM_DESC(cmdq_length, "Command queue length.");
> > +
> > +static int fltq_length = 1024;
> > +module_param(fltq_length, int, 0644);
> > +MODULE_PARM_DESC(fltq_length, "Fault queue length.");
> > +
> > +static int priq_length = 1024;
> > +module_param(priq_length, int, 0644);
> > +MODULE_PARM_DESC(priq_length, "Page request interface queue length.");
> > +
> >  /* IOMMU PSCID allocation namespace. */
> >  #define RISCV_IOMMU_MAX_PSCID  (1U << 20)
> >  static DEFINE_IDA(riscv_iommu_pscids);
> > @@ -65,6 +77,497 @@ static DEFINE_IDA(riscv_iommu_pscids);
> >  static const struct iommu_domain_ops riscv_iommu_domain_ops;
> >  static const struct iommu_ops riscv_iommu_ops;
> >
> > +/*
> > + * Common queue management routines
> > + */
> > +
> > +/* Note: offsets are the same for all queues */
> > +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
> > +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
> > +
> > +static unsigned riscv_iommu_queue_consume(struct riscv_iommu_device *iommu,
> > +                                         struct riscv_iommu_queue *q, unsigned *ready)
> > +{
> > +       u32 tail = riscv_iommu_readl(iommu, Q_TAIL(q));
> > +       *ready = q->lui;
> > +
> > +       BUG_ON(q->cnt <= tail);
> > +       if (q->lui <= tail)
> > +               return tail - q->lui;
> > +       return q->cnt - q->lui;
> > +}
> > +
> > +static void riscv_iommu_queue_release(struct riscv_iommu_device *iommu,
> > +                                     struct riscv_iommu_queue *q, unsigned count)
> > +{
> > +       q->lui = (q->lui + count) & (q->cnt - 1);
> > +       riscv_iommu_writel(iommu, Q_HEAD(q), q->lui);
> > +}
> > +
> > +static u32 riscv_iommu_queue_ctrl(struct riscv_iommu_device *iommu,
> > +                                 struct riscv_iommu_queue *q, u32 val)
> > +{
> > +       cycles_t end_cycles = RISCV_IOMMU_TIMEOUT + get_cycles();
> > +
> > +       riscv_iommu_writel(iommu, q->qcr, val);
> > +       do {
> > +               val = riscv_iommu_readl(iommu, q->qcr);
> > +               if (!(val & RISCV_IOMMU_QUEUE_BUSY))
> > +                       break;
> > +               cpu_relax();
> > +       } while (get_cycles() < end_cycles);
> > +
> > +       return val;
> > +}
> > +
> > +static void riscv_iommu_queue_free(struct riscv_iommu_device *iommu,
> > +                                  struct riscv_iommu_queue *q)
> > +{
> > +       size_t size = q->len * q->cnt;
> > +
> > +       riscv_iommu_queue_ctrl(iommu, q, 0);
> > +
> > +       if (q->base) {
> > +               if (q->in_iomem)
> > +                       iounmap(q->base);
> > +               else
> > +                       dmam_free_coherent(iommu->dev, size, q->base, q->base_dma);
> > +       }
> > +       if (q->irq)
> > +               free_irq(q->irq, q);
> > +}
> > +
> > +static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data);
> > +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data);
> > +static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data);
> > +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data);
> > +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
> > +static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
> > +
> > +static int riscv_iommu_queue_init(struct riscv_iommu_device *iommu, int queue_id)
> > +{
> > +       struct device *dev = iommu->dev;
> > +       struct riscv_iommu_queue *q = NULL;
> > +       size_t queue_size = 0;
> > +       irq_handler_t irq_check;
> > +       irq_handler_t irq_process;
> > +       const char *name;
> > +       int count = 0;
> > +       int irq = 0;
> > +       unsigned order = 0;
> > +       u64 qbr_val = 0;
> > +       u64 qbr_readback = 0;
> > +       u64 qbr_paddr = 0;
> > +       int ret = 0;
> > +
> > +       switch (queue_id) {
> > +       case RISCV_IOMMU_COMMAND_QUEUE:
> > +               q = &iommu->cmdq;
> > +               q->len = sizeof(struct riscv_iommu_command);
> > +               count = iommu->cmdq_len;
> > +               irq = iommu->irq_cmdq;
> > +               irq_check = riscv_iommu_cmdq_irq_check;
> > +               irq_process = riscv_iommu_cmdq_process;
> > +               q->qbr = RISCV_IOMMU_REG_CQB;
> > +               q->qcr = RISCV_IOMMU_REG_CQCSR;
> > +               name = "cmdq";
> > +               break;
> > +       case RISCV_IOMMU_FAULT_QUEUE:
> > +               q = &iommu->fltq;
> > +               q->len = sizeof(struct riscv_iommu_fq_record);
> > +               count = iommu->fltq_len;
> > +               irq = iommu->irq_fltq;
> > +               irq_check = riscv_iommu_fltq_irq_check;
> > +               irq_process = riscv_iommu_fltq_process;
> > +               q->qbr = RISCV_IOMMU_REG_FQB;
> > +               q->qcr = RISCV_IOMMU_REG_FQCSR;
> > +               name = "fltq";
> > +               break;
> > +       case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
> > +               q = &iommu->priq;
> > +               q->len = sizeof(struct riscv_iommu_pq_record);
> > +               count = iommu->priq_len;
> > +               irq = iommu->irq_priq;
> > +               irq_check = riscv_iommu_priq_irq_check;
> > +               irq_process = riscv_iommu_priq_process;
> > +               q->qbr = RISCV_IOMMU_REG_PQB;
> > +               q->qcr = RISCV_IOMMU_REG_PQCSR;
> > +               name = "priq";
> > +               break;
> > +       default:
> > +               dev_err(dev, "invalid queue interrupt index in queue_init!\n");
> > +               return -EINVAL;
> > +       }
> > +
> > +       /* Polling not implemented */
> > +       if (!irq)
> > +               return -ENODEV;
> > +
> > +       /* Allocate queue in memory and set the base register */
> > +       order = ilog2(count);
> > +       do {
> > +               queue_size = q->len * (1ULL << order);
> > +               q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);
> > +               if (q->base || queue_size < PAGE_SIZE)
> > +                       break;
> > +
> > +               order--;
> > +       } while (1);
> > +
> > +       if (!q->base) {
> > +               dev_err(dev, "failed to allocate %s queue (cnt: %u)\n", name, count);
> > +               return -ENOMEM;
> > +       }
> > +
> > +       q->cnt = 1ULL << order;
> > +
> > +       qbr_val = phys_to_ppn(q->base_dma) |
> > +           FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
> > +
> > +       riscv_iommu_writeq(iommu, q->qbr, qbr_val);
> > +
> > +       /*
> > +        * Queue base registers are WARL, so it's possible that whatever we wrote
> > +        * there was illegal/not supported by the hw in which case we need to make
> > +        * sure we set a supported PPN and/or queue size.
> > +        */
> > +       qbr_readback = riscv_iommu_readq(iommu, q->qbr);
> > +       if (qbr_readback == qbr_val)
> > +               goto irq;
> > +
> > +       dmam_free_coherent(dev, queue_size, q->base, q->base_dma);
> > +
> > +       /* Get supported queue size */
> > +       order = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qbr_readback) + 1;
> > +       q->cnt = 1ULL << order;
> > +       queue_size = q->len * q->cnt;
> > +
> > +       /*
> > +        * In case we also failed to set PPN, it means the field is hardcoded and the
> > +        * queue resides in I/O memory instead, so get its physical address and
> > +        * ioremap it.
> > +        */
> > +       qbr_paddr = ppn_to_phys(qbr_readback);
> > +       if (qbr_paddr != q->base_dma) {
> > +               dev_info(dev,
> > +                        "hardcoded ppn in %s base register, using io memory for the queue\n",
> > +                        name);
> > +               dev_info(dev, "queue length for %s set to %i\n", name, q->cnt);
> > +               q->in_iomem = true;
> > +               q->base = ioremap(qbr_paddr, queue_size);
> > +               if (!q->base) {
> > +                       dev_err(dev, "failed to map %s queue (cnt: %u)\n", name, q->cnt);
> > +                       return -ENOMEM;
> > +               }
> > +               q->base_dma = qbr_paddr;
> > +       } else {
> > +               /*
> > +                * We only failed to set the queue size, re-try to allocate memory with
> > +                * the queue size supported by the hw.
> > +                */
> > +               dev_info(dev, "hardcoded queue size in %s base register\n", name);
> > +               dev_info(dev, "retrying with queue length: %i\n", q->cnt);
> > +               q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);
> > +               if (!q->base) {
> > +                       dev_err(dev, "failed to allocate %s queue (cnt: %u)\n",
> > +                               name, q->cnt);
> > +                       return -ENOMEM;
> > +               }
> > +       }
> > +
> > +       qbr_val = phys_to_ppn(q->base_dma) |
> > +           FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
> > +       riscv_iommu_writeq(iommu, q->qbr, qbr_val);
> > +
> > +       /* Final check to make sure hw accepted our write */
> > +       qbr_readback = riscv_iommu_readq(iommu, q->qbr);
> > +       if (qbr_readback != qbr_val) {
> > +               dev_err(dev, "failed to set base register for %s\n", name);
> > +               goto fail;
> > +       }
> > +
> > + irq:
> > +       if (request_threaded_irq(irq, irq_check, irq_process, IRQF_ONESHOT | IRQF_SHARED,
> > +                                dev_name(dev), q)) {
> > +               dev_err(dev, "fail to request irq %d for %s\n", irq, name);
> > +               goto fail;
> > +       }
> > +
> > +       q->irq = irq;
> > +
> > +       /* Note: All RIO_xQ_EN/IE fields are in the same offsets */
> > +       ret =
> > +           riscv_iommu_queue_ctrl(iommu, q,
> > +                                  RISCV_IOMMU_QUEUE_ENABLE |
> > +                                  RISCV_IOMMU_QUEUE_INTR_ENABLE);
> > +       if (ret & RISCV_IOMMU_QUEUE_BUSY) {
> > +               dev_err(dev, "%s init timeout\n", name);
> > +               ret = -EBUSY;
> > +               goto fail;
> > +       }
> > +
> > +       return 0;
> > +
> > + fail:
> > +       riscv_iommu_queue_free(iommu, q);
> > +       return 0;
> > +}
> > +
> > +/*
> > + * I/O MMU Command queue chapter 3.1
> > + */
> > +
> > +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
> > +{
> > +       cmd->dword0 =
> > +           FIELD_PREP(RISCV_IOMMU_CMD_OPCODE,
> > +                      RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | FIELD_PREP(RISCV_IOMMU_CMD_FUNC,
> > +                                                                    RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
> > +       cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> > +                                                 u64 addr)
> > +{
> > +       cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> > +       cmd->dword1 = addr;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
> > +                                                  unsigned pscid)
> > +{
> > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
> > +           RISCV_IOMMU_CMD_IOTINVAL_PSCV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
> > +                                                  unsigned gscid)
> > +{
> > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
> > +           RISCV_IOMMU_CMD_IOTINVAL_GV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
> > +{
> > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C);
> > +       cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
> > +                                                 u64 addr, u32 data)
> > +{
> > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> > +           FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) | RISCV_IOMMU_CMD_IOFENCE_AV;
> > +       cmd->dword1 = (addr >> 2);
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
> > +{
> > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
> > +       cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
> > +{
> > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
> > +       cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
> > +                                                unsigned devid)
> > +{
> > +       cmd->dword0 |=
> > +           FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) | RISCV_IOMMU_CMD_IODIR_DV;
> > +}
> > +
> > +/* TODO: Convert into lock-less MPSC implementation. */
> > +static bool riscv_iommu_post_sync(struct riscv_iommu_device *iommu,
> > +                                 struct riscv_iommu_command *cmd, bool sync)
> > +{
> > +       u32 head, tail, next, last;
> > +       unsigned long flags;
> > +
> > +       spin_lock_irqsave(&iommu->cq_lock, flags);
> > +       head = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) & (iommu->cmdq.cnt - 1);
> > +       tail = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
> > +       last = iommu->cmdq.lui;
> > +       if (tail != last) {
> > +               spin_unlock_irqrestore(&iommu->cq_lock, flags);
> > +               /*
> > +                * FIXME: This is a workaround for dropped MMIO writes/reads on QEMU platform.
> > +                *        While debugging of the problem is still ongoing, this provides
> > +                *        a simple impolementation of try-again policy.
> > +                *        Will be changed to lock-less algorithm in the feature.
> > +                */
> > +               dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (1st)\n", last, tail);
> > +               spin_lock_irqsave(&iommu->cq_lock, flags);
> > +               tail =
> > +                   riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
> > +               last = iommu->cmdq.lui;
> > +               if (tail != last) {
> > +                       spin_unlock_irqrestore(&iommu->cq_lock, flags);
> > +                       dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (2nd)\n", last, tail);
> > +                       spin_lock_irqsave(&iommu->cq_lock, flags);
> > +               }
> > +       }
> > +
> > +       next = (last + 1) & (iommu->cmdq.cnt - 1);
> > +       if (next != head) {
> > +               struct riscv_iommu_command *ptr = iommu->cmdq.base;
> > +               ptr[last] = *cmd;
> > +               wmb();
> > +               riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQT, next);
> > +               iommu->cmdq.lui = next;
> > +       }
> > +
> > +       spin_unlock_irqrestore(&iommu->cq_lock, flags);
> > +
> > +       if (sync && head != next) {
> > +               cycles_t start_time = get_cycles();
> > +               while (1) {
> > +                       last = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) &
> > +                           (iommu->cmdq.cnt - 1);
> > +                       if (head < next && last >= next)
> > +                               break;
> > +                       if (head > next && last < head && last >= next)
> > +                               break;
> > +                       if (RISCV_IOMMU_TIMEOUT < (get_cycles() - start_time)) {
>
> This condition will be imprecise, because here is not in irq disabled
> context, it will be scheduled out or preempted. When we come back
> here, it might be over 1 second, but the IOFENCE is actually
> completed.
>

Good point. Thank.


> > +                               dev_err(iommu->dev, "IOFENCE TIMEOUT\n");
> > +                               return false;
> > +                       }
> > +                       cpu_relax();
> > +               }
> > +       }
> > +
> > +       return next != head;
> > +}
> > +
> > +static bool riscv_iommu_post(struct riscv_iommu_device *iommu,
> > +                            struct riscv_iommu_command *cmd)
> > +{
> > +       return riscv_iommu_post_sync(iommu, cmd, false);
> > +}
> > +
> > +static bool riscv_iommu_iofence_sync(struct riscv_iommu_device *iommu)
> > +{
> > +       struct riscv_iommu_command cmd;
> > +       riscv_iommu_cmd_iofence(&cmd);
> > +       return riscv_iommu_post_sync(iommu, &cmd, true);
> > +}
> > +
> > +/* Command queue primary interrupt handler */
> > +static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data)
> > +{
> > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > +       struct riscv_iommu_device *iommu =
> > +           container_of(q, struct riscv_iommu_device, cmdq);
> > +       u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> > +       if (ipsr & RISCV_IOMMU_IPSR_CIP)
> > +               return IRQ_WAKE_THREAD;
> > +       return IRQ_NONE;
> > +}
> > +
> > +/* Command queue interrupt hanlder thread function */
> > +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
> > +{
> > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > +       struct riscv_iommu_device *iommu;
> > +       unsigned ctrl;
> > +
> > +       iommu = container_of(q, struct riscv_iommu_device, cmdq);
> > +
> > +       /* Error reporting, clear error reports if any. */
> > +       ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQCSR);
> > +       if (ctrl & (RISCV_IOMMU_CQCSR_CQMF |
> > +                   RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL)) {
> > +               riscv_iommu_queue_ctrl(iommu, &iommu->cmdq, ctrl);
> > +               dev_warn_ratelimited(iommu->dev,
> > +                                    "Command queue error: fault: %d tout: %d err: %d\n",
> > +                                    !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
> > +                                    !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
> > +                                    !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL));
>
> We need to handle the error by either adjusting the tail to remove the
> failed command or fixing the failed command itself. Otherwise, the
> failed command will keep in the queue and IOMMU will try to execute
> it. I guess the first option might be easier to implement.
>

Correct. Thanks for pointing this out.
Error handling / recovery was not pushed in this series. There is
work-in-progress series to handle various types of failures, including
command processing errors, DDT misconfiguration, queue overflows,
device reported faults handling, etc.  I can bring some of the error
handling here, if needed. Otherwise I'd prefer to keep it as separate
series, sent out once this one is merged.

> > +       }
> > +
> > +       /* Clear fault interrupt pending. */
> > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_CIP);
> > +
> > +       return IRQ_HANDLED;
> > +}
> > +
> > +/*
> > + * Fault/event queue, chapter 3.2
> > + */
> > +
> > +static void riscv_iommu_fault_report(struct riscv_iommu_device *iommu,
> > +                                    struct riscv_iommu_fq_record *event)
> > +{
> > +       unsigned err, devid;
> > +
> > +       err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
> > +       devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
> > +
> > +       dev_warn_ratelimited(iommu->dev,
> > +                            "Fault %d devid: %d" " iotval: %llx iotval2: %llx\n", err,
> > +                            devid, event->iotval, event->iotval2);
> > +}
> > +
> > +/* Fault/event queue primary interrupt handler */
> > +static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data)
> > +{
> > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > +       struct riscv_iommu_device *iommu =
> > +           container_of(q, struct riscv_iommu_device, fltq);
> > +       u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> > +       if (ipsr & RISCV_IOMMU_IPSR_FIP)
> > +               return IRQ_WAKE_THREAD;
> > +       return IRQ_NONE;
> > +}
> > +
> > +/* Fault queue interrupt hanlder thread function */
> > +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
> > +{
> > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > +       struct riscv_iommu_device *iommu;
> > +       struct riscv_iommu_fq_record *events;
> > +       unsigned cnt, len, idx, ctrl;
> > +
> > +       iommu = container_of(q, struct riscv_iommu_device, fltq);
> > +       events = (struct riscv_iommu_fq_record *)q->base;
> > +
> > +       /* Error reporting, clear error reports if any. */
> > +       ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FQCSR);
> > +       if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
> > +               riscv_iommu_queue_ctrl(iommu, &iommu->fltq, ctrl);
> > +               dev_warn_ratelimited(iommu->dev,
> > +                                    "Fault queue error: fault: %d full: %d\n",
> > +                                    !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
> > +                                    !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
> > +       }
> > +
> > +       /* Clear fault interrupt pending. */
> > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_FIP);
> > +
> > +       /* Report fault events. */
> > +       do {
> > +               cnt = riscv_iommu_queue_consume(iommu, q, &idx);
> > +               if (!cnt)
> > +                       break;
> > +               for (len = 0; len < cnt; idx++, len++)
> > +                       riscv_iommu_fault_report(iommu, &events[idx]);
> > +               riscv_iommu_queue_release(iommu, q, cnt);
> > +       } while (1);
> > +
> > +       return IRQ_HANDLED;
> > +}
> > +
> > +/*
> > + * Page request queue, chapter 3.3
> > + */
> > +
> >  /*
> >   * Register device for IOMMU tracking.
> >   */
> > @@ -97,6 +600,54 @@ static void riscv_iommu_add_device(struct riscv_iommu_device *iommu, struct devi
> >         mutex_unlock(&iommu->eps_mutex);
> >  }
> >
> > +/* Page request interface queue primary interrupt handler */
> > +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data)
> > +{
> > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > +       struct riscv_iommu_device *iommu =
> > +           container_of(q, struct riscv_iommu_device, priq);
> > +       u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> > +       if (ipsr & RISCV_IOMMU_IPSR_PIP)
> > +               return IRQ_WAKE_THREAD;
> > +       return IRQ_NONE;
> > +}
> > +
> > +/* Page request interface queue interrupt hanlder thread function */
> > +static irqreturn_t riscv_iommu_priq_process(int irq, void *data)
> > +{
> > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > +       struct riscv_iommu_device *iommu;
> > +       struct riscv_iommu_pq_record *requests;
> > +       unsigned cnt, idx, ctrl;
> > +
> > +       iommu = container_of(q, struct riscv_iommu_device, priq);
> > +       requests = (struct riscv_iommu_pq_record *)q->base;
> > +
> > +       /* Error reporting, clear error reports if any. */
> > +       ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_PQCSR);
> > +       if (ctrl & (RISCV_IOMMU_PQCSR_PQMF | RISCV_IOMMU_PQCSR_PQOF)) {
> > +               riscv_iommu_queue_ctrl(iommu, &iommu->priq, ctrl);
> > +               dev_warn_ratelimited(iommu->dev,
> > +                                    "Page request queue error: fault: %d full: %d\n",
> > +                                    !!(ctrl & RISCV_IOMMU_PQCSR_PQMF),
> > +                                    !!(ctrl & RISCV_IOMMU_PQCSR_PQOF));
> > +       }
> > +
> > +       /* Clear page request interrupt pending. */
> > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_PIP);
> > +
> > +       /* Process page requests. */
> > +       do {
> > +               cnt = riscv_iommu_queue_consume(iommu, q, &idx);
> > +               if (!cnt)
> > +                       break;
> > +               dev_warn(iommu->dev, "unexpected %u page requests\n", cnt);
> > +               riscv_iommu_queue_release(iommu, q, cnt);
> > +       } while (1);
> > +
> > +       return IRQ_HANDLED;
> > +}
> > +
> >  /*
> >   * Endpoint management
> >   */
> > @@ -350,7 +901,29 @@ static void riscv_iommu_flush_iotlb_range(struct iommu_domain *iommu_domain,
> >                                           unsigned long *start, unsigned long *end,
> >                                           size_t *pgsize)
> >  {
> > -       /* Command interface not implemented */
> > +       struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
> > +       struct riscv_iommu_command cmd;
> > +       unsigned long iova;
> > +
> > +       if (domain->mode == RISCV_IOMMU_DC_FSC_MODE_BARE)
> > +               return;
> > +
> > +       /* Domain not attached to an IOMMU! */
> > +       BUG_ON(!domain->iommu);
> > +
> > +       riscv_iommu_cmd_inval_vma(&cmd);
> > +       riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> > +
> > +       if (start && end && pgsize) {
> > +               /* Cover only the range that is needed */
> > +               for (iova = *start; iova <= *end; iova += *pgsize) {
> > +                       riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> > +                       riscv_iommu_post(domain->iommu, &cmd);
> > +               }
> > +       } else {
> > +               riscv_iommu_post(domain->iommu, &cmd);
> > +       }
> > +       riscv_iommu_iofence_sync(domain->iommu);
> >  }
> >
> >  static void riscv_iommu_flush_iotlb_all(struct iommu_domain *iommu_domain)
> > @@ -610,6 +1183,9 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
> >         iommu_device_unregister(&iommu->iommu);
> >         iommu_device_sysfs_remove(&iommu->iommu);
> >         riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > +       riscv_iommu_queue_free(iommu, &iommu->cmdq);
> > +       riscv_iommu_queue_free(iommu, &iommu->fltq);
> > +       riscv_iommu_queue_free(iommu, &iommu->priq);
> >  }
> >
> >  int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > @@ -632,6 +1208,16 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> >         }
> >  #endif
> >
> > +       /*
> > +        * Assign queue lengths from module parameters if not already
> > +        * set on the device tree.
> > +        */
> > +       if (!iommu->cmdq_len)
> > +               iommu->cmdq_len = cmdq_length;
> > +       if (!iommu->fltq_len)
> > +               iommu->fltq_len = fltq_length;
> > +       if (!iommu->priq_len)
> > +               iommu->priq_len = priq_length;
> >         /* Clear any pending interrupt flag. */
> >         riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR,
> >                            RISCV_IOMMU_IPSR_CIP |
> > @@ -639,7 +1225,20 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> >                            RISCV_IOMMU_IPSR_PMIP | RISCV_IOMMU_IPSR_PIP);
> >         spin_lock_init(&iommu->cq_lock);
> >         mutex_init(&iommu->eps_mutex);
> > +       ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_COMMAND_QUEUE);
> > +       if (ret)
> > +               goto fail;
> > +       ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_FAULT_QUEUE);
> > +       if (ret)
> > +               goto fail;
> > +       if (!(iommu->cap & RISCV_IOMMU_CAP_ATS))
> > +               goto no_ats;
> > +
> > +       ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_PAGE_REQUEST_QUEUE);
> > +       if (ret)
> > +               goto fail;
> >
> > + no_ats:
> >         ret = riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_BARE);
> >
> >         if (ret) {
> > @@ -663,5 +1262,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> >         return 0;
> >   fail:
> >         riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > +       riscv_iommu_queue_free(iommu, &iommu->priq);
> > +       riscv_iommu_queue_free(iommu, &iommu->fltq);
> > +       riscv_iommu_queue_free(iommu, &iommu->cmdq);
> >         return ret;
> >  }
> > diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> > index 7dc9baa59a50..04148a2a8ffd 100644
> > --- a/drivers/iommu/riscv/iommu.h
> > +++ b/drivers/iommu/riscv/iommu.h
> > @@ -28,6 +28,24 @@
> >  #define IOMMU_PAGE_SIZE_1G     BIT_ULL(30)
> >  #define IOMMU_PAGE_SIZE_512G   BIT_ULL(39)
> >
> > +struct riscv_iommu_queue {
> > +       dma_addr_t base_dma;    /* ring buffer bus address */
> > +       void *base;             /* ring buffer pointer */
> > +       size_t len;             /* single item length */
> > +       u32 cnt;                /* items count */
> > +       u32 lui;                /* last used index, consumer/producer share */
> > +       unsigned qbr;           /* queue base register offset */
> > +       unsigned qcr;           /* queue control and status register offset */
> > +       int irq;                /* registered interrupt number */
> > +       bool in_iomem;          /* indicates queue data are in I/O memory  */
> > +};
> > +
> > +enum riscv_queue_ids {
> > +       RISCV_IOMMU_COMMAND_QUEUE       = 0,
> > +       RISCV_IOMMU_FAULT_QUEUE         = 1,
> > +       RISCV_IOMMU_PAGE_REQUEST_QUEUE  = 2
> > +};
> > +
> >  struct riscv_iommu_device {
> >         struct iommu_device iommu;      /* iommu core interface */
> >         struct device *dev;             /* iommu hardware */
> > @@ -42,6 +60,11 @@ struct riscv_iommu_device {
> >         int irq_pm;
> >         int irq_priq;
> >
> > +       /* Queue lengths */
> > +       int cmdq_len;
> > +       int fltq_len;
> > +       int priq_len;
> > +
> >         /* supported and enabled hardware capabilities */
> >         u64 cap;
> >
> > @@ -53,6 +76,11 @@ struct riscv_iommu_device {
> >         unsigned ddt_mode;
> >         bool ddtp_in_iomem;
> >
> > +       /* hardware queues */
> > +       struct riscv_iommu_queue cmdq;
> > +       struct riscv_iommu_queue fltq;
> > +       struct riscv_iommu_queue priq;
> > +
> >         /* Connected end-points */
> >         struct rb_root eps;
> >         struct mutex eps_mutex;
> > --
> > 2.34.1
> >
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv

best,
- Tomasz
Zong Li Aug. 3, 2023, 8:24 a.m. UTC | #16
On Thu, Aug 3, 2023 at 4:50 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> On Sat, Jul 29, 2023 at 5:58 AM Zong Li <zong.li@sifive.com> wrote:
> >
> > On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> > >
> > > Enables message or wire signal interrupts for PCIe and platforms devices.
> > >
> > > Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
> > > Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
> > > Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> > > ---
> > >  drivers/iommu/riscv/iommu-pci.c      |  72 ++++
> > >  drivers/iommu/riscv/iommu-platform.c |  66 +++
> > >  drivers/iommu/riscv/iommu.c          | 604 ++++++++++++++++++++++++++-
> > >  drivers/iommu/riscv/iommu.h          |  28 ++
> > >  4 files changed, 769 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/iommu/riscv/iommu-pci.c b/drivers/iommu/riscv/iommu-pci.c
> > > index c91f963d7a29..9ea0647f7b92 100644
> > > --- a/drivers/iommu/riscv/iommu-pci.c
> > > +++ b/drivers/iommu/riscv/iommu-pci.c
> > > @@ -34,6 +34,7 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
> > >  {
> > >         struct device *dev = &pdev->dev;
> > >         struct riscv_iommu_device *iommu;
> > > +       u64 icvec;
> > >         int ret;
> > >
> > >         ret = pci_enable_device_mem(pdev);
> > > @@ -67,14 +68,84 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
> > >         iommu->dev = dev;
> > >         dev_set_drvdata(dev, iommu);
> > >
> > > +       /* Check device reported capabilities. */
> > > +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> > > +
> > > +       /* The PCI driver only uses MSIs, make sure the IOMMU supports this */
> > > +       switch (FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap)) {
> > > +       case RISCV_IOMMU_CAP_IGS_MSI:
> > > +       case RISCV_IOMMU_CAP_IGS_BOTH:
> > > +               break;
> > > +       default:
> > > +               dev_err(dev, "unable to use message-signaled interrupts\n");
> > > +               ret = -ENODEV;
> > > +               goto fail;
> > > +       }
> > > +
> > >         dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> > >         pci_set_master(pdev);
> > >
> > > +       /* Allocate and assign IRQ vectors for the various events */
> > > +       ret = pci_alloc_irq_vectors(pdev, 1, RISCV_IOMMU_INTR_COUNT, PCI_IRQ_MSIX);
> > > +       if (ret < 0) {
> > > +               dev_err(dev, "unable to allocate irq vectors\n");
> > > +               goto fail;
> > > +       }
> > > +
> > > +       ret = -ENODEV;
> > > +
> > > +       iommu->irq_cmdq = msi_get_virq(dev, RISCV_IOMMU_INTR_CQ);
> > > +       if (!iommu->irq_cmdq) {
> > > +               dev_warn(dev, "no MSI vector %d for the command queue\n",
> > > +                        RISCV_IOMMU_INTR_CQ);
> > > +               goto fail;
> > > +       }
> > > +
> > > +       iommu->irq_fltq = msi_get_virq(dev, RISCV_IOMMU_INTR_FQ);
> > > +       if (!iommu->irq_fltq) {
> > > +               dev_warn(dev, "no MSI vector %d for the fault/event queue\n",
> > > +                        RISCV_IOMMU_INTR_FQ);
> > > +               goto fail;
> > > +       }
> > > +
> > > +       if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
> > > +               iommu->irq_pm = msi_get_virq(dev, RISCV_IOMMU_INTR_PM);
> > > +               if (!iommu->irq_pm) {
> > > +                       dev_warn(dev,
> > > +                                "no MSI vector %d for performance monitoring\n",
> > > +                                RISCV_IOMMU_INTR_PM);
> > > +                       goto fail;
> > > +               }
> > > +       }
> > > +
> > > +       if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
> > > +               iommu->irq_priq = msi_get_virq(dev, RISCV_IOMMU_INTR_PQ);
> > > +               if (!iommu->irq_priq) {
> > > +                       dev_warn(dev,
> > > +                                "no MSI vector %d for page-request queue\n",
> > > +                                RISCV_IOMMU_INTR_PQ);
> > > +                       goto fail;
> > > +               }
> > > +       }
> > > +
> > > +       /* Set simple 1:1 mapping for MSI vectors */
> > > +       icvec = FIELD_PREP(RISCV_IOMMU_IVEC_CIV, RISCV_IOMMU_INTR_CQ) |
> > > +           FIELD_PREP(RISCV_IOMMU_IVEC_FIV, RISCV_IOMMU_INTR_FQ);
> > > +
> > > +       if (iommu->cap & RISCV_IOMMU_CAP_HPM)
> > > +               icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, RISCV_IOMMU_INTR_PM);
> > > +
> > > +       if (iommu->cap & RISCV_IOMMU_CAP_ATS)
> > > +               icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, RISCV_IOMMU_INTR_PQ);
> > > +
> > > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IVEC, icvec);
> > > +
> > >         ret = riscv_iommu_init(iommu);
> > >         if (!ret)
> > >                 return ret;
> > >
> > >   fail:
> > > +       pci_free_irq_vectors(pdev);
> > >         pci_clear_master(pdev);
> > >         pci_release_regions(pdev);
> > >         pci_disable_device(pdev);
> > > @@ -85,6 +156,7 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
> > >  static void riscv_iommu_pci_remove(struct pci_dev *pdev)
> > >  {
> > >         riscv_iommu_remove(dev_get_drvdata(&pdev->dev));
> > > +       pci_free_irq_vectors(pdev);
> > >         pci_clear_master(pdev);
> > >         pci_release_regions(pdev);
> > >         pci_disable_device(pdev);
> > > diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c
> > > index e4e8ca6711e7..35935d3c7ef4 100644
> > > --- a/drivers/iommu/riscv/iommu-platform.c
> > > +++ b/drivers/iommu/riscv/iommu-platform.c
> > > @@ -20,6 +20,8 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
> > >         struct device *dev = &pdev->dev;
> > >         struct riscv_iommu_device *iommu = NULL;
> > >         struct resource *res = NULL;
> > > +       u32 fctl = 0;
> > > +       int irq = 0;
> > >         int ret = 0;
> > >
> > >         iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
> > > @@ -53,6 +55,70 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
> > >                 goto fail;
> > >         }
> > >
> > > +       iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> > > +
> > > +       /* For now we only support WSIs until we have AIA support */
> >
> > I'm not completely understand AIA support here, because I saw the pci
> > case uses the MSI, and kernel seems to have the AIA implementation.
> > Could you please elaborate it?
> >
> > > +       ret = FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap);
> > > +       if (ret == RISCV_IOMMU_CAP_IGS_MSI) {
> > > +               dev_err(dev, "IOMMU only supports MSIs\n");
> > > +               goto fail;
> > > +       }
> > > +
> > > +       /* Parse IRQ assignment */
> > > +       irq = platform_get_irq_byname_optional(pdev, "cmdq");
> > > +       if (irq > 0)
> > > +               iommu->irq_cmdq = irq;
> > > +       else {
> > > +               dev_err(dev, "no IRQ provided for the command queue\n");
> > > +               goto fail;
> > > +       }
> > > +
> > > +       irq = platform_get_irq_byname_optional(pdev, "fltq");
> > > +       if (irq > 0)
> > > +               iommu->irq_fltq = irq;
> > > +       else {
> > > +               dev_err(dev, "no IRQ provided for the fault/event queue\n");
> > > +               goto fail;
> > > +       }
> > > +
> > > +       if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
> > > +               irq = platform_get_irq_byname_optional(pdev, "pm");
> > > +               if (irq > 0)
> > > +                       iommu->irq_pm = irq;
> > > +               else {
> > > +                       dev_err(dev, "no IRQ provided for performance monitoring\n");
> > > +                       goto fail;
> > > +               }
> > > +       }
> > > +
> > > +       if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
> > > +               irq = platform_get_irq_byname_optional(pdev, "priq");
> > > +               if (irq > 0)
> > > +                       iommu->irq_priq = irq;
> > > +               else {
> > > +                       dev_err(dev, "no IRQ provided for the page-request queue\n");
> > > +                       goto fail;
> > > +               }
> > > +       }
> >
> > Should we define the "interrupt-names" in dt-bindings?
> >
>
> Yes, this was brought up earlier wrt dt-bindings.
>
> I'm considering removal of interrupt names from DT (and get-byname
> option), as IOMMU hardware cause-to-vector remapping `icvec` should be
> used to map interrupt source to actual interrupt vector. If possible
> device driver should map cause to interrupt (based on number of
> vectors available) or rely on ICVEC WARL properties to discover fixed
> cause-to-vector mapping in the hardware.
>

I'm not sure if I understand correctly, but one thing we might need to
consider is when the vector numbers are less than interrupt sources,
for example, if IOMMU only supports single vector (i.e. supports a
single interrupt wire), the `request_threaded_irq` will request the
irq three times by using same irq number for command, fault and
page-request queues. It would cause a problem and fail to request an
IRQ for the other two queues. It seems that we still need to consider
this situation regardless of how we determine the IRQs for each
interrupt source.

> Please let me know if this is reasonable change.
>
> > > +
> > > +       /* Make sure fctl.WSI is set */
> > > +       fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
> > > +       fctl |= RISCV_IOMMU_FCTL_WSI;
> > > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, fctl);
> > > +
> > > +       /* Parse Queue lengts */
> > > +       ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> > > +       if (!ret)
> > > +               dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> > > +
> > > +       ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> > > +       if (!ret)
> > > +               dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> > > +
> > > +       ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> > > +       if (!ret)
> > > +               dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
> > > +
> > >         dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
> > >
> > >         return riscv_iommu_init(iommu);
> > > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> > > index 31dc3c458e13..5c4cf9875302 100644
> > > --- a/drivers/iommu/riscv/iommu.c
> > > +++ b/drivers/iommu/riscv/iommu.c
> > > @@ -45,6 +45,18 @@ static int ddt_mode = RISCV_IOMMU_DDTP_MODE_BARE;
> > >  module_param(ddt_mode, int, 0644);
> > >  MODULE_PARM_DESC(ddt_mode, "Device Directory Table mode.");
> > >
> > > +static int cmdq_length = 1024;
> > > +module_param(cmdq_length, int, 0644);
> > > +MODULE_PARM_DESC(cmdq_length, "Command queue length.");
> > > +
> > > +static int fltq_length = 1024;
> > > +module_param(fltq_length, int, 0644);
> > > +MODULE_PARM_DESC(fltq_length, "Fault queue length.");
> > > +
> > > +static int priq_length = 1024;
> > > +module_param(priq_length, int, 0644);
> > > +MODULE_PARM_DESC(priq_length, "Page request interface queue length.");
> > > +
> > >  /* IOMMU PSCID allocation namespace. */
> > >  #define RISCV_IOMMU_MAX_PSCID  (1U << 20)
> > >  static DEFINE_IDA(riscv_iommu_pscids);
> > > @@ -65,6 +77,497 @@ static DEFINE_IDA(riscv_iommu_pscids);
> > >  static const struct iommu_domain_ops riscv_iommu_domain_ops;
> > >  static const struct iommu_ops riscv_iommu_ops;
> > >
> > > +/*
> > > + * Common queue management routines
> > > + */
> > > +
> > > +/* Note: offsets are the same for all queues */
> > > +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
> > > +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
> > > +
> > > +static unsigned riscv_iommu_queue_consume(struct riscv_iommu_device *iommu,
> > > +                                         struct riscv_iommu_queue *q, unsigned *ready)
> > > +{
> > > +       u32 tail = riscv_iommu_readl(iommu, Q_TAIL(q));
> > > +       *ready = q->lui;
> > > +
> > > +       BUG_ON(q->cnt <= tail);
> > > +       if (q->lui <= tail)
> > > +               return tail - q->lui;
> > > +       return q->cnt - q->lui;
> > > +}
> > > +
> > > +static void riscv_iommu_queue_release(struct riscv_iommu_device *iommu,
> > > +                                     struct riscv_iommu_queue *q, unsigned count)
> > > +{
> > > +       q->lui = (q->lui + count) & (q->cnt - 1);
> > > +       riscv_iommu_writel(iommu, Q_HEAD(q), q->lui);
> > > +}
> > > +
> > > +static u32 riscv_iommu_queue_ctrl(struct riscv_iommu_device *iommu,
> > > +                                 struct riscv_iommu_queue *q, u32 val)
> > > +{
> > > +       cycles_t end_cycles = RISCV_IOMMU_TIMEOUT + get_cycles();
> > > +
> > > +       riscv_iommu_writel(iommu, q->qcr, val);
> > > +       do {
> > > +               val = riscv_iommu_readl(iommu, q->qcr);
> > > +               if (!(val & RISCV_IOMMU_QUEUE_BUSY))
> > > +                       break;
> > > +               cpu_relax();
> > > +       } while (get_cycles() < end_cycles);
> > > +
> > > +       return val;
> > > +}
> > > +
> > > +static void riscv_iommu_queue_free(struct riscv_iommu_device *iommu,
> > > +                                  struct riscv_iommu_queue *q)
> > > +{
> > > +       size_t size = q->len * q->cnt;
> > > +
> > > +       riscv_iommu_queue_ctrl(iommu, q, 0);
> > > +
> > > +       if (q->base) {
> > > +               if (q->in_iomem)
> > > +                       iounmap(q->base);
> > > +               else
> > > +                       dmam_free_coherent(iommu->dev, size, q->base, q->base_dma);
> > > +       }
> > > +       if (q->irq)
> > > +               free_irq(q->irq, q);
> > > +}
> > > +
> > > +static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data);
> > > +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data);
> > > +static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data);
> > > +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data);
> > > +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
> > > +static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
> > > +
> > > +static int riscv_iommu_queue_init(struct riscv_iommu_device *iommu, int queue_id)
> > > +{
> > > +       struct device *dev = iommu->dev;
> > > +       struct riscv_iommu_queue *q = NULL;
> > > +       size_t queue_size = 0;
> > > +       irq_handler_t irq_check;
> > > +       irq_handler_t irq_process;
> > > +       const char *name;
> > > +       int count = 0;
> > > +       int irq = 0;
> > > +       unsigned order = 0;
> > > +       u64 qbr_val = 0;
> > > +       u64 qbr_readback = 0;
> > > +       u64 qbr_paddr = 0;
> > > +       int ret = 0;
> > > +
> > > +       switch (queue_id) {
> > > +       case RISCV_IOMMU_COMMAND_QUEUE:
> > > +               q = &iommu->cmdq;
> > > +               q->len = sizeof(struct riscv_iommu_command);
> > > +               count = iommu->cmdq_len;
> > > +               irq = iommu->irq_cmdq;
> > > +               irq_check = riscv_iommu_cmdq_irq_check;
> > > +               irq_process = riscv_iommu_cmdq_process;
> > > +               q->qbr = RISCV_IOMMU_REG_CQB;
> > > +               q->qcr = RISCV_IOMMU_REG_CQCSR;
> > > +               name = "cmdq";
> > > +               break;
> > > +       case RISCV_IOMMU_FAULT_QUEUE:
> > > +               q = &iommu->fltq;
> > > +               q->len = sizeof(struct riscv_iommu_fq_record);
> > > +               count = iommu->fltq_len;
> > > +               irq = iommu->irq_fltq;
> > > +               irq_check = riscv_iommu_fltq_irq_check;
> > > +               irq_process = riscv_iommu_fltq_process;
> > > +               q->qbr = RISCV_IOMMU_REG_FQB;
> > > +               q->qcr = RISCV_IOMMU_REG_FQCSR;
> > > +               name = "fltq";
> > > +               break;
> > > +       case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
> > > +               q = &iommu->priq;
> > > +               q->len = sizeof(struct riscv_iommu_pq_record);
> > > +               count = iommu->priq_len;
> > > +               irq = iommu->irq_priq;
> > > +               irq_check = riscv_iommu_priq_irq_check;
> > > +               irq_process = riscv_iommu_priq_process;
> > > +               q->qbr = RISCV_IOMMU_REG_PQB;
> > > +               q->qcr = RISCV_IOMMU_REG_PQCSR;
> > > +               name = "priq";
> > > +               break;
> > > +       default:
> > > +               dev_err(dev, "invalid queue interrupt index in queue_init!\n");
> > > +               return -EINVAL;
> > > +       }
> > > +
> > > +       /* Polling not implemented */
> > > +       if (!irq)
> > > +               return -ENODEV;
> > > +
> > > +       /* Allocate queue in memory and set the base register */
> > > +       order = ilog2(count);
> > > +       do {
> > > +               queue_size = q->len * (1ULL << order);
> > > +               q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);
> > > +               if (q->base || queue_size < PAGE_SIZE)
> > > +                       break;
> > > +
> > > +               order--;
> > > +       } while (1);
> > > +
> > > +       if (!q->base) {
> > > +               dev_err(dev, "failed to allocate %s queue (cnt: %u)\n", name, count);
> > > +               return -ENOMEM;
> > > +       }
> > > +
> > > +       q->cnt = 1ULL << order;
> > > +
> > > +       qbr_val = phys_to_ppn(q->base_dma) |
> > > +           FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
> > > +
> > > +       riscv_iommu_writeq(iommu, q->qbr, qbr_val);
> > > +
> > > +       /*
> > > +        * Queue base registers are WARL, so it's possible that whatever we wrote
> > > +        * there was illegal/not supported by the hw in which case we need to make
> > > +        * sure we set a supported PPN and/or queue size.
> > > +        */
> > > +       qbr_readback = riscv_iommu_readq(iommu, q->qbr);
> > > +       if (qbr_readback == qbr_val)
> > > +               goto irq;
> > > +
> > > +       dmam_free_coherent(dev, queue_size, q->base, q->base_dma);
> > > +
> > > +       /* Get supported queue size */
> > > +       order = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qbr_readback) + 1;
> > > +       q->cnt = 1ULL << order;
> > > +       queue_size = q->len * q->cnt;
> > > +
> > > +       /*
> > > +        * In case we also failed to set PPN, it means the field is hardcoded and the
> > > +        * queue resides in I/O memory instead, so get its physical address and
> > > +        * ioremap it.
> > > +        */
> > > +       qbr_paddr = ppn_to_phys(qbr_readback);
> > > +       if (qbr_paddr != q->base_dma) {
> > > +               dev_info(dev,
> > > +                        "hardcoded ppn in %s base register, using io memory for the queue\n",
> > > +                        name);
> > > +               dev_info(dev, "queue length for %s set to %i\n", name, q->cnt);
> > > +               q->in_iomem = true;
> > > +               q->base = ioremap(qbr_paddr, queue_size);
> > > +               if (!q->base) {
> > > +                       dev_err(dev, "failed to map %s queue (cnt: %u)\n", name, q->cnt);
> > > +                       return -ENOMEM;
> > > +               }
> > > +               q->base_dma = qbr_paddr;
> > > +       } else {
> > > +               /*
> > > +                * We only failed to set the queue size, re-try to allocate memory with
> > > +                * the queue size supported by the hw.
> > > +                */
> > > +               dev_info(dev, "hardcoded queue size in %s base register\n", name);
> > > +               dev_info(dev, "retrying with queue length: %i\n", q->cnt);
> > > +               q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);
> > > +               if (!q->base) {
> > > +                       dev_err(dev, "failed to allocate %s queue (cnt: %u)\n",
> > > +                               name, q->cnt);
> > > +                       return -ENOMEM;
> > > +               }
> > > +       }
> > > +
> > > +       qbr_val = phys_to_ppn(q->base_dma) |
> > > +           FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
> > > +       riscv_iommu_writeq(iommu, q->qbr, qbr_val);
> > > +
> > > +       /* Final check to make sure hw accepted our write */
> > > +       qbr_readback = riscv_iommu_readq(iommu, q->qbr);
> > > +       if (qbr_readback != qbr_val) {
> > > +               dev_err(dev, "failed to set base register for %s\n", name);
> > > +               goto fail;
> > > +       }
> > > +
> > > + irq:
> > > +       if (request_threaded_irq(irq, irq_check, irq_process, IRQF_ONESHOT | IRQF_SHARED,
> > > +                                dev_name(dev), q)) {
> > > +               dev_err(dev, "fail to request irq %d for %s\n", irq, name);
> > > +               goto fail;
> > > +       }
> > > +
> > > +       q->irq = irq;
> > > +
> > > +       /* Note: All RIO_xQ_EN/IE fields are in the same offsets */
> > > +       ret =
> > > +           riscv_iommu_queue_ctrl(iommu, q,
> > > +                                  RISCV_IOMMU_QUEUE_ENABLE |
> > > +                                  RISCV_IOMMU_QUEUE_INTR_ENABLE);
> > > +       if (ret & RISCV_IOMMU_QUEUE_BUSY) {
> > > +               dev_err(dev, "%s init timeout\n", name);
> > > +               ret = -EBUSY;
> > > +               goto fail;
> > > +       }
> > > +
> > > +       return 0;
> > > +
> > > + fail:
> > > +       riscv_iommu_queue_free(iommu, q);
> > > +       return 0;
> > > +}
> > > +
> > > +/*
> > > + * I/O MMU Command queue chapter 3.1
> > > + */
> > > +
> > > +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
> > > +{
> > > +       cmd->dword0 =
> > > +           FIELD_PREP(RISCV_IOMMU_CMD_OPCODE,
> > > +                      RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | FIELD_PREP(RISCV_IOMMU_CMD_FUNC,
> > > +                                                                    RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
> > > +       cmd->dword1 = 0;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> > > +                                                 u64 addr)
> > > +{
> > > +       cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> > > +       cmd->dword1 = addr;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
> > > +                                                  unsigned pscid)
> > > +{
> > > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
> > > +           RISCV_IOMMU_CMD_IOTINVAL_PSCV;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
> > > +                                                  unsigned gscid)
> > > +{
> > > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
> > > +           RISCV_IOMMU_CMD_IOTINVAL_GV;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
> > > +{
> > > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > > +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C);
> > > +       cmd->dword1 = 0;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
> > > +                                                 u64 addr, u32 data)
> > > +{
> > > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > > +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> > > +           FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) | RISCV_IOMMU_CMD_IOFENCE_AV;
> > > +       cmd->dword1 = (addr >> 2);
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
> > > +{
> > > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > > +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
> > > +       cmd->dword1 = 0;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
> > > +{
> > > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > > +           FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
> > > +       cmd->dword1 = 0;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
> > > +                                                unsigned devid)
> > > +{
> > > +       cmd->dword0 |=
> > > +           FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) | RISCV_IOMMU_CMD_IODIR_DV;
> > > +}
> > > +
> > > +/* TODO: Convert into lock-less MPSC implementation. */
> > > +static bool riscv_iommu_post_sync(struct riscv_iommu_device *iommu,
> > > +                                 struct riscv_iommu_command *cmd, bool sync)
> > > +{
> > > +       u32 head, tail, next, last;
> > > +       unsigned long flags;
> > > +
> > > +       spin_lock_irqsave(&iommu->cq_lock, flags);
> > > +       head = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) & (iommu->cmdq.cnt - 1);
> > > +       tail = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
> > > +       last = iommu->cmdq.lui;
> > > +       if (tail != last) {
> > > +               spin_unlock_irqrestore(&iommu->cq_lock, flags);
> > > +               /*
> > > +                * FIXME: This is a workaround for dropped MMIO writes/reads on QEMU platform.
> > > +                *        While debugging of the problem is still ongoing, this provides
> > > +                *        a simple impolementation of try-again policy.
> > > +                *        Will be changed to lock-less algorithm in the feature.
> > > +                */
> > > +               dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (1st)\n", last, tail);
> > > +               spin_lock_irqsave(&iommu->cq_lock, flags);
> > > +               tail =
> > > +                   riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
> > > +               last = iommu->cmdq.lui;
> > > +               if (tail != last) {
> > > +                       spin_unlock_irqrestore(&iommu->cq_lock, flags);
> > > +                       dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (2nd)\n", last, tail);
> > > +                       spin_lock_irqsave(&iommu->cq_lock, flags);
> > > +               }
> > > +       }
> > > +
> > > +       next = (last + 1) & (iommu->cmdq.cnt - 1);
> > > +       if (next != head) {
> > > +               struct riscv_iommu_command *ptr = iommu->cmdq.base;
> > > +               ptr[last] = *cmd;
> > > +               wmb();
> > > +               riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQT, next);
> > > +               iommu->cmdq.lui = next;
> > > +       }
> > > +
> > > +       spin_unlock_irqrestore(&iommu->cq_lock, flags);
> > > +
> > > +       if (sync && head != next) {
> > > +               cycles_t start_time = get_cycles();
> > > +               while (1) {
> > > +                       last = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) &
> > > +                           (iommu->cmdq.cnt - 1);
> > > +                       if (head < next && last >= next)
> > > +                               break;
> > > +                       if (head > next && last < head && last >= next)
> > > +                               break;
> > > +                       if (RISCV_IOMMU_TIMEOUT < (get_cycles() - start_time)) {
> >
> > This condition will be imprecise, because here is not in irq disabled
> > context, it will be scheduled out or preempted. When we come back
> > here, it might be over 1 second, but the IOFENCE is actually
> > completed.
> >
>
> Good point. Thank.
>
>
> > > +                               dev_err(iommu->dev, "IOFENCE TIMEOUT\n");
> > > +                               return false;
> > > +                       }
> > > +                       cpu_relax();
> > > +               }
> > > +       }
> > > +
> > > +       return next != head;
> > > +}
> > > +
> > > +static bool riscv_iommu_post(struct riscv_iommu_device *iommu,
> > > +                            struct riscv_iommu_command *cmd)
> > > +{
> > > +       return riscv_iommu_post_sync(iommu, cmd, false);
> > > +}
> > > +
> > > +static bool riscv_iommu_iofence_sync(struct riscv_iommu_device *iommu)
> > > +{
> > > +       struct riscv_iommu_command cmd;
> > > +       riscv_iommu_cmd_iofence(&cmd);
> > > +       return riscv_iommu_post_sync(iommu, &cmd, true);
> > > +}
> > > +
> > > +/* Command queue primary interrupt handler */
> > > +static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data)
> > > +{
> > > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > > +       struct riscv_iommu_device *iommu =
> > > +           container_of(q, struct riscv_iommu_device, cmdq);
> > > +       u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> > > +       if (ipsr & RISCV_IOMMU_IPSR_CIP)
> > > +               return IRQ_WAKE_THREAD;
> > > +       return IRQ_NONE;
> > > +}
> > > +
> > > +/* Command queue interrupt hanlder thread function */
> > > +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
> > > +{
> > > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > > +       struct riscv_iommu_device *iommu;
> > > +       unsigned ctrl;
> > > +
> > > +       iommu = container_of(q, struct riscv_iommu_device, cmdq);
> > > +
> > > +       /* Error reporting, clear error reports if any. */
> > > +       ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQCSR);
> > > +       if (ctrl & (RISCV_IOMMU_CQCSR_CQMF |
> > > +                   RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL)) {
> > > +               riscv_iommu_queue_ctrl(iommu, &iommu->cmdq, ctrl);
> > > +               dev_warn_ratelimited(iommu->dev,
> > > +                                    "Command queue error: fault: %d tout: %d err: %d\n",
> > > +                                    !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
> > > +                                    !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
> > > +                                    !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL));
> >
> > We need to handle the error by either adjusting the tail to remove the
> > failed command or fixing the failed command itself. Otherwise, the
> > failed command will keep in the queue and IOMMU will try to execute
> > it. I guess the first option might be easier to implement.
> >
>
> Correct. Thanks for pointing this out.
> Error handling / recovery was not pushed in this series. There is
> work-in-progress series to handle various types of failures, including
> command processing errors, DDT misconfiguration, queue overflows,
> device reported faults handling, etc.  I can bring some of the error
> handling here, if needed. Otherwise I'd prefer to keep it as separate
> series, sent out once this one is merged.

It sounds good to me, thanks.

>
> > > +       }
> > > +
> > > +       /* Clear fault interrupt pending. */
> > > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_CIP);
> > > +
> > > +       return IRQ_HANDLED;
> > > +}
> > > +
> > > +/*
> > > + * Fault/event queue, chapter 3.2
> > > + */
> > > +
> > > +static void riscv_iommu_fault_report(struct riscv_iommu_device *iommu,
> > > +                                    struct riscv_iommu_fq_record *event)
> > > +{
> > > +       unsigned err, devid;
> > > +
> > > +       err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
> > > +       devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
> > > +
> > > +       dev_warn_ratelimited(iommu->dev,
> > > +                            "Fault %d devid: %d" " iotval: %llx iotval2: %llx\n", err,
> > > +                            devid, event->iotval, event->iotval2);
> > > +}
> > > +
> > > +/* Fault/event queue primary interrupt handler */
> > > +static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data)
> > > +{
> > > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > > +       struct riscv_iommu_device *iommu =
> > > +           container_of(q, struct riscv_iommu_device, fltq);
> > > +       u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> > > +       if (ipsr & RISCV_IOMMU_IPSR_FIP)
> > > +               return IRQ_WAKE_THREAD;
> > > +       return IRQ_NONE;
> > > +}
> > > +
> > > +/* Fault queue interrupt hanlder thread function */
> > > +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
> > > +{
> > > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > > +       struct riscv_iommu_device *iommu;
> > > +       struct riscv_iommu_fq_record *events;
> > > +       unsigned cnt, len, idx, ctrl;
> > > +
> > > +       iommu = container_of(q, struct riscv_iommu_device, fltq);
> > > +       events = (struct riscv_iommu_fq_record *)q->base;
> > > +
> > > +       /* Error reporting, clear error reports if any. */
> > > +       ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FQCSR);
> > > +       if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
> > > +               riscv_iommu_queue_ctrl(iommu, &iommu->fltq, ctrl);
> > > +               dev_warn_ratelimited(iommu->dev,
> > > +                                    "Fault queue error: fault: %d full: %d\n",
> > > +                                    !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
> > > +                                    !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
> > > +       }
> > > +
> > > +       /* Clear fault interrupt pending. */
> > > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_FIP);
> > > +
> > > +       /* Report fault events. */
> > > +       do {
> > > +               cnt = riscv_iommu_queue_consume(iommu, q, &idx);
> > > +               if (!cnt)
> > > +                       break;
> > > +               for (len = 0; len < cnt; idx++, len++)
> > > +                       riscv_iommu_fault_report(iommu, &events[idx]);
> > > +               riscv_iommu_queue_release(iommu, q, cnt);
> > > +       } while (1);
> > > +
> > > +       return IRQ_HANDLED;
> > > +}
> > > +
> > > +/*
> > > + * Page request queue, chapter 3.3
> > > + */
> > > +
> > >  /*
> > >   * Register device for IOMMU tracking.
> > >   */
> > > @@ -97,6 +600,54 @@ static void riscv_iommu_add_device(struct riscv_iommu_device *iommu, struct devi
> > >         mutex_unlock(&iommu->eps_mutex);
> > >  }
> > >
> > > +/* Page request interface queue primary interrupt handler */
> > > +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data)
> > > +{
> > > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > > +       struct riscv_iommu_device *iommu =
> > > +           container_of(q, struct riscv_iommu_device, priq);
> > > +       u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> > > +       if (ipsr & RISCV_IOMMU_IPSR_PIP)
> > > +               return IRQ_WAKE_THREAD;
> > > +       return IRQ_NONE;
> > > +}
> > > +
> > > +/* Page request interface queue interrupt hanlder thread function */
> > > +static irqreturn_t riscv_iommu_priq_process(int irq, void *data)
> > > +{
> > > +       struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> > > +       struct riscv_iommu_device *iommu;
> > > +       struct riscv_iommu_pq_record *requests;
> > > +       unsigned cnt, idx, ctrl;
> > > +
> > > +       iommu = container_of(q, struct riscv_iommu_device, priq);
> > > +       requests = (struct riscv_iommu_pq_record *)q->base;
> > > +
> > > +       /* Error reporting, clear error reports if any. */
> > > +       ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_PQCSR);
> > > +       if (ctrl & (RISCV_IOMMU_PQCSR_PQMF | RISCV_IOMMU_PQCSR_PQOF)) {
> > > +               riscv_iommu_queue_ctrl(iommu, &iommu->priq, ctrl);
> > > +               dev_warn_ratelimited(iommu->dev,
> > > +                                    "Page request queue error: fault: %d full: %d\n",
> > > +                                    !!(ctrl & RISCV_IOMMU_PQCSR_PQMF),
> > > +                                    !!(ctrl & RISCV_IOMMU_PQCSR_PQOF));
> > > +       }
> > > +
> > > +       /* Clear page request interrupt pending. */
> > > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_PIP);
> > > +
> > > +       /* Process page requests. */
> > > +       do {
> > > +               cnt = riscv_iommu_queue_consume(iommu, q, &idx);
> > > +               if (!cnt)
> > > +                       break;
> > > +               dev_warn(iommu->dev, "unexpected %u page requests\n", cnt);
> > > +               riscv_iommu_queue_release(iommu, q, cnt);
> > > +       } while (1);
> > > +
> > > +       return IRQ_HANDLED;
> > > +}
> > > +
> > >  /*
> > >   * Endpoint management
> > >   */
> > > @@ -350,7 +901,29 @@ static void riscv_iommu_flush_iotlb_range(struct iommu_domain *iommu_domain,
> > >                                           unsigned long *start, unsigned long *end,
> > >                                           size_t *pgsize)
> > >  {
> > > -       /* Command interface not implemented */
> > > +       struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
> > > +       struct riscv_iommu_command cmd;
> > > +       unsigned long iova;
> > > +
> > > +       if (domain->mode == RISCV_IOMMU_DC_FSC_MODE_BARE)
> > > +               return;
> > > +
> > > +       /* Domain not attached to an IOMMU! */
> > > +       BUG_ON(!domain->iommu);
> > > +
> > > +       riscv_iommu_cmd_inval_vma(&cmd);
> > > +       riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> > > +
> > > +       if (start && end && pgsize) {
> > > +               /* Cover only the range that is needed */
> > > +               for (iova = *start; iova <= *end; iova += *pgsize) {
> > > +                       riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> > > +                       riscv_iommu_post(domain->iommu, &cmd);
> > > +               }
> > > +       } else {
> > > +               riscv_iommu_post(domain->iommu, &cmd);
> > > +       }
> > > +       riscv_iommu_iofence_sync(domain->iommu);
> > >  }
> > >
> > >  static void riscv_iommu_flush_iotlb_all(struct iommu_domain *iommu_domain)
> > > @@ -610,6 +1183,9 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
> > >         iommu_device_unregister(&iommu->iommu);
> > >         iommu_device_sysfs_remove(&iommu->iommu);
> > >         riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > > +       riscv_iommu_queue_free(iommu, &iommu->cmdq);
> > > +       riscv_iommu_queue_free(iommu, &iommu->fltq);
> > > +       riscv_iommu_queue_free(iommu, &iommu->priq);
> > >  }
> > >
> > >  int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > > @@ -632,6 +1208,16 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > >         }
> > >  #endif
> > >
> > > +       /*
> > > +        * Assign queue lengths from module parameters if not already
> > > +        * set on the device tree.
> > > +        */
> > > +       if (!iommu->cmdq_len)
> > > +               iommu->cmdq_len = cmdq_length;
> > > +       if (!iommu->fltq_len)
> > > +               iommu->fltq_len = fltq_length;
> > > +       if (!iommu->priq_len)
> > > +               iommu->priq_len = priq_length;
> > >         /* Clear any pending interrupt flag. */
> > >         riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR,
> > >                            RISCV_IOMMU_IPSR_CIP |
> > > @@ -639,7 +1225,20 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > >                            RISCV_IOMMU_IPSR_PMIP | RISCV_IOMMU_IPSR_PIP);
> > >         spin_lock_init(&iommu->cq_lock);
> > >         mutex_init(&iommu->eps_mutex);
> > > +       ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_COMMAND_QUEUE);
> > > +       if (ret)
> > > +               goto fail;
> > > +       ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_FAULT_QUEUE);
> > > +       if (ret)
> > > +               goto fail;
> > > +       if (!(iommu->cap & RISCV_IOMMU_CAP_ATS))
> > > +               goto no_ats;
> > > +
> > > +       ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_PAGE_REQUEST_QUEUE);
> > > +       if (ret)
> > > +               goto fail;
> > >
> > > + no_ats:
> > >         ret = riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_BARE);
> > >
> > >         if (ret) {
> > > @@ -663,5 +1262,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > >         return 0;
> > >   fail:
> > >         riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > > +       riscv_iommu_queue_free(iommu, &iommu->priq);
> > > +       riscv_iommu_queue_free(iommu, &iommu->fltq);
> > > +       riscv_iommu_queue_free(iommu, &iommu->cmdq);
> > >         return ret;
> > >  }
> > > diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> > > index 7dc9baa59a50..04148a2a8ffd 100644
> > > --- a/drivers/iommu/riscv/iommu.h
> > > +++ b/drivers/iommu/riscv/iommu.h
> > > @@ -28,6 +28,24 @@
> > >  #define IOMMU_PAGE_SIZE_1G     BIT_ULL(30)
> > >  #define IOMMU_PAGE_SIZE_512G   BIT_ULL(39)
> > >
> > > +struct riscv_iommu_queue {
> > > +       dma_addr_t base_dma;    /* ring buffer bus address */
> > > +       void *base;             /* ring buffer pointer */
> > > +       size_t len;             /* single item length */
> > > +       u32 cnt;                /* items count */
> > > +       u32 lui;                /* last used index, consumer/producer share */
> > > +       unsigned qbr;           /* queue base register offset */
> > > +       unsigned qcr;           /* queue control and status register offset */
> > > +       int irq;                /* registered interrupt number */
> > > +       bool in_iomem;          /* indicates queue data are in I/O memory  */
> > > +};
> > > +
> > > +enum riscv_queue_ids {
> > > +       RISCV_IOMMU_COMMAND_QUEUE       = 0,
> > > +       RISCV_IOMMU_FAULT_QUEUE         = 1,
> > > +       RISCV_IOMMU_PAGE_REQUEST_QUEUE  = 2
> > > +};
> > > +
> > >  struct riscv_iommu_device {
> > >         struct iommu_device iommu;      /* iommu core interface */
> > >         struct device *dev;             /* iommu hardware */
> > > @@ -42,6 +60,11 @@ struct riscv_iommu_device {
> > >         int irq_pm;
> > >         int irq_priq;
> > >
> > > +       /* Queue lengths */
> > > +       int cmdq_len;
> > > +       int fltq_len;
> > > +       int priq_len;
> > > +
> > >         /* supported and enabled hardware capabilities */
> > >         u64 cap;
> > >
> > > @@ -53,6 +76,11 @@ struct riscv_iommu_device {
> > >         unsigned ddt_mode;
> > >         bool ddtp_in_iomem;
> > >
> > > +       /* hardware queues */
> > > +       struct riscv_iommu_queue cmdq;
> > > +       struct riscv_iommu_queue fltq;
> > > +       struct riscv_iommu_queue priq;
> > > +
> > >         /* Connected end-points */
> > >         struct rb_root eps;
> > >         struct mutex eps_mutex;
> > > --
> > > 2.34.1
> > >
> > >
> > > _______________________________________________
> > > linux-riscv mailing list
> > > linux-riscv@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/linux-riscv
>
> best,
> - Tomasz
Robin Murphy Aug. 16, 2023, 6:49 p.m. UTC | #17
On 2023-07-19 20:33, Tomasz Jeznach wrote:
> Enables message or wire signal interrupts for PCIe and platforms devices.
> 
> Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
> Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> ---
>   drivers/iommu/riscv/iommu-pci.c      |  72 ++++
>   drivers/iommu/riscv/iommu-platform.c |  66 +++
>   drivers/iommu/riscv/iommu.c          | 604 ++++++++++++++++++++++++++-
>   drivers/iommu/riscv/iommu.h          |  28 ++
>   4 files changed, 769 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/riscv/iommu-pci.c b/drivers/iommu/riscv/iommu-pci.c
> index c91f963d7a29..9ea0647f7b92 100644
> --- a/drivers/iommu/riscv/iommu-pci.c
> +++ b/drivers/iommu/riscv/iommu-pci.c
> @@ -34,6 +34,7 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
>   {
>   	struct device *dev = &pdev->dev;
>   	struct riscv_iommu_device *iommu;
> +	u64 icvec;
>   	int ret;
>   
>   	ret = pci_enable_device_mem(pdev);
> @@ -67,14 +68,84 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
>   	iommu->dev = dev;
>   	dev_set_drvdata(dev, iommu);
>   
> +	/* Check device reported capabilities. */
> +	iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> +
> +	/* The PCI driver only uses MSIs, make sure the IOMMU supports this */
> +	switch (FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap)) {
> +	case RISCV_IOMMU_CAP_IGS_MSI:
> +	case RISCV_IOMMU_CAP_IGS_BOTH:
> +		break;
> +	default:
> +		dev_err(dev, "unable to use message-signaled interrupts\n");
> +		ret = -ENODEV;
> +		goto fail;
> +	}
> +
>   	dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
>   	pci_set_master(pdev);
>   
> +	/* Allocate and assign IRQ vectors for the various events */
> +	ret = pci_alloc_irq_vectors(pdev, 1, RISCV_IOMMU_INTR_COUNT, PCI_IRQ_MSIX);
> +	if (ret < 0) {
> +		dev_err(dev, "unable to allocate irq vectors\n");
> +		goto fail;
> +	}
> +
> +	ret = -ENODEV;
> +
> +	iommu->irq_cmdq = msi_get_virq(dev, RISCV_IOMMU_INTR_CQ);
> +	if (!iommu->irq_cmdq) {
> +		dev_warn(dev, "no MSI vector %d for the command queue\n",
> +			 RISCV_IOMMU_INTR_CQ);
> +		goto fail;
> +	}
> +
> +	iommu->irq_fltq = msi_get_virq(dev, RISCV_IOMMU_INTR_FQ);
> +	if (!iommu->irq_fltq) {
> +		dev_warn(dev, "no MSI vector %d for the fault/event queue\n",
> +			 RISCV_IOMMU_INTR_FQ);
> +		goto fail;
> +	}
> +
> +	if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
> +		iommu->irq_pm = msi_get_virq(dev, RISCV_IOMMU_INTR_PM);
> +		if (!iommu->irq_pm) {
> +			dev_warn(dev,
> +				 "no MSI vector %d for performance monitoring\n",
> +				 RISCV_IOMMU_INTR_PM);
> +			goto fail;
> +		}
> +	}
> +
> +	if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
> +		iommu->irq_priq = msi_get_virq(dev, RISCV_IOMMU_INTR_PQ);
> +		if (!iommu->irq_priq) {
> +			dev_warn(dev,
> +				 "no MSI vector %d for page-request queue\n",
> +				 RISCV_IOMMU_INTR_PQ);
> +			goto fail;
> +		}
> +	}
> +
> +	/* Set simple 1:1 mapping for MSI vectors */
> +	icvec = FIELD_PREP(RISCV_IOMMU_IVEC_CIV, RISCV_IOMMU_INTR_CQ) |
> +	    FIELD_PREP(RISCV_IOMMU_IVEC_FIV, RISCV_IOMMU_INTR_FQ);
> +
> +	if (iommu->cap & RISCV_IOMMU_CAP_HPM)
> +		icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, RISCV_IOMMU_INTR_PM);
> +
> +	if (iommu->cap & RISCV_IOMMU_CAP_ATS)
> +		icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, RISCV_IOMMU_INTR_PQ);
> +
> +	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IVEC, icvec);
> +
>   	ret = riscv_iommu_init(iommu);
>   	if (!ret)
>   		return ret;
>   
>    fail:
> +	pci_free_irq_vectors(pdev);
>   	pci_clear_master(pdev);
>   	pci_release_regions(pdev);
>   	pci_disable_device(pdev);
> @@ -85,6 +156,7 @@ static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
>   static void riscv_iommu_pci_remove(struct pci_dev *pdev)
>   {
>   	riscv_iommu_remove(dev_get_drvdata(&pdev->dev));
> +	pci_free_irq_vectors(pdev);
>   	pci_clear_master(pdev);
>   	pci_release_regions(pdev);
>   	pci_disable_device(pdev);
> diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c
> index e4e8ca6711e7..35935d3c7ef4 100644
> --- a/drivers/iommu/riscv/iommu-platform.c
> +++ b/drivers/iommu/riscv/iommu-platform.c
> @@ -20,6 +20,8 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
>   	struct device *dev = &pdev->dev;
>   	struct riscv_iommu_device *iommu = NULL;
>   	struct resource *res = NULL;
> +	u32 fctl = 0;
> +	int irq = 0;
>   	int ret = 0;
>   
>   	iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
> @@ -53,6 +55,70 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
>   		goto fail;
>   	}
>   
> +	iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
> +
> +	/* For now we only support WSIs until we have AIA support */
> +	ret = FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap);
> +	if (ret == RISCV_IOMMU_CAP_IGS_MSI) {
> +		dev_err(dev, "IOMMU only supports MSIs\n");
> +		goto fail;
> +	}
> +
> +	/* Parse IRQ assignment */
> +	irq = platform_get_irq_byname_optional(pdev, "cmdq");
> +	if (irq > 0)
> +		iommu->irq_cmdq = irq;
> +	else {
> +		dev_err(dev, "no IRQ provided for the command queue\n");
> +		goto fail;
> +	}
> +
> +	irq = platform_get_irq_byname_optional(pdev, "fltq");
> +	if (irq > 0)
> +		iommu->irq_fltq = irq;
> +	else {
> +		dev_err(dev, "no IRQ provided for the fault/event queue\n");
> +		goto fail;
> +	}
> +
> +	if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
> +		irq = platform_get_irq_byname_optional(pdev, "pm");
> +		if (irq > 0)
> +			iommu->irq_pm = irq;
> +		else {
> +			dev_err(dev, "no IRQ provided for performance monitoring\n");
> +			goto fail;
> +		}
> +	}
> +
> +	if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
> +		irq = platform_get_irq_byname_optional(pdev, "priq");
> +		if (irq > 0)
> +			iommu->irq_priq = irq;
> +		else {
> +			dev_err(dev, "no IRQ provided for the page-request queue\n");
> +			goto fail;
> +		}
> +	}
> +
> +	/* Make sure fctl.WSI is set */
> +	fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
> +	fctl |= RISCV_IOMMU_FCTL_WSI;
> +	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, fctl);
> +
> +	/* Parse Queue lengts */
> +	ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
> +	if (!ret)
> +		dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
> +
> +	ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
> +	if (!ret)
> +		dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
> +
> +	ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
> +	if (!ret)
> +		dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);

These properties are not documented in the binding, but are clearly 
Linux-specific driver policy which does not belong in DT anyway.

> +
>   	dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
>   
>   	return riscv_iommu_init(iommu);
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 31dc3c458e13..5c4cf9875302 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -45,6 +45,18 @@ static int ddt_mode = RISCV_IOMMU_DDTP_MODE_BARE;
>   module_param(ddt_mode, int, 0644);
>   MODULE_PARM_DESC(ddt_mode, "Device Directory Table mode.");
>   
> +static int cmdq_length = 1024;
> +module_param(cmdq_length, int, 0644);
> +MODULE_PARM_DESC(cmdq_length, "Command queue length.");
> +
> +static int fltq_length = 1024;
> +module_param(fltq_length, int, 0644);
> +MODULE_PARM_DESC(fltq_length, "Fault queue length.");
> +
> +static int priq_length = 1024;
> +module_param(priq_length, int, 0644);
> +MODULE_PARM_DESC(priq_length, "Page request interface queue length.");
> +
>   /* IOMMU PSCID allocation namespace. */
>   #define RISCV_IOMMU_MAX_PSCID	(1U << 20)
>   static DEFINE_IDA(riscv_iommu_pscids);
> @@ -65,6 +77,497 @@ static DEFINE_IDA(riscv_iommu_pscids);
>   static const struct iommu_domain_ops riscv_iommu_domain_ops;
>   static const struct iommu_ops riscv_iommu_ops;
>   
> +/*
> + * Common queue management routines
> + */
> +
> +/* Note: offsets are the same for all queues */
> +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
> +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
> +
> +static unsigned riscv_iommu_queue_consume(struct riscv_iommu_device *iommu,
> +					  struct riscv_iommu_queue *q, unsigned *ready)
> +{
> +	u32 tail = riscv_iommu_readl(iommu, Q_TAIL(q));
> +	*ready = q->lui;
> +
> +	BUG_ON(q->cnt <= tail);
> +	if (q->lui <= tail)
> +		return tail - q->lui;
> +	return q->cnt - q->lui;
> +}
> +
> +static void riscv_iommu_queue_release(struct riscv_iommu_device *iommu,
> +				      struct riscv_iommu_queue *q, unsigned count)
> +{
> +	q->lui = (q->lui + count) & (q->cnt - 1);
> +	riscv_iommu_writel(iommu, Q_HEAD(q), q->lui);
> +}
> +
> +static u32 riscv_iommu_queue_ctrl(struct riscv_iommu_device *iommu,
> +				  struct riscv_iommu_queue *q, u32 val)
> +{
> +	cycles_t end_cycles = RISCV_IOMMU_TIMEOUT + get_cycles();
> +
> +	riscv_iommu_writel(iommu, q->qcr, val);
> +	do {
> +		val = riscv_iommu_readl(iommu, q->qcr);
> +		if (!(val & RISCV_IOMMU_QUEUE_BUSY))
> +			break;
> +		cpu_relax();
> +	} while (get_cycles() < end_cycles);
> +
> +	return val;
> +}
> +
> +static void riscv_iommu_queue_free(struct riscv_iommu_device *iommu,
> +				   struct riscv_iommu_queue *q)
> +{
> +	size_t size = q->len * q->cnt;
> +
> +	riscv_iommu_queue_ctrl(iommu, q, 0);
> +
> +	if (q->base) {
> +		if (q->in_iomem)
> +			iounmap(q->base);
> +		else
> +			dmam_free_coherent(iommu->dev, size, q->base, q->base_dma);
> +	}
> +	if (q->irq)
> +		free_irq(q->irq, q);
> +}
> +
> +static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data);
> +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data);
> +static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data);
> +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data);
> +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
> +static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
> +
> +static int riscv_iommu_queue_init(struct riscv_iommu_device *iommu, int queue_id)
> +{
> +	struct device *dev = iommu->dev;
> +	struct riscv_iommu_queue *q = NULL;
> +	size_t queue_size = 0;
> +	irq_handler_t irq_check;
> +	irq_handler_t irq_process;
> +	const char *name;
> +	int count = 0;
> +	int irq = 0;
> +	unsigned order = 0;
> +	u64 qbr_val = 0;
> +	u64 qbr_readback = 0;
> +	u64 qbr_paddr = 0;
> +	int ret = 0;
> +
> +	switch (queue_id) {
> +	case RISCV_IOMMU_COMMAND_QUEUE:
> +		q = &iommu->cmdq;
> +		q->len = sizeof(struct riscv_iommu_command);
> +		count = iommu->cmdq_len;
> +		irq = iommu->irq_cmdq;
> +		irq_check = riscv_iommu_cmdq_irq_check;
> +		irq_process = riscv_iommu_cmdq_process;
> +		q->qbr = RISCV_IOMMU_REG_CQB;
> +		q->qcr = RISCV_IOMMU_REG_CQCSR;
> +		name = "cmdq";
> +		break;
> +	case RISCV_IOMMU_FAULT_QUEUE:
> +		q = &iommu->fltq;
> +		q->len = sizeof(struct riscv_iommu_fq_record);
> +		count = iommu->fltq_len;
> +		irq = iommu->irq_fltq;
> +		irq_check = riscv_iommu_fltq_irq_check;
> +		irq_process = riscv_iommu_fltq_process;
> +		q->qbr = RISCV_IOMMU_REG_FQB;
> +		q->qcr = RISCV_IOMMU_REG_FQCSR;
> +		name = "fltq";
> +		break;
> +	case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
> +		q = &iommu->priq;
> +		q->len = sizeof(struct riscv_iommu_pq_record);
> +		count = iommu->priq_len;
> +		irq = iommu->irq_priq;
> +		irq_check = riscv_iommu_priq_irq_check;
> +		irq_process = riscv_iommu_priq_process;
> +		q->qbr = RISCV_IOMMU_REG_PQB;
> +		q->qcr = RISCV_IOMMU_REG_PQCSR;
> +		name = "priq";
> +		break;
> +	default:
> +		dev_err(dev, "invalid queue interrupt index in queue_init!\n");
> +		return -EINVAL;
> +	}
> +
> +	/* Polling not implemented */
> +	if (!irq)
> +		return -ENODEV;
> +
> +	/* Allocate queue in memory and set the base register */
> +	order = ilog2(count);
> +	do {
> +		queue_size = q->len * (1ULL << order);
> +		q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);
> +		if (q->base || queue_size < PAGE_SIZE)
> +			break;
> +
> +		order--;
> +	} while (1);
> +
> +	if (!q->base) {
> +		dev_err(dev, "failed to allocate %s queue (cnt: %u)\n", name, count);
> +		return -ENOMEM;
> +	}
> +
> +	q->cnt = 1ULL << order;
> +
> +	qbr_val = phys_to_ppn(q->base_dma) |
> +	    FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
> +
> +	riscv_iommu_writeq(iommu, q->qbr, qbr_val);
> +
> +	/*
> +	 * Queue base registers are WARL, so it's possible that whatever we wrote
> +	 * there was illegal/not supported by the hw in which case we need to make
> +	 * sure we set a supported PPN and/or queue size.
> +	 */
> +	qbr_readback = riscv_iommu_readq(iommu, q->qbr);
> +	if (qbr_readback == qbr_val)
> +		goto irq;
> +
> +	dmam_free_coherent(dev, queue_size, q->base, q->base_dma);
> +
> +	/* Get supported queue size */
> +	order = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qbr_readback) + 1;
> +	q->cnt = 1ULL << order;
> +	queue_size = q->len * q->cnt;

Um... What? We allocate an arbitrarily-sized queue, free it again, 
*then* check what the hardware actually supports, and maybe allocate 
another queue? I can't help thinking there's a much better way...

> +
> +	/*
> +	 * In case we also failed to set PPN, it means the field is hardcoded and the
> +	 * queue resides in I/O memory instead, so get its physical address and
> +	 * ioremap it.
> +	 */
> +	qbr_paddr = ppn_to_phys(qbr_readback);
> +	if (qbr_paddr != q->base_dma) {
> +		dev_info(dev,
> +			 "hardcoded ppn in %s base register, using io memory for the queue\n",
> +			 name);
> +		dev_info(dev, "queue length for %s set to %i\n", name, q->cnt);
> +		q->in_iomem = true;
> +		q->base = ioremap(qbr_paddr, queue_size);
> +		if (!q->base) {
> +			dev_err(dev, "failed to map %s queue (cnt: %u)\n", name, q->cnt);
> +			return -ENOMEM;
> +		}
> +		q->base_dma = qbr_paddr;
> +	} else {
> +		/*
> +		 * We only failed to set the queue size, re-try to allocate memory with
> +		 * the queue size supported by the hw.
> +		 */
> +		dev_info(dev, "hardcoded queue size in %s base register\n", name);
> +		dev_info(dev, "retrying with queue length: %i\n", q->cnt);
> +		q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);

Note that dma_alloc_coherent only guarantees natural alignment here, so 
if you need a minimum alignment of 4KB as the spec claims you should 
really make clamp your minimum allocation size to that.

> +		if (!q->base) {
> +			dev_err(dev, "failed to allocate %s queue (cnt: %u)\n",
> +				name, q->cnt);
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	qbr_val = phys_to_ppn(q->base_dma) |
> +	    FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
> +	riscv_iommu_writeq(iommu, q->qbr, qbr_val);
> +
> +	/* Final check to make sure hw accepted our write */
> +	qbr_readback = riscv_iommu_readq(iommu, q->qbr);
> +	if (qbr_readback != qbr_val) {
> +		dev_err(dev, "failed to set base register for %s\n", name);
> +		goto fail;
> +	}
> +
> + irq:
> +	if (request_threaded_irq(irq, irq_check, irq_process, IRQF_ONESHOT | IRQF_SHARED,
> +				 dev_name(dev), q)) {
> +		dev_err(dev, "fail to request irq %d for %s\n", irq, name);
> +		goto fail;
> +	}
> +
> +	q->irq = irq;
> +
> +	/* Note: All RIO_xQ_EN/IE fields are in the same offsets */
> +	ret =
> +	    riscv_iommu_queue_ctrl(iommu, q,
> +				   RISCV_IOMMU_QUEUE_ENABLE |
> +				   RISCV_IOMMU_QUEUE_INTR_ENABLE);
> +	if (ret & RISCV_IOMMU_QUEUE_BUSY) {
> +		dev_err(dev, "%s init timeout\n", name);
> +		ret = -EBUSY;
> +		goto fail;
> +	}
> +
> +	return 0;
> +
> + fail:
> +	riscv_iommu_queue_free(iommu, q);
> +	return 0;
> +}
> +
> +/*
> + * I/O MMU Command queue chapter 3.1
> + */
> +
> +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
> +{
> +	cmd->dword0 =
> +	    FIELD_PREP(RISCV_IOMMU_CMD_OPCODE,
> +		       RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | FIELD_PREP(RISCV_IOMMU_CMD_FUNC,
> +								     RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);

Interesting indentation... :/

> +	cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> +						  u64 addr)
> +{
> +	cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> +	cmd->dword1 = addr;
> +}
> +
> +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
> +						   unsigned pscid)
> +{
> +	cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
> +	    RISCV_IOMMU_CMD_IOTINVAL_PSCV;
> +}
> +
> +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
> +						   unsigned gscid)
> +{
> +	cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
> +	    RISCV_IOMMU_CMD_IOTINVAL_GV;
> +}
> +
> +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
> +{
> +	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> +	    FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C);
> +	cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
> +						  u64 addr, u32 data)
> +{
> +	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> +	    FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> +	    FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) | RISCV_IOMMU_CMD_IOFENCE_AV;
> +	cmd->dword1 = (addr >> 2);
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
> +{
> +	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> +	    FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
> +	cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
> +{
> +	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> +	    FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
> +	cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
> +						 unsigned devid)
> +{
> +	cmd->dword0 |=
> +	    FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) | RISCV_IOMMU_CMD_IODIR_DV;
> +}
> +
> +/* TODO: Convert into lock-less MPSC implementation. */
> +static bool riscv_iommu_post_sync(struct riscv_iommu_device *iommu,
> +				  struct riscv_iommu_command *cmd, bool sync)
> +{
> +	u32 head, tail, next, last;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iommu->cq_lock, flags);
> +	head = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) & (iommu->cmdq.cnt - 1);
> +	tail = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
> +	last = iommu->cmdq.lui;
> +	if (tail != last) {
> +		spin_unlock_irqrestore(&iommu->cq_lock, flags);
> +		/*
> +		 * FIXME: This is a workaround for dropped MMIO writes/reads on QEMU platform.
> +		 *        While debugging of the problem is still ongoing, this provides
> +		 *        a simple impolementation of try-again policy.
> +		 *        Will be changed to lock-less algorithm in the feature.
> +		 */
> +		dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (1st)\n", last, tail);
> +		spin_lock_irqsave(&iommu->cq_lock, flags);
> +		tail =
> +		    riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
> +		last = iommu->cmdq.lui;
> +		if (tail != last) {
> +			spin_unlock_irqrestore(&iommu->cq_lock, flags);
> +			dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (2nd)\n", last, tail);
> +			spin_lock_irqsave(&iommu->cq_lock, flags);
> +		}
> +	}
> +
> +	next = (last + 1) & (iommu->cmdq.cnt - 1);
> +	if (next != head) {
> +		struct riscv_iommu_command *ptr = iommu->cmdq.base;
> +		ptr[last] = *cmd;
> +		wmb();
> +		riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQT, next);
> +		iommu->cmdq.lui = next;
> +	}
> +
> +	spin_unlock_irqrestore(&iommu->cq_lock, flags);
> +
> +	if (sync && head != next) {
> +		cycles_t start_time = get_cycles();
> +		while (1) {
> +			last = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) &
> +			    (iommu->cmdq.cnt - 1);
> +			if (head < next && last >= next)
> +				break;
> +			if (head > next && last < head && last >= next)
> +				break;
> +			if (RISCV_IOMMU_TIMEOUT < (get_cycles() - start_time)) {
> +				dev_err(iommu->dev, "IOFENCE TIMEOUT\n");
> +				return false;
> +			}
> +			cpu_relax();
> +		}
> +	}
> +
> +	return next != head;
> +}
> +
> +static bool riscv_iommu_post(struct riscv_iommu_device *iommu,
> +			     struct riscv_iommu_command *cmd)
> +{
> +	return riscv_iommu_post_sync(iommu, cmd, false);
> +}
> +
> +static bool riscv_iommu_iofence_sync(struct riscv_iommu_device *iommu)
> +{
> +	struct riscv_iommu_command cmd;
> +	riscv_iommu_cmd_iofence(&cmd);
> +	return riscv_iommu_post_sync(iommu, &cmd, true);
> +}
> +
> +/* Command queue primary interrupt handler */
> +static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data)
> +{
> +	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +	struct riscv_iommu_device *iommu =
> +	    container_of(q, struct riscv_iommu_device, cmdq);
> +	u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> +	if (ipsr & RISCV_IOMMU_IPSR_CIP)
> +		return IRQ_WAKE_THREAD;
> +	return IRQ_NONE;
> +}
> +
> +/* Command queue interrupt hanlder thread function */
> +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
> +{
> +	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +	struct riscv_iommu_device *iommu;
> +	unsigned ctrl;
> +
> +	iommu = container_of(q, struct riscv_iommu_device, cmdq);
> +
> +	/* Error reporting, clear error reports if any. */
> +	ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQCSR);
> +	if (ctrl & (RISCV_IOMMU_CQCSR_CQMF |
> +		    RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL)) {
> +		riscv_iommu_queue_ctrl(iommu, &iommu->cmdq, ctrl);
> +		dev_warn_ratelimited(iommu->dev,
> +				     "Command queue error: fault: %d tout: %d err: %d\n",
> +				     !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
> +				     !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
> +				     !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL));
> +	}
> +
> +	/* Clear fault interrupt pending. */
> +	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_CIP);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +/*
> + * Fault/event queue, chapter 3.2
> + */
> +
> +static void riscv_iommu_fault_report(struct riscv_iommu_device *iommu,
> +				     struct riscv_iommu_fq_record *event)
> +{
> +	unsigned err, devid;
> +
> +	err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
> +	devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
> +
> +	dev_warn_ratelimited(iommu->dev,
> +			     "Fault %d devid: %d" " iotval: %llx iotval2: %llx\n", err,
> +			     devid, event->iotval, event->iotval2);
> +}
> +
> +/* Fault/event queue primary interrupt handler */
> +static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data)
> +{
> +	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +	struct riscv_iommu_device *iommu =
> +	    container_of(q, struct riscv_iommu_device, fltq);
> +	u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> +	if (ipsr & RISCV_IOMMU_IPSR_FIP)
> +		return IRQ_WAKE_THREAD;
> +	return IRQ_NONE;
> +}
> +
> +/* Fault queue interrupt hanlder thread function */
> +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
> +{
> +	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +	struct riscv_iommu_device *iommu;
> +	struct riscv_iommu_fq_record *events;
> +	unsigned cnt, len, idx, ctrl;
> +
> +	iommu = container_of(q, struct riscv_iommu_device, fltq);
> +	events = (struct riscv_iommu_fq_record *)q->base;
> +
> +	/* Error reporting, clear error reports if any. */
> +	ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FQCSR);
> +	if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
> +		riscv_iommu_queue_ctrl(iommu, &iommu->fltq, ctrl);
> +		dev_warn_ratelimited(iommu->dev,
> +				     "Fault queue error: fault: %d full: %d\n",
> +				     !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
> +				     !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
> +	}
> +
> +	/* Clear fault interrupt pending. */
> +	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_FIP);
> +
> +	/* Report fault events. */
> +	do {
> +		cnt = riscv_iommu_queue_consume(iommu, q, &idx);
> +		if (!cnt)
> +			break;
> +		for (len = 0; len < cnt; idx++, len++)
> +			riscv_iommu_fault_report(iommu, &events[idx]);
> +		riscv_iommu_queue_release(iommu, q, cnt);
> +	} while (1);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +/*
> + * Page request queue, chapter 3.3
> + */
> +
>   /*
>    * Register device for IOMMU tracking.
>    */
> @@ -97,6 +600,54 @@ static void riscv_iommu_add_device(struct riscv_iommu_device *iommu, struct devi
>   	mutex_unlock(&iommu->eps_mutex);
>   }
>   
> +/* Page request interface queue primary interrupt handler */
> +static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data)
> +{
> +	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +	struct riscv_iommu_device *iommu =
> +	    container_of(q, struct riscv_iommu_device, priq);
> +	u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
> +	if (ipsr & RISCV_IOMMU_IPSR_PIP)
> +		return IRQ_WAKE_THREAD;
> +	return IRQ_NONE;
> +}
> +
> +/* Page request interface queue interrupt hanlder thread function */
> +static irqreturn_t riscv_iommu_priq_process(int irq, void *data)
> +{
> +	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
> +	struct riscv_iommu_device *iommu;
> +	struct riscv_iommu_pq_record *requests;
> +	unsigned cnt, idx, ctrl;
> +
> +	iommu = container_of(q, struct riscv_iommu_device, priq);
> +	requests = (struct riscv_iommu_pq_record *)q->base;
> +
> +	/* Error reporting, clear error reports if any. */
> +	ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_PQCSR);
> +	if (ctrl & (RISCV_IOMMU_PQCSR_PQMF | RISCV_IOMMU_PQCSR_PQOF)) {
> +		riscv_iommu_queue_ctrl(iommu, &iommu->priq, ctrl);
> +		dev_warn_ratelimited(iommu->dev,
> +				     "Page request queue error: fault: %d full: %d\n",
> +				     !!(ctrl & RISCV_IOMMU_PQCSR_PQMF),
> +				     !!(ctrl & RISCV_IOMMU_PQCSR_PQOF));
> +	}
> +
> +	/* Clear page request interrupt pending. */
> +	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_PIP);
> +
> +	/* Process page requests. */
> +	do {
> +		cnt = riscv_iommu_queue_consume(iommu, q, &idx);
> +		if (!cnt)
> +			break;
> +		dev_warn(iommu->dev, "unexpected %u page requests\n", cnt);
> +		riscv_iommu_queue_release(iommu, q, cnt);
> +	} while (1);
> +
> +	return IRQ_HANDLED;
> +}
> +
>   /*
>    * Endpoint management
>    */
> @@ -350,7 +901,29 @@ static void riscv_iommu_flush_iotlb_range(struct iommu_domain *iommu_domain,
>   					  unsigned long *start, unsigned long *end,
>   					  size_t *pgsize)
>   {
> -	/* Command interface not implemented */
> +	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
> +	struct riscv_iommu_command cmd;
> +	unsigned long iova;
> +
> +	if (domain->mode == RISCV_IOMMU_DC_FSC_MODE_BARE)

That should probably not happen - things shouldn't be calling TLB ops on 
identity domains (and ideally your identity domains wouldn't even *have* 
iotlb callbacks...)

> +		return;
> +
> +	/* Domain not attached to an IOMMU! */
> +	BUG_ON(!domain->iommu);

However I'm not sure how iommu_create_device_direct_mappings() isn't 
hitting that?

Thanks,
Robin.

> +
> +	riscv_iommu_cmd_inval_vma(&cmd);
> +	riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> +
> +	if (start && end && pgsize) {
> +		/* Cover only the range that is needed */
> +		for (iova = *start; iova <= *end; iova += *pgsize) {
> +			riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> +			riscv_iommu_post(domain->iommu, &cmd);
> +		}
> +	} else {
> +		riscv_iommu_post(domain->iommu, &cmd);
> +	}
> +	riscv_iommu_iofence_sync(domain->iommu);
>   }
>   
>   static void riscv_iommu_flush_iotlb_all(struct iommu_domain *iommu_domain)
> @@ -610,6 +1183,9 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
>   	iommu_device_unregister(&iommu->iommu);
>   	iommu_device_sysfs_remove(&iommu->iommu);
>   	riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> +	riscv_iommu_queue_free(iommu, &iommu->cmdq);
> +	riscv_iommu_queue_free(iommu, &iommu->fltq);
> +	riscv_iommu_queue_free(iommu, &iommu->priq);
>   }
>   
>   int riscv_iommu_init(struct riscv_iommu_device *iommu)
> @@ -632,6 +1208,16 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>   	}
>   #endif
>   
> +	/*
> +	 * Assign queue lengths from module parameters if not already
> +	 * set on the device tree.
> +	 */
> +	if (!iommu->cmdq_len)
> +		iommu->cmdq_len = cmdq_length;
> +	if (!iommu->fltq_len)
> +		iommu->fltq_len = fltq_length;
> +	if (!iommu->priq_len)
> +		iommu->priq_len = priq_length;
>   	/* Clear any pending interrupt flag. */
>   	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR,
>   			   RISCV_IOMMU_IPSR_CIP |
> @@ -639,7 +1225,20 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>   			   RISCV_IOMMU_IPSR_PMIP | RISCV_IOMMU_IPSR_PIP);
>   	spin_lock_init(&iommu->cq_lock);
>   	mutex_init(&iommu->eps_mutex);
> +	ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_COMMAND_QUEUE);
> +	if (ret)
> +		goto fail;
> +	ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_FAULT_QUEUE);
> +	if (ret)
> +		goto fail;
> +	if (!(iommu->cap & RISCV_IOMMU_CAP_ATS))
> +		goto no_ats;
> +
> +	ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_PAGE_REQUEST_QUEUE);
> +	if (ret)
> +		goto fail;
>   
> + no_ats:
>   	ret = riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_BARE);
>   
>   	if (ret) {
> @@ -663,5 +1262,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>   	return 0;
>    fail:
>   	riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> +	riscv_iommu_queue_free(iommu, &iommu->priq);
> +	riscv_iommu_queue_free(iommu, &iommu->fltq);
> +	riscv_iommu_queue_free(iommu, &iommu->cmdq);
>   	return ret;
>   }
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index 7dc9baa59a50..04148a2a8ffd 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -28,6 +28,24 @@
>   #define IOMMU_PAGE_SIZE_1G	BIT_ULL(30)
>   #define IOMMU_PAGE_SIZE_512G	BIT_ULL(39)
>   
> +struct riscv_iommu_queue {
> +	dma_addr_t base_dma;	/* ring buffer bus address */
> +	void *base;		/* ring buffer pointer */
> +	size_t len;		/* single item length */
> +	u32 cnt;		/* items count */
> +	u32 lui;		/* last used index, consumer/producer share */
> +	unsigned qbr;		/* queue base register offset */
> +	unsigned qcr;		/* queue control and status register offset */
> +	int irq;		/* registered interrupt number */
> +	bool in_iomem;		/* indicates queue data are in I/O memory  */
> +};
> +
> +enum riscv_queue_ids {
> +	RISCV_IOMMU_COMMAND_QUEUE	= 0,
> +	RISCV_IOMMU_FAULT_QUEUE		= 1,
> +	RISCV_IOMMU_PAGE_REQUEST_QUEUE	= 2
> +};
> +
>   struct riscv_iommu_device {
>   	struct iommu_device iommu;	/* iommu core interface */
>   	struct device *dev;		/* iommu hardware */
> @@ -42,6 +60,11 @@ struct riscv_iommu_device {
>   	int irq_pm;
>   	int irq_priq;
>   
> +	/* Queue lengths */
> +	int cmdq_len;
> +	int fltq_len;
> +	int priq_len;
> +
>   	/* supported and enabled hardware capabilities */
>   	u64 cap;
>   
> @@ -53,6 +76,11 @@ struct riscv_iommu_device {
>   	unsigned ddt_mode;
>   	bool ddtp_in_iomem;
>   
> +	/* hardware queues */
> +	struct riscv_iommu_queue cmdq;
> +	struct riscv_iommu_queue fltq;
> +	struct riscv_iommu_queue priq;
> +
>   	/* Connected end-points */
>   	struct rb_root eps;
>   	struct mutex eps_mutex;
diff mbox series

Patch

diff --git a/drivers/iommu/riscv/iommu-pci.c b/drivers/iommu/riscv/iommu-pci.c
index c91f963d7a29..9ea0647f7b92 100644
--- a/drivers/iommu/riscv/iommu-pci.c
+++ b/drivers/iommu/riscv/iommu-pci.c
@@ -34,6 +34,7 @@  static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
 {
 	struct device *dev = &pdev->dev;
 	struct riscv_iommu_device *iommu;
+	u64 icvec;
 	int ret;
 
 	ret = pci_enable_device_mem(pdev);
@@ -67,14 +68,84 @@  static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
 	iommu->dev = dev;
 	dev_set_drvdata(dev, iommu);
 
+	/* Check device reported capabilities. */
+	iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
+
+	/* The PCI driver only uses MSIs, make sure the IOMMU supports this */
+	switch (FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap)) {
+	case RISCV_IOMMU_CAP_IGS_MSI:
+	case RISCV_IOMMU_CAP_IGS_BOTH:
+		break;
+	default:
+		dev_err(dev, "unable to use message-signaled interrupts\n");
+		ret = -ENODEV;
+		goto fail;
+	}
+
 	dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
 	pci_set_master(pdev);
 
+	/* Allocate and assign IRQ vectors for the various events */
+	ret = pci_alloc_irq_vectors(pdev, 1, RISCV_IOMMU_INTR_COUNT, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		dev_err(dev, "unable to allocate irq vectors\n");
+		goto fail;
+	}
+
+	ret = -ENODEV;
+
+	iommu->irq_cmdq = msi_get_virq(dev, RISCV_IOMMU_INTR_CQ);
+	if (!iommu->irq_cmdq) {
+		dev_warn(dev, "no MSI vector %d for the command queue\n",
+			 RISCV_IOMMU_INTR_CQ);
+		goto fail;
+	}
+
+	iommu->irq_fltq = msi_get_virq(dev, RISCV_IOMMU_INTR_FQ);
+	if (!iommu->irq_fltq) {
+		dev_warn(dev, "no MSI vector %d for the fault/event queue\n",
+			 RISCV_IOMMU_INTR_FQ);
+		goto fail;
+	}
+
+	if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
+		iommu->irq_pm = msi_get_virq(dev, RISCV_IOMMU_INTR_PM);
+		if (!iommu->irq_pm) {
+			dev_warn(dev,
+				 "no MSI vector %d for performance monitoring\n",
+				 RISCV_IOMMU_INTR_PM);
+			goto fail;
+		}
+	}
+
+	if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
+		iommu->irq_priq = msi_get_virq(dev, RISCV_IOMMU_INTR_PQ);
+		if (!iommu->irq_priq) {
+			dev_warn(dev,
+				 "no MSI vector %d for page-request queue\n",
+				 RISCV_IOMMU_INTR_PQ);
+			goto fail;
+		}
+	}
+
+	/* Set simple 1:1 mapping for MSI vectors */
+	icvec = FIELD_PREP(RISCV_IOMMU_IVEC_CIV, RISCV_IOMMU_INTR_CQ) |
+	    FIELD_PREP(RISCV_IOMMU_IVEC_FIV, RISCV_IOMMU_INTR_FQ);
+
+	if (iommu->cap & RISCV_IOMMU_CAP_HPM)
+		icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, RISCV_IOMMU_INTR_PM);
+
+	if (iommu->cap & RISCV_IOMMU_CAP_ATS)
+		icvec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, RISCV_IOMMU_INTR_PQ);
+
+	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IVEC, icvec);
+
 	ret = riscv_iommu_init(iommu);
 	if (!ret)
 		return ret;
 
  fail:
+	pci_free_irq_vectors(pdev);
 	pci_clear_master(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
@@ -85,6 +156,7 @@  static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_i
 static void riscv_iommu_pci_remove(struct pci_dev *pdev)
 {
 	riscv_iommu_remove(dev_get_drvdata(&pdev->dev));
+	pci_free_irq_vectors(pdev);
 	pci_clear_master(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c
index e4e8ca6711e7..35935d3c7ef4 100644
--- a/drivers/iommu/riscv/iommu-platform.c
+++ b/drivers/iommu/riscv/iommu-platform.c
@@ -20,6 +20,8 @@  static int riscv_iommu_platform_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct riscv_iommu_device *iommu = NULL;
 	struct resource *res = NULL;
+	u32 fctl = 0;
+	int irq = 0;
 	int ret = 0;
 
 	iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
@@ -53,6 +55,70 @@  static int riscv_iommu_platform_probe(struct platform_device *pdev)
 		goto fail;
 	}
 
+	iommu->cap = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAP);
+
+	/* For now we only support WSIs until we have AIA support */
+	ret = FIELD_GET(RISCV_IOMMU_CAP_IGS, iommu->cap);
+	if (ret == RISCV_IOMMU_CAP_IGS_MSI) {
+		dev_err(dev, "IOMMU only supports MSIs\n");
+		goto fail;
+	}
+
+	/* Parse IRQ assignment */
+	irq = platform_get_irq_byname_optional(pdev, "cmdq");
+	if (irq > 0)
+		iommu->irq_cmdq = irq;
+	else {
+		dev_err(dev, "no IRQ provided for the command queue\n");
+		goto fail;
+	}
+
+	irq = platform_get_irq_byname_optional(pdev, "fltq");
+	if (irq > 0)
+		iommu->irq_fltq = irq;
+	else {
+		dev_err(dev, "no IRQ provided for the fault/event queue\n");
+		goto fail;
+	}
+
+	if (iommu->cap & RISCV_IOMMU_CAP_HPM) {
+		irq = platform_get_irq_byname_optional(pdev, "pm");
+		if (irq > 0)
+			iommu->irq_pm = irq;
+		else {
+			dev_err(dev, "no IRQ provided for performance monitoring\n");
+			goto fail;
+		}
+	}
+
+	if (iommu->cap & RISCV_IOMMU_CAP_ATS) {
+		irq = platform_get_irq_byname_optional(pdev, "priq");
+		if (irq > 0)
+			iommu->irq_priq = irq;
+		else {
+			dev_err(dev, "no IRQ provided for the page-request queue\n");
+			goto fail;
+		}
+	}
+
+	/* Make sure fctl.WSI is set */
+	fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
+	fctl |= RISCV_IOMMU_FCTL_WSI;
+	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, fctl);
+
+	/* Parse Queue lengts */
+	ret = of_property_read_u32(pdev->dev.of_node, "cmdq_len", &iommu->cmdq_len);
+	if (!ret)
+		dev_info(dev, "command queue length set to %i\n", iommu->cmdq_len);
+
+	ret = of_property_read_u32(pdev->dev.of_node, "fltq_len", &iommu->fltq_len);
+	if (!ret)
+		dev_info(dev, "fault/event queue length set to %i\n", iommu->fltq_len);
+
+	ret = of_property_read_u32(pdev->dev.of_node, "priq_len", &iommu->priq_len);
+	if (!ret)
+		dev_info(dev, "page request queue length set to %i\n", iommu->priq_len);
+
 	dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
 
 	return riscv_iommu_init(iommu);
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 31dc3c458e13..5c4cf9875302 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -45,6 +45,18 @@  static int ddt_mode = RISCV_IOMMU_DDTP_MODE_BARE;
 module_param(ddt_mode, int, 0644);
 MODULE_PARM_DESC(ddt_mode, "Device Directory Table mode.");
 
+static int cmdq_length = 1024;
+module_param(cmdq_length, int, 0644);
+MODULE_PARM_DESC(cmdq_length, "Command queue length.");
+
+static int fltq_length = 1024;
+module_param(fltq_length, int, 0644);
+MODULE_PARM_DESC(fltq_length, "Fault queue length.");
+
+static int priq_length = 1024;
+module_param(priq_length, int, 0644);
+MODULE_PARM_DESC(priq_length, "Page request interface queue length.");
+
 /* IOMMU PSCID allocation namespace. */
 #define RISCV_IOMMU_MAX_PSCID	(1U << 20)
 static DEFINE_IDA(riscv_iommu_pscids);
@@ -65,6 +77,497 @@  static DEFINE_IDA(riscv_iommu_pscids);
 static const struct iommu_domain_ops riscv_iommu_domain_ops;
 static const struct iommu_ops riscv_iommu_ops;
 
+/*
+ * Common queue management routines
+ */
+
+/* Note: offsets are the same for all queues */
+#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
+#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
+
+static unsigned riscv_iommu_queue_consume(struct riscv_iommu_device *iommu,
+					  struct riscv_iommu_queue *q, unsigned *ready)
+{
+	u32 tail = riscv_iommu_readl(iommu, Q_TAIL(q));
+	*ready = q->lui;
+
+	BUG_ON(q->cnt <= tail);
+	if (q->lui <= tail)
+		return tail - q->lui;
+	return q->cnt - q->lui;
+}
+
+static void riscv_iommu_queue_release(struct riscv_iommu_device *iommu,
+				      struct riscv_iommu_queue *q, unsigned count)
+{
+	q->lui = (q->lui + count) & (q->cnt - 1);
+	riscv_iommu_writel(iommu, Q_HEAD(q), q->lui);
+}
+
+static u32 riscv_iommu_queue_ctrl(struct riscv_iommu_device *iommu,
+				  struct riscv_iommu_queue *q, u32 val)
+{
+	cycles_t end_cycles = RISCV_IOMMU_TIMEOUT + get_cycles();
+
+	riscv_iommu_writel(iommu, q->qcr, val);
+	do {
+		val = riscv_iommu_readl(iommu, q->qcr);
+		if (!(val & RISCV_IOMMU_QUEUE_BUSY))
+			break;
+		cpu_relax();
+	} while (get_cycles() < end_cycles);
+
+	return val;
+}
+
+static void riscv_iommu_queue_free(struct riscv_iommu_device *iommu,
+				   struct riscv_iommu_queue *q)
+{
+	size_t size = q->len * q->cnt;
+
+	riscv_iommu_queue_ctrl(iommu, q, 0);
+
+	if (q->base) {
+		if (q->in_iomem)
+			iounmap(q->base);
+		else
+			dmam_free_coherent(iommu->dev, size, q->base, q->base_dma);
+	}
+	if (q->irq)
+		free_irq(q->irq, q);
+}
+
+static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data);
+static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data);
+static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data);
+static irqreturn_t riscv_iommu_fltq_process(int irq, void *data);
+static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data);
+static irqreturn_t riscv_iommu_priq_process(int irq, void *data);
+
+static int riscv_iommu_queue_init(struct riscv_iommu_device *iommu, int queue_id)
+{
+	struct device *dev = iommu->dev;
+	struct riscv_iommu_queue *q = NULL;
+	size_t queue_size = 0;
+	irq_handler_t irq_check;
+	irq_handler_t irq_process;
+	const char *name;
+	int count = 0;
+	int irq = 0;
+	unsigned order = 0;
+	u64 qbr_val = 0;
+	u64 qbr_readback = 0;
+	u64 qbr_paddr = 0;
+	int ret = 0;
+
+	switch (queue_id) {
+	case RISCV_IOMMU_COMMAND_QUEUE:
+		q = &iommu->cmdq;
+		q->len = sizeof(struct riscv_iommu_command);
+		count = iommu->cmdq_len;
+		irq = iommu->irq_cmdq;
+		irq_check = riscv_iommu_cmdq_irq_check;
+		irq_process = riscv_iommu_cmdq_process;
+		q->qbr = RISCV_IOMMU_REG_CQB;
+		q->qcr = RISCV_IOMMU_REG_CQCSR;
+		name = "cmdq";
+		break;
+	case RISCV_IOMMU_FAULT_QUEUE:
+		q = &iommu->fltq;
+		q->len = sizeof(struct riscv_iommu_fq_record);
+		count = iommu->fltq_len;
+		irq = iommu->irq_fltq;
+		irq_check = riscv_iommu_fltq_irq_check;
+		irq_process = riscv_iommu_fltq_process;
+		q->qbr = RISCV_IOMMU_REG_FQB;
+		q->qcr = RISCV_IOMMU_REG_FQCSR;
+		name = "fltq";
+		break;
+	case RISCV_IOMMU_PAGE_REQUEST_QUEUE:
+		q = &iommu->priq;
+		q->len = sizeof(struct riscv_iommu_pq_record);
+		count = iommu->priq_len;
+		irq = iommu->irq_priq;
+		irq_check = riscv_iommu_priq_irq_check;
+		irq_process = riscv_iommu_priq_process;
+		q->qbr = RISCV_IOMMU_REG_PQB;
+		q->qcr = RISCV_IOMMU_REG_PQCSR;
+		name = "priq";
+		break;
+	default:
+		dev_err(dev, "invalid queue interrupt index in queue_init!\n");
+		return -EINVAL;
+	}
+
+	/* Polling not implemented */
+	if (!irq)
+		return -ENODEV;
+
+	/* Allocate queue in memory and set the base register */
+	order = ilog2(count);
+	do {
+		queue_size = q->len * (1ULL << order);
+		q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);
+		if (q->base || queue_size < PAGE_SIZE)
+			break;
+
+		order--;
+	} while (1);
+
+	if (!q->base) {
+		dev_err(dev, "failed to allocate %s queue (cnt: %u)\n", name, count);
+		return -ENOMEM;
+	}
+
+	q->cnt = 1ULL << order;
+
+	qbr_val = phys_to_ppn(q->base_dma) |
+	    FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
+
+	riscv_iommu_writeq(iommu, q->qbr, qbr_val);
+
+	/*
+	 * Queue base registers are WARL, so it's possible that whatever we wrote
+	 * there was illegal/not supported by the hw in which case we need to make
+	 * sure we set a supported PPN and/or queue size.
+	 */
+	qbr_readback = riscv_iommu_readq(iommu, q->qbr);
+	if (qbr_readback == qbr_val)
+		goto irq;
+
+	dmam_free_coherent(dev, queue_size, q->base, q->base_dma);
+
+	/* Get supported queue size */
+	order = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qbr_readback) + 1;
+	q->cnt = 1ULL << order;
+	queue_size = q->len * q->cnt;
+
+	/*
+	 * In case we also failed to set PPN, it means the field is hardcoded and the
+	 * queue resides in I/O memory instead, so get its physical address and
+	 * ioremap it.
+	 */
+	qbr_paddr = ppn_to_phys(qbr_readback);
+	if (qbr_paddr != q->base_dma) {
+		dev_info(dev,
+			 "hardcoded ppn in %s base register, using io memory for the queue\n",
+			 name);
+		dev_info(dev, "queue length for %s set to %i\n", name, q->cnt);
+		q->in_iomem = true;
+		q->base = ioremap(qbr_paddr, queue_size);
+		if (!q->base) {
+			dev_err(dev, "failed to map %s queue (cnt: %u)\n", name, q->cnt);
+			return -ENOMEM;
+		}
+		q->base_dma = qbr_paddr;
+	} else {
+		/*
+		 * We only failed to set the queue size, re-try to allocate memory with
+		 * the queue size supported by the hw.
+		 */
+		dev_info(dev, "hardcoded queue size in %s base register\n", name);
+		dev_info(dev, "retrying with queue length: %i\n", q->cnt);
+		q->base = dmam_alloc_coherent(dev, queue_size, &q->base_dma, GFP_KERNEL);
+		if (!q->base) {
+			dev_err(dev, "failed to allocate %s queue (cnt: %u)\n",
+				name, q->cnt);
+			return -ENOMEM;
+		}
+	}
+
+	qbr_val = phys_to_ppn(q->base_dma) |
+	    FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, order - 1);
+	riscv_iommu_writeq(iommu, q->qbr, qbr_val);
+
+	/* Final check to make sure hw accepted our write */
+	qbr_readback = riscv_iommu_readq(iommu, q->qbr);
+	if (qbr_readback != qbr_val) {
+		dev_err(dev, "failed to set base register for %s\n", name);
+		goto fail;
+	}
+
+ irq:
+	if (request_threaded_irq(irq, irq_check, irq_process, IRQF_ONESHOT | IRQF_SHARED,
+				 dev_name(dev), q)) {
+		dev_err(dev, "fail to request irq %d for %s\n", irq, name);
+		goto fail;
+	}
+
+	q->irq = irq;
+
+	/* Note: All RIO_xQ_EN/IE fields are in the same offsets */
+	ret =
+	    riscv_iommu_queue_ctrl(iommu, q,
+				   RISCV_IOMMU_QUEUE_ENABLE |
+				   RISCV_IOMMU_QUEUE_INTR_ENABLE);
+	if (ret & RISCV_IOMMU_QUEUE_BUSY) {
+		dev_err(dev, "%s init timeout\n", name);
+		ret = -EBUSY;
+		goto fail;
+	}
+
+	return 0;
+
+ fail:
+	riscv_iommu_queue_free(iommu, q);
+	return 0;
+}
+
+/*
+ * I/O MMU Command queue chapter 3.1
+ */
+
+static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
+{
+	cmd->dword0 =
+	    FIELD_PREP(RISCV_IOMMU_CMD_OPCODE,
+		       RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | FIELD_PREP(RISCV_IOMMU_CMD_FUNC,
+								     RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
+	cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
+						  u64 addr)
+{
+	cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
+	cmd->dword1 = addr;
+}
+
+static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
+						   unsigned pscid)
+{
+	cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
+	    RISCV_IOMMU_CMD_IOTINVAL_PSCV;
+}
+
+static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
+						   unsigned gscid)
+{
+	cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
+	    RISCV_IOMMU_CMD_IOTINVAL_GV;
+}
+
+static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
+{
+	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
+	    FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C);
+	cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
+						  u64 addr, u32 data)
+{
+	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
+	    FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
+	    FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) | RISCV_IOMMU_CMD_IOFENCE_AV;
+	cmd->dword1 = (addr >> 2);
+}
+
+static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
+{
+	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
+	    FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
+	cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
+{
+	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
+	    FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
+	cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
+						 unsigned devid)
+{
+	cmd->dword0 |=
+	    FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) | RISCV_IOMMU_CMD_IODIR_DV;
+}
+
+/* TODO: Convert into lock-less MPSC implementation. */
+static bool riscv_iommu_post_sync(struct riscv_iommu_device *iommu,
+				  struct riscv_iommu_command *cmd, bool sync)
+{
+	u32 head, tail, next, last;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->cq_lock, flags);
+	head = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) & (iommu->cmdq.cnt - 1);
+	tail = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
+	last = iommu->cmdq.lui;
+	if (tail != last) {
+		spin_unlock_irqrestore(&iommu->cq_lock, flags);
+		/*
+		 * FIXME: This is a workaround for dropped MMIO writes/reads on QEMU platform.
+		 *        While debugging of the problem is still ongoing, this provides
+		 *        a simple impolementation of try-again policy.
+		 *        Will be changed to lock-less algorithm in the feature.
+		 */
+		dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (1st)\n", last, tail);
+		spin_lock_irqsave(&iommu->cq_lock, flags);
+		tail =
+		    riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQT) & (iommu->cmdq.cnt - 1);
+		last = iommu->cmdq.lui;
+		if (tail != last) {
+			spin_unlock_irqrestore(&iommu->cq_lock, flags);
+			dev_dbg(iommu->dev, "IOMMU CQT: %x != %x (2nd)\n", last, tail);
+			spin_lock_irqsave(&iommu->cq_lock, flags);
+		}
+	}
+
+	next = (last + 1) & (iommu->cmdq.cnt - 1);
+	if (next != head) {
+		struct riscv_iommu_command *ptr = iommu->cmdq.base;
+		ptr[last] = *cmd;
+		wmb();
+		riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQT, next);
+		iommu->cmdq.lui = next;
+	}
+
+	spin_unlock_irqrestore(&iommu->cq_lock, flags);
+
+	if (sync && head != next) {
+		cycles_t start_time = get_cycles();
+		while (1) {
+			last = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQH) &
+			    (iommu->cmdq.cnt - 1);
+			if (head < next && last >= next)
+				break;
+			if (head > next && last < head && last >= next)
+				break;
+			if (RISCV_IOMMU_TIMEOUT < (get_cycles() - start_time)) {
+				dev_err(iommu->dev, "IOFENCE TIMEOUT\n");
+				return false;
+			}
+			cpu_relax();
+		}
+	}
+
+	return next != head;
+}
+
+static bool riscv_iommu_post(struct riscv_iommu_device *iommu,
+			     struct riscv_iommu_command *cmd)
+{
+	return riscv_iommu_post_sync(iommu, cmd, false);
+}
+
+static bool riscv_iommu_iofence_sync(struct riscv_iommu_device *iommu)
+{
+	struct riscv_iommu_command cmd;
+	riscv_iommu_cmd_iofence(&cmd);
+	return riscv_iommu_post_sync(iommu, &cmd, true);
+}
+
+/* Command queue primary interrupt handler */
+static irqreturn_t riscv_iommu_cmdq_irq_check(int irq, void *data)
+{
+	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
+	struct riscv_iommu_device *iommu =
+	    container_of(q, struct riscv_iommu_device, cmdq);
+	u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
+	if (ipsr & RISCV_IOMMU_IPSR_CIP)
+		return IRQ_WAKE_THREAD;
+	return IRQ_NONE;
+}
+
+/* Command queue interrupt hanlder thread function */
+static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
+{
+	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
+	struct riscv_iommu_device *iommu;
+	unsigned ctrl;
+
+	iommu = container_of(q, struct riscv_iommu_device, cmdq);
+
+	/* Error reporting, clear error reports if any. */
+	ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_CQCSR);
+	if (ctrl & (RISCV_IOMMU_CQCSR_CQMF |
+		    RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL)) {
+		riscv_iommu_queue_ctrl(iommu, &iommu->cmdq, ctrl);
+		dev_warn_ratelimited(iommu->dev,
+				     "Command queue error: fault: %d tout: %d err: %d\n",
+				     !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
+				     !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
+				     !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL));
+	}
+
+	/* Clear fault interrupt pending. */
+	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_CIP);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Fault/event queue, chapter 3.2
+ */
+
+static void riscv_iommu_fault_report(struct riscv_iommu_device *iommu,
+				     struct riscv_iommu_fq_record *event)
+{
+	unsigned err, devid;
+
+	err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
+	devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
+
+	dev_warn_ratelimited(iommu->dev,
+			     "Fault %d devid: %d" " iotval: %llx iotval2: %llx\n", err,
+			     devid, event->iotval, event->iotval2);
+}
+
+/* Fault/event queue primary interrupt handler */
+static irqreturn_t riscv_iommu_fltq_irq_check(int irq, void *data)
+{
+	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
+	struct riscv_iommu_device *iommu =
+	    container_of(q, struct riscv_iommu_device, fltq);
+	u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
+	if (ipsr & RISCV_IOMMU_IPSR_FIP)
+		return IRQ_WAKE_THREAD;
+	return IRQ_NONE;
+}
+
+/* Fault queue interrupt hanlder thread function */
+static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
+{
+	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
+	struct riscv_iommu_device *iommu;
+	struct riscv_iommu_fq_record *events;
+	unsigned cnt, len, idx, ctrl;
+
+	iommu = container_of(q, struct riscv_iommu_device, fltq);
+	events = (struct riscv_iommu_fq_record *)q->base;
+
+	/* Error reporting, clear error reports if any. */
+	ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FQCSR);
+	if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
+		riscv_iommu_queue_ctrl(iommu, &iommu->fltq, ctrl);
+		dev_warn_ratelimited(iommu->dev,
+				     "Fault queue error: fault: %d full: %d\n",
+				     !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
+				     !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
+	}
+
+	/* Clear fault interrupt pending. */
+	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_FIP);
+
+	/* Report fault events. */
+	do {
+		cnt = riscv_iommu_queue_consume(iommu, q, &idx);
+		if (!cnt)
+			break;
+		for (len = 0; len < cnt; idx++, len++)
+			riscv_iommu_fault_report(iommu, &events[idx]);
+		riscv_iommu_queue_release(iommu, q, cnt);
+	} while (1);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Page request queue, chapter 3.3
+ */
+
 /*
  * Register device for IOMMU tracking.
  */
@@ -97,6 +600,54 @@  static void riscv_iommu_add_device(struct riscv_iommu_device *iommu, struct devi
 	mutex_unlock(&iommu->eps_mutex);
 }
 
+/* Page request interface queue primary interrupt handler */
+static irqreturn_t riscv_iommu_priq_irq_check(int irq, void *data)
+{
+	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
+	struct riscv_iommu_device *iommu =
+	    container_of(q, struct riscv_iommu_device, priq);
+	u32 ipsr = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_IPSR);
+	if (ipsr & RISCV_IOMMU_IPSR_PIP)
+		return IRQ_WAKE_THREAD;
+	return IRQ_NONE;
+}
+
+/* Page request interface queue interrupt hanlder thread function */
+static irqreturn_t riscv_iommu_priq_process(int irq, void *data)
+{
+	struct riscv_iommu_queue *q = (struct riscv_iommu_queue *)data;
+	struct riscv_iommu_device *iommu;
+	struct riscv_iommu_pq_record *requests;
+	unsigned cnt, idx, ctrl;
+
+	iommu = container_of(q, struct riscv_iommu_device, priq);
+	requests = (struct riscv_iommu_pq_record *)q->base;
+
+	/* Error reporting, clear error reports if any. */
+	ctrl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_PQCSR);
+	if (ctrl & (RISCV_IOMMU_PQCSR_PQMF | RISCV_IOMMU_PQCSR_PQOF)) {
+		riscv_iommu_queue_ctrl(iommu, &iommu->priq, ctrl);
+		dev_warn_ratelimited(iommu->dev,
+				     "Page request queue error: fault: %d full: %d\n",
+				     !!(ctrl & RISCV_IOMMU_PQCSR_PQMF),
+				     !!(ctrl & RISCV_IOMMU_PQCSR_PQOF));
+	}
+
+	/* Clear page request interrupt pending. */
+	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_PIP);
+
+	/* Process page requests. */
+	do {
+		cnt = riscv_iommu_queue_consume(iommu, q, &idx);
+		if (!cnt)
+			break;
+		dev_warn(iommu->dev, "unexpected %u page requests\n", cnt);
+		riscv_iommu_queue_release(iommu, q, cnt);
+	} while (1);
+
+	return IRQ_HANDLED;
+}
+
 /*
  * Endpoint management
  */
@@ -350,7 +901,29 @@  static void riscv_iommu_flush_iotlb_range(struct iommu_domain *iommu_domain,
 					  unsigned long *start, unsigned long *end,
 					  size_t *pgsize)
 {
-	/* Command interface not implemented */
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	struct riscv_iommu_command cmd;
+	unsigned long iova;
+
+	if (domain->mode == RISCV_IOMMU_DC_FSC_MODE_BARE)
+		return;
+
+	/* Domain not attached to an IOMMU! */
+	BUG_ON(!domain->iommu);
+
+	riscv_iommu_cmd_inval_vma(&cmd);
+	riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+
+	if (start && end && pgsize) {
+		/* Cover only the range that is needed */
+		for (iova = *start; iova <= *end; iova += *pgsize) {
+			riscv_iommu_cmd_inval_set_addr(&cmd, iova);
+			riscv_iommu_post(domain->iommu, &cmd);
+		}
+	} else {
+		riscv_iommu_post(domain->iommu, &cmd);
+	}
+	riscv_iommu_iofence_sync(domain->iommu);
 }
 
 static void riscv_iommu_flush_iotlb_all(struct iommu_domain *iommu_domain)
@@ -610,6 +1183,9 @@  void riscv_iommu_remove(struct riscv_iommu_device *iommu)
 	iommu_device_unregister(&iommu->iommu);
 	iommu_device_sysfs_remove(&iommu->iommu);
 	riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
+	riscv_iommu_queue_free(iommu, &iommu->cmdq);
+	riscv_iommu_queue_free(iommu, &iommu->fltq);
+	riscv_iommu_queue_free(iommu, &iommu->priq);
 }
 
 int riscv_iommu_init(struct riscv_iommu_device *iommu)
@@ -632,6 +1208,16 @@  int riscv_iommu_init(struct riscv_iommu_device *iommu)
 	}
 #endif
 
+	/*
+	 * Assign queue lengths from module parameters if not already
+	 * set on the device tree.
+	 */
+	if (!iommu->cmdq_len)
+		iommu->cmdq_len = cmdq_length;
+	if (!iommu->fltq_len)
+		iommu->fltq_len = fltq_length;
+	if (!iommu->priq_len)
+		iommu->priq_len = priq_length;
 	/* Clear any pending interrupt flag. */
 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR,
 			   RISCV_IOMMU_IPSR_CIP |
@@ -639,7 +1225,20 @@  int riscv_iommu_init(struct riscv_iommu_device *iommu)
 			   RISCV_IOMMU_IPSR_PMIP | RISCV_IOMMU_IPSR_PIP);
 	spin_lock_init(&iommu->cq_lock);
 	mutex_init(&iommu->eps_mutex);
+	ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_COMMAND_QUEUE);
+	if (ret)
+		goto fail;
+	ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_FAULT_QUEUE);
+	if (ret)
+		goto fail;
+	if (!(iommu->cap & RISCV_IOMMU_CAP_ATS))
+		goto no_ats;
+
+	ret = riscv_iommu_queue_init(iommu, RISCV_IOMMU_PAGE_REQUEST_QUEUE);
+	if (ret)
+		goto fail;
 
+ no_ats:
 	ret = riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_BARE);
 
 	if (ret) {
@@ -663,5 +1262,8 @@  int riscv_iommu_init(struct riscv_iommu_device *iommu)
 	return 0;
  fail:
 	riscv_iommu_enable(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
+	riscv_iommu_queue_free(iommu, &iommu->priq);
+	riscv_iommu_queue_free(iommu, &iommu->fltq);
+	riscv_iommu_queue_free(iommu, &iommu->cmdq);
 	return ret;
 }
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index 7dc9baa59a50..04148a2a8ffd 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -28,6 +28,24 @@ 
 #define IOMMU_PAGE_SIZE_1G	BIT_ULL(30)
 #define IOMMU_PAGE_SIZE_512G	BIT_ULL(39)
 
+struct riscv_iommu_queue {
+	dma_addr_t base_dma;	/* ring buffer bus address */
+	void *base;		/* ring buffer pointer */
+	size_t len;		/* single item length */
+	u32 cnt;		/* items count */
+	u32 lui;		/* last used index, consumer/producer share */
+	unsigned qbr;		/* queue base register offset */
+	unsigned qcr;		/* queue control and status register offset */
+	int irq;		/* registered interrupt number */
+	bool in_iomem;		/* indicates queue data are in I/O memory  */
+};
+
+enum riscv_queue_ids {
+	RISCV_IOMMU_COMMAND_QUEUE	= 0,
+	RISCV_IOMMU_FAULT_QUEUE		= 1,
+	RISCV_IOMMU_PAGE_REQUEST_QUEUE	= 2
+};
+
 struct riscv_iommu_device {
 	struct iommu_device iommu;	/* iommu core interface */
 	struct device *dev;		/* iommu hardware */
@@ -42,6 +60,11 @@  struct riscv_iommu_device {
 	int irq_pm;
 	int irq_priq;
 
+	/* Queue lengths */
+	int cmdq_len;
+	int fltq_len;
+	int priq_len;
+
 	/* supported and enabled hardware capabilities */
 	u64 cap;
 
@@ -53,6 +76,11 @@  struct riscv_iommu_device {
 	unsigned ddt_mode;
 	bool ddtp_in_iomem;
 
+	/* hardware queues */
+	struct riscv_iommu_queue cmdq;
+	struct riscv_iommu_queue fltq;
+	struct riscv_iommu_queue priq;
+
 	/* Connected end-points */
 	struct rb_root eps;
 	struct mutex eps_mutex;