diff mbox series

[v4,6/7] iommu/riscv: Command and fault queue support

Message ID f63635cd72fd0d1b0026a6839dbfeaa2cd154eaa.1714752293.git.tjeznach@rivosinc.com (mailing list archive)
State Superseded
Headers show
Series Linux RISC-V IOMMU Support | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR fail PR summary
conchuod/patch-6-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-6-test-2 fail .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-6-test-3 fail .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-6-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-6-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-6-test-6 success .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-6-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-6-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-6-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-6-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-6-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-6-test-12 success .github/scripts/patches/tests/verify_signedoff.sh

Commit Message

Tomasz Jeznach May 3, 2024, 4:12 p.m. UTC
Introduce device command submission and fault reporting queues,
as described in Chapter 3.1 and 3.2 of the RISC-V IOMMU Architecture
Specification.

Command and fault queues are instantiated in contiguous system memory
local to IOMMU device domain, or mapped from fixed I/O space provided
by the hardware implementation. Detection of the location and maximum
allowed size of the queue utilize WARL properties of queue base control
register. Driver implementation will try to allocate up to 128KB of
system memory, while respecting hardware supported maximum queue size.

Interrupts allocation is based on interrupt vectors availability and
distributed to all queues in simple round-robin fashion. For hardware
Implementation with fixed event type to interrupt vector assignment
IVEC WARL property is used to discover such mappings.

Address translation, command and queue fault handling in this change
is limited to simple fault reporting without taking any action.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
---
 drivers/iommu/riscv/iommu-bits.h |  75 +++++
 drivers/iommu/riscv/iommu.c      | 496 ++++++++++++++++++++++++++++++-
 drivers/iommu/riscv/iommu.h      |  21 ++
 3 files changed, 590 insertions(+), 2 deletions(-)

Comments

Zong Li May 8, 2024, 3:38 p.m. UTC | #1
On Sat, May 4, 2024 at 12:13 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> Introduce device command submission and fault reporting queues,
> as described in Chapter 3.1 and 3.2 of the RISC-V IOMMU Architecture
> Specification.
>
> Command and fault queues are instantiated in contiguous system memory
> local to IOMMU device domain, or mapped from fixed I/O space provided
> by the hardware implementation. Detection of the location and maximum
> allowed size of the queue utilize WARL properties of queue base control
> register. Driver implementation will try to allocate up to 128KB of
> system memory, while respecting hardware supported maximum queue size.
>
> Interrupts allocation is based on interrupt vectors availability and
> distributed to all queues in simple round-robin fashion. For hardware
> Implementation with fixed event type to interrupt vector assignment
> IVEC WARL property is used to discover such mappings.
>
> Address translation, command and queue fault handling in this change
> is limited to simple fault reporting without taking any action.
>
> Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> ---
>  drivers/iommu/riscv/iommu-bits.h |  75 +++++
>  drivers/iommu/riscv/iommu.c      | 496 ++++++++++++++++++++++++++++++-
>  drivers/iommu/riscv/iommu.h      |  21 ++
>  3 files changed, 590 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> index ba093c29de9f..40c379222821 100644
> --- a/drivers/iommu/riscv/iommu-bits.h
> +++ b/drivers/iommu/riscv/iommu-bits.h
> @@ -704,4 +704,79 @@ struct riscv_iommu_msi_pte {
>  #define RISCV_IOMMU_MSI_MRIF_NPPN      RISCV_IOMMU_PPN_FIELD
>  #define RISCV_IOMMU_MSI_MRIF_NID_MSB   BIT_ULL(60)
>
> +/* Helper functions: command structure builders. */
> +
> +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
> +{
> +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) |
> +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
> +       cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> +                                                 u64 addr)
> +{
> +       cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr));
> +       cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> +}
> +
> +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
> +                                                  int pscid)
> +{
> +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
> +                      RISCV_IOMMU_CMD_IOTINVAL_PSCV;
> +}
> +
> +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
> +                                                  int gscid)
> +{
> +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
> +                      RISCV_IOMMU_CMD_IOTINVAL_GV;
> +}
> +
> +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
> +{
> +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> +                     RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW;
> +       cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
> +                                                 u64 addr, u32 data)
> +{
> +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> +                     FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) |
> +                     RISCV_IOMMU_CMD_IOFENCE_AV;
> +       cmd->dword1 = addr >> 2;
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
> +{
> +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
> +       cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
> +{
> +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
> +       cmd->dword1 = 0;
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
> +                                                unsigned int devid)
> +{
> +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) |
> +                      RISCV_IOMMU_CMD_IODIR_DV;
> +}
> +
> +static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd,
> +                                                unsigned int pasid)
> +{
> +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid);
> +}
> +
>  #endif /* _RISCV_IOMMU_BITS_H_ */
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 71b7903d83d4..4349ac8a3990 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -25,7 +25,14 @@
>  #include "iommu.h"
>
>  /* Timeouts in [us] */
> -#define RISCV_IOMMU_DDTP_TIMEOUT       50000
> +#define RISCV_IOMMU_QCSR_TIMEOUT       150000
> +#define RISCV_IOMMU_QUEUE_TIMEOUT      150000
> +#define RISCV_IOMMU_DDTP_TIMEOUT       10000000
> +#define RISCV_IOMMU_IOTINVAL_TIMEOUT   90000000
> +
> +/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
> +#define RISCV_IOMMU_DEF_CQ_COUNT       8192
> +#define RISCV_IOMMU_DEF_FQ_COUNT       4096
>
>  /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
>  #define phys_to_ppn(va)  (((va) >> 2) & (((1ULL << 44) - 1) << 10))
> @@ -89,6 +96,432 @@ static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
>                        riscv_iommu_devres_pages_match, &devres);
>  }
>
> +/*
> + * Hardware queue allocation and management.
> + */
> +
> +/* Setup queue base, control registers and default queue length */
> +#define RISCV_IOMMU_QUEUE_INIT(q, name) do {                           \
> +       struct riscv_iommu_queue *_q = q;                               \
> +       _q->qid = RISCV_IOMMU_INTR_ ## name;                            \
> +       _q->qbr = RISCV_IOMMU_REG_ ## name ## B;                        \
> +       _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR;                      \
> +       _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
> +} while (0)
> +
> +/* Note: offsets are the same for all queues */
> +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
> +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
> +#define Q_ITEM(q, index) ((q)->mask & (index))
> +#define Q_IPSR(q) BIT((q)->qid)
> +
> +/*
> + * Discover queue ring buffer hardware configuration, allocate in-memory
> + * ring buffer or use fixed I/O memory location, configure queue base register.
> + * Must be called before hardware queue is enabled.
> + *
> + * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
> + * @entry_size - queue single element size in bytes.
> + */
> +static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
> +                                  struct riscv_iommu_queue *queue,
> +                                  size_t entry_size)
> +{
> +       unsigned int logsz;
> +       u64 qb, rb;
> +
> +       /*
> +        * Use WARL base register property to discover maximum allowed
> +        * number of entries and optional fixed IO address for queue location.
> +        */
> +       riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOGSZ_FIELD);
> +       qb = riscv_iommu_readq(iommu, queue->qbr);
> +
> +       /*
> +        * Calculate and verify hardware supported queue length, as reported
> +        * by the field LOGSZ, where max queue length is equal to 2^(LOGSZ + 1).
> +        * Update queue size based on hardware supported value.
> +        */
> +       logsz = ilog2(queue->mask);
> +       if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb))
> +               logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb);
> +
> +       /*
> +        * Use WARL base register property to discover an optional fixed IO
> +        * address for queue ring buffer location. Otherwise allocate contigus
> +        * system memory.
> +        */
> +       if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
> +               const size_t queue_size = entry_size << (logsz + 1);
> +
> +               queue->phys = ppn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
> +               queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
> +       } else {
> +               do {
> +                       const size_t queue_size = entry_size << (logsz + 1);
> +                       const int order = get_order(queue_size);
> +
> +                       queue->base = riscv_iommu_get_pages(iommu, order);
> +                       queue->phys = __pa(queue->base);
> +               } while (!queue->base && logsz-- > 0);
> +       }
> +
> +       if (!queue->base)
> +               return -ENOMEM;
> +
> +       qb = phys_to_ppn(queue->phys) |
> +            FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, logsz);
> +
> +       /* Update base register and read back to verify hw accepted our write */
> +       riscv_iommu_writeq(iommu, queue->qbr, qb);
> +       rb = riscv_iommu_readq(iommu, queue->qbr);
> +       if (rb != qb) {
> +               dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
> +               return -ENODEV;
> +       }
> +
> +       /* Update actual queue mask */
> +       queue->mask = (2U << logsz) - 1;
> +
> +       dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
> +               queue->qid, logsz + 1);
> +
> +       return 0;
> +}
> +
> +/* Check interrupt queue status, IPSR */
> +static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
> +{
> +       struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> +
> +       if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
> +               return IRQ_WAKE_THREAD;
> +
> +       return IRQ_NONE;
> +}
> +
> +static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
> +{
> +       /* Reuse IVEC.CIV mask for all interrupt vectors mapping. */
> +       return (iommu->ivec >> (n * 4)) & RISCV_IOMMU_IVEC_CIV;
> +}
> +
> +/*
> + * Enable queue processing in the hardware, register interrupt handler.
> + *
> + * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
> + * @irq_handler - threaded interrupt handler.
> + */
> +static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
> +                                   struct riscv_iommu_queue *queue,
> +                                   irq_handler_t irq_handler)
> +{
> +       const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
> +       u32 csr;
> +       int rc;
> +
> +       if (queue->iommu)
> +               return -EBUSY;
> +
> +       /* Polling not implemented */
> +       if (!irq)
> +               return -ENODEV;
> +
> +       queue->iommu = iommu;
> +       rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
> +                                 IRQF_ONESHOT | IRQF_SHARED,
> +                                 dev_name(iommu->dev), queue);
> +       if (rc) {
> +               queue->iommu = NULL;
> +               return rc;
> +       }
> +
> +       /*
> +        * Enable queue with interrupts, clear any memory fault if any.
> +        * Wait for the hardware to acknowledge request and activate queue
> +        * processing.
> +        * Note: All CSR bitfields are in the same offsets for all queues.
> +        */
> +       riscv_iommu_writel(iommu, queue->qcr,
> +                          RISCV_IOMMU_QUEUE_ENABLE |
> +                          RISCV_IOMMU_QUEUE_INTR_ENABLE |
> +                          RISCV_IOMMU_QUEUE_MEM_FAULT);
> +
> +       riscv_iommu_readl_timeout(iommu, queue->qcr,
> +                                 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
> +                                 10, RISCV_IOMMU_QCSR_TIMEOUT);
> +
> +       if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
> +                                               RISCV_IOMMU_QUEUE_BUSY |
> +                                               RISCV_IOMMU_QUEUE_MEM_FAULT))) {
> +               /* Best effort to stop and disable failing hardware queue. */
> +               riscv_iommu_writel(iommu, queue->qcr, 0);
> +               free_irq(irq, queue);
> +               queue->iommu = NULL;
> +               dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
> +               return -EBUSY;
> +       }
> +
> +       /* Clear any pending interrupt flag. */
> +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> +
> +       return 0;
> +}
> +
> +/*
> + * Disable queue. Wait for the hardware to acknowledge request and
> + * stop processing enqueued requests. Report errors but continue.
> + */
> +static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
> +{
> +       struct riscv_iommu_device *iommu = queue->iommu;
> +       u32 csr;
> +
> +       if (!iommu)
> +               return;
> +
> +       free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
> +       riscv_iommu_writel(iommu, queue->qcr, 0);
> +       riscv_iommu_readl_timeout(iommu, queue->qcr,
> +                                 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
> +                                 10, RISCV_IOMMU_QCSR_TIMEOUT);
> +
> +       if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
> +               dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
> +                       queue->qid, csr);
> +
> +       queue->iommu = NULL;
> +}
> +
> +/*
> + * Returns number of available valid queue entries and the first item index.
> + * Update shadow producer index if necessary.
> + */
> +static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
> +                                    unsigned int *index)
> +{
> +       unsigned int head = atomic_read(&queue->head);
> +       unsigned int tail = atomic_read(&queue->tail);
> +       unsigned int last = Q_ITEM(queue, tail);
> +       int available = (int)(tail - head);
> +
> +       *index = head;
> +
> +       if (available > 0)
> +               return available;
> +
> +       /* read hardware producer index, check reserved register bits are not set. */
> +       if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
> +                                     tail, (tail & ~queue->mask) == 0,
> +                                     0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
> +               dev_err_once(queue->iommu->dev,
> +                            "Hardware error: queue access timeout\n");
> +               return 0;
> +       }
> +
> +       if (tail == last)
> +               return 0;
> +
> +       /* update shadow producer index */
> +       return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
> +}
> +
> +/*
> + * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
> + */
> +static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
> +{
> +       const unsigned int head = atomic_add_return(count, &queue->head);
> +
> +       riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
> +}
> +
> +/* Return actual consumer index based on hardware reported queue head index. */
> +static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
> +{
> +       const unsigned int cons = atomic_read(&queue->head);
> +       const unsigned int last = Q_ITEM(queue, cons);
> +       unsigned int head;
> +
> +       if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
> +                                     !(head & ~queue->mask),
> +                                     0, RISCV_IOMMU_QUEUE_TIMEOUT))
> +               return cons;
> +
> +       return cons + ((head - last) & queue->mask);
> +}
> +
> +/* Wait for submitted item to be processed. */
> +static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
> +                                 unsigned int index,
> +                                 unsigned int timeout_us)
> +{
> +       unsigned int cons = atomic_read(&queue->head);
> +
> +       /* Already processed by the consumer */
> +       if ((int)(cons - index) > 0)
> +               return 0;
> +
> +       /* Monitor consumer index */
> +       return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
> +                                (int)(cons - index) > 0, 0, timeout_us);
> +}

Apart from iofence command, it seems to me that we might not be able
to use the head pointer to determine whether the command is executed,
because spec mentioned that the IOMMU advancing cqh is not a guarantee
that the commands fetched by the IOMMU have been executed or
committed. For iofence completion, perhaps using ADD and AV to write
memory would be better as well.

> +
> +/* Enqueue an entry and wait to be processed if timeout_us > 0
> + *
> + * Error handling for IOMMU hardware not responding in reasonable time
> + * will be added as separate patch series along with other RAS features.
> + * For now, only report hardware failure and continue.
> + */
> +static void riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
> +                                  void *entry, size_t entry_size,
> +                                  unsigned int timeout_us)
> +{
> +       unsigned int prod;
> +       unsigned int head;
> +       unsigned int tail;
> +       unsigned long flags;
> +
> +       /* Do not preempt submission flow. */
> +       local_irq_save(flags);
> +
> +       /* 1. Allocate some space in the queue */
> +       prod = atomic_inc_return(&queue->prod) - 1;
> +       head = atomic_read(&queue->head);
> +
> +       /* 2. Wait for space availability. */
> +       if ((prod - head) > queue->mask) {
> +               if (readx_poll_timeout(atomic_read, &queue->head,
> +                                      head, (prod - head) < queue->mask,
> +                                      0, RISCV_IOMMU_QUEUE_TIMEOUT))
> +                       goto err_busy;
> +       } else if ((prod - head) == queue->mask) {
> +               const unsigned int last = Q_ITEM(queue, head);
> +
> +               if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
> +                                             !(head & ~queue->mask) && head != last,
> +                                             0, RISCV_IOMMU_QUEUE_TIMEOUT))
> +                       goto err_busy;
> +               atomic_add((head - last) & queue->mask, &queue->head);
> +       }
> +
> +       /* 3. Store entry in the ring buffer. */
> +       memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
> +
> +       /* 4. Wait for all previous entries to be ready */
> +       if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
> +                              0, RISCV_IOMMU_QUEUE_TIMEOUT))
> +               goto err_busy;
> +
> +       /* 5. Complete submission and restore local interrupts */
> +       dma_wmb();
> +       riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
> +       atomic_inc(&queue->tail);
> +       local_irq_restore(flags);
> +
> +       if (timeout_us && riscv_iommu_queue_wait(queue, prod, timeout_us))
> +               dev_err_once(queue->iommu->dev,
> +                            "Hardware error: command execution timeout\n");
> +
> +       return;
> +
> +err_busy:
> +       local_irq_restore(flags);
> +       dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
> +}
> +
> +/*
> + * IOMMU Command queue chapter 3.1
> + */
> +
> +/* Command queue interrupt handler thread function */
> +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
> +{
> +       const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> +       unsigned int ctrl;
> +
> +       /* Clear MF/CQ errors, complete error recovery to be implemented. */
> +       ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
> +       if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
> +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
> +               riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);

For RISCV_IOMMU_CQCSR_CMD_ILL and RISCV_IOMMU_CQCSR_CMD_TO, I think we
need to adjust the head/tail pointer, otherwise, IOMMU might keep
trying to execute the problematic command.

> +               dev_warn(queue->iommu->dev,
> +                        "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",

It might be a bit weird if we view fence_w_ip as queue error and print
the message as error?


> +                        queue->qid,
> +                        !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
> +                        !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
> +                        !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
> +                        !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
> +       }
> +
> +       /* Placeholder for command queue interrupt notifiers */
> +
> +       /* Clear command interrupt pending. */
> +       riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> +
> +       return IRQ_HANDLED;
> +}
> +
> +/* Send command to the IOMMU command queue */
> +static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
> +                                struct riscv_iommu_command *cmd,
> +                                unsigned int timeout_us)
> +{
> +       riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd), timeout_us);
> +}
> +
> +/*
> + * IOMMU Fault/Event queue chapter 3.2
> + */
> +
> +static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
> +                             struct riscv_iommu_fq_record *event)
> +{
> +       unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
> +       unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
> +
> +       /* Placeholder for future fault handling implementation, report only. */
> +       if (err)
> +               dev_warn_ratelimited(iommu->dev,
> +                                    "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
> +                                    err, devid, event->iotval, event->iotval2);
> +}
> +
> +/* Fault queue interrupt handler thread function */
> +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
> +{
> +       struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> +       struct riscv_iommu_device *iommu = queue->iommu;
> +       struct riscv_iommu_fq_record *events;
> +       unsigned int ctrl, idx;
> +       int cnt, len;
> +
> +       events = (struct riscv_iommu_fq_record *)queue->base;
> +
> +       /* Clear fault interrupt pending and process all received fault events. */
> +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> +
> +       do {
> +               cnt = riscv_iommu_queue_consume(queue, &idx);
> +               for (len = 0; len < cnt; idx++, len++)
> +                       riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
> +               riscv_iommu_queue_release(queue, cnt);
> +       } while (cnt > 0);
> +
> +       /* Clear MF/OF errors, complete error recovery to be implemented. */
> +       ctrl = riscv_iommu_readl(iommu, queue->qcr);
> +       if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
> +               riscv_iommu_writel(iommu, queue->qcr, ctrl);
> +               dev_warn(iommu->dev,
> +                        "Queue #%u error; memory fault:%d overflow:%d\n",
> +                        queue->qid,
> +                        !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
> +                        !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
> +       }
> +
> +       return IRQ_HANDLED;
> +}
> +
>  /* Lookup and initialize device context info structure. */
>  static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
>                                                  unsigned int devid)
> @@ -250,6 +683,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
>         struct device *dev = iommu->dev;
>         u64 ddtp, rq_ddtp;
>         unsigned int mode, rq_mode = ddtp_mode;
> +       struct riscv_iommu_command cmd;
>
>         ddtp = riscv_iommu_read_ddtp(iommu);
>         if (ddtp & RISCV_IOMMU_DDTP_BUSY)
> @@ -317,6 +751,18 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
>         if (mode != ddtp_mode)
>                 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
>
> +       /* Invalidate device context cache */
> +       riscv_iommu_cmd_iodir_inval_ddt(&cmd);
> +       riscv_iommu_cmd_send(iommu, &cmd, 0);
> +
> +       /* Invalidate address translation cache */
> +       riscv_iommu_cmd_inval_vma(&cmd);
> +       riscv_iommu_cmd_send(iommu, &cmd, 0);
> +
> +       /* IOFENCE.C */
> +       riscv_iommu_cmd_iofence(&cmd);
> +       riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
> +
>         return 0;
>  }
>
> @@ -492,6 +938,26 @@ static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
>                         return -EINVAL;
>         }
>
> +       /*
> +        * Distribute interrupt vectors, always use first vector for CIV.
> +        * At least one interrupt is required. Read back and verify.
> +        */
> +       if (!iommu->irqs_count)
> +               return -EINVAL;
> +
> +       iommu->ivec = 0;
> +       iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_FIV, 1 % iommu->irqs_count);
> +       iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, 2 % iommu->irqs_count);
> +       iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, 3 % iommu->irqs_count);
> +       riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_IVEC, iommu->ivec);
> +
> +       iommu->ivec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_IVEC);
> +       if (riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_CIV) >= RISCV_IOMMU_INTR_COUNT ||
> +           riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_FIV) >= RISCV_IOMMU_INTR_COUNT ||
> +           riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PIV) >= RISCV_IOMMU_INTR_COUNT ||
> +           riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PMIV) >= RISCV_IOMMU_INTR_COUNT)
> +               return -EINVAL;
> +
>         return 0;
>  }
>
> @@ -500,12 +966,17 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
>         iommu_device_unregister(&iommu->iommu);
>         iommu_device_sysfs_remove(&iommu->iommu);
>         riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> +       riscv_iommu_queue_disable(&iommu->cmdq);
> +       riscv_iommu_queue_disable(&iommu->fltq);
>  }
>
>  int riscv_iommu_init(struct riscv_iommu_device *iommu)
>  {
>         int rc;
>
> +       RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
> +       RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
> +
>         rc = riscv_iommu_init_check(iommu);
>         if (rc)
>                 return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
> @@ -514,10 +985,28 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>         if (rc)
>                 return rc;
>
> -       rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> +       rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
> +                                    sizeof(struct riscv_iommu_command));
> +       if (rc)
> +               return rc;
> +
> +       rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
> +                                    sizeof(struct riscv_iommu_fq_record));
> +       if (rc)
> +               return rc;
> +
> +       rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
>         if (rc)
>                 return rc;
>
> +       rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
> +       if (rc)
> +               goto err_queue_disable;
> +
> +       rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> +       if (rc)
> +               goto err_queue_disable;
> +
>         rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
>                                     dev_name(iommu->dev));
>         if (rc) {
> @@ -537,5 +1026,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>         iommu_device_sysfs_remove(&iommu->iommu);
>  err_iodir_off:
>         riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> +err_queue_disable:
> +       riscv_iommu_queue_disable(&iommu->fltq);
> +       riscv_iommu_queue_disable(&iommu->cmdq);
>         return rc;
>  }
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index f1696926582c..03e0c45bc7e1 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -17,6 +17,22 @@
>
>  #include "iommu-bits.h"
>
> +struct riscv_iommu_device;
> +
> +struct riscv_iommu_queue {
> +       atomic_t prod;                          /* unbounded producer allocation index */
> +       atomic_t head;                          /* unbounded shadow ring buffer consumer index */
> +       atomic_t tail;                          /* unbounded shadow ring buffer producer index */
> +       unsigned int mask;                      /* index mask, queue length - 1 */
> +       unsigned int irq;                       /* allocated interrupt number */
> +       struct riscv_iommu_device *iommu;       /* iommu device handling the queue when active */
> +       void *base;                             /* ring buffer kernel pointer */
> +       dma_addr_t phys;                        /* ring buffer physical address */
> +       u16 qbr;                                /* base register offset, head and tail reference */
> +       u16 qcr;                                /* control and status register offset */
> +       u8 qid;                                 /* queue identifier, same as RISCV_IOMMU_INTR_XX */
> +};
> +
>  struct riscv_iommu_device {
>         /* iommu core interface */
>         struct iommu_device iommu;
> @@ -34,6 +50,11 @@ struct riscv_iommu_device {
>         /* available interrupt numbers, MSI or WSI */
>         unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
>         unsigned int irqs_count;
> +       unsigned int ivec;
> +
> +       /* hardware queues */
> +       struct riscv_iommu_queue cmdq;
> +       struct riscv_iommu_queue fltq;
>
>         /* device directory */
>         unsigned int ddt_mode;
> --
> 2.34.1
>
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Tomasz Jeznach May 8, 2024, 4:03 p.m. UTC | #2
On Wed, May 8, 2024 at 8:39 AM Zong Li <zong.li@sifive.com> wrote:
>
> On Sat, May 4, 2024 at 12:13 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> >
> > Introduce device command submission and fault reporting queues,
> > as described in Chapter 3.1 and 3.2 of the RISC-V IOMMU Architecture
> > Specification.
> >
> > Command and fault queues are instantiated in contiguous system memory
> > local to IOMMU device domain, or mapped from fixed I/O space provided
> > by the hardware implementation. Detection of the location and maximum
> > allowed size of the queue utilize WARL properties of queue base control
> > register. Driver implementation will try to allocate up to 128KB of
> > system memory, while respecting hardware supported maximum queue size.
> >
> > Interrupts allocation is based on interrupt vectors availability and
> > distributed to all queues in simple round-robin fashion. For hardware
> > Implementation with fixed event type to interrupt vector assignment
> > IVEC WARL property is used to discover such mappings.
> >
> > Address translation, command and queue fault handling in this change
> > is limited to simple fault reporting without taking any action.
> >
> > Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> > Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> > ---
> >  drivers/iommu/riscv/iommu-bits.h |  75 +++++
> >  drivers/iommu/riscv/iommu.c      | 496 ++++++++++++++++++++++++++++++-
> >  drivers/iommu/riscv/iommu.h      |  21 ++
> >  3 files changed, 590 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> > index ba093c29de9f..40c379222821 100644
> > --- a/drivers/iommu/riscv/iommu-bits.h
> > +++ b/drivers/iommu/riscv/iommu-bits.h
> > @@ -704,4 +704,79 @@ struct riscv_iommu_msi_pte {
> >  #define RISCV_IOMMU_MSI_MRIF_NPPN      RISCV_IOMMU_PPN_FIELD
> >  #define RISCV_IOMMU_MSI_MRIF_NID_MSB   BIT_ULL(60)
> >
> > +/* Helper functions: command structure builders. */
> > +
> > +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
> > +{
> > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) |
> > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
> > +       cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> > +                                                 u64 addr)
> > +{
> > +       cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr));
> > +       cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
> > +                                                  int pscid)
> > +{
> > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
> > +                      RISCV_IOMMU_CMD_IOTINVAL_PSCV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
> > +                                                  int gscid)
> > +{
> > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
> > +                      RISCV_IOMMU_CMD_IOTINVAL_GV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
> > +{
> > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> > +                     RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW;
> > +       cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
> > +                                                 u64 addr, u32 data)
> > +{
> > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> > +                     FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) |
> > +                     RISCV_IOMMU_CMD_IOFENCE_AV;
> > +       cmd->dword1 = addr >> 2;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
> > +{
> > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
> > +       cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
> > +{
> > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
> > +       cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
> > +                                                unsigned int devid)
> > +{
> > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) |
> > +                      RISCV_IOMMU_CMD_IODIR_DV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd,
> > +                                                unsigned int pasid)
> > +{
> > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid);
> > +}
> > +
> >  #endif /* _RISCV_IOMMU_BITS_H_ */
> > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> > index 71b7903d83d4..4349ac8a3990 100644
> > --- a/drivers/iommu/riscv/iommu.c
> > +++ b/drivers/iommu/riscv/iommu.c
> > @@ -25,7 +25,14 @@
> >  #include "iommu.h"
> >
> >  /* Timeouts in [us] */
> > -#define RISCV_IOMMU_DDTP_TIMEOUT       50000
> > +#define RISCV_IOMMU_QCSR_TIMEOUT       150000
> > +#define RISCV_IOMMU_QUEUE_TIMEOUT      150000
> > +#define RISCV_IOMMU_DDTP_TIMEOUT       10000000
> > +#define RISCV_IOMMU_IOTINVAL_TIMEOUT   90000000
> > +
> > +/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
> > +#define RISCV_IOMMU_DEF_CQ_COUNT       8192
> > +#define RISCV_IOMMU_DEF_FQ_COUNT       4096
> >
> >  /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
> >  #define phys_to_ppn(va)  (((va) >> 2) & (((1ULL << 44) - 1) << 10))
> > @@ -89,6 +96,432 @@ static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
> >                        riscv_iommu_devres_pages_match, &devres);
> >  }
> >
> > +/*
> > + * Hardware queue allocation and management.
> > + */
> > +
> > +/* Setup queue base, control registers and default queue length */
> > +#define RISCV_IOMMU_QUEUE_INIT(q, name) do {                           \
> > +       struct riscv_iommu_queue *_q = q;                               \
> > +       _q->qid = RISCV_IOMMU_INTR_ ## name;                            \
> > +       _q->qbr = RISCV_IOMMU_REG_ ## name ## B;                        \
> > +       _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR;                      \
> > +       _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
> > +} while (0)
> > +
> > +/* Note: offsets are the same for all queues */
> > +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
> > +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
> > +#define Q_ITEM(q, index) ((q)->mask & (index))
> > +#define Q_IPSR(q) BIT((q)->qid)
> > +
> > +/*
> > + * Discover queue ring buffer hardware configuration, allocate in-memory
> > + * ring buffer or use fixed I/O memory location, configure queue base register.
> > + * Must be called before hardware queue is enabled.
> > + *
> > + * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
> > + * @entry_size - queue single element size in bytes.
> > + */
> > +static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
> > +                                  struct riscv_iommu_queue *queue,
> > +                                  size_t entry_size)
> > +{
> > +       unsigned int logsz;
> > +       u64 qb, rb;
> > +
> > +       /*
> > +        * Use WARL base register property to discover maximum allowed
> > +        * number of entries and optional fixed IO address for queue location.
> > +        */
> > +       riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOGSZ_FIELD);
> > +       qb = riscv_iommu_readq(iommu, queue->qbr);
> > +
> > +       /*
> > +        * Calculate and verify hardware supported queue length, as reported
> > +        * by the field LOGSZ, where max queue length is equal to 2^(LOGSZ + 1).
> > +        * Update queue size based on hardware supported value.
> > +        */
> > +       logsz = ilog2(queue->mask);
> > +       if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb))
> > +               logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb);
> > +
> > +       /*
> > +        * Use WARL base register property to discover an optional fixed IO
> > +        * address for queue ring buffer location. Otherwise allocate contigus
> > +        * system memory.
> > +        */
> > +       if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
> > +               const size_t queue_size = entry_size << (logsz + 1);
> > +
> > +               queue->phys = ppn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
> > +               queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
> > +       } else {
> > +               do {
> > +                       const size_t queue_size = entry_size << (logsz + 1);
> > +                       const int order = get_order(queue_size);
> > +
> > +                       queue->base = riscv_iommu_get_pages(iommu, order);
> > +                       queue->phys = __pa(queue->base);
> > +               } while (!queue->base && logsz-- > 0);
> > +       }
> > +
> > +       if (!queue->base)
> > +               return -ENOMEM;
> > +
> > +       qb = phys_to_ppn(queue->phys) |
> > +            FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, logsz);
> > +
> > +       /* Update base register and read back to verify hw accepted our write */
> > +       riscv_iommu_writeq(iommu, queue->qbr, qb);
> > +       rb = riscv_iommu_readq(iommu, queue->qbr);
> > +       if (rb != qb) {
> > +               dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
> > +               return -ENODEV;
> > +       }
> > +
> > +       /* Update actual queue mask */
> > +       queue->mask = (2U << logsz) - 1;
> > +
> > +       dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
> > +               queue->qid, logsz + 1);
> > +
> > +       return 0;
> > +}
> > +
> > +/* Check interrupt queue status, IPSR */
> > +static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
> > +{
> > +       struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> > +
> > +       if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
> > +               return IRQ_WAKE_THREAD;
> > +
> > +       return IRQ_NONE;
> > +}
> > +
> > +static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
> > +{
> > +       /* Reuse IVEC.CIV mask for all interrupt vectors mapping. */
> > +       return (iommu->ivec >> (n * 4)) & RISCV_IOMMU_IVEC_CIV;
> > +}
> > +
> > +/*
> > + * Enable queue processing in the hardware, register interrupt handler.
> > + *
> > + * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
> > + * @irq_handler - threaded interrupt handler.
> > + */
> > +static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
> > +                                   struct riscv_iommu_queue *queue,
> > +                                   irq_handler_t irq_handler)
> > +{
> > +       const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
> > +       u32 csr;
> > +       int rc;
> > +
> > +       if (queue->iommu)
> > +               return -EBUSY;
> > +
> > +       /* Polling not implemented */
> > +       if (!irq)
> > +               return -ENODEV;
> > +
> > +       queue->iommu = iommu;
> > +       rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
> > +                                 IRQF_ONESHOT | IRQF_SHARED,
> > +                                 dev_name(iommu->dev), queue);
> > +       if (rc) {
> > +               queue->iommu = NULL;
> > +               return rc;
> > +       }
> > +
> > +       /*
> > +        * Enable queue with interrupts, clear any memory fault if any.
> > +        * Wait for the hardware to acknowledge request and activate queue
> > +        * processing.
> > +        * Note: All CSR bitfields are in the same offsets for all queues.
> > +        */
> > +       riscv_iommu_writel(iommu, queue->qcr,
> > +                          RISCV_IOMMU_QUEUE_ENABLE |
> > +                          RISCV_IOMMU_QUEUE_INTR_ENABLE |
> > +                          RISCV_IOMMU_QUEUE_MEM_FAULT);
> > +
> > +       riscv_iommu_readl_timeout(iommu, queue->qcr,
> > +                                 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
> > +                                 10, RISCV_IOMMU_QCSR_TIMEOUT);
> > +
> > +       if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
> > +                                               RISCV_IOMMU_QUEUE_BUSY |
> > +                                               RISCV_IOMMU_QUEUE_MEM_FAULT))) {
> > +               /* Best effort to stop and disable failing hardware queue. */
> > +               riscv_iommu_writel(iommu, queue->qcr, 0);
> > +               free_irq(irq, queue);
> > +               queue->iommu = NULL;
> > +               dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
> > +               return -EBUSY;
> > +       }
> > +
> > +       /* Clear any pending interrupt flag. */
> > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> > +
> > +       return 0;
> > +}
> > +
> > +/*
> > + * Disable queue. Wait for the hardware to acknowledge request and
> > + * stop processing enqueued requests. Report errors but continue.
> > + */
> > +static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
> > +{
> > +       struct riscv_iommu_device *iommu = queue->iommu;
> > +       u32 csr;
> > +
> > +       if (!iommu)
> > +               return;
> > +
> > +       free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
> > +       riscv_iommu_writel(iommu, queue->qcr, 0);
> > +       riscv_iommu_readl_timeout(iommu, queue->qcr,
> > +                                 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
> > +                                 10, RISCV_IOMMU_QCSR_TIMEOUT);
> > +
> > +       if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
> > +               dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
> > +                       queue->qid, csr);
> > +
> > +       queue->iommu = NULL;
> > +}
> > +
> > +/*
> > + * Returns number of available valid queue entries and the first item index.
> > + * Update shadow producer index if necessary.
> > + */
> > +static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
> > +                                    unsigned int *index)
> > +{
> > +       unsigned int head = atomic_read(&queue->head);
> > +       unsigned int tail = atomic_read(&queue->tail);
> > +       unsigned int last = Q_ITEM(queue, tail);
> > +       int available = (int)(tail - head);
> > +
> > +       *index = head;
> > +
> > +       if (available > 0)
> > +               return available;
> > +
> > +       /* read hardware producer index, check reserved register bits are not set. */
> > +       if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
> > +                                     tail, (tail & ~queue->mask) == 0,
> > +                                     0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
> > +               dev_err_once(queue->iommu->dev,
> > +                            "Hardware error: queue access timeout\n");
> > +               return 0;
> > +       }
> > +
> > +       if (tail == last)
> > +               return 0;
> > +
> > +       /* update shadow producer index */
> > +       return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
> > +}
> > +
> > +/*
> > + * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
> > + */
> > +static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
> > +{
> > +       const unsigned int head = atomic_add_return(count, &queue->head);
> > +
> > +       riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
> > +}
> > +
> > +/* Return actual consumer index based on hardware reported queue head index. */
> > +static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
> > +{
> > +       const unsigned int cons = atomic_read(&queue->head);
> > +       const unsigned int last = Q_ITEM(queue, cons);
> > +       unsigned int head;
> > +
> > +       if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
> > +                                     !(head & ~queue->mask),
> > +                                     0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > +               return cons;
> > +
> > +       return cons + ((head - last) & queue->mask);
> > +}
> > +
> > +/* Wait for submitted item to be processed. */
> > +static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
> > +                                 unsigned int index,
> > +                                 unsigned int timeout_us)
> > +{
> > +       unsigned int cons = atomic_read(&queue->head);
> > +
> > +       /* Already processed by the consumer */
> > +       if ((int)(cons - index) > 0)
> > +               return 0;
> > +
> > +       /* Monitor consumer index */
> > +       return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
> > +                                (int)(cons - index) > 0, 0, timeout_us);
> > +}
>
> Apart from iofence command, it seems to me that we might not be able
> to use the head pointer to determine whether the command is executed,
> because spec mentioned that the IOMMU advancing cqh is not a guarantee
> that the commands fetched by the IOMMU have been executed or
> committed. For iofence completion, perhaps using ADD and AV to write
> memory would be better as well.
>

I'll update the comment to the function that this is *only* allowed to
track IOFENCE.C completion. Call to riscv_iommu_queue_wait() is used
only for this purpose (unless specification will be extended to track
other commands with head/tail updates).

Previous version of the command tracking used address/AV notifier, but
after several iterations and testing we've considered tracking
head/tail as best approach.

> > +
> > +/* Enqueue an entry and wait to be processed if timeout_us > 0
> > + *
> > + * Error handling for IOMMU hardware not responding in reasonable time
> > + * will be added as separate patch series along with other RAS features.
> > + * For now, only report hardware failure and continue.
> > + */
> > +static void riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
> > +                                  void *entry, size_t entry_size,
> > +                                  unsigned int timeout_us)
> > +{
> > +       unsigned int prod;
> > +       unsigned int head;
> > +       unsigned int tail;
> > +       unsigned long flags;
> > +
> > +       /* Do not preempt submission flow. */
> > +       local_irq_save(flags);
> > +
> > +       /* 1. Allocate some space in the queue */
> > +       prod = atomic_inc_return(&queue->prod) - 1;
> > +       head = atomic_read(&queue->head);
> > +
> > +       /* 2. Wait for space availability. */
> > +       if ((prod - head) > queue->mask) {
> > +               if (readx_poll_timeout(atomic_read, &queue->head,
> > +                                      head, (prod - head) < queue->mask,
> > +                                      0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > +                       goto err_busy;
> > +       } else if ((prod - head) == queue->mask) {
> > +               const unsigned int last = Q_ITEM(queue, head);
> > +
> > +               if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
> > +                                             !(head & ~queue->mask) && head != last,
> > +                                             0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > +                       goto err_busy;
> > +               atomic_add((head - last) & queue->mask, &queue->head);
> > +       }
> > +
> > +       /* 3. Store entry in the ring buffer. */
> > +       memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
> > +
> > +       /* 4. Wait for all previous entries to be ready */
> > +       if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
> > +                              0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > +               goto err_busy;
> > +
> > +       /* 5. Complete submission and restore local interrupts */
> > +       dma_wmb();
> > +       riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
> > +       atomic_inc(&queue->tail);
> > +       local_irq_restore(flags);
> > +
> > +       if (timeout_us && riscv_iommu_queue_wait(queue, prod, timeout_us))
> > +               dev_err_once(queue->iommu->dev,
> > +                            "Hardware error: command execution timeout\n");
> > +
> > +       return;
> > +
> > +err_busy:
> > +       local_irq_restore(flags);
> > +       dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
> > +}
> > +
> > +/*
> > + * IOMMU Command queue chapter 3.1
> > + */
> > +
> > +/* Command queue interrupt handler thread function */
> > +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
> > +{
> > +       const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> > +       unsigned int ctrl;
> > +
> > +       /* Clear MF/CQ errors, complete error recovery to be implemented. */
> > +       ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
> > +       if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
> > +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
> > +               riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
>
> For RISCV_IOMMU_CQCSR_CMD_ILL and RISCV_IOMMU_CQCSR_CMD_TO, I think we
> need to adjust the head/tail pointer, otherwise, IOMMU might keep
> trying to execute the problematic command.
>
> > +               dev_warn(queue->iommu->dev,
> > +                        "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
>
> It might be a bit weird if we view fence_w_ip as queue error and print
> the message as error?

Driver does not set WSI bit in IOFENCE.C command structure, so it's
unexpected to see fence_w_ip set. This message is warning only about
unexpected command queue interrupt, listing state of possible
triggers.

>
> > +                        queue->qid,
> > +                        !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
> > +                        !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
> > +                        !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
> > +                        !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
> > +       }
> > +
> > +       /* Placeholder for command queue interrupt notifiers */
> > +
> > +       /* Clear command interrupt pending. */
> > +       riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> > +
> > +       return IRQ_HANDLED;
> > +}
> > +
> > +/* Send command to the IOMMU command queue */
> > +static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
> > +                                struct riscv_iommu_command *cmd,
> > +                                unsigned int timeout_us)
> > +{
> > +       riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd), timeout_us);
> > +}
> > +
> > +/*
> > + * IOMMU Fault/Event queue chapter 3.2
> > + */
> > +
> > +static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
> > +                             struct riscv_iommu_fq_record *event)
> > +{
> > +       unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
> > +       unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
> > +
> > +       /* Placeholder for future fault handling implementation, report only. */
> > +       if (err)
> > +               dev_warn_ratelimited(iommu->dev,
> > +                                    "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
> > +                                    err, devid, event->iotval, event->iotval2);
> > +}
> > +
> > +/* Fault queue interrupt handler thread function */
> > +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
> > +{
> > +       struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> > +       struct riscv_iommu_device *iommu = queue->iommu;
> > +       struct riscv_iommu_fq_record *events;
> > +       unsigned int ctrl, idx;
> > +       int cnt, len;
> > +
> > +       events = (struct riscv_iommu_fq_record *)queue->base;
> > +
> > +       /* Clear fault interrupt pending and process all received fault events. */
> > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> > +
> > +       do {
> > +               cnt = riscv_iommu_queue_consume(queue, &idx);
> > +               for (len = 0; len < cnt; idx++, len++)
> > +                       riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
> > +               riscv_iommu_queue_release(queue, cnt);
> > +       } while (cnt > 0);
> > +
> > +       /* Clear MF/OF errors, complete error recovery to be implemented. */
> > +       ctrl = riscv_iommu_readl(iommu, queue->qcr);
> > +       if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
> > +               riscv_iommu_writel(iommu, queue->qcr, ctrl);
> > +               dev_warn(iommu->dev,
> > +                        "Queue #%u error; memory fault:%d overflow:%d\n",
> > +                        queue->qid,
> > +                        !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
> > +                        !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
> > +       }
> > +
> > +       return IRQ_HANDLED;
> > +}
> > +
> >  /* Lookup and initialize device context info structure. */
> >  static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
> >                                                  unsigned int devid)
> > @@ -250,6 +683,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
> >         struct device *dev = iommu->dev;
> >         u64 ddtp, rq_ddtp;
> >         unsigned int mode, rq_mode = ddtp_mode;
> > +       struct riscv_iommu_command cmd;
> >
> >         ddtp = riscv_iommu_read_ddtp(iommu);
> >         if (ddtp & RISCV_IOMMU_DDTP_BUSY)
> > @@ -317,6 +751,18 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
> >         if (mode != ddtp_mode)
> >                 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
> >
> > +       /* Invalidate device context cache */
> > +       riscv_iommu_cmd_iodir_inval_ddt(&cmd);
> > +       riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +
> > +       /* Invalidate address translation cache */
> > +       riscv_iommu_cmd_inval_vma(&cmd);
> > +       riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +
> > +       /* IOFENCE.C */
> > +       riscv_iommu_cmd_iofence(&cmd);
> > +       riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
> > +
> >         return 0;
> >  }
> >
> > @@ -492,6 +938,26 @@ static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
> >                         return -EINVAL;
> >         }
> >
> > +       /*
> > +        * Distribute interrupt vectors, always use first vector for CIV.
> > +        * At least one interrupt is required. Read back and verify.
> > +        */
> > +       if (!iommu->irqs_count)
> > +               return -EINVAL;
> > +
> > +       iommu->ivec = 0;
> > +       iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_FIV, 1 % iommu->irqs_count);
> > +       iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, 2 % iommu->irqs_count);
> > +       iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, 3 % iommu->irqs_count);
> > +       riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_IVEC, iommu->ivec);
> > +
> > +       iommu->ivec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_IVEC);
> > +       if (riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_CIV) >= RISCV_IOMMU_INTR_COUNT ||
> > +           riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_FIV) >= RISCV_IOMMU_INTR_COUNT ||
> > +           riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PIV) >= RISCV_IOMMU_INTR_COUNT ||
> > +           riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PMIV) >= RISCV_IOMMU_INTR_COUNT)
> > +               return -EINVAL;
> > +
> >         return 0;
> >  }
> >
> > @@ -500,12 +966,17 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
> >         iommu_device_unregister(&iommu->iommu);
> >         iommu_device_sysfs_remove(&iommu->iommu);
> >         riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > +       riscv_iommu_queue_disable(&iommu->cmdq);
> > +       riscv_iommu_queue_disable(&iommu->fltq);
> >  }
> >
> >  int riscv_iommu_init(struct riscv_iommu_device *iommu)
> >  {
> >         int rc;
> >
> > +       RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
> > +       RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
> > +
> >         rc = riscv_iommu_init_check(iommu);
> >         if (rc)
> >                 return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
> > @@ -514,10 +985,28 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> >         if (rc)
> >                 return rc;
> >
> > -       rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> > +       rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
> > +                                    sizeof(struct riscv_iommu_command));
> > +       if (rc)
> > +               return rc;
> > +
> > +       rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
> > +                                    sizeof(struct riscv_iommu_fq_record));
> > +       if (rc)
> > +               return rc;
> > +
> > +       rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
> >         if (rc)
> >                 return rc;
> >
> > +       rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
> > +       if (rc)
> > +               goto err_queue_disable;
> > +
> > +       rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> > +       if (rc)
> > +               goto err_queue_disable;
> > +
> >         rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
> >                                     dev_name(iommu->dev));
> >         if (rc) {
> > @@ -537,5 +1026,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> >         iommu_device_sysfs_remove(&iommu->iommu);
> >  err_iodir_off:
> >         riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > +err_queue_disable:
> > +       riscv_iommu_queue_disable(&iommu->fltq);
> > +       riscv_iommu_queue_disable(&iommu->cmdq);
> >         return rc;
> >  }
> > diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> > index f1696926582c..03e0c45bc7e1 100644
> > --- a/drivers/iommu/riscv/iommu.h
> > +++ b/drivers/iommu/riscv/iommu.h
> > @@ -17,6 +17,22 @@
> >
> >  #include "iommu-bits.h"
> >
> > +struct riscv_iommu_device;
> > +
> > +struct riscv_iommu_queue {
> > +       atomic_t prod;                          /* unbounded producer allocation index */
> > +       atomic_t head;                          /* unbounded shadow ring buffer consumer index */
> > +       atomic_t tail;                          /* unbounded shadow ring buffer producer index */
> > +       unsigned int mask;                      /* index mask, queue length - 1 */
> > +       unsigned int irq;                       /* allocated interrupt number */
> > +       struct riscv_iommu_device *iommu;       /* iommu device handling the queue when active */
> > +       void *base;                             /* ring buffer kernel pointer */
> > +       dma_addr_t phys;                        /* ring buffer physical address */
> > +       u16 qbr;                                /* base register offset, head and tail reference */
> > +       u16 qcr;                                /* control and status register offset */
> > +       u8 qid;                                 /* queue identifier, same as RISCV_IOMMU_INTR_XX */
> > +};
> > +
> >  struct riscv_iommu_device {
> >         /* iommu core interface */
> >         struct iommu_device iommu;
> > @@ -34,6 +50,11 @@ struct riscv_iommu_device {
> >         /* available interrupt numbers, MSI or WSI */
> >         unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
> >         unsigned int irqs_count;
> > +       unsigned int ivec;
> > +
> > +       /* hardware queues */
> > +       struct riscv_iommu_queue cmdq;
> > +       struct riscv_iommu_queue fltq;
> >
> >         /* device directory */
> >         unsigned int ddt_mode;
> > --
> > 2.34.1
> >

Best,
- Tomasz

> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
Zong Li May 9, 2024, 1:57 a.m. UTC | #3
On Thu, May 9, 2024 at 12:03 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> On Wed, May 8, 2024 at 8:39 AM Zong Li <zong.li@sifive.com> wrote:
> >
> > On Sat, May 4, 2024 at 12:13 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> > >
> > > Introduce device command submission and fault reporting queues,
> > > as described in Chapter 3.1 and 3.2 of the RISC-V IOMMU Architecture
> > > Specification.
> > >
> > > Command and fault queues are instantiated in contiguous system memory
> > > local to IOMMU device domain, or mapped from fixed I/O space provided
> > > by the hardware implementation. Detection of the location and maximum
> > > allowed size of the queue utilize WARL properties of queue base control
> > > register. Driver implementation will try to allocate up to 128KB of
> > > system memory, while respecting hardware supported maximum queue size.
> > >
> > > Interrupts allocation is based on interrupt vectors availability and
> > > distributed to all queues in simple round-robin fashion. For hardware
> > > Implementation with fixed event type to interrupt vector assignment
> > > IVEC WARL property is used to discover such mappings.
> > >
> > > Address translation, command and queue fault handling in this change
> > > is limited to simple fault reporting without taking any action.
> > >
> > > Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> > > Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> > > ---
> > >  drivers/iommu/riscv/iommu-bits.h |  75 +++++
> > >  drivers/iommu/riscv/iommu.c      | 496 ++++++++++++++++++++++++++++++-
> > >  drivers/iommu/riscv/iommu.h      |  21 ++
> > >  3 files changed, 590 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> > > index ba093c29de9f..40c379222821 100644
> > > --- a/drivers/iommu/riscv/iommu-bits.h
> > > +++ b/drivers/iommu/riscv/iommu-bits.h
> > > @@ -704,4 +704,79 @@ struct riscv_iommu_msi_pte {
> > >  #define RISCV_IOMMU_MSI_MRIF_NPPN      RISCV_IOMMU_PPN_FIELD
> > >  #define RISCV_IOMMU_MSI_MRIF_NID_MSB   BIT_ULL(60)
> > >
> > > +/* Helper functions: command structure builders. */
> > > +
> > > +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
> > > +{
> > > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) |
> > > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
> > > +       cmd->dword1 = 0;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> > > +                                                 u64 addr)
> > > +{
> > > +       cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr));
> > > +       cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
> > > +                                                  int pscid)
> > > +{
> > > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
> > > +                      RISCV_IOMMU_CMD_IOTINVAL_PSCV;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
> > > +                                                  int gscid)
> > > +{
> > > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
> > > +                      RISCV_IOMMU_CMD_IOTINVAL_GV;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
> > > +{
> > > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> > > +                     RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW;
> > > +       cmd->dword1 = 0;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
> > > +                                                 u64 addr, u32 data)
> > > +{
> > > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> > > +                     FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) |
> > > +                     RISCV_IOMMU_CMD_IOFENCE_AV;
> > > +       cmd->dword1 = addr >> 2;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
> > > +{
> > > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
> > > +       cmd->dword1 = 0;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
> > > +{
> > > +       cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > > +                     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
> > > +       cmd->dword1 = 0;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
> > > +                                                unsigned int devid)
> > > +{
> > > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) |
> > > +                      RISCV_IOMMU_CMD_IODIR_DV;
> > > +}
> > > +
> > > +static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd,
> > > +                                                unsigned int pasid)
> > > +{
> > > +       cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid);
> > > +}
> > > +
> > >  #endif /* _RISCV_IOMMU_BITS_H_ */
> > > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> > > index 71b7903d83d4..4349ac8a3990 100644
> > > --- a/drivers/iommu/riscv/iommu.c
> > > +++ b/drivers/iommu/riscv/iommu.c
> > > @@ -25,7 +25,14 @@
> > >  #include "iommu.h"
> > >
> > >  /* Timeouts in [us] */
> > > -#define RISCV_IOMMU_DDTP_TIMEOUT       50000
> > > +#define RISCV_IOMMU_QCSR_TIMEOUT       150000
> > > +#define RISCV_IOMMU_QUEUE_TIMEOUT      150000
> > > +#define RISCV_IOMMU_DDTP_TIMEOUT       10000000
> > > +#define RISCV_IOMMU_IOTINVAL_TIMEOUT   90000000
> > > +
> > > +/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
> > > +#define RISCV_IOMMU_DEF_CQ_COUNT       8192
> > > +#define RISCV_IOMMU_DEF_FQ_COUNT       4096
> > >
> > >  /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
> > >  #define phys_to_ppn(va)  (((va) >> 2) & (((1ULL << 44) - 1) << 10))
> > > @@ -89,6 +96,432 @@ static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
> > >                        riscv_iommu_devres_pages_match, &devres);
> > >  }
> > >
> > > +/*
> > > + * Hardware queue allocation and management.
> > > + */
> > > +
> > > +/* Setup queue base, control registers and default queue length */
> > > +#define RISCV_IOMMU_QUEUE_INIT(q, name) do {                           \
> > > +       struct riscv_iommu_queue *_q = q;                               \
> > > +       _q->qid = RISCV_IOMMU_INTR_ ## name;                            \
> > > +       _q->qbr = RISCV_IOMMU_REG_ ## name ## B;                        \
> > > +       _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR;                      \
> > > +       _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
> > > +} while (0)
> > > +
> > > +/* Note: offsets are the same for all queues */
> > > +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
> > > +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
> > > +#define Q_ITEM(q, index) ((q)->mask & (index))
> > > +#define Q_IPSR(q) BIT((q)->qid)
> > > +
> > > +/*
> > > + * Discover queue ring buffer hardware configuration, allocate in-memory
> > > + * ring buffer or use fixed I/O memory location, configure queue base register.
> > > + * Must be called before hardware queue is enabled.
> > > + *
> > > + * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
> > > + * @entry_size - queue single element size in bytes.
> > > + */
> > > +static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
> > > +                                  struct riscv_iommu_queue *queue,
> > > +                                  size_t entry_size)
> > > +{
> > > +       unsigned int logsz;
> > > +       u64 qb, rb;
> > > +
> > > +       /*
> > > +        * Use WARL base register property to discover maximum allowed
> > > +        * number of entries and optional fixed IO address for queue location.
> > > +        */
> > > +       riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOGSZ_FIELD);
> > > +       qb = riscv_iommu_readq(iommu, queue->qbr);
> > > +
> > > +       /*
> > > +        * Calculate and verify hardware supported queue length, as reported
> > > +        * by the field LOGSZ, where max queue length is equal to 2^(LOGSZ + 1).
> > > +        * Update queue size based on hardware supported value.
> > > +        */
> > > +       logsz = ilog2(queue->mask);
> > > +       if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb))
> > > +               logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb);
> > > +
> > > +       /*
> > > +        * Use WARL base register property to discover an optional fixed IO
> > > +        * address for queue ring buffer location. Otherwise allocate contigus
> > > +        * system memory.
> > > +        */
> > > +       if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
> > > +               const size_t queue_size = entry_size << (logsz + 1);
> > > +
> > > +               queue->phys = ppn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
> > > +               queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
> > > +       } else {
> > > +               do {
> > > +                       const size_t queue_size = entry_size << (logsz + 1);
> > > +                       const int order = get_order(queue_size);
> > > +
> > > +                       queue->base = riscv_iommu_get_pages(iommu, order);
> > > +                       queue->phys = __pa(queue->base);
> > > +               } while (!queue->base && logsz-- > 0);
> > > +       }
> > > +
> > > +       if (!queue->base)
> > > +               return -ENOMEM;
> > > +
> > > +       qb = phys_to_ppn(queue->phys) |
> > > +            FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, logsz);
> > > +
> > > +       /* Update base register and read back to verify hw accepted our write */
> > > +       riscv_iommu_writeq(iommu, queue->qbr, qb);
> > > +       rb = riscv_iommu_readq(iommu, queue->qbr);
> > > +       if (rb != qb) {
> > > +               dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
> > > +               return -ENODEV;
> > > +       }
> > > +
> > > +       /* Update actual queue mask */
> > > +       queue->mask = (2U << logsz) - 1;
> > > +
> > > +       dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
> > > +               queue->qid, logsz + 1);
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +/* Check interrupt queue status, IPSR */
> > > +static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
> > > +{
> > > +       struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> > > +
> > > +       if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
> > > +               return IRQ_WAKE_THREAD;
> > > +
> > > +       return IRQ_NONE;
> > > +}
> > > +
> > > +static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
> > > +{
> > > +       /* Reuse IVEC.CIV mask for all interrupt vectors mapping. */
> > > +       return (iommu->ivec >> (n * 4)) & RISCV_IOMMU_IVEC_CIV;
> > > +}
> > > +
> > > +/*
> > > + * Enable queue processing in the hardware, register interrupt handler.
> > > + *
> > > + * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
> > > + * @irq_handler - threaded interrupt handler.
> > > + */
> > > +static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
> > > +                                   struct riscv_iommu_queue *queue,
> > > +                                   irq_handler_t irq_handler)
> > > +{
> > > +       const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
> > > +       u32 csr;
> > > +       int rc;
> > > +
> > > +       if (queue->iommu)
> > > +               return -EBUSY;
> > > +
> > > +       /* Polling not implemented */
> > > +       if (!irq)
> > > +               return -ENODEV;
> > > +
> > > +       queue->iommu = iommu;
> > > +       rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
> > > +                                 IRQF_ONESHOT | IRQF_SHARED,
> > > +                                 dev_name(iommu->dev), queue);
> > > +       if (rc) {
> > > +               queue->iommu = NULL;
> > > +               return rc;
> > > +       }
> > > +
> > > +       /*
> > > +        * Enable queue with interrupts, clear any memory fault if any.
> > > +        * Wait for the hardware to acknowledge request and activate queue
> > > +        * processing.
> > > +        * Note: All CSR bitfields are in the same offsets for all queues.
> > > +        */
> > > +       riscv_iommu_writel(iommu, queue->qcr,
> > > +                          RISCV_IOMMU_QUEUE_ENABLE |
> > > +                          RISCV_IOMMU_QUEUE_INTR_ENABLE |
> > > +                          RISCV_IOMMU_QUEUE_MEM_FAULT);
> > > +
> > > +       riscv_iommu_readl_timeout(iommu, queue->qcr,
> > > +                                 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
> > > +                                 10, RISCV_IOMMU_QCSR_TIMEOUT);
> > > +
> > > +       if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
> > > +                                               RISCV_IOMMU_QUEUE_BUSY |
> > > +                                               RISCV_IOMMU_QUEUE_MEM_FAULT))) {
> > > +               /* Best effort to stop and disable failing hardware queue. */
> > > +               riscv_iommu_writel(iommu, queue->qcr, 0);
> > > +               free_irq(irq, queue);
> > > +               queue->iommu = NULL;
> > > +               dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
> > > +               return -EBUSY;
> > > +       }
> > > +
> > > +       /* Clear any pending interrupt flag. */
> > > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +/*
> > > + * Disable queue. Wait for the hardware to acknowledge request and
> > > + * stop processing enqueued requests. Report errors but continue.
> > > + */
> > > +static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
> > > +{
> > > +       struct riscv_iommu_device *iommu = queue->iommu;
> > > +       u32 csr;
> > > +
> > > +       if (!iommu)
> > > +               return;
> > > +
> > > +       free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
> > > +       riscv_iommu_writel(iommu, queue->qcr, 0);
> > > +       riscv_iommu_readl_timeout(iommu, queue->qcr,
> > > +                                 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
> > > +                                 10, RISCV_IOMMU_QCSR_TIMEOUT);
> > > +
> > > +       if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
> > > +               dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
> > > +                       queue->qid, csr);
> > > +
> > > +       queue->iommu = NULL;
> > > +}
> > > +
> > > +/*
> > > + * Returns number of available valid queue entries and the first item index.
> > > + * Update shadow producer index if necessary.
> > > + */
> > > +static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
> > > +                                    unsigned int *index)
> > > +{
> > > +       unsigned int head = atomic_read(&queue->head);
> > > +       unsigned int tail = atomic_read(&queue->tail);
> > > +       unsigned int last = Q_ITEM(queue, tail);
> > > +       int available = (int)(tail - head);
> > > +
> > > +       *index = head;
> > > +
> > > +       if (available > 0)
> > > +               return available;
> > > +
> > > +       /* read hardware producer index, check reserved register bits are not set. */
> > > +       if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
> > > +                                     tail, (tail & ~queue->mask) == 0,
> > > +                                     0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
> > > +               dev_err_once(queue->iommu->dev,
> > > +                            "Hardware error: queue access timeout\n");
> > > +               return 0;
> > > +       }
> > > +
> > > +       if (tail == last)
> > > +               return 0;
> > > +
> > > +       /* update shadow producer index */
> > > +       return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
> > > +}
> > > +
> > > +/*
> > > + * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
> > > + */
> > > +static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
> > > +{
> > > +       const unsigned int head = atomic_add_return(count, &queue->head);
> > > +
> > > +       riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
> > > +}
> > > +
> > > +/* Return actual consumer index based on hardware reported queue head index. */
> > > +static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
> > > +{
> > > +       const unsigned int cons = atomic_read(&queue->head);
> > > +       const unsigned int last = Q_ITEM(queue, cons);
> > > +       unsigned int head;
> > > +
> > > +       if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
> > > +                                     !(head & ~queue->mask),
> > > +                                     0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > > +               return cons;
> > > +
> > > +       return cons + ((head - last) & queue->mask);
> > > +}
> > > +
> > > +/* Wait for submitted item to be processed. */
> > > +static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
> > > +                                 unsigned int index,
> > > +                                 unsigned int timeout_us)
> > > +{
> > > +       unsigned int cons = atomic_read(&queue->head);
> > > +
> > > +       /* Already processed by the consumer */
> > > +       if ((int)(cons - index) > 0)
> > > +               return 0;
> > > +
> > > +       /* Monitor consumer index */
> > > +       return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
> > > +                                (int)(cons - index) > 0, 0, timeout_us);
> > > +}
> >
> > Apart from iofence command, it seems to me that we might not be able
> > to use the head pointer to determine whether the command is executed,
> > because spec mentioned that the IOMMU advancing cqh is not a guarantee
> > that the commands fetched by the IOMMU have been executed or
> > committed. For iofence completion, perhaps using ADD and AV to write
> > memory would be better as well.
> >
>
> I'll update the comment to the function that this is *only* allowed to
> track IOFENCE.C completion. Call to riscv_iommu_queue_wait() is used
> only for this purpose (unless specification will be extended to track
> other commands with head/tail updates).

I'm not sure whether it is a good idea if we check the opcode around
there, but adding comment is ok to me. Thanks

>
> Previous version of the command tracking used address/AV notifier, but
> after several iterations and testing we've considered tracking
> head/tail as best approach.

Thanks for bringing the information back here again.

>
> > > +
> > > +/* Enqueue an entry and wait to be processed if timeout_us > 0
> > > + *
> > > + * Error handling for IOMMU hardware not responding in reasonable time
> > > + * will be added as separate patch series along with other RAS features.
> > > + * For now, only report hardware failure and continue.
> > > + */
> > > +static void riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
> > > +                                  void *entry, size_t entry_size,
> > > +                                  unsigned int timeout_us)
> > > +{
> > > +       unsigned int prod;
> > > +       unsigned int head;
> > > +       unsigned int tail;
> > > +       unsigned long flags;
> > > +
> > > +       /* Do not preempt submission flow. */
> > > +       local_irq_save(flags);
> > > +
> > > +       /* 1. Allocate some space in the queue */
> > > +       prod = atomic_inc_return(&queue->prod) - 1;
> > > +       head = atomic_read(&queue->head);
> > > +
> > > +       /* 2. Wait for space availability. */
> > > +       if ((prod - head) > queue->mask) {
> > > +               if (readx_poll_timeout(atomic_read, &queue->head,
> > > +                                      head, (prod - head) < queue->mask,
> > > +                                      0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > > +                       goto err_busy;
> > > +       } else if ((prod - head) == queue->mask) {
> > > +               const unsigned int last = Q_ITEM(queue, head);
> > > +
> > > +               if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
> > > +                                             !(head & ~queue->mask) && head != last,
> > > +                                             0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > > +                       goto err_busy;
> > > +               atomic_add((head - last) & queue->mask, &queue->head);
> > > +       }
> > > +
> > > +       /* 3. Store entry in the ring buffer. */
> > > +       memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
> > > +
> > > +       /* 4. Wait for all previous entries to be ready */
> > > +       if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
> > > +                              0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > > +               goto err_busy;
> > > +
> > > +       /* 5. Complete submission and restore local interrupts */
> > > +       dma_wmb();
> > > +       riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
> > > +       atomic_inc(&queue->tail);
> > > +       local_irq_restore(flags);
> > > +
> > > +       if (timeout_us && riscv_iommu_queue_wait(queue, prod, timeout_us))
> > > +               dev_err_once(queue->iommu->dev,
> > > +                            "Hardware error: command execution timeout\n");
> > > +
> > > +       return;
> > > +
> > > +err_busy:
> > > +       local_irq_restore(flags);
> > > +       dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
> > > +}
> > > +
> > > +/*
> > > + * IOMMU Command queue chapter 3.1
> > > + */
> > > +
> > > +/* Command queue interrupt handler thread function */
> > > +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
> > > +{
> > > +       const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> > > +       unsigned int ctrl;
> > > +
> > > +       /* Clear MF/CQ errors, complete error recovery to be implemented. */
> > > +       ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
> > > +       if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
> > > +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
> > > +               riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
> >
> > For RISCV_IOMMU_CQCSR_CMD_ILL and RISCV_IOMMU_CQCSR_CMD_TO, I think we
> > need to adjust the head/tail pointer, otherwise, IOMMU might keep
> > trying to execute the problematic command.
> >
> > > +               dev_warn(queue->iommu->dev,
> > > +                        "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
> >
> > It might be a bit weird if we view fence_w_ip as queue error and print
> > the message as error?
>
> Driver does not set WSI bit in IOFENCE.C command structure, so it's
> unexpected to see fence_w_ip set. This message is warning only about
> unexpected command queue interrupt, listing state of possible
> triggers.

Okay, perhaps we can adjust it in the future when we actually enable
the fence_w_ip. Thanks

>
> >
> > > +                        queue->qid,
> > > +                        !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
> > > +                        !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
> > > +                        !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
> > > +                        !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
> > > +       }
> > > +
> > > +       /* Placeholder for command queue interrupt notifiers */
> > > +
> > > +       /* Clear command interrupt pending. */
> > > +       riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> > > +
> > > +       return IRQ_HANDLED;
> > > +}
> > > +
> > > +/* Send command to the IOMMU command queue */
> > > +static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
> > > +                                struct riscv_iommu_command *cmd,
> > > +                                unsigned int timeout_us)
> > > +{
> > > +       riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd), timeout_us);
> > > +}
> > > +
> > > +/*
> > > + * IOMMU Fault/Event queue chapter 3.2
> > > + */
> > > +
> > > +static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
> > > +                             struct riscv_iommu_fq_record *event)
> > > +{
> > > +       unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
> > > +       unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
> > > +
> > > +       /* Placeholder for future fault handling implementation, report only. */
> > > +       if (err)
> > > +               dev_warn_ratelimited(iommu->dev,
> > > +                                    "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
> > > +                                    err, devid, event->iotval, event->iotval2);
> > > +}
> > > +
> > > +/* Fault queue interrupt handler thread function */
> > > +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
> > > +{
> > > +       struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> > > +       struct riscv_iommu_device *iommu = queue->iommu;
> > > +       struct riscv_iommu_fq_record *events;
> > > +       unsigned int ctrl, idx;
> > > +       int cnt, len;
> > > +
> > > +       events = (struct riscv_iommu_fq_record *)queue->base;
> > > +
> > > +       /* Clear fault interrupt pending and process all received fault events. */
> > > +       riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> > > +
> > > +       do {
> > > +               cnt = riscv_iommu_queue_consume(queue, &idx);
> > > +               for (len = 0; len < cnt; idx++, len++)
> > > +                       riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
> > > +               riscv_iommu_queue_release(queue, cnt);
> > > +       } while (cnt > 0);
> > > +
> > > +       /* Clear MF/OF errors, complete error recovery to be implemented. */
> > > +       ctrl = riscv_iommu_readl(iommu, queue->qcr);
> > > +       if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
> > > +               riscv_iommu_writel(iommu, queue->qcr, ctrl);
> > > +               dev_warn(iommu->dev,
> > > +                        "Queue #%u error; memory fault:%d overflow:%d\n",
> > > +                        queue->qid,
> > > +                        !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
> > > +                        !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
> > > +       }
> > > +
> > > +       return IRQ_HANDLED;
> > > +}
> > > +
> > >  /* Lookup and initialize device context info structure. */
> > >  static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
> > >                                                  unsigned int devid)
> > > @@ -250,6 +683,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
> > >         struct device *dev = iommu->dev;
> > >         u64 ddtp, rq_ddtp;
> > >         unsigned int mode, rq_mode = ddtp_mode;
> > > +       struct riscv_iommu_command cmd;
> > >
> > >         ddtp = riscv_iommu_read_ddtp(iommu);
> > >         if (ddtp & RISCV_IOMMU_DDTP_BUSY)
> > > @@ -317,6 +751,18 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
> > >         if (mode != ddtp_mode)
> > >                 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
> > >
> > > +       /* Invalidate device context cache */
> > > +       riscv_iommu_cmd_iodir_inval_ddt(&cmd);
> > > +       riscv_iommu_cmd_send(iommu, &cmd, 0);
> > > +
> > > +       /* Invalidate address translation cache */
> > > +       riscv_iommu_cmd_inval_vma(&cmd);
> > > +       riscv_iommu_cmd_send(iommu, &cmd, 0);
> > > +
> > > +       /* IOFENCE.C */
> > > +       riscv_iommu_cmd_iofence(&cmd);
> > > +       riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
> > > +
> > >         return 0;
> > >  }
> > >
> > > @@ -492,6 +938,26 @@ static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
> > >                         return -EINVAL;
> > >         }
> > >
> > > +       /*
> > > +        * Distribute interrupt vectors, always use first vector for CIV.
> > > +        * At least one interrupt is required. Read back and verify.
> > > +        */
> > > +       if (!iommu->irqs_count)
> > > +               return -EINVAL;
> > > +
> > > +       iommu->ivec = 0;
> > > +       iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_FIV, 1 % iommu->irqs_count);
> > > +       iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, 2 % iommu->irqs_count);
> > > +       iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, 3 % iommu->irqs_count);
> > > +       riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_IVEC, iommu->ivec);
> > > +
> > > +       iommu->ivec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_IVEC);
> > > +       if (riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_CIV) >= RISCV_IOMMU_INTR_COUNT ||
> > > +           riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_FIV) >= RISCV_IOMMU_INTR_COUNT ||
> > > +           riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PIV) >= RISCV_IOMMU_INTR_COUNT ||
> > > +           riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PMIV) >= RISCV_IOMMU_INTR_COUNT)
> > > +               return -EINVAL;
> > > +
> > >         return 0;
> > >  }
> > >
> > > @@ -500,12 +966,17 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
> > >         iommu_device_unregister(&iommu->iommu);
> > >         iommu_device_sysfs_remove(&iommu->iommu);
> > >         riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > > +       riscv_iommu_queue_disable(&iommu->cmdq);
> > > +       riscv_iommu_queue_disable(&iommu->fltq);
> > >  }
> > >
> > >  int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > >  {
> > >         int rc;
> > >
> > > +       RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
> > > +       RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
> > > +
> > >         rc = riscv_iommu_init_check(iommu);
> > >         if (rc)
> > >                 return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
> > > @@ -514,10 +985,28 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > >         if (rc)
> > >                 return rc;
> > >
> > > -       rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> > > +       rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
> > > +                                    sizeof(struct riscv_iommu_command));
> > > +       if (rc)
> > > +               return rc;
> > > +
> > > +       rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
> > > +                                    sizeof(struct riscv_iommu_fq_record));
> > > +       if (rc)
> > > +               return rc;
> > > +
> > > +       rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
> > >         if (rc)
> > >                 return rc;
> > >
> > > +       rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
> > > +       if (rc)
> > > +               goto err_queue_disable;
> > > +
> > > +       rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> > > +       if (rc)
> > > +               goto err_queue_disable;
> > > +
> > >         rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
> > >                                     dev_name(iommu->dev));
> > >         if (rc) {
> > > @@ -537,5 +1026,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > >         iommu_device_sysfs_remove(&iommu->iommu);
> > >  err_iodir_off:
> > >         riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > > +err_queue_disable:
> > > +       riscv_iommu_queue_disable(&iommu->fltq);
> > > +       riscv_iommu_queue_disable(&iommu->cmdq);
> > >         return rc;
> > >  }
> > > diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> > > index f1696926582c..03e0c45bc7e1 100644
> > > --- a/drivers/iommu/riscv/iommu.h
> > > +++ b/drivers/iommu/riscv/iommu.h
> > > @@ -17,6 +17,22 @@
> > >
> > >  #include "iommu-bits.h"
> > >
> > > +struct riscv_iommu_device;
> > > +
> > > +struct riscv_iommu_queue {
> > > +       atomic_t prod;                          /* unbounded producer allocation index */
> > > +       atomic_t head;                          /* unbounded shadow ring buffer consumer index */
> > > +       atomic_t tail;                          /* unbounded shadow ring buffer producer index */
> > > +       unsigned int mask;                      /* index mask, queue length - 1 */
> > > +       unsigned int irq;                       /* allocated interrupt number */
> > > +       struct riscv_iommu_device *iommu;       /* iommu device handling the queue when active */
> > > +       void *base;                             /* ring buffer kernel pointer */
> > > +       dma_addr_t phys;                        /* ring buffer physical address */
> > > +       u16 qbr;                                /* base register offset, head and tail reference */
> > > +       u16 qcr;                                /* control and status register offset */
> > > +       u8 qid;                                 /* queue identifier, same as RISCV_IOMMU_INTR_XX */
> > > +};
> > > +
> > >  struct riscv_iommu_device {
> > >         /* iommu core interface */
> > >         struct iommu_device iommu;
> > > @@ -34,6 +50,11 @@ struct riscv_iommu_device {
> > >         /* available interrupt numbers, MSI or WSI */
> > >         unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
> > >         unsigned int irqs_count;
> > > +       unsigned int ivec;
> > > +
> > > +       /* hardware queues */
> > > +       struct riscv_iommu_queue cmdq;
> > > +       struct riscv_iommu_queue fltq;
> > >
> > >         /* device directory */
> > >         unsigned int ddt_mode;
> > > --
> > > 2.34.1
> > >
>
> Best,
> - Tomasz
>
> > >
> > > _______________________________________________
> > > linux-riscv mailing list
> > > linux-riscv@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/linux-riscv
diff mbox series

Patch

diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
index ba093c29de9f..40c379222821 100644
--- a/drivers/iommu/riscv/iommu-bits.h
+++ b/drivers/iommu/riscv/iommu-bits.h
@@ -704,4 +704,79 @@  struct riscv_iommu_msi_pte {
 #define RISCV_IOMMU_MSI_MRIF_NPPN	RISCV_IOMMU_PPN_FIELD
 #define RISCV_IOMMU_MSI_MRIF_NID_MSB	BIT_ULL(60)
 
+/* Helper functions: command structure builders. */
+
+static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
+{
+	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) |
+		      FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
+	cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
+						  u64 addr)
+{
+	cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr));
+	cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
+}
+
+static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
+						   int pscid)
+{
+	cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
+		       RISCV_IOMMU_CMD_IOTINVAL_PSCV;
+}
+
+static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
+						   int gscid)
+{
+	cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
+		       RISCV_IOMMU_CMD_IOTINVAL_GV;
+}
+
+static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
+{
+	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
+		      FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
+		      RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW;
+	cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
+						  u64 addr, u32 data)
+{
+	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
+		      FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
+		      FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) |
+		      RISCV_IOMMU_CMD_IOFENCE_AV;
+	cmd->dword1 = addr >> 2;
+}
+
+static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
+{
+	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
+		      FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
+	cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
+{
+	cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
+		      FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
+	cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
+						 unsigned int devid)
+{
+	cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) |
+		       RISCV_IOMMU_CMD_IODIR_DV;
+}
+
+static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd,
+						 unsigned int pasid)
+{
+	cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid);
+}
+
 #endif /* _RISCV_IOMMU_BITS_H_ */
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 71b7903d83d4..4349ac8a3990 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -25,7 +25,14 @@ 
 #include "iommu.h"
 
 /* Timeouts in [us] */
-#define RISCV_IOMMU_DDTP_TIMEOUT	50000
+#define RISCV_IOMMU_QCSR_TIMEOUT	150000
+#define RISCV_IOMMU_QUEUE_TIMEOUT	150000
+#define RISCV_IOMMU_DDTP_TIMEOUT	10000000
+#define RISCV_IOMMU_IOTINVAL_TIMEOUT	90000000
+
+/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
+#define RISCV_IOMMU_DEF_CQ_COUNT	8192
+#define RISCV_IOMMU_DEF_FQ_COUNT	4096
 
 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
 #define phys_to_ppn(va)  (((va) >> 2) & (((1ULL << 44) - 1) << 10))
@@ -89,6 +96,432 @@  static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
 		       riscv_iommu_devres_pages_match, &devres);
 }
 
+/*
+ * Hardware queue allocation and management.
+ */
+
+/* Setup queue base, control registers and default queue length */
+#define RISCV_IOMMU_QUEUE_INIT(q, name) do {				\
+	struct riscv_iommu_queue *_q = q;				\
+	_q->qid = RISCV_IOMMU_INTR_ ## name;				\
+	_q->qbr = RISCV_IOMMU_REG_ ## name ## B;			\
+	_q->qcr = RISCV_IOMMU_REG_ ## name ## CSR;			\
+	_q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
+} while (0)
+
+/* Note: offsets are the same for all queues */
+#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
+#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
+#define Q_ITEM(q, index) ((q)->mask & (index))
+#define Q_IPSR(q) BIT((q)->qid)
+
+/*
+ * Discover queue ring buffer hardware configuration, allocate in-memory
+ * ring buffer or use fixed I/O memory location, configure queue base register.
+ * Must be called before hardware queue is enabled.
+ *
+ * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
+ * @entry_size - queue single element size in bytes.
+ */
+static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
+				   struct riscv_iommu_queue *queue,
+				   size_t entry_size)
+{
+	unsigned int logsz;
+	u64 qb, rb;
+
+	/*
+	 * Use WARL base register property to discover maximum allowed
+	 * number of entries and optional fixed IO address for queue location.
+	 */
+	riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOGSZ_FIELD);
+	qb = riscv_iommu_readq(iommu, queue->qbr);
+
+	/*
+	 * Calculate and verify hardware supported queue length, as reported
+	 * by the field LOGSZ, where max queue length is equal to 2^(LOGSZ + 1).
+	 * Update queue size based on hardware supported value.
+	 */
+	logsz = ilog2(queue->mask);
+	if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb))
+		logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb);
+
+	/*
+	 * Use WARL base register property to discover an optional fixed IO
+	 * address for queue ring buffer location. Otherwise allocate contigus
+	 * system memory.
+	 */
+	if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
+		const size_t queue_size = entry_size << (logsz + 1);
+
+		queue->phys = ppn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
+		queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
+	} else {
+		do {
+			const size_t queue_size = entry_size << (logsz + 1);
+			const int order = get_order(queue_size);
+
+			queue->base = riscv_iommu_get_pages(iommu, order);
+			queue->phys = __pa(queue->base);
+		} while (!queue->base && logsz-- > 0);
+	}
+
+	if (!queue->base)
+		return -ENOMEM;
+
+	qb = phys_to_ppn(queue->phys) |
+	     FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, logsz);
+
+	/* Update base register and read back to verify hw accepted our write */
+	riscv_iommu_writeq(iommu, queue->qbr, qb);
+	rb = riscv_iommu_readq(iommu, queue->qbr);
+	if (rb != qb) {
+		dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
+		return -ENODEV;
+	}
+
+	/* Update actual queue mask */
+	queue->mask = (2U << logsz) - 1;
+
+	dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
+		queue->qid, logsz + 1);
+
+	return 0;
+}
+
+/* Check interrupt queue status, IPSR */
+static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
+{
+	struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+
+	if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
+		return IRQ_WAKE_THREAD;
+
+	return IRQ_NONE;
+}
+
+static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
+{
+	/* Reuse IVEC.CIV mask for all interrupt vectors mapping. */
+	return (iommu->ivec >> (n * 4)) & RISCV_IOMMU_IVEC_CIV;
+}
+
+/*
+ * Enable queue processing in the hardware, register interrupt handler.
+ *
+ * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
+ * @irq_handler - threaded interrupt handler.
+ */
+static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
+				    struct riscv_iommu_queue *queue,
+				    irq_handler_t irq_handler)
+{
+	const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
+	u32 csr;
+	int rc;
+
+	if (queue->iommu)
+		return -EBUSY;
+
+	/* Polling not implemented */
+	if (!irq)
+		return -ENODEV;
+
+	queue->iommu = iommu;
+	rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
+				  IRQF_ONESHOT | IRQF_SHARED,
+				  dev_name(iommu->dev), queue);
+	if (rc) {
+		queue->iommu = NULL;
+		return rc;
+	}
+
+	/*
+	 * Enable queue with interrupts, clear any memory fault if any.
+	 * Wait for the hardware to acknowledge request and activate queue
+	 * processing.
+	 * Note: All CSR bitfields are in the same offsets for all queues.
+	 */
+	riscv_iommu_writel(iommu, queue->qcr,
+			   RISCV_IOMMU_QUEUE_ENABLE |
+			   RISCV_IOMMU_QUEUE_INTR_ENABLE |
+			   RISCV_IOMMU_QUEUE_MEM_FAULT);
+
+	riscv_iommu_readl_timeout(iommu, queue->qcr,
+				  csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
+				  10, RISCV_IOMMU_QCSR_TIMEOUT);
+
+	if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
+						RISCV_IOMMU_QUEUE_BUSY |
+						RISCV_IOMMU_QUEUE_MEM_FAULT))) {
+		/* Best effort to stop and disable failing hardware queue. */
+		riscv_iommu_writel(iommu, queue->qcr, 0);
+		free_irq(irq, queue);
+		queue->iommu = NULL;
+		dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
+		return -EBUSY;
+	}
+
+	/* Clear any pending interrupt flag. */
+	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+	return 0;
+}
+
+/*
+ * Disable queue. Wait for the hardware to acknowledge request and
+ * stop processing enqueued requests. Report errors but continue.
+ */
+static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
+{
+	struct riscv_iommu_device *iommu = queue->iommu;
+	u32 csr;
+
+	if (!iommu)
+		return;
+
+	free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
+	riscv_iommu_writel(iommu, queue->qcr, 0);
+	riscv_iommu_readl_timeout(iommu, queue->qcr,
+				  csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
+				  10, RISCV_IOMMU_QCSR_TIMEOUT);
+
+	if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
+		dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
+			queue->qid, csr);
+
+	queue->iommu = NULL;
+}
+
+/*
+ * Returns number of available valid queue entries and the first item index.
+ * Update shadow producer index if necessary.
+ */
+static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
+				     unsigned int *index)
+{
+	unsigned int head = atomic_read(&queue->head);
+	unsigned int tail = atomic_read(&queue->tail);
+	unsigned int last = Q_ITEM(queue, tail);
+	int available = (int)(tail - head);
+
+	*index = head;
+
+	if (available > 0)
+		return available;
+
+	/* read hardware producer index, check reserved register bits are not set. */
+	if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
+				      tail, (tail & ~queue->mask) == 0,
+				      0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
+		dev_err_once(queue->iommu->dev,
+			     "Hardware error: queue access timeout\n");
+		return 0;
+	}
+
+	if (tail == last)
+		return 0;
+
+	/* update shadow producer index */
+	return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
+}
+
+/*
+ * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
+ */
+static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
+{
+	const unsigned int head = atomic_add_return(count, &queue->head);
+
+	riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
+}
+
+/* Return actual consumer index based on hardware reported queue head index. */
+static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
+{
+	const unsigned int cons = atomic_read(&queue->head);
+	const unsigned int last = Q_ITEM(queue, cons);
+	unsigned int head;
+
+	if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
+				      !(head & ~queue->mask),
+				      0, RISCV_IOMMU_QUEUE_TIMEOUT))
+		return cons;
+
+	return cons + ((head - last) & queue->mask);
+}
+
+/* Wait for submitted item to be processed. */
+static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
+				  unsigned int index,
+				  unsigned int timeout_us)
+{
+	unsigned int cons = atomic_read(&queue->head);
+
+	/* Already processed by the consumer */
+	if ((int)(cons - index) > 0)
+		return 0;
+
+	/* Monitor consumer index */
+	return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
+				 (int)(cons - index) > 0, 0, timeout_us);
+}
+
+/* Enqueue an entry and wait to be processed if timeout_us > 0
+ *
+ * Error handling for IOMMU hardware not responding in reasonable time
+ * will be added as separate patch series along with other RAS features.
+ * For now, only report hardware failure and continue.
+ */
+static void riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
+				   void *entry, size_t entry_size,
+				   unsigned int timeout_us)
+{
+	unsigned int prod;
+	unsigned int head;
+	unsigned int tail;
+	unsigned long flags;
+
+	/* Do not preempt submission flow. */
+	local_irq_save(flags);
+
+	/* 1. Allocate some space in the queue */
+	prod = atomic_inc_return(&queue->prod) - 1;
+	head = atomic_read(&queue->head);
+
+	/* 2. Wait for space availability. */
+	if ((prod - head) > queue->mask) {
+		if (readx_poll_timeout(atomic_read, &queue->head,
+				       head, (prod - head) < queue->mask,
+				       0, RISCV_IOMMU_QUEUE_TIMEOUT))
+			goto err_busy;
+	} else if ((prod - head) == queue->mask) {
+		const unsigned int last = Q_ITEM(queue, head);
+
+		if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
+					      !(head & ~queue->mask) && head != last,
+					      0, RISCV_IOMMU_QUEUE_TIMEOUT))
+			goto err_busy;
+		atomic_add((head - last) & queue->mask, &queue->head);
+	}
+
+	/* 3. Store entry in the ring buffer. */
+	memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
+
+	/* 4. Wait for all previous entries to be ready */
+	if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
+			       0, RISCV_IOMMU_QUEUE_TIMEOUT))
+		goto err_busy;
+
+	/* 5. Complete submission and restore local interrupts */
+	dma_wmb();
+	riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
+	atomic_inc(&queue->tail);
+	local_irq_restore(flags);
+
+	if (timeout_us && riscv_iommu_queue_wait(queue, prod, timeout_us))
+		dev_err_once(queue->iommu->dev,
+			     "Hardware error: command execution timeout\n");
+
+	return;
+
+err_busy:
+	local_irq_restore(flags);
+	dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
+}
+
+/*
+ * IOMMU Command queue chapter 3.1
+ */
+
+/* Command queue interrupt handler thread function */
+static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
+{
+	const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+	unsigned int ctrl;
+
+	/* Clear MF/CQ errors, complete error recovery to be implemented. */
+	ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
+	if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
+		    RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
+		riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
+		dev_warn(queue->iommu->dev,
+			 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
+			 queue->qid,
+			 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
+			 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
+			 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
+			 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
+	}
+
+	/* Placeholder for command queue interrupt notifiers */
+
+	/* Clear command interrupt pending. */
+	riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+	return IRQ_HANDLED;
+}
+
+/* Send command to the IOMMU command queue */
+static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
+				 struct riscv_iommu_command *cmd,
+				 unsigned int timeout_us)
+{
+	riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd), timeout_us);
+}
+
+/*
+ * IOMMU Fault/Event queue chapter 3.2
+ */
+
+static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
+			      struct riscv_iommu_fq_record *event)
+{
+	unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
+	unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
+
+	/* Placeholder for future fault handling implementation, report only. */
+	if (err)
+		dev_warn_ratelimited(iommu->dev,
+				     "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
+				     err, devid, event->iotval, event->iotval2);
+}
+
+/* Fault queue interrupt handler thread function */
+static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
+{
+	struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+	struct riscv_iommu_device *iommu = queue->iommu;
+	struct riscv_iommu_fq_record *events;
+	unsigned int ctrl, idx;
+	int cnt, len;
+
+	events = (struct riscv_iommu_fq_record *)queue->base;
+
+	/* Clear fault interrupt pending and process all received fault events. */
+	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+	do {
+		cnt = riscv_iommu_queue_consume(queue, &idx);
+		for (len = 0; len < cnt; idx++, len++)
+			riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
+		riscv_iommu_queue_release(queue, cnt);
+	} while (cnt > 0);
+
+	/* Clear MF/OF errors, complete error recovery to be implemented. */
+	ctrl = riscv_iommu_readl(iommu, queue->qcr);
+	if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
+		riscv_iommu_writel(iommu, queue->qcr, ctrl);
+		dev_warn(iommu->dev,
+			 "Queue #%u error; memory fault:%d overflow:%d\n",
+			 queue->qid,
+			 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
+			 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
+	}
+
+	return IRQ_HANDLED;
+}
+
 /* Lookup and initialize device context info structure. */
 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
 						 unsigned int devid)
@@ -250,6 +683,7 @@  static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
 	struct device *dev = iommu->dev;
 	u64 ddtp, rq_ddtp;
 	unsigned int mode, rq_mode = ddtp_mode;
+	struct riscv_iommu_command cmd;
 
 	ddtp = riscv_iommu_read_ddtp(iommu);
 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
@@ -317,6 +751,18 @@  static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
 	if (mode != ddtp_mode)
 		dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
 
+	/* Invalidate device context cache */
+	riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+	riscv_iommu_cmd_send(iommu, &cmd, 0);
+
+	/* Invalidate address translation cache */
+	riscv_iommu_cmd_inval_vma(&cmd);
+	riscv_iommu_cmd_send(iommu, &cmd, 0);
+
+	/* IOFENCE.C */
+	riscv_iommu_cmd_iofence(&cmd);
+	riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+
 	return 0;
 }
 
@@ -492,6 +938,26 @@  static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
 			return -EINVAL;
 	}
 
+	/*
+	 * Distribute interrupt vectors, always use first vector for CIV.
+	 * At least one interrupt is required. Read back and verify.
+	 */
+	if (!iommu->irqs_count)
+		return -EINVAL;
+
+	iommu->ivec = 0;
+	iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_FIV, 1 % iommu->irqs_count);
+	iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, 2 % iommu->irqs_count);
+	iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, 3 % iommu->irqs_count);
+	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_IVEC, iommu->ivec);
+
+	iommu->ivec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_IVEC);
+	if (riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_CIV) >= RISCV_IOMMU_INTR_COUNT ||
+	    riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_FIV) >= RISCV_IOMMU_INTR_COUNT ||
+	    riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PIV) >= RISCV_IOMMU_INTR_COUNT ||
+	    riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PMIV) >= RISCV_IOMMU_INTR_COUNT)
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -500,12 +966,17 @@  void riscv_iommu_remove(struct riscv_iommu_device *iommu)
 	iommu_device_unregister(&iommu->iommu);
 	iommu_device_sysfs_remove(&iommu->iommu);
 	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
+	riscv_iommu_queue_disable(&iommu->cmdq);
+	riscv_iommu_queue_disable(&iommu->fltq);
 }
 
 int riscv_iommu_init(struct riscv_iommu_device *iommu)
 {
 	int rc;
 
+	RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
+	RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
+
 	rc = riscv_iommu_init_check(iommu);
 	if (rc)
 		return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
@@ -514,10 +985,28 @@  int riscv_iommu_init(struct riscv_iommu_device *iommu)
 	if (rc)
 		return rc;
 
-	rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
+	rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
+				     sizeof(struct riscv_iommu_command));
+	if (rc)
+		return rc;
+
+	rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
+				     sizeof(struct riscv_iommu_fq_record));
+	if (rc)
+		return rc;
+
+	rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
 	if (rc)
 		return rc;
 
+	rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
+	if (rc)
+		goto err_queue_disable;
+
+	rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
+	if (rc)
+		goto err_queue_disable;
+
 	rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
 				    dev_name(iommu->dev));
 	if (rc) {
@@ -537,5 +1026,8 @@  int riscv_iommu_init(struct riscv_iommu_device *iommu)
 	iommu_device_sysfs_remove(&iommu->iommu);
 err_iodir_off:
 	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
+err_queue_disable:
+	riscv_iommu_queue_disable(&iommu->fltq);
+	riscv_iommu_queue_disable(&iommu->cmdq);
 	return rc;
 }
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index f1696926582c..03e0c45bc7e1 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -17,6 +17,22 @@ 
 
 #include "iommu-bits.h"
 
+struct riscv_iommu_device;
+
+struct riscv_iommu_queue {
+	atomic_t prod;				/* unbounded producer allocation index */
+	atomic_t head;				/* unbounded shadow ring buffer consumer index */
+	atomic_t tail;				/* unbounded shadow ring buffer producer index */
+	unsigned int mask;			/* index mask, queue length - 1 */
+	unsigned int irq;			/* allocated interrupt number */
+	struct riscv_iommu_device *iommu;	/* iommu device handling the queue when active */
+	void *base;				/* ring buffer kernel pointer */
+	dma_addr_t phys;			/* ring buffer physical address */
+	u16 qbr;				/* base register offset, head and tail reference */
+	u16 qcr;				/* control and status register offset */
+	u8 qid;					/* queue identifier, same as RISCV_IOMMU_INTR_XX */
+};
+
 struct riscv_iommu_device {
 	/* iommu core interface */
 	struct iommu_device iommu;
@@ -34,6 +50,11 @@  struct riscv_iommu_device {
 	/* available interrupt numbers, MSI or WSI */
 	unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
 	unsigned int irqs_count;
+	unsigned int ivec;
+
+	/* hardware queues */
+	struct riscv_iommu_queue cmdq;
+	struct riscv_iommu_queue fltq;
 
 	/* device directory */
 	unsigned int ddt_mode;