diff mbox series

[RFC,1/2] hw/riscv: rivos-iommu: Baseline implementation of RIVOS IOMMU.

Message ID 20220316222116.2492777-2-tjeznach@rivosinc.com (mailing list archive)
State New, archived
Headers show
Series hw/riscv: Baseline QEMU support for RISC-V IOMMU (draft) | expand

Commit Message

Tomasz Jeznach March 16, 2022, 10:21 p.m. UTC
The patch introduces baseline implementation of a draft proposal
of RISC-V IOMMU specification as discussed in the RISC-V Forum [1] [2].

The implementation follows a draft version of the specification published
at [3] including all updates available on 2022/03/10.

This patch covers baseline features proposed in the specification:
- Two stage address translation, with Sv32, Sv39, Sv48, Sv57 modes.
- Multilevel device directory tree.
- Cache management command interface.
- Fault reporting interface.

References:
 [1] https://lists.riscv.org/g/tech-privileged/message/875
 [2] https://lists.riscv.org/g/tech-iommu/message/3
 [3] https://docs.google.com/document/d/1ytBZ6eDk1pAeBlZjDvm6_qqJbKQ0fMYKedyx0uoAGB0/view

Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
---
 hw/riscv/Kconfig               |    3 +
 hw/riscv/meson.build           |    1 +
 hw/riscv/rivos_iommu.c         | 1350 ++++++++++++++++++++++++++++++++
 hw/riscv/trace-events          |    7 +
 hw/riscv/trace.h               |    2 +
 include/hw/pci/pci_ids.h       |    1 +
 include/hw/riscv/rivos_iommu.h |   80 ++
 meson.build                    |    1 +
 8 files changed, 1445 insertions(+)
 create mode 100644 hw/riscv/rivos_iommu.c
 create mode 100644 hw/riscv/trace-events
 create mode 100644 hw/riscv/trace.h
 create mode 100644 include/hw/riscv/rivos_iommu.h

Comments

Alistair Francis April 21, 2022, 6:13 a.m. UTC | #1
On Thu, Mar 17, 2022 at 8:25 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> The patch introduces baseline implementation of a draft proposal
> of RISC-V IOMMU specification as discussed in the RISC-V Forum [1] [2].
>
> The implementation follows a draft version of the specification published
> at [3] including all updates available on 2022/03/10.
>
> This patch covers baseline features proposed in the specification:
> - Two stage address translation, with Sv32, Sv39, Sv48, Sv57 modes.
> - Multilevel device directory tree.
> - Cache management command interface.
> - Fault reporting interface.
>
> References:
>  [1] https://lists.riscv.org/g/tech-privileged/message/875
>  [2] https://lists.riscv.org/g/tech-iommu/message/3
>  [3] https://docs.google.com/document/d/1ytBZ6eDk1pAeBlZjDvm6_qqJbKQ0fMYKedyx0uoAGB0/view

Thanks for the patch!

Overall it looks good. It looks like it's on the right track, we
probably just need the spec to stabalise a bit more before we accept
it though.

>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> ---
>  hw/riscv/Kconfig               |    3 +
>  hw/riscv/meson.build           |    1 +
>  hw/riscv/rivos_iommu.c         | 1350 ++++++++++++++++++++++++++++++++
>  hw/riscv/trace-events          |    7 +
>  hw/riscv/trace.h               |    2 +
>  include/hw/pci/pci_ids.h       |    1 +
>  include/hw/riscv/rivos_iommu.h |   80 ++
>  meson.build                    |    1 +
>  8 files changed, 1445 insertions(+)
>  create mode 100644 hw/riscv/rivos_iommu.c
>  create mode 100644 hw/riscv/trace-events
>  create mode 100644 hw/riscv/trace.h
>  create mode 100644 include/hw/riscv/rivos_iommu.h
>
> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> index 91bb9d21c4..c6cbd7b42c 100644
> --- a/hw/riscv/Kconfig
> +++ b/hw/riscv/Kconfig
> @@ -4,6 +4,9 @@ config RISCV_NUMA
>  config IBEX
>      bool
>
> +config RIVOS_IOMMU
> +    bool
> +
>  config MICROCHIP_PFSOC
>      bool
>      select CADENCE_SDHCI
> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> index ab6cae57ea..a2aeb5fab4 100644
> --- a/hw/riscv/meson.build
> +++ b/hw/riscv/meson.build
> @@ -9,5 +9,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_E', if_true: files('sifive_e.c'))
>  riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>  riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>  riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
> +riscv_ss.add(when: 'CONFIG_RIVOS_IOMMU', if_true: files('rivos_iommu.c'))
>
>  hw_arch += {'riscv': riscv_ss}
> diff --git a/hw/riscv/rivos_iommu.c b/hw/riscv/rivos_iommu.c
> new file mode 100644
> index 0000000000..f043a6864a
> --- /dev/null
> +++ b/hw/riscv/rivos_iommu.c
> @@ -0,0 +1,1350 @@
> +/*
> + * QEMU emulation of an RISC-V RIVOS-IOMMU
> + *
> + * Copyright (C) 2022 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +#include "hw/pci/msi.h"
> +#include "hw/pci/msix.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/riscv/riscv_hart.h"
> +#include "hw/riscv/rivos_iommu.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qemu/error-report.h"
> +
> +#include "trace.h"
> +
> +
> +/* Based on Rivos RISC-V IOMMU Specification, Mar 10, 2022 */
> +
> +/* Rivos I/O programming interface registers */
> +#define RIO_REG_CAP             0x0000  /* Supported capabilities  */
> +#define RIO_REG_DDTP            0x0010  /* Device Directory Table Pointer */
> +#define RIO_REG_CQ_BASE         0x0018  /* Command queue base/head/tail */
> +#define RIO_REG_CQ_HEAD         0x0020
> +#define RIO_REG_CQ_TAIL         0x0024
> +#define RIO_REG_FQ_BASE         0x0028  /* Fault queue base/head/tail */
> +#define RIO_REG_FQ_HEAD         0x0030
> +#define RIO_REG_FQ_TAIL         0x0034
> +#define RIO_REG_PQ_BASE         0x0038  /* Page request queue base/head/tail */
> +#define RIO_REG_PQ_HEAD         0x0040
> +#define RIO_REG_PQ_TAIL         0x0044
> +#define RIO_REG_CQ_CONTROL      0x0048  /* Command queue control */
> +#define RIO_REG_FQ_CONTROL      0x004C  /* Fault queue control */
> +#define RIO_REG_PQ_CONTROL      0x0050  /* Page request queue control */
> +#define RIO_REG_IPSR            0x0054  /* Interrupt pending status  */
> +#define RIO_REG_IOCNTOVF        0x0058
> +#define RIO_REG_IOCNTINH        0x005C
> +#define RIO_REG_IOHPMCYCLES     0x0060
> +#define RIO_REG_IOHPMCTR_BASE   0x0068
> +#define RIO_REG_IOHPMEVT_BASE   0x0160
> +#define RIO_REG_IOCNTSEC        0x0258
> +#define RIO_REG_IVEC            0x02F8  /* Interrupt cause to vector mapping */
> +#define RIO_REG_MSI_ADDR_BASE   0x0300  /* MSI address for vector #0 */
> +#define RIO_REG_MSI_DATA_BASE   0x0308  /* MSI data for vector #0 */
> +#define RIO_REG_MSI_CTRL_BASE   0x030C  /* MSI control for vector #0 */
> +#define RIO_REG_MSI_PBA_BASE    0x0400  /* MSI Pending Bit Array */
> +
> +/* Capabilities supported by the IOMMU, RIO_REG_CAP */
> +#define RIO_CAP_REVISION_MASK   0x00FF
> +#define RIO_CAP_STAGE_ONE      (1ULL << 8)
> +#define RIO_CAP_STAGE_TWO      (1ULL << 9)
> +#define RIO_CAP_MSI            (1ULL << 10)
> +#define RIO_CAP_MRIF           (1ULL << 11)
> +#define RIO_CAP_ATS            (1ULL << 12)
> +#define RIO_CAP_AMO            (1ULL << 13)
> +
> +/* Device directory table pointer */
> +#define RIO_DDTP_BUSY          (1ULL << 59)
> +
> +#define RIO_DDTP_MASK_PPN       0x00000FFFFFFFFFFFULL
> +#define RIO_DDTP_MASK_MODE      0xF000000000000000ULL
> +#define RIO_DDTE_MASK_PPN       0x00FFFFFFFFFFF000ULL
> +
> +/* Device directory mode values, within RIO_DDTP_MASK_MODE */
> +#define RIO_DDTP_MODE_OFF       0
> +#define RIO_DDTP_MODE_BARE      1
> +#define RIO_DDTP_MODE_3LVL      2
> +#define RIO_DDTP_MODE_2LVL      3
> +#define RIO_DDTP_MODE_1LVL      4
> +#define RIO_DDTP_MODE_MAX       RIO_DDTP_MODE_1LVL
> +
> +/* Command queue base register */
> +#define RIO_CQ_MASK_LOG2SZ      0x000000000000001FULL
> +#define RIO_CQ_MASK_PPN         0x0001FFFFFFFFFFE0ULL
> +
> +/* Command queue control and status register */
> +#define RIO_CQ_ENABLE          (1 << 0)
> +#define RIO_CQ_IRQ_ENABLE      (1 << 1)
> +#define RIO_CQ_FAULT           (1 << 8)
> +#define RIO_CQ_TIMEOUT         (1 << 9)
> +#define RIO_CQ_ERROR           (1 << 10)
> +#define RIO_CQ_ACTIVE          (1 << 16)
> +#define RIO_CQ_BUSY            (1 << 17)
> +
> +/* Fault queue base register */
> +#define RIO_FQ_MASK_LOG2SZ      0x000000000000001FULL
> +#define RIO_FQ_MASK_PPN         0x0001FFFFFFFFFFE0ULL
> +
> +/* Fault queue control and status register */
> +#define RIO_FQ_ENABLE          (1 << 0)
> +#define RIO_FQ_IRQ_ENABLE      (1 << 1)
> +#define RIO_FQ_FAULT           (1 << 8)
> +#define RIO_FQ_FULL            (1 << 9)
> +#define RIO_FQ_ACTIVE          (1 << 16)
> +#define RIO_FQ_BUSY            (1 << 17)
> +
> +/* Page request queue base register */
> +#define RIO_PQ_MASK_LOG2SZ      0x000000000000001FULL
> +#define RIO_PQ_MASK_PPN         0x0001FFFFFFFFFFE0ULL
> +
> +/* Page request queue control and status register */
> +#define RIO_PQ_ENABLE          (1 << 0)
> +#define RIO_PQ_IRQ_ENABLE      (1 << 1)
> +#define RIO_PQ_FAULT           (1 << 8)
> +#define RIO_PQ_FULL            (1 << 9)
> +#define RIO_PQ_ACTIVE          (1 << 16)
> +#define RIO_PQ_BUSY            (1 << 17)
> +
> +/* Interrupt Sources, used for IPSR and IVEC indexing. */
> +#define RIO_INT_CQ              0
> +#define RIO_INT_FQ              1
> +#define RIO_INT_PM              2
> +#define RIO_INT_PQ              3
> +#define RIO_INT_COUNT           4
> +
> +/* Device Context */
> +typedef struct RivosIOMMUDeviceContext {
> +    uint64_t  tc;          /* Translation Control */
> +    uint64_t  gatp;        /* IO Hypervisor Guest Address Translation */
> +    uint64_t  satp;        /* IO SATP or IO vSATP or PDTP */
> +    uint64_t  pscid;       /* Process soft-context ID */
> +    uint64_t  msiptp;      /* MSI Page Table Pointer (extended context) */
> +    uint64_t  msi_addr_mask;
> +    uint64_t  msi_addr_pattern;
> +    uint64_t  _reserved;
> +} RivosIOMMUDeviceContext;
> +
> +#define RIO_DCTC_VALID            (1ULL << 0)
> +#define RIO_DCTC_EN_ATS           (1ULL << 1)
> +#define RIO_DCTC_EN_PRI           (1ULL << 2)
> +#define RIO_DCTC_T2GPA            (1ULL << 3)
> +#define RIO_DCTC_DIS_TRANS_FAULT  (1ULL << 4)
> +#define RIO_DCTC_PDTV             (1ULL << 5)
> +
> +/* Shared MODE:ASID:PPN masks for GATP, SATP */
> +#define RIO_ATP_MASK_PPN           SATP64_PPN
> +#define RIO_ATP_MASK_GSCID         SATP64_ASID
> +#define RIO_ATP_MASK_MODE          SATP64_MODE
> +
> +#define RIO_ATP_MODE_SV32          VM_1_10_SV32
> +#define RIO_ATP_MODE_SV39          VM_1_10_SV39
> +#define RIO_ATP_MODE_SV48          VM_1_10_SV48
> +#define RIO_ATP_MODE_SV57          VM_1_10_SV57
> +#define RIO_ATP_MODE_BARE          VM_1_10_MBARE
> +
> +/* satp.mode when tc.RIO_DCTC_PDTV is set */
> +#define RIO_PDTP_MODE_BARE         0
> +#define RIO_PDTP_MODE_PD20         1
> +#define RIO_PDTP_MODE_PD17         2
> +#define RIO_PDTP_MODE_PD8          3
> +
> +#define RIO_DCMSI_VALID            1
> +#define RIO_DCMSI_MASK_PPN         0x0FFFFFFFFFFFFFFEULL
> +#define RIO_DCMSI_MASK_MODE        0xF000000000000000ULL
> +
> +#define RIO_DCMSI_MODE_BARE        0
> +#define RIO_DCMSI_MODE_FLAT        1
> +
> +/* I/O Management Unit Command format */
> +typedef struct RivosIOMMUCommand {
> +    uint64_t request;
> +    uint64_t address;
> +} RivosIOMMUCommand;
> +
> +/* RivosIOMMUCommand.request opcode and function mask */
> +#define RIO_CMD_MASK_FUN_OP        0x00000000000003FFULL
> +
> +/* opcode == IOTINVAL.* */
> +#define RIO_CMD_IOTINVAL_VMA       0x001
> +#define RIO_CMD_IOTINVAL_GVMA      0x081
> +#define RIO_CMD_IOTINVAL_MSI       0x101
> +
> +#define RIO_IOTINVAL_PSCID_VALID   0x0000000000000400ULL
> +#define RIO_IOTINVAL_ADDR_VALID    0x0000000000000800ULL
> +#define RIO_IOTINVAL_GSCID_VALID   0x0000000000001000ULL
> +#define RIO_IOTINVAL_ADDR_NAPOT    0x0000000000002000ULL
> +#define RIO_IOTINVAL_MASK_PSCID    0x0000000FFFFF0000ULL
> +#define RIO_IOTINVAL_MASK_GSCID    0x00FFFF0000000000ULL
> +
> +/* opcode == IODIR.* */
> +#define RIO_CMD_IODIR_INV_DDT      0x002
> +#define RIO_CMD_IODIR_PRE_DDT      0x082
> +#define RIO_CMD_IODIR_INV_PDT      0x102
> +#define RIO_CMD_IODIR_PRE_PDT      0x182
> +
> +#define RIO_IODIR_DID_VALID        0x0000000000000400ULL
> +#define RIO_IODIR_MASK_PID         0x0000000FFFFF0000ULL
> +#define RIO_IODIR_MASK_DID         0xFFFFFF0000000000ULL
> +
> +/* opcode == IOFENCE.* */
> +#define RIO_CMD_IOFENCE_C          0x003
> +
> +#define RIO_IOFENCE_PR             0x0000000000000400ULL
> +#define RIO_IOFENCE_PW             0x0000000000000800ULL
> +#define RIO_IOFENCE_AV             0x0000000000001000ULL
> +#define RIO_IOFENCE_MASK_DATA      0xFFFFFFFF00000000ULL
> +
> +/* opcode == ATS */
> +#define RIO_CMD_ATS_INVAL          0x004
> +#define RIO_CMD_ATS_PRGR           0x084
> +
> +/* Fault Queue element */
> +typedef struct RivosIOMMUEvent {
> +    uint64_t reason;
> +    uint64_t _rsrvd;
> +    uint64_t iova;
> +    uint64_t phys;
> +} RivosIOMMUEvent;
> +
> +/* Event reason */
> +#define RIO_EVENT_MASK_DID         0x0000000000FFFFFFULL
> +#define RIO_EVENT_MASK_PID         0x00000FFFFF000000ULL
> +#define RIO_EVENT_PV               0x0000100000000000ULL
> +#define RIO_EVENT_PRIV             0x0000200000000000ULL
> +#define RIO_EVENT_MASK_TTYP        0x000FC00000000000ULL
> +#define RIO_EVENT_MASK_CAUSE       0xFFF0000000000000ULL
> +
> +#define RIO_TTYP_NONE              0 /* Fault not caused by an inbound trx */
> +#define RIO_TTYP_URX               1 /* Untranslated read for execute trx */
> +#define RIO_TTYP_URD               2 /* Untranslated read transaction */
> +#define RIO_TTYP_UWR               3 /* Untranslated write/AMO transaction */
> +#define RIO_TTYP_TRX               4 /* Translated read for execute trx */
> +#define RIO_TTYP_TRD               5 /* Translated read transaction */
> +#define RIO_TTYP_TWR               6 /* Translated write/AMO transaction */
> +#define RIO_TTYP_ATS               7 /* PCIe ATS Translation Request */
> +#define RIO_TTYP_MRQ               8 /* Message Request */
> +
> +#define RIO_ERRC_I_ALIGN           0 /* Instruction address misaligned */
> +#define RIO_ERRC_I_FAULT           1 /* Instruction access fault */
> +#define RIO_ERRC_RD_ALIGN          4 /* Read address misaligned */
> +#define RIO_ERRC_RD_FAULT          5 /* Read access fault */
> +#define RIO_ERRC_WR_ALIGN          6 /* Write/AMO address misaligned */
> +#define RIO_ERRC_WR_FAULT          7 /* Write/AMO access fault */
> +#define RIO_ERRC_PGFAULT_I        12 /* Instruction page fault */
> +#define RIO_ERRC_PGFAULT_RD       13 /* Read page fault */
> +#define RIO_ERRC_PGFAULT_WR       15 /* Write/AMO page fault */
> +#define RIO_ERRC_GPGFAULT_I       20 /* Instruction guest page fault */
> +#define RIO_ERRC_GPGFAULT_RD      21 /* Read guest-page fault */
> +#define RIO_ERRC_GPGFAULT_WR      23 /* Write/AMO guest-page fault */
> +#define RIO_ERRC_DMA_DISABLED    256 /* Inbound transactions disallowed */
> +#define RIO_ERRC_DDT_FAULT       257 /* DDT entry load access fault */
> +#define RIO_ERRC_DDT_INVALID     258 /* DDT entry not valid */
> +#define RIO_ERRC_DDT_UNSUPPORTED 259 /* DDT entry misconfigured */
> +#define RIO_ERRC_REQ_INVALID     260 /* Transaction type disallowed */
> +#define RIO_ERRC_PDT_FAULT       261 /* PDT entry load access fault. */
> +#define RIO_ERRC_PDT_INVALID     262 /* PDT entry not valid */
> +#define RIO_ERRC_PDT_UNSUPPORTED 263 /* PDT entry misconfigured */
> +#define RIO_ERRC_MSI_FAULT       264 /* MSI PTE load access fault */
> +#define RIO_ERRC_MSI_INVALID     265 /* MSI PTE not valid */
> +#define RIO_ERRC_MRIF_FAULT      266 /* MRIF access fault */
> +
> +
> +/*
> + * Rivos Inc. I/O Management Unit PCIe Device Emulation
> + */
> +
> +#ifndef PCI_VENDOR_ID_RIVOS
> +#define PCI_VENDOR_ID_RIVOS           0x1efd
> +#endif
> +
> +#ifndef PCI_DEVICE_ID_RIVOS_IOMMU
> +#define PCI_DEVICE_ID_RIVOS_IOMMU     0x8001
> +#endif
> +
> +/* Programming interface revision */
> +#define RIO_CAP_REVISION              0x0002
> +
> +#define RIO_REG_MMIO_SIZE             0x0300
> +
> +#define RIO_ERR_NONE                  0
> +#define RIO_ERR_ANY                   1
> +
> +#define RIO_ERR(cause)                \
> +    (RIO_ERR_ANY | (((cause) & 0x0fff) << 16))
> +
> +#define RIO_ERR_IO(cause, ttyp)       \
> +    (RIO_ERR_ANY | (((cause) & 0x0fff) << 16) | (((ttyp) & 0x3f) << 8))
> +
> +#define RIO_ERR_CAUSE(err)            (((err) >> 16) & 0xfff)
> +#define RIO_ERR_TTYP(err)             (((err) >> 8) & 0x3f)

You could probably move this to a header file.

Also have a look at the REG32/REG64 and FIELD macros, they are a nice
way to define registers.

> +
> +
> +/* IO virtual address space wrapper for attached PCI devices */
> +struct RivosIOMMUSpace {
> +    IOMMUMemoryRegion             mr;
> +    AddressSpace                  as;
> +    RivosIOMMUState              *iommu;
> +    RivosIOMMUDeviceContext       dc;
> +    bool                          dc_valid;
> +    uint32_t                      devid;
> +    QLIST_ENTRY(RivosIOMMUSpace)  list;
> +};
> +
> +
> +static uint32_t rivos_iommu_reg_mod(RivosIOMMUState *s,
> +    unsigned idx, uint32_t set, uint32_t clr)
> +{
> +    uint32_t val;
> +    qemu_mutex_lock(&s->core_lock);
> +    val = ldl_le_p(&s->regs_rw[idx]);
> +    stl_le_p(&s->regs_rw[idx], set | (val & ~clr));
> +    qemu_mutex_unlock(&s->core_lock);
> +    return val;
> +}
> +
> +static unsigned rivos_iommu_irq_vector(RivosIOMMUState *s, int source)
> +{
> +    const uint32_t ivec = ldl_le_p(&s->regs_rw[RIO_REG_IVEC]);
> +    return (ivec >> (source * 4)) & 0x0F;
> +}
> +
> +static void rivos_iommu_irq_use(RivosIOMMUState *s, int source)
> +{
> +    msix_vector_use(&(s->pci), rivos_iommu_irq_vector(s, source));
> +}
> +
> +static void rivos_iommu_irq_unuse(RivosIOMMUState *s, int source)
> +{
> +    msix_vector_unuse(&(s->pci), rivos_iommu_irq_vector(s, source));
> +}
> +
> +static void rivos_iommu_irq_assert(RivosIOMMUState *s, int source)
> +{
> +    uint32_t ipsr = rivos_iommu_reg_mod(s, RIO_REG_IPSR, (1 << source), 0);
> +
> +    if (!(ipsr & (1 << source)) && msix_enabled(&(s->pci))) {
> +        const unsigned vector = rivos_iommu_irq_vector(s, source);
> +        msix_notify(&(s->pci), vector);
> +    }
> +}
> +
> +static void rivos_iommu_fault_iova(RivosIOMMUSpace *as, int err, hwaddr iova,
> +    hwaddr gpa)
> +{
> +    RivosIOMMUState *s = as->iommu;
> +    RivosIOMMUEvent ev;
> +    MemTxResult res;
> +    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
> +    uint32_t head = ldl_le_p(&s->regs_rw[RIO_REG_FQ_HEAD]) & s->fq_mask;
> +    uint32_t next = (s->fq_tail + 1) & s->fq_mask;
> +    uint32_t ctrl = ldl_le_p(&s->regs_rw[RIO_REG_FQ_CONTROL]);
> +    uint32_t ctrl_err = 0;
> +
> +    ev.reason = as->devid;
> +    ev.reason = set_field(ev.reason, RIO_EVENT_MASK_CAUSE, RIO_ERR_CAUSE(err));
> +    ev.reason = set_field(ev.reason, RIO_EVENT_MASK_TTYP, RIO_ERR_TTYP(err));
> +    ev.iova = iova;
> +    ev.phys = gpa;
> +
> +    trace_rivos_iommu_flt(PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid),
> +                          PCI_FUNC(as->devid), RIO_ERR_CAUSE(err), iova);
> +
> +    if (!(ctrl & RIO_FQ_ACTIVE) || !!(ctrl & (RIO_FQ_FULL | RIO_FQ_FAULT))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        ctrl_err = RIO_FQ_FULL;
> +    } else {
> +        dma_addr_t addr = s->fq_base + s->fq_tail * sizeof(RivosIOMMUEvent);
> +        res = dma_memory_write(&address_space_memory, addr, &ev, sizeof(ev),
> +                               ma);
> +        if (res != MEMTX_OK) {
> +            ctrl_err = RIO_FQ_FAULT;
> +        } else {
> +            s->fq_tail = next;
> +        }
> +    }
> +
> +    stl_le_p(&s->regs_rw[RIO_REG_FQ_TAIL], s->fq_tail);
> +
> +    if (ctrl_err) {
> +        rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, ctrl_err, 0);
> +    }
> +
> +    if (ctrl & RIO_FQ_IRQ_ENABLE) {
> +        rivos_iommu_irq_assert(s, RIO_INT_FQ);
> +    }
> +}
> +
> +static void rivos_iommu_fault(RivosIOMMUSpace *as, int cause)
> +{
> +    rivos_iommu_fault_iova(as, cause, 0, 0);
> +}
> +
> +
> +/* Risc-V IOMMU Page Table walker.
> + *
> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> + * Both implementation can be merged into single helper function in future.
> + * Keeping them separate for now, as error reporting and flow specifics are
> + * sufficiently different for separate implementation.

I don't love this, it would probably be worth trying to reuse what we can.

Also we probably don't need the RIO_ERR() on every return type, can't
we just return the int and then format it later. That should help with
re-using existing page table code

> + *
> + * Returns RIO_ERR_ with fault code.
> + */
> +static int rivos_iommu_fetch_pa(RivosIOMMUSpace *as,
> +    hwaddr addr, hwaddr *physical, uint64_t gatp, uint64_t satp,
> +    bool first_stage, IOMMUAccessFlags access)
> +{
> +    MemTxResult res;
> +    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
> +    hwaddr base;
> +    int i, levels, ptidxbits, ptshift, ptesize, mode, widened;
> +    uint64_t atp = first_stage ? satp : gatp;
> +
> +    base = (hwaddr) get_field(atp, RIO_ATP_MASK_PPN) << PGSHIFT;
> +    mode = get_field(atp, RIO_ATP_MASK_MODE);
> +
> +    switch (mode) {
> +    case RIO_ATP_MODE_SV32:
> +        levels = 2;
> +        ptidxbits = 10;
> +        ptesize = 4;
> +        break;
> +    case RIO_ATP_MODE_SV39:
> +        levels = 3;
> +        ptidxbits = 9;
> +        ptesize = 8;
> +        break;
> +    case RIO_ATP_MODE_SV48:
> +        levels = 4;
> +        ptidxbits = 9;
> +        ptesize = 8;
> +        break;
> +    case RIO_ATP_MODE_SV57:
> +        levels = 5;
> +        ptidxbits = 9;
> +        ptesize = 8;
> +        break;
> +    case RIO_ATP_MODE_BARE:
> +        if (first_stage) {
> +            return rivos_iommu_fetch_pa(as, addr, physical,
> +                                        gatp, satp, false, access);
> +        }
> +        *physical = addr;
> +        return RIO_ERR_NONE;
> +    default:
> +        return RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED);
> +    }
> +
> +    widened = first_stage ? 0 : 2;
> +    ptshift = (levels - 1) * ptidxbits;
> +
> +    /* zero extended address range check */
> +    int va_bits = PGSHIFT + levels * ptidxbits + widened;
> +    uint64_t va_mask = (1ULL << va_bits) - 1;
> +    if ((addr & va_mask) != addr) {
> +        return RIO_ERR(RIO_ERRC_DMA_DISABLED);
> +    }
> +
> +    for (i = 0; i < levels; i++, ptshift -= ptidxbits) {
> +        target_ulong pte;
> +        hwaddr pte_addr;
> +        target_ulong idx;
> +
> +        idx = (addr >> (PGSHIFT + ptshift)) & ((1 << (ptidxbits + widened))-1);
> +        pte_addr = base + idx * ptesize;
> +        widened = 0;
> +
> +        if (ptesize == 4) {
> +            pte = address_space_ldl(&address_space_memory, pte_addr, ma, &res);
> +        } else {
> +            pte = address_space_ldq(&address_space_memory, pte_addr, ma, &res);
> +        }
> +
> +        if (res != MEMTX_OK) {
> +            return RIO_ERR(RIO_ERRC_PDT_FAULT);
> +        }
> +
> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> +        if (!(pte & PTE_V)) {
> +            /* Invalid PTE */
> +            return RIO_ERR(RIO_ERRC_PDT_INVALID);
> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> +            /* Inner PTE, continue walking */
> +            base = ppn << PGSHIFT;
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> +            /* Reserved leaf PTE flags: PTE_W */
> +            return RIO_ERR(RIO_ERRC_PDT_INVALID);
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> +            /* Reserved leaf PTE flags: PTE_W + PTE_X */
> +            return RIO_ERR(RIO_ERRC_PDT_INVALID);
> +        } else if (ppn & ((1ULL << ptshift) - 1)) {
> +            /* Misaligned PPN */
> +            return RIO_ERR(RIO_ERRC_PDT_INVALID);
> +        } else if ((access & IOMMU_RO) && !(pte & PTE_R)) {
> +            /* Read access check failed */
> +            return first_stage ? RIO_ERR(RIO_ERRC_GPGFAULT_RD)
> +                               : RIO_ERR(RIO_ERRC_PGFAULT_RD);
> +        } else if ((access & IOMMU_WO) && !(pte & PTE_W)) {
> +            /* Write access check failed */
> +            return first_stage ? RIO_ERR(RIO_ERRC_GPGFAULT_WR)
> +                               : RIO_ERR(RIO_ERRC_PGFAULT_WR);
> +        } else {
> +            /* Leaf PTE, update base to translated address. */
> +            target_ulong vpn = addr >> PGSHIFT;
> +            base = ((ppn | (vpn & ((1L << ptshift) - 1))) << PGSHIFT) |
> +                    (addr & ~TARGET_PAGE_MASK);
> +        }
> +
> +        /* Do the second stage translation if enabled. */
> +        if (first_stage) {
> +            hwaddr spa;
> +
> +            int ret = rivos_iommu_fetch_pa(as, base, &spa,
> +                                           gatp, satp, false, access);
> +
> +            /* Report back GPA causing second stage translation fault. */
> +            if (ret) {
> +                *physical = base;
> +                return ret;
> +            }
> +
> +            base = spa;
> +        }
> +
> +        if (pte & (PTE_R | PTE_W | PTE_X)) {
> +            /* Leaf PTE, return translated address */
> +            *physical = base;
> +            return RIO_ERR_NONE;
> +        }
> +    }
> +    return RIO_ERR(RIO_ERRC_PDT_INVALID);
> +}
> +
> +/* Risc-V IOMMU Device Directory Tree walker.
> + *
> + * Returns RIO_ERR_ with fault code.
> + */
> +static int rivos_iommu_fetch_dc(RivosIOMMUState *iommu, uint32_t devid,
> +    RivosIOMMUDeviceContext *dc)
> +{
> +    MemTxResult res;
> +    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
> +    hwaddr addr;
> +    const bool dcbase = !iommu->enable_msi;
> +    const size_t dcsize = sizeof(*dc) >> dcbase;
> +    unsigned int depth = RIO_DDTP_MODE_1LVL - iommu->ddt_mode;
> +
> +    if (depth > 2) {
> +        return RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED);
> +    }
> +
> +    /* Check supported device id range. */
> +    if (devid >= (1 << (depth * 9 + 6 + (dcbase && depth != 2)))) {
> +        return RIO_ERR(RIO_ERRC_DDT_INVALID);
> +    }
> +
> +    for (addr = iommu->ddt_base; depth-- > 0; ) {
> +        const int split = depth * 9 + 6 + dcbase;
> +        addr |= ((devid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        uint64_t dde = address_space_ldq(&address_space_memory, addr, ma, &res);
> +        if (res != MEMTX_OK) {
> +            return RIO_ERR(RIO_ERRC_DDT_FAULT);
> +        }
> +        if (!(dde & RIO_DCTC_VALID)) {
> +            return RIO_ERR(RIO_ERRC_DDT_INVALID);
> +        }
> +        addr = dde & RIO_DDTE_MASK_PPN;
> +    }
> +
> +    /* index into device context entry page */
> +    addr |= (devid * dcsize) & ~TARGET_PAGE_MASK;
> +
> +    memset(dc, 0, sizeof(*dc));
> +    res = dma_memory_read(&address_space_memory, addr, dc, dcsize, ma);
> +
> +    if (res != MEMTX_OK) {
> +        return RIO_ERR(RIO_ERRC_DDT_FAULT);
> +    }
> +
> +    if (!(dc->tc & RIO_DCTC_VALID)) {
> +        return RIO_ERR(RIO_ERRC_DDT_INVALID);
> +    }
> +
> +    return RIO_ERR_NONE;
> +}
> +
> +static void rivos_iommu_translate_tlb(RivosIOMMUSpace *as,
> +    IOMMUAccessFlags flag, IOMMUTLBEntry *tlb)
> +{
> +    RivosIOMMUState *iommu = as->iommu;
> +
> +    if (!as->dc_valid) {
> +        /* Fetch device context if not cached. */
> +        int ret = rivos_iommu_fetch_dc(iommu, as->devid, &as->dc);
> +        if (ret != RIO_ERR_NONE) {
> +            rivos_iommu_fault(as, ret);
> +            return;
> +        } else {
> +            as->dc_valid = true;
> +        }
> +    }
> +
> +    /* MSI window */
> +    if (!(((tlb->iova >> PGSHIFT) ^ as->dc.msi_addr_pattern) &
> +        ~as->dc.msi_addr_mask)) {
> +        if (flag != IOMMU_WO) {
> +            /* only writes are allowed. */
> +            rivos_iommu_fault_iova(as, RIO_ERR(RIO_ERRC_MRIF_FAULT),
> +                                   tlb->iova, 0);
> +            return;
> +        }
> +        if (tlb->iova & ~TARGET_PAGE_MASK) {
> +            /* unaligned access. */
> +            rivos_iommu_fault_iova(as, RIO_ERR(RIO_ERRC_MRIF_FAULT),
> +                                   tlb->iova, 0);
> +            return;
> +        }
> +        if (!(as->dc.msiptp & RIO_DCMSI_VALID)) {
> +            /* MSI remapping not enabled */
> +            rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_INVALID));
> +            return;
> +        }
> +        int mode = get_field(as->dc.msiptp, RIO_DCMSI_MASK_MODE);
> +        switch (mode) {
> +            case RIO_DCMSI_MODE_BARE:
> +                tlb->translated_addr = tlb->iova;
> +                tlb->addr_mask = ((1ULL << PGSHIFT) - 1);
> +                tlb->perm = flag;
> +                break;
> +
> +            case RIO_DCMSI_MODE_FLAT:
> +                /* TODO: not implemented, follow AIA section 9.5 */
> +                rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED));
> +                return;
> +
> +            default:
> +                rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED));
> +                return;
> +        }
> +
> +        return;
> +    }
> +
> +    /* Lookup SATP */
> +    if (as->dc.tc & RIO_DCTC_PDTV) {
> +        /* Process directory tree is not supported yet. */
> +        rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_PDT_UNSUPPORTED));
> +        return;
> +    }
> +
> +    /* Lookup IOATC */
> +    /* TODO: merge in IOATC PoC */
> +
> +    /* Memory access */
> +    hwaddr physical;
> +    int err = rivos_iommu_fetch_pa(as, tlb->iova, &physical,
> +                                   as->dc.gatp, as->dc.satp,
> +                                   iommu->enable_stage_one, flag);
> +    if (err == RIO_ERR_NONE) {
> +        tlb->translated_addr = physical;
> +        tlb->addr_mask = ((1ULL << PGSHIFT) - 1);
> +        tlb->perm = flag;
> +    } else if (!(as->dc.tc & RIO_DCTC_DIS_TRANS_FAULT)) {
> +        const int fault = RIO_ERR_IO(RIO_ERR_CAUSE(err),
> +            flag == IOMMU_WO ? RIO_TTYP_UWR : RIO_TTYP_URD);
> +        rivos_iommu_fault_iova(as, fault, tlb->iova, physical);
> +    }
> +
> +    return;
> +}
> +
> +static const char *IOMMU_FLAG_STR[] = {
> +    "NA",
> +    "RO",
> +    "WR",
> +    "RW",
> +};
> +
> +/* Called from RCU critical section */
> +static IOMMUTLBEntry rivos_iommu_translate(IOMMUMemoryRegion *iommu_mr,
> +    hwaddr addr, IOMMUAccessFlags flag, int iommu_idx)
> +{
> +    RivosIOMMUSpace *as = container_of(iommu_mr, RivosIOMMUSpace, mr);
> +    const uint32_t ddt_mode = as->iommu->ddt_mode;
> +    IOMMUTLBEntry tlb = {
> +        .iova = addr,
> +        .target_as = &address_space_memory,
> +        .perm = IOMMU_NONE,
> +    };
> +
> +    switch (ddt_mode) {
> +        case RIO_DDTP_MODE_OFF:
> +            /* All translations disabled, power-on state. */
> +            rivos_iommu_fault_iova(as, RIO_ERR(RIO_ERRC_DMA_DISABLED),
> +                                   tlb.iova, 0);
> +            break;
> +
> +        case RIO_DDTP_MODE_BARE:
> +            /* Global passthrough mode enabled for all devices. */
> +            tlb.translated_addr = tlb.iova;
> +            tlb.addr_mask = ~0ULL;
> +            tlb.perm = flag;
> +            break;
> +
> +        case RIO_DDTP_MODE_3LVL:
> +        case RIO_DDTP_MODE_2LVL:
> +        case RIO_DDTP_MODE_1LVL:
> +            /* Translate using device directory information. */
> +            rivos_iommu_translate_tlb(as, flag, &tlb);
> +            break;
> +
> +        default:
> +            /* Invalid device directory tree mode, should never happen. */
> +            rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED));
> +            break;
> +    }
> +
> +    trace_rivos_iommu_dma(PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid),
> +        PCI_FUNC(as->devid), IOMMU_FLAG_STR[tlb.perm & IOMMU_RW],
> +        tlb.iova, tlb.translated_addr);
> +
> +    return tlb;
> +}
> +
> +static void rivos_iommu_iodir_inval_ddt(RivosIOMMUState *s, bool all,
> +    uint32_t devid)
> +{
> +    RivosIOMMUSpace *as;
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    QLIST_FOREACH(as, &s->spaces, list) {
> +        if (all || (as->devid == devid)) {
> +            as->dc_valid = false;
> +        }
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +}
> +
> +static void rivos_iommu_iofence(RivosIOMMUState *s, bool av, uint64_t addr,
> +    uint32_t data)
> +{
> +    MemTxResult res;
> +    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
> +
> +    if (av) {
> +        res = dma_memory_write(&address_space_memory, addr, &data, sizeof(data),
> +                               ma);
> +        if (res != MEMTX_OK) {
> +            rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, RIO_CQ_FAULT, 0);
> +        }
> +    }
> +}
> +
> +static int rivos_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
> +    IOMMUNotifierFlag old, IOMMUNotifierFlag new, Error **errp)
> +{
> +    if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
> +        error_setg(errp, "rivos-iommu does not support dev-iotlb");
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static void rivos_iommu_process_cq_tail(RivosIOMMUState *s)
> +{
> +    RivosIOMMUCommand cmd;
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
> +    uint32_t tail;
> +    uint32_t ctrl = ldl_le_p(&s->regs_rw[RIO_REG_CQ_CONTROL]);
> +    uint32_t bdf = pci_get_bdf(&s->pci);
> +    uint32_t err = 0;
> +
> +    /* Fetch latest tail position and clear busy marker */
> +    s->cq_tail_db = false;
> +    tail = s->cq_mask & ldl_le_p(&s->regs_rw[RIO_REG_CQ_TAIL]);
> +
> +    /* Check for pending error or queue processing disabled */
> +    if (!(ctrl & RIO_CQ_ACTIVE) || !!(ctrl & (RIO_CQ_ERROR | RIO_CQ_FAULT)))
> +    {
> +        return;
> +    }
> +
> +    while (tail != s->cq_head) {
> +        addr = s->cq_base  + s->cq_head * sizeof(cmd);
> +        res = dma_memory_read(&address_space_memory, addr, &cmd, sizeof(cmd),
> +                              ma);
> +
> +        if (res != MEMTX_OK) {
> +            err = RIO_CQ_FAULT;
> +            break;
> +        }
> +
> +        trace_rivos_iommu_cmd(PCI_BUS_NUM(bdf), PCI_SLOT(bdf),
> +                              PCI_FUNC(bdf), cmd.request, cmd.address);
> +
> +        int fun_op = get_field(cmd.request, RIO_CMD_MASK_FUN_OP);
> +
> +        switch(fun_op) {
> +            case RIO_CMD_IOFENCE_C:
> +                rivos_iommu_iofence(s, !!(cmd.request & RIO_IOFENCE_AV),
> +                    cmd.address,
> +                    get_field(cmd.request, RIO_IOFENCE_MASK_DATA));
> +                break;
> +
> +            case RIO_CMD_IOTINVAL_GVMA:
> +                /* IOTLB not implemented */
> +                break;
> +
> +            case RIO_CMD_IOTINVAL_MSI:
> +                /* IOTLB not implemented */
> +                break;
> +
> +            case RIO_CMD_IOTINVAL_VMA:
> +                /* IOTLB not implemented */
> +                break;
> +
> +            case RIO_CMD_IODIR_INV_DDT:
> +                rivos_iommu_iodir_inval_ddt(s,
> +                        !(cmd.request & RIO_IODIR_DID_VALID),
> +                        get_field(cmd.request, RIO_IODIR_MASK_DID));
> +                break;
> +
> +            case RIO_CMD_IODIR_INV_PDT:
> +                /* PDT invalidate not implemented. */
> +                break;
> +
> +            case RIO_CMD_IODIR_PRE_DDT:
> +                /* DDT pre-fetching not implemented. */
> +                break;
> +
> +            case RIO_CMD_IODIR_PRE_PDT:
> +                /* PDT pre-fetching not implemented. */
> +                break;
> +
> +            default:
> +                err = RIO_CQ_ERROR;
> +                break;
> +        }
> +
> +        /* Invalid instruction, keep cq_head at failed instruction index. */
> +        if (err) {
> +            break;
> +        }
> +
> +        s->cq_head = (s->cq_head + 1) & s->cq_mask;
> +    }
> +
> +    stl_le_p(&s->regs_rw[RIO_REG_CQ_HEAD], s->cq_head);
> +
> +    if (err) {
> +        rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, err, 0);
> +    }
> +
> +    if (ctrl & RIO_CQ_IRQ_ENABLE) {
> +        rivos_iommu_irq_assert(s, RIO_INT_CQ);
> +    }
> +}
> +
> +static void rivos_iommu_process_ddtp(RivosIOMMUState *s)
> +{
> +    uint64_t base = ldq_le_p(&s->regs_rw[RIO_REG_DDTP]) & ~RIO_DDTP_BUSY;
> +    uint32_t mode = get_field(base, RIO_DDTP_MASK_MODE);
> +    bool ok;
> +
> +    /* Allowed DDTP.MODE transitions:
> +     * {OFF, BARE} -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
> +     */
> +
> +    if (s->ddt_mode == mode) {
> +        ok = true;
> +    } else if (s->ddt_mode == RIO_DDTP_MODE_OFF ||
> +               s->ddt_mode == RIO_DDTP_MODE_BARE) {
> +        ok = mode == RIO_DDTP_MODE_1LVL ||
> +             mode == RIO_DDTP_MODE_2LVL ||
> +             mode == RIO_DDTP_MODE_3LVL;
> +    } else {
> +        ok = mode == RIO_DDTP_MODE_OFF ||
> +             mode == RIO_DDTP_MODE_BARE;
> +    }
> +
> +    if (ok) {
> +        s->ddt_base = get_field(base, RIO_DDTP_MASK_PPN) << PGSHIFT;
> +        s->ddt_mode = mode;
> +    } else {
> +        /* report back last valid mode and device directory table pointer. */
> +        base = s->ddt_base >> PGSHIFT;
> +        base = set_field(base, RIO_DDTP_MASK_MODE, s->ddt_mode);
> +    }
> +
> +    stq_le_p(&s->regs_rw[RIO_REG_DDTP], base);
> +}
> +
> +static void rivos_iommu_process_cq_control(RivosIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = ldl_le_p(&s->regs_rw[RIO_REG_CQ_CONTROL]);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RIO_FQ_ENABLE);
> +    bool active = !!(ctrl_set & RIO_FQ_ACTIVE);
> +
> +    if (enable && !active) {
> +        base = ldq_le_p(&s->regs_rw[RIO_REG_CQ_BASE]);
> +        s->cq_mask = (2ULL << get_field(base, RIO_CQ_MASK_LOG2SZ)) - 1;
> +        s->cq_base = get_field(base, RIO_CQ_MASK_PPN) << PGSHIFT;
> +        s->cq_head = 0;
> +        rivos_iommu_irq_use(s, RIO_INT_CQ);
> +        stl_le_p(&s->regs_ro[RIO_REG_CQ_TAIL], ~s->cq_mask);
> +        stl_le_p(&s->regs_rw[RIO_REG_CQ_HEAD], s->cq_head);
> +        stl_le_p(&s->regs_rw[RIO_REG_CQ_TAIL], s->cq_head);
> +        ctrl_set = RIO_CQ_ACTIVE;
> +        ctrl_clr = RIO_CQ_BUSY | RIO_CQ_FAULT | RIO_CQ_ERROR | RIO_CQ_TIMEOUT;
> +    } else if (!enable && active) {
> +        rivos_iommu_irq_unuse(s, RIO_INT_CQ);
> +        stl_le_p(&s->regs_ro[RIO_REG_CQ_TAIL], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RIO_CQ_BUSY | RIO_CQ_ACTIVE;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RIO_CQ_BUSY;
> +    }
> +
> +    rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, ctrl_set, ctrl_clr);
> +}
> +
> +static void rivos_iommu_process_fq_control(RivosIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = ldl_le_p(&s->regs_rw[RIO_REG_FQ_CONTROL]);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RIO_FQ_ENABLE);
> +    bool active = !!(ctrl_set & RIO_FQ_ACTIVE);
> +
> +    if (enable && !active) {
> +        base = ldq_le_p(&s->regs_rw[RIO_REG_FQ_BASE]);
> +        s->fq_mask = (2ULL << get_field(base, RIO_FQ_MASK_LOG2SZ)) - 1;
> +        s->fq_base = get_field(base, RIO_FQ_MASK_PPN) << PGSHIFT;
> +        s->fq_tail = 0;
> +        rivos_iommu_irq_use(s, RIO_INT_FQ);
> +        stl_le_p(&s->regs_rw[RIO_REG_FQ_HEAD], s->fq_tail);
> +        stl_le_p(&s->regs_rw[RIO_REG_FQ_TAIL], s->fq_tail);
> +        stl_le_p(&s->regs_ro[RIO_REG_FQ_HEAD], ~s->fq_mask);
> +        ctrl_set = RIO_FQ_ACTIVE;
> +        ctrl_clr = RIO_FQ_BUSY | RIO_FQ_FAULT | RIO_FQ_FULL;
> +    } else if (!enable && active) {
> +        rivos_iommu_irq_unuse(s, RIO_INT_FQ);
> +        stl_le_p(&s->regs_ro[RIO_REG_FQ_HEAD], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RIO_FQ_BUSY | RIO_FQ_ACTIVE;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RIO_FQ_BUSY;
> +    }
> +
> +    rivos_iommu_reg_mod(s, RIO_REG_FQ_CONTROL, ctrl_set, ctrl_clr);
> +}
> +
> +static void rivos_iommu_process_pq_control(RivosIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = ldl_le_p(&s->regs_rw[RIO_REG_PQ_CONTROL]);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RIO_PQ_ENABLE);
> +    bool active = !!(ctrl_set & RIO_PQ_ACTIVE);
> +
> +    if (enable && !active) {
> +        base = ldq_le_p(&s->regs_rw[RIO_REG_PQ_BASE]);
> +        s->pq_mask = (2ULL << get_field(base, RIO_PQ_MASK_LOG2SZ)) - 1;
> +        s->pq_base = get_field(base, RIO_PQ_MASK_PPN) << PGSHIFT;
> +        s->pq_tail = 0;
> +        rivos_iommu_irq_use(s, RIO_INT_PQ);
> +        stl_le_p(&s->regs_rw[RIO_REG_PQ_HEAD], s->pq_tail);
> +        stl_le_p(&s->regs_rw[RIO_REG_PQ_TAIL], s->pq_tail);
> +        stl_le_p(&s->regs_ro[RIO_REG_PQ_HEAD], ~s->pq_mask);
> +        ctrl_set = RIO_PQ_ACTIVE;
> +        ctrl_clr = RIO_PQ_BUSY | RIO_PQ_FAULT | RIO_PQ_FULL;
> +    } else if (!enable && active) {
> +        rivos_iommu_irq_unuse(s, RIO_INT_PQ);
> +        stl_le_p(&s->regs_ro[RIO_REG_PQ_HEAD], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RIO_PQ_BUSY | RIO_PQ_ACTIVE;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RIO_PQ_BUSY;
> +    }
> +
> +    rivos_iommu_reg_mod(s, RIO_REG_PQ_CONTROL, ctrl_set, ctrl_clr);
> +}
> +
> +static void *rivos_iommu_core_proc(void* arg)
> +{
> +    RivosIOMMUState *s = arg;
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    while (!s->core_stop) {
> +        if (s->cq_tail_db) {
> +            qemu_mutex_unlock(&s->core_lock);
> +            rivos_iommu_process_cq_tail(s);
> +        } else if (ldl_le_p(&s->regs_rw[RIO_REG_CQ_CONTROL]) & RIO_CQ_BUSY) {
> +            qemu_mutex_unlock(&s->core_lock);
> +            rivos_iommu_process_cq_control(s);
> +        } else if (ldl_le_p(&s->regs_rw[RIO_REG_FQ_CONTROL]) & RIO_FQ_BUSY) {
> +            qemu_mutex_unlock(&s->core_lock);
> +            rivos_iommu_process_fq_control(s);
> +        } else if (ldl_le_p(&s->regs_rw[RIO_REG_PQ_CONTROL]) & RIO_PQ_BUSY) {
> +            qemu_mutex_unlock(&s->core_lock);
> +            rivos_iommu_process_pq_control(s);
> +        } else if (ldq_le_p(&s->regs_rw[RIO_REG_DDTP]) & RIO_DDTP_BUSY) {
> +            qemu_mutex_unlock(&s->core_lock);
> +            rivos_iommu_process_ddtp(s);
> +        } else {
> +            qemu_cond_wait(&s->core_cond, &s->core_lock);
> +            continue;
> +        }
> +        qemu_mutex_lock(&s->core_lock);
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +
> +    return NULL;
> +}
> +
> +static void rivos_iommu_mmio_write(void *opaque, hwaddr addr, uint64_t val,
> +                             unsigned size)
> +{
> +    RivosIOMMUState *s = opaque;
> +    uint64_t busy = 0;
> +    bool wakeup = true;
> +
> +    if (addr + size > sizeof(s->regs_rw)) {
> +        /* unsupported MMIO access location */
> +        return;
> +    }
> +
> +    /* actionable MMIO write. */
> +    switch (addr) {
> +        case RIO_REG_DDTP:
> +            busy = RIO_DDTP_BUSY;
> +            break;
> +
> +        /* upper half DDTP update */
> +        case RIO_REG_DDTP + 4:
> +            busy = RIO_DDTP_BUSY >> 32;
> +            break;
> +
> +        case RIO_REG_CQ_TAIL:
> +            s->cq_tail_db = true;
> +            break;
> +
> +        case RIO_REG_CQ_CONTROL:
> +            busy = RIO_CQ_BUSY;
> +            break;
> +
> +        case RIO_REG_FQ_CONTROL:
> +            busy = RIO_FQ_BUSY;
> +            break;
> +
> +        case RIO_REG_PQ_CONTROL:
> +            busy = RIO_PQ_BUSY;
> +            break;
> +
> +        default:
> +            wakeup = false;
> +            break;
> +    }
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    if (size == 1) {
> +        uint8_t ro = s->regs_ro[addr];
> +        uint8_t wc = s->regs_wc[addr];
> +        uint8_t rw = s->regs_rw[addr];
> +        s->regs_rw[addr] = ((rw & ro) | (val & ~ro)) & ~(val & wc);
> +    } else if (size == 2) {
> +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
> +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
> +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
> +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (val & ~ro)) & ~(val & wc));
> +    } else if (size == 4) {
> +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]) | busy;
> +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (val & ~ro)) & ~(val & wc));
> +    } else if (size == 8) {
> +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]) | busy;
> +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (val & ~ro)) & ~(val & wc));
> +    }
> +
> +    /* wakeup core processing thread */
> +    if (wakeup) {
> +        qemu_cond_signal(&s->core_cond);
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +}
> +
> +static uint64_t rivos_iommu_mmio_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    RivosIOMMUState *s = opaque;
> +    uint64_t val = -1;
> +
> +    if (addr + size > sizeof(s->regs_rw)) {
> +        return (uint64_t)-1;
> +    } else if (size == 1) {
> +        val = (uint64_t) s->regs_rw[addr];
> +    } else if (size == 2) {
> +        val = lduw_le_p(&s->regs_rw[addr]);
> +    } else if (size == 4) {
> +        val = ldl_le_p(&s->regs_rw[addr]);
> +    } else if (size == 8) {
> +        val = ldq_le_p(&s->regs_rw[addr]);

Is there a reason you are using these byte swapping helper functions?

Alistair

> +    }
> +
> +    return val;
> +}
> +
> +static const MemoryRegionOps rivos_iommu_mmio_ops = {
> +    .read = rivos_iommu_mmio_read,
> +    .write = rivos_iommu_mmio_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +        .unaligned = false,
> +    },
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +static AddressSpace *rivos_iommu_dma_as(PCIBus *bus, void *opaque, int devfn)
> +{
> +    RivosIOMMUState *s = opaque;
> +    RivosIOMMUSpace *as;
> +    char name[64];
> +    uint32_t devid = PCI_BUILD_BDF(pci_bus_num(bus), devfn);
> +    uint32_t iommu_devid = pci_get_bdf(&s->pci);
> +
> +    if (iommu_devid == devid) {
> +        /* No translation for IOMMU device itself. */
> +        return &address_space_memory;
> +    }
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    QLIST_FOREACH(as, &s->spaces, list) {
> +        if (as->devid == devid)
> +            break;
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +
> +    if (as == NULL) {
> +        as = g_malloc0(sizeof(RivosIOMMUSpace));
> +
> +        as->iommu = s;
> +        as->devid = devid;
> +
> +        snprintf(name, sizeof(name), "rivos-iommu-%04x:%02x.%d-iova",
> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +
> +        memory_region_init_iommu(&as->mr, sizeof(as->mr),
> +            TYPE_RIVOS_IOMMU_MEMORY_REGION,
> +            OBJECT(as), name, UINT64_MAX);
> +
> +        address_space_init(&as->as, MEMORY_REGION(&as->mr),
> +                           TYPE_RIVOS_IOMMU_PCI);
> +
> +        qemu_mutex_lock(&s->core_lock);
> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
> +        qemu_mutex_unlock(&s->core_lock);
> +
> +        trace_rivos_iommu_new(PCI_BUS_NUM(iommu_devid), PCI_SLOT(iommu_devid),
> +            PCI_FUNC(iommu_devid), PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid),
> +            PCI_FUNC(as->devid));
> +    }
> +
> +    return &as->as;
> +}
> +
> +static void rivos_iommu_reg_reset(RivosIOMMUState *s)
> +{
> +    const uint64_t cap = (s->version & RIO_CAP_REVISION_MASK) |
> +                   (s->enable_stage_one * RIO_CAP_STAGE_ONE) |
> +                   (s->enable_stage_two * RIO_CAP_STAGE_TWO) |
> +                   (s->enable_msi * RIO_CAP_MSI);
> +
> +    /* Mark all registers read-only */
> +    memset(s->regs_ro, 0xff, sizeof(s->regs_ro));
> +
> +    /* Set power-on register state */
> +    stq_le_p(&s->regs_rw[RIO_REG_CAP], cap);
> +    stq_le_p(&s->regs_ro[RIO_REG_DDTP],
> +        ~(RIO_DDTP_MASK_PPN | RIO_DDTP_MASK_MODE));
> +    stq_le_p(&s->regs_ro[RIO_REG_CQ_BASE],
> +        ~(RIO_CQ_MASK_LOG2SZ | RIO_CQ_MASK_PPN));
> +    stq_le_p(&s->regs_ro[RIO_REG_FQ_BASE],
> +        ~(RIO_FQ_MASK_LOG2SZ | RIO_FQ_MASK_PPN));
> +    stq_le_p(&s->regs_ro[RIO_REG_PQ_BASE],
> +        ~(RIO_PQ_MASK_LOG2SZ | RIO_PQ_MASK_PPN));
> +    stl_le_p(&s->regs_wc[RIO_REG_CQ_CONTROL],
> +        RIO_CQ_FAULT | RIO_CQ_TIMEOUT | RIO_CQ_ERROR);
> +    stl_le_p(&s->regs_ro[RIO_REG_CQ_CONTROL], RIO_CQ_ACTIVE | RIO_CQ_BUSY);
> +    stl_le_p(&s->regs_wc[RIO_REG_FQ_CONTROL], RIO_FQ_FAULT | RIO_FQ_FULL);
> +    stl_le_p(&s->regs_ro[RIO_REG_FQ_CONTROL], RIO_FQ_ACTIVE | RIO_FQ_BUSY);
> +    stl_le_p(&s->regs_wc[RIO_REG_PQ_CONTROL], RIO_PQ_FAULT | RIO_PQ_FULL);
> +    stl_le_p(&s->regs_ro[RIO_REG_PQ_CONTROL], RIO_PQ_ACTIVE | RIO_PQ_BUSY);
> +    stl_le_p(&s->regs_wc[RIO_REG_IPSR], ~0);
> +}
> +
> +static void rivos_iommu_realize(PCIDevice *dev, Error **errp)
> +{
> +    DeviceState *d = DEVICE(dev);
> +    RivosIOMMUState *s = RIVOS_IOMMU_PCI(d);
> +    const uint64_t bar_size =
> +        pow2ceil(QEMU_ALIGN_UP(sizeof(s->regs_rw), TARGET_PAGE_SIZE));
> +    Error *err = NULL;
> +
> +    QLIST_INIT(&s->spaces);
> +    qemu_cond_init(&s->core_cond);
> +    qemu_mutex_init(&s->core_lock);
> +    rivos_iommu_reg_reset(s);
> +
> +    qemu_thread_create(&s->core_proc, "rivos-iommu-core",
> +        rivos_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
> +
> +    memory_region_init(&s->bar0, OBJECT(s),
> +            "rivos-iommu-bar0", bar_size);
> +    memory_region_init_io(&s->mmio, OBJECT(s), &rivos_iommu_mmio_ops, s,
> +            "rivos-iommu", sizeof(s->regs_rw));
> +    memory_region_add_subregion(&s->bar0, 0, &s->mmio);
> +
> +    pcie_endpoint_cap_init(dev, 0x80);
> +
> +    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> +            PCI_BASE_ADDRESS_MEM_TYPE_64, &s->bar0);
> +
> +    int ret = msix_init(dev, RIO_INT_COUNT,
> +                    &s->bar0, 0, RIO_REG_MSI_ADDR_BASE,
> +                    &s->bar0, 0, RIO_REG_MSI_PBA_BASE, 0, &err);
> +
> +    if (ret == -ENOTSUP) {
> +        /* MSI-x is not supported by the platform.
> +         * Driver should use timer/polling based notification handlers.
> +         */
> +        warn_report_err(err);
> +    } else if (ret < 0) {
> +        error_propagate(errp, err);
> +        return;
> +    }
> +
> +    /* TODO: find root port bus ranges and use for FDT/ACPI generation. */
> +    PCIBus *bus = pci_device_root_bus(dev);
> +    if (!bus) {
> +        error_setg(errp, "can't find PCIe root port for %02x:%02x.%x",
> +            pci_bus_num(pci_get_bus(dev)), PCI_SLOT(dev->devfn),
> +            PCI_FUNC(dev->devfn));
> +        return;
> +    }
> +
> +    pci_setup_iommu(bus, rivos_iommu_dma_as, s);
> +}
> +
> +static void rivos_iommu_exit(PCIDevice *dev)
> +{
> +    DeviceState *d = DEVICE(dev);
> +    RivosIOMMUState *s = RIVOS_IOMMU_PCI(d);
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    s->core_stop = true;
> +    qemu_cond_signal(&s->core_cond);
> +    qemu_mutex_unlock(&s->core_lock);
> +    qemu_thread_join(&s->core_proc);
> +    qemu_cond_destroy(&s->core_cond);
> +    qemu_mutex_destroy(&s->core_lock);
> +}
> +
> +static const VMStateDescription rivos_iommu_vmstate = {
> +    .name = "rivos-iommu",
> +    .unmigratable = 1
> +};
> +
> +static Property rivos_iommu_properties[] = {
> +    DEFINE_PROP_UINT32("version", RivosIOMMUState, version, RIO_CAP_REVISION),
> +    DEFINE_PROP_BOOL("msi", RivosIOMMUState, enable_msi, TRUE),
> +    DEFINE_PROP_BOOL("stage-one", RivosIOMMUState, enable_stage_one, TRUE),
> +    DEFINE_PROP_BOOL("stage-two", RivosIOMMUState, enable_stage_two, TRUE),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void rivos_iommu_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
> +
> +    device_class_set_props(dc, rivos_iommu_properties);
> +    k->realize = rivos_iommu_realize;
> +    k->exit = rivos_iommu_exit;
> +    k->vendor_id = PCI_VENDOR_ID_RIVOS;
> +    k->device_id = PCI_DEVICE_ID_RIVOS_IOMMU;
> +    k->revision = 0;
> +    k->class_id = PCI_CLASS_SYSTEM_IOMMU;
> +    dc->desc = "RIVOS-IOMMU (RIO) DMA Remapping device";
> +    dc->vmsd = &rivos_iommu_vmstate;
> +    dc->hotpluggable = false;
> +    dc->user_creatable = true;
> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +}
> +
> +static const TypeInfo rivos_iommu_pci = {
> +    .name = TYPE_RIVOS_IOMMU_PCI,
> +    .parent = TYPE_PCI_DEVICE,
> +    .instance_size = sizeof(RivosIOMMUState),
> +    .class_init = rivos_iommu_class_init,
> +    .interfaces = (InterfaceInfo[]) {
> +        { INTERFACE_PCIE_DEVICE },
> +        { },
> +    },
> +};
> +
> +static void rivos_iommu_memory_region_class_init(ObjectClass *klass, void *data)
> +{
> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> +
> +    imrc->translate = rivos_iommu_translate;
> +    imrc->notify_flag_changed = rivos_iommu_notify_flag_changed;
> +}
> +
> +static const TypeInfo rivos_iommu_memory_region_info = {
> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> +    .name = TYPE_RIVOS_IOMMU_MEMORY_REGION,
> +    .class_init = rivos_iommu_memory_region_class_init,
> +};
> +
> +static void rivos_iommu_register_types(void)
> +{
> +    type_register_static(&rivos_iommu_pci);
> +    type_register_static(&rivos_iommu_memory_region_info);
> +}
> +
> +type_init(rivos_iommu_register_types);
> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> new file mode 100644
> index 0000000000..c3618764ed
> --- /dev/null
> +++ b/hw/riscv/trace-events
> @@ -0,0 +1,7 @@
> +# See documentation at docs/devel/tracing.rst
> +
> +# rivos-iommu.c
> +rivos_iommu_new(int bus, int slot, int func, int dbus, int dslot, int dfunc) "NEW %04x:%02x.%d attached %04x:%02x.%d"
> +rivos_iommu_flt(int bus, int slot, int func, int cause, uint64_t iova) "FLT %04x:%02x.%d cause: %d iova: 0x%"PRIx64
> +rivos_iommu_dma(int bus, int slot, int func, const char *dir, uint64_t iova, uint64_t phys) "TLB q%04x:%02x.%d %s 0x%"PRIx64" -> 0x%"PRIx64
> +rivos_iommu_cmd(int bus, int slot, int func, uint64_t l, uint64_t u) "CMD %04x:%02x.%d 0x%"PRIx64" 0x%"PRIx64
> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> new file mode 100644
> index 0000000000..b88504b750
> --- /dev/null
> +++ b/hw/riscv/trace.h
> @@ -0,0 +1,2 @@
> +#include "trace/trace-hw_riscv.h"
> +
> diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
> index 11abe22d46..73dad2aced 100644
> --- a/include/hw/pci/pci_ids.h
> +++ b/include/hw/pci/pci_ids.h
> @@ -88,6 +88,7 @@
>  #define PCI_CLASS_SYSTEM_RTC             0x0803
>  #define PCI_CLASS_SYSTEM_PCI_HOTPLUG     0x0804
>  #define PCI_CLASS_SYSTEM_SDHCI           0x0805
> +#define PCI_CLASS_SYSTEM_IOMMU           0x0806
>  #define PCI_CLASS_SYSTEM_OTHER           0x0880
>
>  #define PCI_BASE_CLASS_INPUT             0x09
> diff --git a/include/hw/riscv/rivos_iommu.h b/include/hw/riscv/rivos_iommu.h
> new file mode 100644
> index 0000000000..097086d83e
> --- /dev/null
> +++ b/include/hw/riscv/rivos_iommu.h
> @@ -0,0 +1,80 @@
> +/*
> + * QEMU emulation of an RISC-V RIVOS-IOMMU
> + *
> + * Copyright (C) 2022 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RIVOS_IOMMU_H
> +#define HW_RIVOS_IOMMU_H
> +
> +#include "hw/sysbus.h"
> +#include "hw/pci/pci.h"
> +#include "hw/pci/pci_bus.h"
> +#include "qom/object.h"
> +
> +#define TYPE_RIVOS_IOMMU_PCI "rivos-iommu"
> +OBJECT_DECLARE_SIMPLE_TYPE(RivosIOMMUState, RIVOS_IOMMU_PCI)
> +
> +#define TYPE_RIVOS_IOMMU_MEMORY_REGION "rivos-iommu-memory-region"
> +
> +typedef struct RivosIOMMUState RivosIOMMUState;
> +typedef struct RivosIOMMUSpace RivosIOMMUSpace;
> +
> +#define RIVOS_IOMMU_REGS_SIZE 0x300  /* control registers space        */
> +
> +/*
> + * IO virtual address space remapping device state.
> + */
> +struct RivosIOMMUState {
> +    PCIDevice pci;                   /* Parent PCI device              */
> +
> +    MemoryRegion bar0;
> +    MemoryRegion mmio;
> +    uint8_t regs_rw[RIVOS_IOMMU_REGS_SIZE]; /* MMIO register state     */
> +    uint8_t regs_wc[RIVOS_IOMMU_REGS_SIZE]; /* MMIO write-1-to-clear   */
> +    uint8_t regs_ro[RIVOS_IOMMU_REGS_SIZE]; /* MMIO read/only mask     */
> +
> +    /* IOMMU Properties */
> +    uint32_t version;
> +    bool enable_msi;                 /* Enable MSI translation         */
> +    bool enable_stage_one;           /* Enable IOVA->GPA translation   */
> +    bool enable_stage_two;           /* Enable GPA->SPA translation    */
> +
> +    QemuCond core_cond;
> +    QemuMutex core_lock;
> +    QemuThread core_proc;
> +    bool core_stop;
> +
> +    hwaddr ddt_base;
> +    uint32_t ddt_mode;
> +    int ddt_depth;
> +
> +    hwaddr cq_base;
> +    uint32_t cq_mask;
> +    uint32_t cq_head;
> +    bool cq_tail_db;
> +
> +    hwaddr fq_base;
> +    uint32_t fq_mask;
> +    uint32_t fq_tail;
> +
> +    hwaddr pq_base;
> +    uint32_t pq_mask;
> +    uint32_t pq_tail;
> +
> +    QLIST_HEAD(, RivosIOMMUSpace) spaces;
> +};
> +
> +#endif
> diff --git a/meson.build b/meson.build
> index bae62efc9c..62d2a56326 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2688,6 +2688,7 @@ if have_system
>      'hw/ppc',
>      'hw/rdma',
>      'hw/rdma/vmw',
> +    'hw/riscv',
>      'hw/rtc',
>      'hw/s390x',
>      'hw/scsi',
> --
> 2.25.1
>
>
diff mbox series

Patch

diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index 91bb9d21c4..c6cbd7b42c 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -4,6 +4,9 @@  config RISCV_NUMA
 config IBEX
     bool
 
+config RIVOS_IOMMU
+    bool
+
 config MICROCHIP_PFSOC
     bool
     select CADENCE_SDHCI
diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
index ab6cae57ea..a2aeb5fab4 100644
--- a/hw/riscv/meson.build
+++ b/hw/riscv/meson.build
@@ -9,5 +9,6 @@  riscv_ss.add(when: 'CONFIG_SIFIVE_E', if_true: files('sifive_e.c'))
 riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
 riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
 riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
+riscv_ss.add(when: 'CONFIG_RIVOS_IOMMU', if_true: files('rivos_iommu.c'))
 
 hw_arch += {'riscv': riscv_ss}
diff --git a/hw/riscv/rivos_iommu.c b/hw/riscv/rivos_iommu.c
new file mode 100644
index 0000000000..f043a6864a
--- /dev/null
+++ b/hw/riscv/rivos_iommu.c
@@ -0,0 +1,1350 @@ 
+/*
+ * QEMU emulation of an RISC-V RIVOS-IOMMU
+ *
+ * Copyright (C) 2022 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "hw/qdev-properties.h"
+#include "hw/riscv/riscv_hart.h"
+#include "hw/riscv/rivos_iommu.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+
+#include "trace.h"
+
+
+/* Based on Rivos RISC-V IOMMU Specification, Mar 10, 2022 */
+
+/* Rivos I/O programming interface registers */
+#define RIO_REG_CAP             0x0000  /* Supported capabilities  */
+#define RIO_REG_DDTP            0x0010  /* Device Directory Table Pointer */
+#define RIO_REG_CQ_BASE         0x0018  /* Command queue base/head/tail */
+#define RIO_REG_CQ_HEAD         0x0020
+#define RIO_REG_CQ_TAIL         0x0024
+#define RIO_REG_FQ_BASE         0x0028  /* Fault queue base/head/tail */
+#define RIO_REG_FQ_HEAD         0x0030
+#define RIO_REG_FQ_TAIL         0x0034
+#define RIO_REG_PQ_BASE         0x0038  /* Page request queue base/head/tail */
+#define RIO_REG_PQ_HEAD         0x0040
+#define RIO_REG_PQ_TAIL         0x0044
+#define RIO_REG_CQ_CONTROL      0x0048  /* Command queue control */
+#define RIO_REG_FQ_CONTROL      0x004C  /* Fault queue control */
+#define RIO_REG_PQ_CONTROL      0x0050  /* Page request queue control */
+#define RIO_REG_IPSR            0x0054  /* Interrupt pending status  */
+#define RIO_REG_IOCNTOVF        0x0058
+#define RIO_REG_IOCNTINH        0x005C
+#define RIO_REG_IOHPMCYCLES     0x0060
+#define RIO_REG_IOHPMCTR_BASE   0x0068
+#define RIO_REG_IOHPMEVT_BASE   0x0160
+#define RIO_REG_IOCNTSEC        0x0258
+#define RIO_REG_IVEC            0x02F8  /* Interrupt cause to vector mapping */
+#define RIO_REG_MSI_ADDR_BASE   0x0300  /* MSI address for vector #0 */
+#define RIO_REG_MSI_DATA_BASE   0x0308  /* MSI data for vector #0 */
+#define RIO_REG_MSI_CTRL_BASE   0x030C  /* MSI control for vector #0 */
+#define RIO_REG_MSI_PBA_BASE    0x0400  /* MSI Pending Bit Array */
+
+/* Capabilities supported by the IOMMU, RIO_REG_CAP */
+#define RIO_CAP_REVISION_MASK   0x00FF
+#define RIO_CAP_STAGE_ONE      (1ULL << 8)
+#define RIO_CAP_STAGE_TWO      (1ULL << 9)
+#define RIO_CAP_MSI            (1ULL << 10)
+#define RIO_CAP_MRIF           (1ULL << 11)
+#define RIO_CAP_ATS            (1ULL << 12)
+#define RIO_CAP_AMO            (1ULL << 13)
+
+/* Device directory table pointer */
+#define RIO_DDTP_BUSY          (1ULL << 59)
+
+#define RIO_DDTP_MASK_PPN       0x00000FFFFFFFFFFFULL
+#define RIO_DDTP_MASK_MODE      0xF000000000000000ULL
+#define RIO_DDTE_MASK_PPN       0x00FFFFFFFFFFF000ULL
+
+/* Device directory mode values, within RIO_DDTP_MASK_MODE */
+#define RIO_DDTP_MODE_OFF       0
+#define RIO_DDTP_MODE_BARE      1
+#define RIO_DDTP_MODE_3LVL      2
+#define RIO_DDTP_MODE_2LVL      3
+#define RIO_DDTP_MODE_1LVL      4
+#define RIO_DDTP_MODE_MAX       RIO_DDTP_MODE_1LVL
+
+/* Command queue base register */
+#define RIO_CQ_MASK_LOG2SZ      0x000000000000001FULL
+#define RIO_CQ_MASK_PPN         0x0001FFFFFFFFFFE0ULL
+
+/* Command queue control and status register */
+#define RIO_CQ_ENABLE          (1 << 0)
+#define RIO_CQ_IRQ_ENABLE      (1 << 1)
+#define RIO_CQ_FAULT           (1 << 8)
+#define RIO_CQ_TIMEOUT         (1 << 9)
+#define RIO_CQ_ERROR           (1 << 10)
+#define RIO_CQ_ACTIVE          (1 << 16)
+#define RIO_CQ_BUSY            (1 << 17)
+
+/* Fault queue base register */
+#define RIO_FQ_MASK_LOG2SZ      0x000000000000001FULL
+#define RIO_FQ_MASK_PPN         0x0001FFFFFFFFFFE0ULL
+
+/* Fault queue control and status register */
+#define RIO_FQ_ENABLE          (1 << 0)
+#define RIO_FQ_IRQ_ENABLE      (1 << 1)
+#define RIO_FQ_FAULT           (1 << 8)
+#define RIO_FQ_FULL            (1 << 9)
+#define RIO_FQ_ACTIVE          (1 << 16)
+#define RIO_FQ_BUSY            (1 << 17)
+
+/* Page request queue base register */
+#define RIO_PQ_MASK_LOG2SZ      0x000000000000001FULL
+#define RIO_PQ_MASK_PPN         0x0001FFFFFFFFFFE0ULL
+
+/* Page request queue control and status register */
+#define RIO_PQ_ENABLE          (1 << 0)
+#define RIO_PQ_IRQ_ENABLE      (1 << 1)
+#define RIO_PQ_FAULT           (1 << 8)
+#define RIO_PQ_FULL            (1 << 9)
+#define RIO_PQ_ACTIVE          (1 << 16)
+#define RIO_PQ_BUSY            (1 << 17)
+
+/* Interrupt Sources, used for IPSR and IVEC indexing. */
+#define RIO_INT_CQ              0
+#define RIO_INT_FQ              1
+#define RIO_INT_PM              2
+#define RIO_INT_PQ              3
+#define RIO_INT_COUNT           4
+
+/* Device Context */
+typedef struct RivosIOMMUDeviceContext {
+    uint64_t  tc;          /* Translation Control */
+    uint64_t  gatp;        /* IO Hypervisor Guest Address Translation */
+    uint64_t  satp;        /* IO SATP or IO vSATP or PDTP */
+    uint64_t  pscid;       /* Process soft-context ID */
+    uint64_t  msiptp;      /* MSI Page Table Pointer (extended context) */
+    uint64_t  msi_addr_mask;
+    uint64_t  msi_addr_pattern;
+    uint64_t  _reserved;
+} RivosIOMMUDeviceContext;
+
+#define RIO_DCTC_VALID            (1ULL << 0)
+#define RIO_DCTC_EN_ATS           (1ULL << 1)
+#define RIO_DCTC_EN_PRI           (1ULL << 2)
+#define RIO_DCTC_T2GPA            (1ULL << 3)
+#define RIO_DCTC_DIS_TRANS_FAULT  (1ULL << 4)
+#define RIO_DCTC_PDTV             (1ULL << 5)
+
+/* Shared MODE:ASID:PPN masks for GATP, SATP */
+#define RIO_ATP_MASK_PPN           SATP64_PPN
+#define RIO_ATP_MASK_GSCID         SATP64_ASID
+#define RIO_ATP_MASK_MODE          SATP64_MODE
+
+#define RIO_ATP_MODE_SV32          VM_1_10_SV32
+#define RIO_ATP_MODE_SV39          VM_1_10_SV39
+#define RIO_ATP_MODE_SV48          VM_1_10_SV48
+#define RIO_ATP_MODE_SV57          VM_1_10_SV57
+#define RIO_ATP_MODE_BARE          VM_1_10_MBARE
+
+/* satp.mode when tc.RIO_DCTC_PDTV is set */
+#define RIO_PDTP_MODE_BARE         0
+#define RIO_PDTP_MODE_PD20         1
+#define RIO_PDTP_MODE_PD17         2
+#define RIO_PDTP_MODE_PD8          3
+
+#define RIO_DCMSI_VALID            1
+#define RIO_DCMSI_MASK_PPN         0x0FFFFFFFFFFFFFFEULL
+#define RIO_DCMSI_MASK_MODE        0xF000000000000000ULL
+
+#define RIO_DCMSI_MODE_BARE        0
+#define RIO_DCMSI_MODE_FLAT        1
+
+/* I/O Management Unit Command format */
+typedef struct RivosIOMMUCommand {
+    uint64_t request;
+    uint64_t address;
+} RivosIOMMUCommand;
+
+/* RivosIOMMUCommand.request opcode and function mask */
+#define RIO_CMD_MASK_FUN_OP        0x00000000000003FFULL
+
+/* opcode == IOTINVAL.* */
+#define RIO_CMD_IOTINVAL_VMA       0x001
+#define RIO_CMD_IOTINVAL_GVMA      0x081
+#define RIO_CMD_IOTINVAL_MSI       0x101
+
+#define RIO_IOTINVAL_PSCID_VALID   0x0000000000000400ULL
+#define RIO_IOTINVAL_ADDR_VALID    0x0000000000000800ULL
+#define RIO_IOTINVAL_GSCID_VALID   0x0000000000001000ULL
+#define RIO_IOTINVAL_ADDR_NAPOT    0x0000000000002000ULL
+#define RIO_IOTINVAL_MASK_PSCID    0x0000000FFFFF0000ULL
+#define RIO_IOTINVAL_MASK_GSCID    0x00FFFF0000000000ULL
+
+/* opcode == IODIR.* */
+#define RIO_CMD_IODIR_INV_DDT      0x002
+#define RIO_CMD_IODIR_PRE_DDT      0x082
+#define RIO_CMD_IODIR_INV_PDT      0x102
+#define RIO_CMD_IODIR_PRE_PDT      0x182
+
+#define RIO_IODIR_DID_VALID        0x0000000000000400ULL
+#define RIO_IODIR_MASK_PID         0x0000000FFFFF0000ULL
+#define RIO_IODIR_MASK_DID         0xFFFFFF0000000000ULL
+
+/* opcode == IOFENCE.* */
+#define RIO_CMD_IOFENCE_C          0x003
+
+#define RIO_IOFENCE_PR             0x0000000000000400ULL
+#define RIO_IOFENCE_PW             0x0000000000000800ULL
+#define RIO_IOFENCE_AV             0x0000000000001000ULL
+#define RIO_IOFENCE_MASK_DATA      0xFFFFFFFF00000000ULL
+
+/* opcode == ATS */
+#define RIO_CMD_ATS_INVAL          0x004
+#define RIO_CMD_ATS_PRGR           0x084
+
+/* Fault Queue element */
+typedef struct RivosIOMMUEvent {
+    uint64_t reason;
+    uint64_t _rsrvd;
+    uint64_t iova;
+    uint64_t phys;
+} RivosIOMMUEvent;
+
+/* Event reason */
+#define RIO_EVENT_MASK_DID         0x0000000000FFFFFFULL
+#define RIO_EVENT_MASK_PID         0x00000FFFFF000000ULL
+#define RIO_EVENT_PV               0x0000100000000000ULL
+#define RIO_EVENT_PRIV             0x0000200000000000ULL
+#define RIO_EVENT_MASK_TTYP        0x000FC00000000000ULL
+#define RIO_EVENT_MASK_CAUSE       0xFFF0000000000000ULL
+
+#define RIO_TTYP_NONE              0 /* Fault not caused by an inbound trx */
+#define RIO_TTYP_URX               1 /* Untranslated read for execute trx */
+#define RIO_TTYP_URD               2 /* Untranslated read transaction */
+#define RIO_TTYP_UWR               3 /* Untranslated write/AMO transaction */
+#define RIO_TTYP_TRX               4 /* Translated read for execute trx */
+#define RIO_TTYP_TRD               5 /* Translated read transaction */
+#define RIO_TTYP_TWR               6 /* Translated write/AMO transaction */
+#define RIO_TTYP_ATS               7 /* PCIe ATS Translation Request */
+#define RIO_TTYP_MRQ               8 /* Message Request */
+
+#define RIO_ERRC_I_ALIGN           0 /* Instruction address misaligned */
+#define RIO_ERRC_I_FAULT           1 /* Instruction access fault */
+#define RIO_ERRC_RD_ALIGN          4 /* Read address misaligned */
+#define RIO_ERRC_RD_FAULT          5 /* Read access fault */
+#define RIO_ERRC_WR_ALIGN          6 /* Write/AMO address misaligned */
+#define RIO_ERRC_WR_FAULT          7 /* Write/AMO access fault */
+#define RIO_ERRC_PGFAULT_I        12 /* Instruction page fault */
+#define RIO_ERRC_PGFAULT_RD       13 /* Read page fault */
+#define RIO_ERRC_PGFAULT_WR       15 /* Write/AMO page fault */
+#define RIO_ERRC_GPGFAULT_I       20 /* Instruction guest page fault */
+#define RIO_ERRC_GPGFAULT_RD      21 /* Read guest-page fault */
+#define RIO_ERRC_GPGFAULT_WR      23 /* Write/AMO guest-page fault */
+#define RIO_ERRC_DMA_DISABLED    256 /* Inbound transactions disallowed */
+#define RIO_ERRC_DDT_FAULT       257 /* DDT entry load access fault */
+#define RIO_ERRC_DDT_INVALID     258 /* DDT entry not valid */
+#define RIO_ERRC_DDT_UNSUPPORTED 259 /* DDT entry misconfigured */
+#define RIO_ERRC_REQ_INVALID     260 /* Transaction type disallowed */
+#define RIO_ERRC_PDT_FAULT       261 /* PDT entry load access fault. */
+#define RIO_ERRC_PDT_INVALID     262 /* PDT entry not valid */
+#define RIO_ERRC_PDT_UNSUPPORTED 263 /* PDT entry misconfigured */
+#define RIO_ERRC_MSI_FAULT       264 /* MSI PTE load access fault */
+#define RIO_ERRC_MSI_INVALID     265 /* MSI PTE not valid */
+#define RIO_ERRC_MRIF_FAULT      266 /* MRIF access fault */
+
+
+/*
+ * Rivos Inc. I/O Management Unit PCIe Device Emulation
+ */
+
+#ifndef PCI_VENDOR_ID_RIVOS
+#define PCI_VENDOR_ID_RIVOS           0x1efd
+#endif
+
+#ifndef PCI_DEVICE_ID_RIVOS_IOMMU
+#define PCI_DEVICE_ID_RIVOS_IOMMU     0x8001
+#endif
+
+/* Programming interface revision */
+#define RIO_CAP_REVISION              0x0002
+
+#define RIO_REG_MMIO_SIZE             0x0300
+
+#define RIO_ERR_NONE                  0
+#define RIO_ERR_ANY                   1
+
+#define RIO_ERR(cause)                \
+    (RIO_ERR_ANY | (((cause) & 0x0fff) << 16))
+
+#define RIO_ERR_IO(cause, ttyp)       \
+    (RIO_ERR_ANY | (((cause) & 0x0fff) << 16) | (((ttyp) & 0x3f) << 8))
+
+#define RIO_ERR_CAUSE(err)            (((err) >> 16) & 0xfff)
+#define RIO_ERR_TTYP(err)             (((err) >> 8) & 0x3f)
+
+
+/* IO virtual address space wrapper for attached PCI devices */
+struct RivosIOMMUSpace {
+    IOMMUMemoryRegion             mr;
+    AddressSpace                  as;
+    RivosIOMMUState              *iommu;
+    RivosIOMMUDeviceContext       dc;
+    bool                          dc_valid;
+    uint32_t                      devid;
+    QLIST_ENTRY(RivosIOMMUSpace)  list;
+};
+
+
+static uint32_t rivos_iommu_reg_mod(RivosIOMMUState *s,
+    unsigned idx, uint32_t set, uint32_t clr)
+{
+    uint32_t val;
+    qemu_mutex_lock(&s->core_lock);
+    val = ldl_le_p(&s->regs_rw[idx]);
+    stl_le_p(&s->regs_rw[idx], set | (val & ~clr));
+    qemu_mutex_unlock(&s->core_lock);
+    return val;
+}
+
+static unsigned rivos_iommu_irq_vector(RivosIOMMUState *s, int source)
+{
+    const uint32_t ivec = ldl_le_p(&s->regs_rw[RIO_REG_IVEC]);
+    return (ivec >> (source * 4)) & 0x0F;
+}
+
+static void rivos_iommu_irq_use(RivosIOMMUState *s, int source)
+{
+    msix_vector_use(&(s->pci), rivos_iommu_irq_vector(s, source));
+}
+
+static void rivos_iommu_irq_unuse(RivosIOMMUState *s, int source)
+{
+    msix_vector_unuse(&(s->pci), rivos_iommu_irq_vector(s, source));
+}
+
+static void rivos_iommu_irq_assert(RivosIOMMUState *s, int source)
+{
+    uint32_t ipsr = rivos_iommu_reg_mod(s, RIO_REG_IPSR, (1 << source), 0);
+
+    if (!(ipsr & (1 << source)) && msix_enabled(&(s->pci))) {
+        const unsigned vector = rivos_iommu_irq_vector(s, source);
+        msix_notify(&(s->pci), vector);
+    }
+}
+
+static void rivos_iommu_fault_iova(RivosIOMMUSpace *as, int err, hwaddr iova,
+    hwaddr gpa)
+{
+    RivosIOMMUState *s = as->iommu;
+    RivosIOMMUEvent ev;
+    MemTxResult res;
+    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
+    uint32_t head = ldl_le_p(&s->regs_rw[RIO_REG_FQ_HEAD]) & s->fq_mask;
+    uint32_t next = (s->fq_tail + 1) & s->fq_mask;
+    uint32_t ctrl = ldl_le_p(&s->regs_rw[RIO_REG_FQ_CONTROL]);
+    uint32_t ctrl_err = 0;
+
+    ev.reason = as->devid;
+    ev.reason = set_field(ev.reason, RIO_EVENT_MASK_CAUSE, RIO_ERR_CAUSE(err));
+    ev.reason = set_field(ev.reason, RIO_EVENT_MASK_TTYP, RIO_ERR_TTYP(err));
+    ev.iova = iova;
+    ev.phys = gpa;
+
+    trace_rivos_iommu_flt(PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid),
+                          PCI_FUNC(as->devid), RIO_ERR_CAUSE(err), iova);
+
+    if (!(ctrl & RIO_FQ_ACTIVE) || !!(ctrl & (RIO_FQ_FULL | RIO_FQ_FAULT))) {
+        return;
+    }
+
+    if (head == next) {
+        ctrl_err = RIO_FQ_FULL;
+    } else {
+        dma_addr_t addr = s->fq_base + s->fq_tail * sizeof(RivosIOMMUEvent);
+        res = dma_memory_write(&address_space_memory, addr, &ev, sizeof(ev),
+                               ma);
+        if (res != MEMTX_OK) {
+            ctrl_err = RIO_FQ_FAULT;
+        } else {
+            s->fq_tail = next;
+        }
+    }
+
+    stl_le_p(&s->regs_rw[RIO_REG_FQ_TAIL], s->fq_tail);
+
+    if (ctrl_err) {
+        rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, ctrl_err, 0);
+    }
+
+    if (ctrl & RIO_FQ_IRQ_ENABLE) {
+        rivos_iommu_irq_assert(s, RIO_INT_FQ);
+    }
+}
+
+static void rivos_iommu_fault(RivosIOMMUSpace *as, int cause)
+{
+    rivos_iommu_fault_iova(as, cause, 0, 0);
+}
+
+
+/* Risc-V IOMMU Page Table walker.
+ *
+ * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
+ * Both implementation can be merged into single helper function in future.
+ * Keeping them separate for now, as error reporting and flow specifics are
+ * sufficiently different for separate implementation.
+ *
+ * Returns RIO_ERR_ with fault code.
+ */
+static int rivos_iommu_fetch_pa(RivosIOMMUSpace *as,
+    hwaddr addr, hwaddr *physical, uint64_t gatp, uint64_t satp,
+    bool first_stage, IOMMUAccessFlags access)
+{
+    MemTxResult res;
+    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
+    hwaddr base;
+    int i, levels, ptidxbits, ptshift, ptesize, mode, widened;
+    uint64_t atp = first_stage ? satp : gatp;
+
+    base = (hwaddr) get_field(atp, RIO_ATP_MASK_PPN) << PGSHIFT;
+    mode = get_field(atp, RIO_ATP_MASK_MODE);
+
+    switch (mode) {
+    case RIO_ATP_MODE_SV32:
+        levels = 2;
+        ptidxbits = 10;
+        ptesize = 4;
+        break;
+    case RIO_ATP_MODE_SV39:
+        levels = 3;
+        ptidxbits = 9;
+        ptesize = 8;
+        break;
+    case RIO_ATP_MODE_SV48:
+        levels = 4;
+        ptidxbits = 9;
+        ptesize = 8;
+        break;
+    case RIO_ATP_MODE_SV57:
+        levels = 5;
+        ptidxbits = 9;
+        ptesize = 8;
+        break;
+    case RIO_ATP_MODE_BARE:
+        if (first_stage) {
+            return rivos_iommu_fetch_pa(as, addr, physical,
+                                        gatp, satp, false, access);
+        }
+        *physical = addr;
+        return RIO_ERR_NONE;
+    default:
+        return RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED);
+    }
+
+    widened = first_stage ? 0 : 2;
+    ptshift = (levels - 1) * ptidxbits;
+
+    /* zero extended address range check */
+    int va_bits = PGSHIFT + levels * ptidxbits + widened;
+    uint64_t va_mask = (1ULL << va_bits) - 1;
+    if ((addr & va_mask) != addr) {
+        return RIO_ERR(RIO_ERRC_DMA_DISABLED);
+    }
+
+    for (i = 0; i < levels; i++, ptshift -= ptidxbits) {
+        target_ulong pte;
+        hwaddr pte_addr;
+        target_ulong idx;
+
+        idx = (addr >> (PGSHIFT + ptshift)) & ((1 << (ptidxbits + widened))-1);
+        pte_addr = base + idx * ptesize;
+        widened = 0;
+
+        if (ptesize == 4) {
+            pte = address_space_ldl(&address_space_memory, pte_addr, ma, &res);
+        } else {
+            pte = address_space_ldq(&address_space_memory, pte_addr, ma, &res);
+        }
+
+        if (res != MEMTX_OK) {
+            return RIO_ERR(RIO_ERRC_PDT_FAULT);
+        }
+
+        hwaddr ppn = pte >> PTE_PPN_SHIFT;
+
+        if (!(pte & PTE_V)) {
+            /* Invalid PTE */
+            return RIO_ERR(RIO_ERRC_PDT_INVALID);
+        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
+            /* Inner PTE, continue walking */
+            base = ppn << PGSHIFT;
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
+            /* Reserved leaf PTE flags: PTE_W */
+            return RIO_ERR(RIO_ERRC_PDT_INVALID);
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
+            /* Reserved leaf PTE flags: PTE_W + PTE_X */
+            return RIO_ERR(RIO_ERRC_PDT_INVALID);
+        } else if (ppn & ((1ULL << ptshift) - 1)) {
+            /* Misaligned PPN */
+            return RIO_ERR(RIO_ERRC_PDT_INVALID);
+        } else if ((access & IOMMU_RO) && !(pte & PTE_R)) {
+            /* Read access check failed */
+            return first_stage ? RIO_ERR(RIO_ERRC_GPGFAULT_RD)
+                               : RIO_ERR(RIO_ERRC_PGFAULT_RD);
+        } else if ((access & IOMMU_WO) && !(pte & PTE_W)) {
+            /* Write access check failed */
+            return first_stage ? RIO_ERR(RIO_ERRC_GPGFAULT_WR)
+                               : RIO_ERR(RIO_ERRC_PGFAULT_WR);
+        } else {
+            /* Leaf PTE, update base to translated address. */
+            target_ulong vpn = addr >> PGSHIFT;
+            base = ((ppn | (vpn & ((1L << ptshift) - 1))) << PGSHIFT) |
+                    (addr & ~TARGET_PAGE_MASK);
+        }
+
+        /* Do the second stage translation if enabled. */
+        if (first_stage) {
+            hwaddr spa;
+
+            int ret = rivos_iommu_fetch_pa(as, base, &spa,
+                                           gatp, satp, false, access);
+
+            /* Report back GPA causing second stage translation fault. */
+            if (ret) {
+                *physical = base;
+                return ret;
+            }
+
+            base = spa;
+        }
+
+        if (pte & (PTE_R | PTE_W | PTE_X)) {
+            /* Leaf PTE, return translated address */
+            *physical = base;
+            return RIO_ERR_NONE;
+        }
+    }
+    return RIO_ERR(RIO_ERRC_PDT_INVALID);
+}
+
+/* Risc-V IOMMU Device Directory Tree walker.
+ *
+ * Returns RIO_ERR_ with fault code.
+ */
+static int rivos_iommu_fetch_dc(RivosIOMMUState *iommu, uint32_t devid,
+    RivosIOMMUDeviceContext *dc)
+{
+    MemTxResult res;
+    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
+    hwaddr addr;
+    const bool dcbase = !iommu->enable_msi;
+    const size_t dcsize = sizeof(*dc) >> dcbase;
+    unsigned int depth = RIO_DDTP_MODE_1LVL - iommu->ddt_mode;
+
+    if (depth > 2) {
+        return RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED);
+    }
+
+    /* Check supported device id range. */
+    if (devid >= (1 << (depth * 9 + 6 + (dcbase && depth != 2)))) {
+        return RIO_ERR(RIO_ERRC_DDT_INVALID);
+    }
+
+    for (addr = iommu->ddt_base; depth-- > 0; ) {
+        const int split = depth * 9 + 6 + dcbase;
+        addr |= ((devid >> split) << 3) & ~TARGET_PAGE_MASK;
+        uint64_t dde = address_space_ldq(&address_space_memory, addr, ma, &res);
+        if (res != MEMTX_OK) {
+            return RIO_ERR(RIO_ERRC_DDT_FAULT);
+        }
+        if (!(dde & RIO_DCTC_VALID)) {
+            return RIO_ERR(RIO_ERRC_DDT_INVALID);
+        }
+        addr = dde & RIO_DDTE_MASK_PPN;
+    }
+
+    /* index into device context entry page */
+    addr |= (devid * dcsize) & ~TARGET_PAGE_MASK;
+
+    memset(dc, 0, sizeof(*dc));
+    res = dma_memory_read(&address_space_memory, addr, dc, dcsize, ma);
+
+    if (res != MEMTX_OK) {
+        return RIO_ERR(RIO_ERRC_DDT_FAULT);
+    }
+
+    if (!(dc->tc & RIO_DCTC_VALID)) {
+        return RIO_ERR(RIO_ERRC_DDT_INVALID);
+    }
+
+    return RIO_ERR_NONE;
+}
+
+static void rivos_iommu_translate_tlb(RivosIOMMUSpace *as,
+    IOMMUAccessFlags flag, IOMMUTLBEntry *tlb)
+{
+    RivosIOMMUState *iommu = as->iommu;
+
+    if (!as->dc_valid) {
+        /* Fetch device context if not cached. */
+        int ret = rivos_iommu_fetch_dc(iommu, as->devid, &as->dc);
+        if (ret != RIO_ERR_NONE) {
+            rivos_iommu_fault(as, ret);
+            return;
+        } else {
+            as->dc_valid = true;
+        }
+    }
+
+    /* MSI window */
+    if (!(((tlb->iova >> PGSHIFT) ^ as->dc.msi_addr_pattern) &
+        ~as->dc.msi_addr_mask)) {
+        if (flag != IOMMU_WO) {
+            /* only writes are allowed. */
+            rivos_iommu_fault_iova(as, RIO_ERR(RIO_ERRC_MRIF_FAULT),
+                                   tlb->iova, 0);
+            return;
+        }
+        if (tlb->iova & ~TARGET_PAGE_MASK) {
+            /* unaligned access. */
+            rivos_iommu_fault_iova(as, RIO_ERR(RIO_ERRC_MRIF_FAULT),
+                                   tlb->iova, 0);
+            return;
+        }
+        if (!(as->dc.msiptp & RIO_DCMSI_VALID)) {
+            /* MSI remapping not enabled */
+            rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_INVALID));
+            return;
+        }
+        int mode = get_field(as->dc.msiptp, RIO_DCMSI_MASK_MODE);
+        switch (mode) {
+            case RIO_DCMSI_MODE_BARE:
+                tlb->translated_addr = tlb->iova;
+                tlb->addr_mask = ((1ULL << PGSHIFT) - 1);
+                tlb->perm = flag;
+                break;
+
+            case RIO_DCMSI_MODE_FLAT:
+                /* TODO: not implemented, follow AIA section 9.5 */
+                rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED));
+                return;
+
+            default:
+                rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED));
+                return;
+        }
+
+        return;
+    }
+
+    /* Lookup SATP */
+    if (as->dc.tc & RIO_DCTC_PDTV) {
+        /* Process directory tree is not supported yet. */
+        rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_PDT_UNSUPPORTED));
+        return;
+    }
+
+    /* Lookup IOATC */
+    /* TODO: merge in IOATC PoC */
+
+    /* Memory access */
+    hwaddr physical;
+    int err = rivos_iommu_fetch_pa(as, tlb->iova, &physical,
+                                   as->dc.gatp, as->dc.satp,
+                                   iommu->enable_stage_one, flag);
+    if (err == RIO_ERR_NONE) {
+        tlb->translated_addr = physical;
+        tlb->addr_mask = ((1ULL << PGSHIFT) - 1);
+        tlb->perm = flag;
+    } else if (!(as->dc.tc & RIO_DCTC_DIS_TRANS_FAULT)) {
+        const int fault = RIO_ERR_IO(RIO_ERR_CAUSE(err),
+            flag == IOMMU_WO ? RIO_TTYP_UWR : RIO_TTYP_URD);
+        rivos_iommu_fault_iova(as, fault, tlb->iova, physical);
+    }
+
+    return;
+}
+
+static const char *IOMMU_FLAG_STR[] = {
+    "NA",
+    "RO",
+    "WR",
+    "RW",
+};
+
+/* Called from RCU critical section */
+static IOMMUTLBEntry rivos_iommu_translate(IOMMUMemoryRegion *iommu_mr,
+    hwaddr addr, IOMMUAccessFlags flag, int iommu_idx)
+{
+    RivosIOMMUSpace *as = container_of(iommu_mr, RivosIOMMUSpace, mr);
+    const uint32_t ddt_mode = as->iommu->ddt_mode;
+    IOMMUTLBEntry tlb = {
+        .iova = addr,
+        .target_as = &address_space_memory,
+        .perm = IOMMU_NONE,
+    };
+
+    switch (ddt_mode) {
+        case RIO_DDTP_MODE_OFF:
+            /* All translations disabled, power-on state. */
+            rivos_iommu_fault_iova(as, RIO_ERR(RIO_ERRC_DMA_DISABLED),
+                                   tlb.iova, 0);
+            break;
+
+        case RIO_DDTP_MODE_BARE:
+            /* Global passthrough mode enabled for all devices. */
+            tlb.translated_addr = tlb.iova;
+            tlb.addr_mask = ~0ULL;
+            tlb.perm = flag;
+            break;
+
+        case RIO_DDTP_MODE_3LVL:
+        case RIO_DDTP_MODE_2LVL:
+        case RIO_DDTP_MODE_1LVL:
+            /* Translate using device directory information. */
+            rivos_iommu_translate_tlb(as, flag, &tlb);
+            break;
+
+        default:
+            /* Invalid device directory tree mode, should never happen. */
+            rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED));
+            break;
+    }
+
+    trace_rivos_iommu_dma(PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid),
+        PCI_FUNC(as->devid), IOMMU_FLAG_STR[tlb.perm & IOMMU_RW],
+        tlb.iova, tlb.translated_addr);
+
+    return tlb;
+}
+
+static void rivos_iommu_iodir_inval_ddt(RivosIOMMUState *s, bool all,
+    uint32_t devid)
+{
+    RivosIOMMUSpace *as;
+
+    qemu_mutex_lock(&s->core_lock);
+    QLIST_FOREACH(as, &s->spaces, list) {
+        if (all || (as->devid == devid)) {
+            as->dc_valid = false;
+        }
+    }
+    qemu_mutex_unlock(&s->core_lock);
+}
+
+static void rivos_iommu_iofence(RivosIOMMUState *s, bool av, uint64_t addr,
+    uint32_t data)
+{
+    MemTxResult res;
+    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
+
+    if (av) {
+        res = dma_memory_write(&address_space_memory, addr, &data, sizeof(data),
+                               ma);
+        if (res != MEMTX_OK) {
+            rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, RIO_CQ_FAULT, 0);
+        }
+    }
+}
+
+static int rivos_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
+    IOMMUNotifierFlag old, IOMMUNotifierFlag new, Error **errp)
+{
+    if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
+        error_setg(errp, "rivos-iommu does not support dev-iotlb");
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static void rivos_iommu_process_cq_tail(RivosIOMMUState *s)
+{
+    RivosIOMMUCommand cmd;
+    MemTxResult res;
+    dma_addr_t addr;
+    MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED;
+    uint32_t tail;
+    uint32_t ctrl = ldl_le_p(&s->regs_rw[RIO_REG_CQ_CONTROL]);
+    uint32_t bdf = pci_get_bdf(&s->pci);
+    uint32_t err = 0;
+
+    /* Fetch latest tail position and clear busy marker */
+    s->cq_tail_db = false;
+    tail = s->cq_mask & ldl_le_p(&s->regs_rw[RIO_REG_CQ_TAIL]);
+
+    /* Check for pending error or queue processing disabled */
+    if (!(ctrl & RIO_CQ_ACTIVE) || !!(ctrl & (RIO_CQ_ERROR | RIO_CQ_FAULT)))
+    {
+        return;
+    }
+
+    while (tail != s->cq_head) {
+        addr = s->cq_base  + s->cq_head * sizeof(cmd);
+        res = dma_memory_read(&address_space_memory, addr, &cmd, sizeof(cmd),
+                              ma);
+
+        if (res != MEMTX_OK) {
+            err = RIO_CQ_FAULT;
+            break;
+        }
+
+        trace_rivos_iommu_cmd(PCI_BUS_NUM(bdf), PCI_SLOT(bdf),
+                              PCI_FUNC(bdf), cmd.request, cmd.address);
+
+        int fun_op = get_field(cmd.request, RIO_CMD_MASK_FUN_OP);
+
+        switch(fun_op) {
+            case RIO_CMD_IOFENCE_C:
+                rivos_iommu_iofence(s, !!(cmd.request & RIO_IOFENCE_AV),
+                    cmd.address,
+                    get_field(cmd.request, RIO_IOFENCE_MASK_DATA));
+                break;
+
+            case RIO_CMD_IOTINVAL_GVMA:
+                /* IOTLB not implemented */
+                break;
+
+            case RIO_CMD_IOTINVAL_MSI:
+                /* IOTLB not implemented */
+                break;
+
+            case RIO_CMD_IOTINVAL_VMA:
+                /* IOTLB not implemented */
+                break;
+
+            case RIO_CMD_IODIR_INV_DDT:
+                rivos_iommu_iodir_inval_ddt(s,
+                        !(cmd.request & RIO_IODIR_DID_VALID),
+                        get_field(cmd.request, RIO_IODIR_MASK_DID));
+                break;
+
+            case RIO_CMD_IODIR_INV_PDT:
+                /* PDT invalidate not implemented. */
+                break;
+
+            case RIO_CMD_IODIR_PRE_DDT:
+                /* DDT pre-fetching not implemented. */
+                break;
+
+            case RIO_CMD_IODIR_PRE_PDT:
+                /* PDT pre-fetching not implemented. */
+                break;
+
+            default:
+                err = RIO_CQ_ERROR;
+                break;
+        }
+
+        /* Invalid instruction, keep cq_head at failed instruction index. */
+        if (err) {
+            break;
+        }
+
+        s->cq_head = (s->cq_head + 1) & s->cq_mask;
+    }
+
+    stl_le_p(&s->regs_rw[RIO_REG_CQ_HEAD], s->cq_head);
+
+    if (err) {
+        rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, err, 0);
+    }
+
+    if (ctrl & RIO_CQ_IRQ_ENABLE) {
+        rivos_iommu_irq_assert(s, RIO_INT_CQ);
+    }
+}
+
+static void rivos_iommu_process_ddtp(RivosIOMMUState *s)
+{
+    uint64_t base = ldq_le_p(&s->regs_rw[RIO_REG_DDTP]) & ~RIO_DDTP_BUSY;
+    uint32_t mode = get_field(base, RIO_DDTP_MASK_MODE);
+    bool ok;
+
+    /* Allowed DDTP.MODE transitions:
+     * {OFF, BARE} -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
+     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
+     */
+
+    if (s->ddt_mode == mode) {
+        ok = true;
+    } else if (s->ddt_mode == RIO_DDTP_MODE_OFF ||
+               s->ddt_mode == RIO_DDTP_MODE_BARE) {
+        ok = mode == RIO_DDTP_MODE_1LVL ||
+             mode == RIO_DDTP_MODE_2LVL ||
+             mode == RIO_DDTP_MODE_3LVL;
+    } else {
+        ok = mode == RIO_DDTP_MODE_OFF ||
+             mode == RIO_DDTP_MODE_BARE;
+    }
+
+    if (ok) {
+        s->ddt_base = get_field(base, RIO_DDTP_MASK_PPN) << PGSHIFT;
+        s->ddt_mode = mode;
+    } else {
+        /* report back last valid mode and device directory table pointer. */
+        base = s->ddt_base >> PGSHIFT;
+        base = set_field(base, RIO_DDTP_MASK_MODE, s->ddt_mode);
+    }
+
+    stq_le_p(&s->regs_rw[RIO_REG_DDTP], base);
+}
+
+static void rivos_iommu_process_cq_control(RivosIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = ldl_le_p(&s->regs_rw[RIO_REG_CQ_CONTROL]);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RIO_FQ_ENABLE);
+    bool active = !!(ctrl_set & RIO_FQ_ACTIVE);
+
+    if (enable && !active) {
+        base = ldq_le_p(&s->regs_rw[RIO_REG_CQ_BASE]);
+        s->cq_mask = (2ULL << get_field(base, RIO_CQ_MASK_LOG2SZ)) - 1;
+        s->cq_base = get_field(base, RIO_CQ_MASK_PPN) << PGSHIFT;
+        s->cq_head = 0;
+        rivos_iommu_irq_use(s, RIO_INT_CQ);
+        stl_le_p(&s->regs_ro[RIO_REG_CQ_TAIL], ~s->cq_mask);
+        stl_le_p(&s->regs_rw[RIO_REG_CQ_HEAD], s->cq_head);
+        stl_le_p(&s->regs_rw[RIO_REG_CQ_TAIL], s->cq_head);
+        ctrl_set = RIO_CQ_ACTIVE;
+        ctrl_clr = RIO_CQ_BUSY | RIO_CQ_FAULT | RIO_CQ_ERROR | RIO_CQ_TIMEOUT;
+    } else if (!enable && active) {
+        rivos_iommu_irq_unuse(s, RIO_INT_CQ);
+        stl_le_p(&s->regs_ro[RIO_REG_CQ_TAIL], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RIO_CQ_BUSY | RIO_CQ_ACTIVE;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RIO_CQ_BUSY;
+    }
+
+    rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, ctrl_set, ctrl_clr);
+}
+
+static void rivos_iommu_process_fq_control(RivosIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = ldl_le_p(&s->regs_rw[RIO_REG_FQ_CONTROL]);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RIO_FQ_ENABLE);
+    bool active = !!(ctrl_set & RIO_FQ_ACTIVE);
+
+    if (enable && !active) {
+        base = ldq_le_p(&s->regs_rw[RIO_REG_FQ_BASE]);
+        s->fq_mask = (2ULL << get_field(base, RIO_FQ_MASK_LOG2SZ)) - 1;
+        s->fq_base = get_field(base, RIO_FQ_MASK_PPN) << PGSHIFT;
+        s->fq_tail = 0;
+        rivos_iommu_irq_use(s, RIO_INT_FQ);
+        stl_le_p(&s->regs_rw[RIO_REG_FQ_HEAD], s->fq_tail);
+        stl_le_p(&s->regs_rw[RIO_REG_FQ_TAIL], s->fq_tail);
+        stl_le_p(&s->regs_ro[RIO_REG_FQ_HEAD], ~s->fq_mask);
+        ctrl_set = RIO_FQ_ACTIVE;
+        ctrl_clr = RIO_FQ_BUSY | RIO_FQ_FAULT | RIO_FQ_FULL;
+    } else if (!enable && active) {
+        rivos_iommu_irq_unuse(s, RIO_INT_FQ);
+        stl_le_p(&s->regs_ro[RIO_REG_FQ_HEAD], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RIO_FQ_BUSY | RIO_FQ_ACTIVE;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RIO_FQ_BUSY;
+    }
+
+    rivos_iommu_reg_mod(s, RIO_REG_FQ_CONTROL, ctrl_set, ctrl_clr);
+}
+
+static void rivos_iommu_process_pq_control(RivosIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = ldl_le_p(&s->regs_rw[RIO_REG_PQ_CONTROL]);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RIO_PQ_ENABLE);
+    bool active = !!(ctrl_set & RIO_PQ_ACTIVE);
+
+    if (enable && !active) {
+        base = ldq_le_p(&s->regs_rw[RIO_REG_PQ_BASE]);
+        s->pq_mask = (2ULL << get_field(base, RIO_PQ_MASK_LOG2SZ)) - 1;
+        s->pq_base = get_field(base, RIO_PQ_MASK_PPN) << PGSHIFT;
+        s->pq_tail = 0;
+        rivos_iommu_irq_use(s, RIO_INT_PQ);
+        stl_le_p(&s->regs_rw[RIO_REG_PQ_HEAD], s->pq_tail);
+        stl_le_p(&s->regs_rw[RIO_REG_PQ_TAIL], s->pq_tail);
+        stl_le_p(&s->regs_ro[RIO_REG_PQ_HEAD], ~s->pq_mask);
+        ctrl_set = RIO_PQ_ACTIVE;
+        ctrl_clr = RIO_PQ_BUSY | RIO_PQ_FAULT | RIO_PQ_FULL;
+    } else if (!enable && active) {
+        rivos_iommu_irq_unuse(s, RIO_INT_PQ);
+        stl_le_p(&s->regs_ro[RIO_REG_PQ_HEAD], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RIO_PQ_BUSY | RIO_PQ_ACTIVE;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RIO_PQ_BUSY;
+    }
+
+    rivos_iommu_reg_mod(s, RIO_REG_PQ_CONTROL, ctrl_set, ctrl_clr);
+}
+
+static void *rivos_iommu_core_proc(void* arg)
+{
+    RivosIOMMUState *s = arg;
+
+    qemu_mutex_lock(&s->core_lock);
+    while (!s->core_stop) {
+        if (s->cq_tail_db) {
+            qemu_mutex_unlock(&s->core_lock);
+            rivos_iommu_process_cq_tail(s);
+        } else if (ldl_le_p(&s->regs_rw[RIO_REG_CQ_CONTROL]) & RIO_CQ_BUSY) {
+            qemu_mutex_unlock(&s->core_lock);
+            rivos_iommu_process_cq_control(s);
+        } else if (ldl_le_p(&s->regs_rw[RIO_REG_FQ_CONTROL]) & RIO_FQ_BUSY) {
+            qemu_mutex_unlock(&s->core_lock);
+            rivos_iommu_process_fq_control(s);
+        } else if (ldl_le_p(&s->regs_rw[RIO_REG_PQ_CONTROL]) & RIO_PQ_BUSY) {
+            qemu_mutex_unlock(&s->core_lock);
+            rivos_iommu_process_pq_control(s);
+        } else if (ldq_le_p(&s->regs_rw[RIO_REG_DDTP]) & RIO_DDTP_BUSY) {
+            qemu_mutex_unlock(&s->core_lock);
+            rivos_iommu_process_ddtp(s);
+        } else {
+            qemu_cond_wait(&s->core_cond, &s->core_lock);
+            continue;
+        }
+        qemu_mutex_lock(&s->core_lock);
+    }
+    qemu_mutex_unlock(&s->core_lock);
+
+    return NULL;
+}
+
+static void rivos_iommu_mmio_write(void *opaque, hwaddr addr, uint64_t val,
+                             unsigned size)
+{
+    RivosIOMMUState *s = opaque;
+    uint64_t busy = 0;
+    bool wakeup = true;
+
+    if (addr + size > sizeof(s->regs_rw)) {
+        /* unsupported MMIO access location */
+        return;
+    }
+
+    /* actionable MMIO write. */
+    switch (addr) {
+        case RIO_REG_DDTP:
+            busy = RIO_DDTP_BUSY;
+            break;
+
+        /* upper half DDTP update */
+        case RIO_REG_DDTP + 4:
+            busy = RIO_DDTP_BUSY >> 32;
+            break;
+
+        case RIO_REG_CQ_TAIL:
+            s->cq_tail_db = true;
+            break;
+
+        case RIO_REG_CQ_CONTROL:
+            busy = RIO_CQ_BUSY;
+            break;
+
+        case RIO_REG_FQ_CONTROL:
+            busy = RIO_FQ_BUSY;
+            break;
+
+        case RIO_REG_PQ_CONTROL:
+            busy = RIO_PQ_BUSY;
+            break;
+
+        default:
+            wakeup = false;
+            break;
+    }
+
+    qemu_mutex_lock(&s->core_lock);
+    if (size == 1) {
+        uint8_t ro = s->regs_ro[addr];
+        uint8_t wc = s->regs_wc[addr];
+        uint8_t rw = s->regs_rw[addr];
+        s->regs_rw[addr] = ((rw & ro) | (val & ~ro)) & ~(val & wc);
+    } else if (size == 2) {
+        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
+        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
+        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
+        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (val & ~ro)) & ~(val & wc));
+    } else if (size == 4) {
+        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
+        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
+        uint32_t rw = ldl_le_p(&s->regs_rw[addr]) | busy;
+        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (val & ~ro)) & ~(val & wc));
+    } else if (size == 8) {
+        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
+        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
+        uint64_t rw = ldq_le_p(&s->regs_rw[addr]) | busy;
+        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (val & ~ro)) & ~(val & wc));
+    }
+
+    /* wakeup core processing thread */
+    if (wakeup) {
+        qemu_cond_signal(&s->core_cond);
+    }
+    qemu_mutex_unlock(&s->core_lock);
+}
+
+static uint64_t rivos_iommu_mmio_read(void *opaque, hwaddr addr, unsigned size)
+{
+    RivosIOMMUState *s = opaque;
+    uint64_t val = -1;
+
+    if (addr + size > sizeof(s->regs_rw)) {
+        return (uint64_t)-1;
+    } else if (size == 1) {
+        val = (uint64_t) s->regs_rw[addr];
+    } else if (size == 2) {
+        val = lduw_le_p(&s->regs_rw[addr]);
+    } else if (size == 4) {
+        val = ldl_le_p(&s->regs_rw[addr]);
+    } else if (size == 8) {
+        val = ldq_le_p(&s->regs_rw[addr]);
+    }
+
+    return val;
+}
+
+static const MemoryRegionOps rivos_iommu_mmio_ops = {
+    .read = rivos_iommu_mmio_read,
+    .write = rivos_iommu_mmio_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    }
+};
+
+static AddressSpace *rivos_iommu_dma_as(PCIBus *bus, void *opaque, int devfn)
+{
+    RivosIOMMUState *s = opaque;
+    RivosIOMMUSpace *as;
+    char name[64];
+    uint32_t devid = PCI_BUILD_BDF(pci_bus_num(bus), devfn);
+    uint32_t iommu_devid = pci_get_bdf(&s->pci);
+
+    if (iommu_devid == devid) {
+        /* No translation for IOMMU device itself. */
+        return &address_space_memory;
+    }
+
+    qemu_mutex_lock(&s->core_lock);
+    QLIST_FOREACH(as, &s->spaces, list) {
+        if (as->devid == devid)
+            break;
+    }
+    qemu_mutex_unlock(&s->core_lock);
+
+    if (as == NULL) {
+        as = g_malloc0(sizeof(RivosIOMMUSpace));
+
+        as->iommu = s;
+        as->devid = devid;
+
+        snprintf(name, sizeof(name), "rivos-iommu-%04x:%02x.%d-iova",
+            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
+
+        memory_region_init_iommu(&as->mr, sizeof(as->mr),
+            TYPE_RIVOS_IOMMU_MEMORY_REGION,
+            OBJECT(as), name, UINT64_MAX);
+
+        address_space_init(&as->as, MEMORY_REGION(&as->mr),
+                           TYPE_RIVOS_IOMMU_PCI);
+
+        qemu_mutex_lock(&s->core_lock);
+        QLIST_INSERT_HEAD(&s->spaces, as, list);
+        qemu_mutex_unlock(&s->core_lock);
+
+        trace_rivos_iommu_new(PCI_BUS_NUM(iommu_devid), PCI_SLOT(iommu_devid),
+            PCI_FUNC(iommu_devid), PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid),
+            PCI_FUNC(as->devid));
+    }
+
+    return &as->as;
+}
+
+static void rivos_iommu_reg_reset(RivosIOMMUState *s)
+{
+    const uint64_t cap = (s->version & RIO_CAP_REVISION_MASK) |
+                   (s->enable_stage_one * RIO_CAP_STAGE_ONE) |
+                   (s->enable_stage_two * RIO_CAP_STAGE_TWO) |
+                   (s->enable_msi * RIO_CAP_MSI);
+
+    /* Mark all registers read-only */
+    memset(s->regs_ro, 0xff, sizeof(s->regs_ro));
+
+    /* Set power-on register state */
+    stq_le_p(&s->regs_rw[RIO_REG_CAP], cap);
+    stq_le_p(&s->regs_ro[RIO_REG_DDTP],
+        ~(RIO_DDTP_MASK_PPN | RIO_DDTP_MASK_MODE));
+    stq_le_p(&s->regs_ro[RIO_REG_CQ_BASE],
+        ~(RIO_CQ_MASK_LOG2SZ | RIO_CQ_MASK_PPN));
+    stq_le_p(&s->regs_ro[RIO_REG_FQ_BASE],
+        ~(RIO_FQ_MASK_LOG2SZ | RIO_FQ_MASK_PPN));
+    stq_le_p(&s->regs_ro[RIO_REG_PQ_BASE],
+        ~(RIO_PQ_MASK_LOG2SZ | RIO_PQ_MASK_PPN));
+    stl_le_p(&s->regs_wc[RIO_REG_CQ_CONTROL],
+        RIO_CQ_FAULT | RIO_CQ_TIMEOUT | RIO_CQ_ERROR);
+    stl_le_p(&s->regs_ro[RIO_REG_CQ_CONTROL], RIO_CQ_ACTIVE | RIO_CQ_BUSY);
+    stl_le_p(&s->regs_wc[RIO_REG_FQ_CONTROL], RIO_FQ_FAULT | RIO_FQ_FULL);
+    stl_le_p(&s->regs_ro[RIO_REG_FQ_CONTROL], RIO_FQ_ACTIVE | RIO_FQ_BUSY);
+    stl_le_p(&s->regs_wc[RIO_REG_PQ_CONTROL], RIO_PQ_FAULT | RIO_PQ_FULL);
+    stl_le_p(&s->regs_ro[RIO_REG_PQ_CONTROL], RIO_PQ_ACTIVE | RIO_PQ_BUSY);
+    stl_le_p(&s->regs_wc[RIO_REG_IPSR], ~0);
+}
+
+static void rivos_iommu_realize(PCIDevice *dev, Error **errp)
+{
+    DeviceState *d = DEVICE(dev);
+    RivosIOMMUState *s = RIVOS_IOMMU_PCI(d);
+    const uint64_t bar_size =
+        pow2ceil(QEMU_ALIGN_UP(sizeof(s->regs_rw), TARGET_PAGE_SIZE));
+    Error *err = NULL;
+
+    QLIST_INIT(&s->spaces);
+    qemu_cond_init(&s->core_cond);
+    qemu_mutex_init(&s->core_lock);
+    rivos_iommu_reg_reset(s);
+
+    qemu_thread_create(&s->core_proc, "rivos-iommu-core",
+        rivos_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
+
+    memory_region_init(&s->bar0, OBJECT(s),
+            "rivos-iommu-bar0", bar_size);
+    memory_region_init_io(&s->mmio, OBJECT(s), &rivos_iommu_mmio_ops, s,
+            "rivos-iommu", sizeof(s->regs_rw));
+    memory_region_add_subregion(&s->bar0, 0, &s->mmio);
+
+    pcie_endpoint_cap_init(dev, 0x80);
+
+    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
+            PCI_BASE_ADDRESS_MEM_TYPE_64, &s->bar0);
+
+    int ret = msix_init(dev, RIO_INT_COUNT,
+                    &s->bar0, 0, RIO_REG_MSI_ADDR_BASE,
+                    &s->bar0, 0, RIO_REG_MSI_PBA_BASE, 0, &err);
+
+    if (ret == -ENOTSUP) {
+        /* MSI-x is not supported by the platform.
+         * Driver should use timer/polling based notification handlers.
+         */
+        warn_report_err(err);
+    } else if (ret < 0) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    /* TODO: find root port bus ranges and use for FDT/ACPI generation. */
+    PCIBus *bus = pci_device_root_bus(dev);
+    if (!bus) {
+        error_setg(errp, "can't find PCIe root port for %02x:%02x.%x",
+            pci_bus_num(pci_get_bus(dev)), PCI_SLOT(dev->devfn),
+            PCI_FUNC(dev->devfn));
+        return;
+    }
+
+    pci_setup_iommu(bus, rivos_iommu_dma_as, s);
+}
+
+static void rivos_iommu_exit(PCIDevice *dev)
+{
+    DeviceState *d = DEVICE(dev);
+    RivosIOMMUState *s = RIVOS_IOMMU_PCI(d);
+
+    qemu_mutex_lock(&s->core_lock);
+    s->core_stop = true;
+    qemu_cond_signal(&s->core_cond);
+    qemu_mutex_unlock(&s->core_lock);
+    qemu_thread_join(&s->core_proc);
+    qemu_cond_destroy(&s->core_cond);
+    qemu_mutex_destroy(&s->core_lock);
+}
+
+static const VMStateDescription rivos_iommu_vmstate = {
+    .name = "rivos-iommu",
+    .unmigratable = 1
+};
+
+static Property rivos_iommu_properties[] = {
+    DEFINE_PROP_UINT32("version", RivosIOMMUState, version, RIO_CAP_REVISION),
+    DEFINE_PROP_BOOL("msi", RivosIOMMUState, enable_msi, TRUE),
+    DEFINE_PROP_BOOL("stage-one", RivosIOMMUState, enable_stage_one, TRUE),
+    DEFINE_PROP_BOOL("stage-two", RivosIOMMUState, enable_stage_two, TRUE),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void rivos_iommu_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    device_class_set_props(dc, rivos_iommu_properties);
+    k->realize = rivos_iommu_realize;
+    k->exit = rivos_iommu_exit;
+    k->vendor_id = PCI_VENDOR_ID_RIVOS;
+    k->device_id = PCI_DEVICE_ID_RIVOS_IOMMU;
+    k->revision = 0;
+    k->class_id = PCI_CLASS_SYSTEM_IOMMU;
+    dc->desc = "RIVOS-IOMMU (RIO) DMA Remapping device";
+    dc->vmsd = &rivos_iommu_vmstate;
+    dc->hotpluggable = false;
+    dc->user_creatable = true;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+}
+
+static const TypeInfo rivos_iommu_pci = {
+    .name = TYPE_RIVOS_IOMMU_PCI,
+    .parent = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(RivosIOMMUState),
+    .class_init = rivos_iommu_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_PCIE_DEVICE },
+        { },
+    },
+};
+
+static void rivos_iommu_memory_region_class_init(ObjectClass *klass, void *data)
+{
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+    imrc->translate = rivos_iommu_translate;
+    imrc->notify_flag_changed = rivos_iommu_notify_flag_changed;
+}
+
+static const TypeInfo rivos_iommu_memory_region_info = {
+    .parent = TYPE_IOMMU_MEMORY_REGION,
+    .name = TYPE_RIVOS_IOMMU_MEMORY_REGION,
+    .class_init = rivos_iommu_memory_region_class_init,
+};
+
+static void rivos_iommu_register_types(void)
+{
+    type_register_static(&rivos_iommu_pci);
+    type_register_static(&rivos_iommu_memory_region_info);
+}
+
+type_init(rivos_iommu_register_types);
diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
new file mode 100644
index 0000000000..c3618764ed
--- /dev/null
+++ b/hw/riscv/trace-events
@@ -0,0 +1,7 @@ 
+# See documentation at docs/devel/tracing.rst
+
+# rivos-iommu.c
+rivos_iommu_new(int bus, int slot, int func, int dbus, int dslot, int dfunc) "NEW %04x:%02x.%d attached %04x:%02x.%d"
+rivos_iommu_flt(int bus, int slot, int func, int cause, uint64_t iova) "FLT %04x:%02x.%d cause: %d iova: 0x%"PRIx64
+rivos_iommu_dma(int bus, int slot, int func, const char *dir, uint64_t iova, uint64_t phys) "TLB q%04x:%02x.%d %s 0x%"PRIx64" -> 0x%"PRIx64
+rivos_iommu_cmd(int bus, int slot, int func, uint64_t l, uint64_t u) "CMD %04x:%02x.%d 0x%"PRIx64" 0x%"PRIx64
diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
new file mode 100644
index 0000000000..b88504b750
--- /dev/null
+++ b/hw/riscv/trace.h
@@ -0,0 +1,2 @@ 
+#include "trace/trace-hw_riscv.h"
+
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index 11abe22d46..73dad2aced 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -88,6 +88,7 @@ 
 #define PCI_CLASS_SYSTEM_RTC             0x0803
 #define PCI_CLASS_SYSTEM_PCI_HOTPLUG     0x0804
 #define PCI_CLASS_SYSTEM_SDHCI           0x0805
+#define PCI_CLASS_SYSTEM_IOMMU           0x0806
 #define PCI_CLASS_SYSTEM_OTHER           0x0880
 
 #define PCI_BASE_CLASS_INPUT             0x09
diff --git a/include/hw/riscv/rivos_iommu.h b/include/hw/riscv/rivos_iommu.h
new file mode 100644
index 0000000000..097086d83e
--- /dev/null
+++ b/include/hw/riscv/rivos_iommu.h
@@ -0,0 +1,80 @@ 
+/*
+ * QEMU emulation of an RISC-V RIVOS-IOMMU
+ *
+ * Copyright (C) 2022 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_RIVOS_IOMMU_H
+#define HW_RIVOS_IOMMU_H
+
+#include "hw/sysbus.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
+#include "qom/object.h"
+
+#define TYPE_RIVOS_IOMMU_PCI "rivos-iommu"
+OBJECT_DECLARE_SIMPLE_TYPE(RivosIOMMUState, RIVOS_IOMMU_PCI)
+
+#define TYPE_RIVOS_IOMMU_MEMORY_REGION "rivos-iommu-memory-region"
+
+typedef struct RivosIOMMUState RivosIOMMUState;
+typedef struct RivosIOMMUSpace RivosIOMMUSpace;
+
+#define RIVOS_IOMMU_REGS_SIZE 0x300  /* control registers space        */
+
+/*
+ * IO virtual address space remapping device state.
+ */
+struct RivosIOMMUState {
+    PCIDevice pci;                   /* Parent PCI device              */
+
+    MemoryRegion bar0;
+    MemoryRegion mmio;
+    uint8_t regs_rw[RIVOS_IOMMU_REGS_SIZE]; /* MMIO register state     */
+    uint8_t regs_wc[RIVOS_IOMMU_REGS_SIZE]; /* MMIO write-1-to-clear   */
+    uint8_t regs_ro[RIVOS_IOMMU_REGS_SIZE]; /* MMIO read/only mask     */
+
+    /* IOMMU Properties */
+    uint32_t version;
+    bool enable_msi;                 /* Enable MSI translation         */
+    bool enable_stage_one;           /* Enable IOVA->GPA translation   */
+    bool enable_stage_two;           /* Enable GPA->SPA translation    */
+
+    QemuCond core_cond;
+    QemuMutex core_lock;
+    QemuThread core_proc;
+    bool core_stop;
+
+    hwaddr ddt_base;
+    uint32_t ddt_mode;
+    int ddt_depth;
+
+    hwaddr cq_base;
+    uint32_t cq_mask;
+    uint32_t cq_head;
+    bool cq_tail_db;
+
+    hwaddr fq_base;
+    uint32_t fq_mask;
+    uint32_t fq_tail;
+
+    hwaddr pq_base;
+    uint32_t pq_mask;
+    uint32_t pq_tail;
+
+    QLIST_HEAD(, RivosIOMMUSpace) spaces;
+};
+
+#endif
diff --git a/meson.build b/meson.build
index bae62efc9c..62d2a56326 100644
--- a/meson.build
+++ b/meson.build
@@ -2688,6 +2688,7 @@  if have_system
     'hw/ppc',
     'hw/rdma',
     'hw/rdma/vmw',
+    'hw/riscv',
     'hw/rtc',
     'hw/s390x',
     'hw/scsi',