diff mbox series

[v9,03/12] hw/riscv: add RISC-V IOMMU base emulation

Message ID 20241004155721.2154626-4-dbarboza@ventanamicro.com (mailing list archive)
State New, archived
Headers show
Series riscv: QEMU RISC-V IOMMU Support | expand

Commit Message

Daniel Henrique Barboza Oct. 4, 2024, 3:57 p.m. UTC
From: Tomasz Jeznach <tjeznach@rivosinc.com>

The RISC-V IOMMU specification is now ratified as-per the RISC-V
international process. The latest frozen specifcation can be found at:

https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf

Add the foundation of the device emulation for RISC-V IOMMU. It includes
support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
sv39x4, sv48x4, sv57x4 caps).

Other capabilities like ATS and DBG support will be added incrementally
in the next patches.

Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
Acked-by: Alistair Francis <alistair.francis@wdc.com>
---
 hw/riscv/Kconfig            |    4 +
 hw/riscv/meson.build        |    1 +
 hw/riscv/riscv-iommu-bits.h |   18 +
 hw/riscv/riscv-iommu.c      | 2021 +++++++++++++++++++++++++++++++++++
 hw/riscv/riscv-iommu.h      |  126 +++
 hw/riscv/trace-events       |   14 +
 hw/riscv/trace.h            |    1 +
 include/hw/riscv/iommu.h    |   36 +
 meson.build                 |    1 +
 9 files changed, 2222 insertions(+)
 create mode 100644 hw/riscv/riscv-iommu.c
 create mode 100644 hw/riscv/riscv-iommu.h
 create mode 100644 hw/riscv/trace-events
 create mode 100644 hw/riscv/trace.h
 create mode 100644 include/hw/riscv/iommu.h

Comments

Jason Chien Oct. 16, 2024, 3:37 a.m. UTC | #1
Hi Daniel,

On 2024/10/4 下午 11:57, Daniel Henrique Barboza wrote:
> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>
> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> international process. The latest frozen specifcation can be found at:
>
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>
> Add the foundation of the device emulation for RISC-V IOMMU. It includes
> support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
> sv39x4, sv48x4, sv57x4 caps).
>
> Other capabilities like ATS and DBG support will be added incrementally
> in the next patches.
>
> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> Acked-by: Alistair Francis <alistair.francis@wdc.com>
> ---
>   hw/riscv/Kconfig            |    4 +
>   hw/riscv/meson.build        |    1 +
>   hw/riscv/riscv-iommu-bits.h |   18 +
>   hw/riscv/riscv-iommu.c      | 2021 +++++++++++++++++++++++++++++++++++
>   hw/riscv/riscv-iommu.h      |  126 +++
>   hw/riscv/trace-events       |   14 +
>   hw/riscv/trace.h            |    1 +
>   include/hw/riscv/iommu.h    |   36 +
>   meson.build                 |    1 +
>   9 files changed, 2222 insertions(+)
>   create mode 100644 hw/riscv/riscv-iommu.c
>   create mode 100644 hw/riscv/riscv-iommu.h
>   create mode 100644 hw/riscv/trace-events
>   create mode 100644 hw/riscv/trace.h
>   create mode 100644 include/hw/riscv/iommu.h
>
> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> index a2030e3a6f..f69d6e3c8e 100644
> --- a/hw/riscv/Kconfig
> +++ b/hw/riscv/Kconfig
> @@ -1,3 +1,6 @@
> +config RISCV_IOMMU
> +    bool
> +
>   config RISCV_NUMA
>       bool
>   
> @@ -47,6 +50,7 @@ config RISCV_VIRT
>       select SERIAL
>       select RISCV_ACLINT
>       select RISCV_APLIC
> +    select RISCV_IOMMU
>       select RISCV_IMSIC
>       select SIFIVE_PLIC
>       select SIFIVE_TEST
> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> index f872674093..cbc99c6e8e 100644
> --- a/hw/riscv/meson.build
> +++ b/hw/riscv/meson.build
> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>   riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>   riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
>   riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
>   
>   hw_arch += {'riscv': riscv_ss}
> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
> index c46d7d18ab..b1c477f5c3 100644
> --- a/hw/riscv/riscv-iommu-bits.h
> +++ b/hw/riscv/riscv-iommu-bits.h
> @@ -69,6 +69,14 @@ struct riscv_iommu_pq_record {
>   /* 5.3 IOMMU Capabilities (64bits) */
>   #define RISCV_IOMMU_REG_CAP             0x0000
>   #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
> +#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
> +#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
> +#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
> +#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
> +#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
> +#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
> +#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
> +#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
>   #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
>   #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
>   #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
> @@ -80,7 +88,9 @@ struct riscv_iommu_pq_record {
>   
>   /* 5.4 Features control register (32bits) */
>   #define RISCV_IOMMU_REG_FCTL            0x0008
> +#define RISCV_IOMMU_FCTL_BE             BIT(0)
>   #define RISCV_IOMMU_FCTL_WSI            BIT(1)
> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
>   
>   /* 5.5 Device-directory-table pointer (64bits) */
>   #define RISCV_IOMMU_REG_DDTP            0x0010
> @@ -175,6 +185,10 @@ enum {
>   
>   /* 5.27 Interrupt cause to vector (64bits) */
>   #define RISCV_IOMMU_REG_ICVEC           0x02F8
> +#define RISCV_IOMMU_ICVEC_CIV           GENMASK_ULL(3, 0)
> +#define RISCV_IOMMU_ICVEC_FIV           GENMASK_ULL(7, 4)
> +#define RISCV_IOMMU_ICVEC_PMIV          GENMASK_ULL(11, 8)
> +#define RISCV_IOMMU_ICVEC_PIV           GENMASK_ULL(15, 12)
>   
>   /* 5.28 MSI Configuration table (32 * 64bits) */
>   #define RISCV_IOMMU_REG_MSI_CONFIG      0x0300
> @@ -203,6 +217,8 @@ struct riscv_iommu_dc {
>   #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
>   #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
>   #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
>   #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
>   #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
>   #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
> @@ -309,9 +325,11 @@ enum riscv_iommu_fq_causes {
>   
>   /* Translation attributes fields */
>   #define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
> +#define RISCV_IOMMU_PC_TA_RESERVED      GENMASK_ULL(63, 32)
>   
>   /* First stage context fields */
>   #define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
> +#define RISCV_IOMMU_PC_FSC_RESERVED     GENMASK_ULL(59, 44)
>   
>   enum riscv_iommu_fq_ttypes {
>       RISCV_IOMMU_FQ_TTYPE_NONE = 0,
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> new file mode 100644
> index 0000000000..f2679f3740
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.c
> @@ -0,0 +1,2021 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2021-2023, Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2 or later, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci_device.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/riscv/riscv_hart.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qemu/timer.h"
> +
> +#include "cpu_bits.h"
> +#include "riscv-iommu.h"
> +#include "riscv-iommu-bits.h"
> +#include "trace.h"
> +
> +#define LIMIT_CACHE_CTX               (1U << 7)
> +#define LIMIT_CACHE_IOT               (1U << 20)
> +
> +/* Physical page number coversions */
> +#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
> +#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
> +
> +typedef struct RISCVIOMMUContext RISCVIOMMUContext;
> +typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
> +
> +/* Device assigned I/O address space */
> +struct RISCVIOMMUSpace {
> +    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
> +    AddressSpace iova_as;       /* IOVA address space for attached device */
> +    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
> +    uint32_t devid;             /* Requester identifier, AKA device_id */
> +    bool notifier;              /* IOMMU unmap notifier enabled */
> +    QLIST_ENTRY(RISCVIOMMUSpace) list;
> +};
> +
> +/* Device translation context state. */
> +struct RISCVIOMMUContext {
> +    uint64_t devid:24;          /* Requester Id, AKA device_id */
> +    uint64_t process_id:20;     /* Process ID. PASID for PCIe */
> +    uint64_t tc;                /* Translation Control */
> +    uint64_t ta;                /* Translation Attributes */
> +    uint64_t satp;              /* S-Stage address translation and protection */
> +    uint64_t gatp;              /* G-Stage address translation and protection */
> +    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
> +    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
> +    uint64_t msiptp;            /* MSI redirection page table pointer */
> +};
> +
> +/* IOMMU index for transactions without process_id specified. */
> +#define RISCV_IOMMU_NOPROCID 0
> +
> +static uint8_t riscv_iommu_get_icvec_vector(uint32_t icvec, uint32_t vec_type)
> +{
> +    switch (vec_type) {
> +    case RISCV_IOMMU_INTR_CQ:
> +        return icvec & RISCV_IOMMU_ICVEC_CIV;
> +    case RISCV_IOMMU_INTR_FQ:
> +        return (icvec & RISCV_IOMMU_ICVEC_FIV) >> 4;
> +    case RISCV_IOMMU_INTR_PM:
> +        return (icvec & RISCV_IOMMU_ICVEC_PMIV) >> 8;
> +    case RISCV_IOMMU_INTR_PQ:
> +        return (icvec & RISCV_IOMMU_ICVEC_PIV) >> 12;
> +    default:
> +        g_assert_not_reached();
> +    }
> +}
> +
> +static void riscv_iommu_notify(RISCVIOMMUState *s, int vec_type)
> +{
> +    const uint32_t fctl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FCTL);
> +    uint32_t ipsr, icvec, vector;
> +
> +    if (fctl & RISCV_IOMMU_FCTL_WSI || !s->notify) {
> +        return;
> +    }
> +
> +    icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
> +    ipsr = riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec_type), 0);
> +
> +    if (!(ipsr & (1 << vec_type))) {
> +        vector = riscv_iommu_get_icvec_vector(icvec, vec_type);
> +        s->notify(s, vector);
> +        trace_riscv_iommu_notify_int_vector(vec_type, vector);
> +    }
> +}
> +
> +static void riscv_iommu_fault(RISCVIOMMUState *s,
> +                              struct riscv_iommu_fq_record *ev)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
> +    uint32_t next = (tail + 1) & s->fq_mask;
> +    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
> +
> +    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), ev->hdr, ev->iotval);
> +
> +    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
> +        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                              RISCV_IOMMU_FQCSR_FQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
> +        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                                  RISCV_IOMMU_FQCSR_FQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
> +    }
> +}
> +
> +static void riscv_iommu_pri(RISCVIOMMUState *s,
> +    struct riscv_iommu_pq_record *pr)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
> +    uint32_t next = (tail + 1) & s->pq_mask;
> +    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
> +
> +    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), pr->payload);
> +
> +    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
> +        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                              RISCV_IOMMU_PQCSR_PQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
> +        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                                  RISCV_IOMMU_PQCSR_PQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
> +    }
> +}
> +
> +/* Portable implementation of pext_u64, bit-mask extraction. */
> +static uint64_t _pext_u64(uint64_t val, uint64_t ext)
> +{
> +    uint64_t ret = 0;
> +    uint64_t rot = 1;
> +
> +    while (ext) {
> +        if (ext & 1) {
> +            if (val & 1) {
> +                ret |= rot;
> +            }
> +            rot <<= 1;
> +        }
> +        val >>= 1;
> +        ext >>= 1;
> +    }
> +
> +    return ret;
> +}
> +
> +/* Check if GPA matches MSI/MRIF pattern. */
> +static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    dma_addr_t gpa)
> +{
> +    if (!s->enable_msi) {
> +        return false;
> +    }
> +
> +    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
> +        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +        return false; /* Invalid MSI/MRIF mode */
> +    }
> +
> +    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
> +        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
> + *
> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> + * Both implementation can be merged into single helper function in future.
> + * Keeping them separate for now, as error reporting and flow specifics are
> + * sufficiently different for separate implementation.
> + *
> + * @s        : IOMMU Device State
> + * @ctx      : Translation context for device id and process address space id.
> + * @iotlb    : translation data: physical address and access mode.
> + * @return   : success or fault cause code.
> + */
> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    dma_addr_t addr, base;
> +    uint64_t satp, gatp, pte;
> +    bool en_s, en_g;
> +    struct {
> +        unsigned char step;
> +        unsigned char levels;
> +        unsigned char ptidxbits;
> +        unsigned char ptesize;
> +    } sc[2];
> +    /* Translation stage phase */
> +    enum {
> +        S_STAGE = 0,
> +        G_STAGE = 1,
> +    } pass;
> +    MemTxResult ret;
> +
> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> +
> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> +
> +    /*
> +     * Early check for MSI address match when IOVA == GPA.
> +     * Note that the (!en_s) condition means that the MSI
> +     * page table may only be used when guest pages are
> +     * mapped using the g-stage page table, whether single-
> +     * or two-stage paging is enabled. It's unavoidable though,
> +     * because the spec mandates that we do a first-stage
> +     * translation before we check the MSI page table, which
> +     * means we can't do an early MSI check unless we have
> +     * strictly !en_s.
> +     */
> +    if (!en_s && (iotlb->perm & IOMMU_WO) &&
> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
> +        iotlb->target_as = &s->trap_as;
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        return 0;
> +    }
> +
> +    /* Exit early for pass-through mode. */
> +    if (!(en_s || en_g)) {
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        /* Allow R/W in pass-through mode */
> +        iotlb->perm = IOMMU_RW;
> +        return 0;
> +    }
> +
> +    /* S/G translation parameters. */
> +    for (pass = 0; pass < 2; pass++) {
> +        uint32_t sv_mode;
> +
> +        sc[pass].step = 0;
> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
> +            /* 32bit mode for GXL/SXL == 1 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 2;
> +                sc[pass].ptidxbits = 10;
> +                sc[pass].ptesize   = 4;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        } else {
> +            /* 64bit mode for GXL/SXL == 0 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 3;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 4;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 5;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        }
> +    };
> +
> +    /* S/G stages translation tables root pointers */
> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    addr = (en_s && en_g) ? satp : iotlb->iova;
> +    base = en_g ? gatp : satp;
> +    pass = en_g ? G_STAGE : S_STAGE;
> +
> +    do {
> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
> +                                 (sc[pass].levels - 1 - sc[pass].step);
> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
> +        const bool ade =
> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
> +
> +        /* Address range check before first level lookup */
> +        if (!sc[pass].step) {
> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> +            if ((addr & va_mask) != addr) {
> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +            }
> +        }
> +
> +        /* Read page table entry */
> +        if (sc[pass].ptesize == 4) {
> +            uint32_t pte32 = 0;
> +            ret = ldl_le_dma(s->target_as, pte_addr, &pte32,
> +                             MEMTXATTRS_UNSPECIFIED);
> +            pte = pte32;
> +        } else {
> +            ret = ldq_le_dma(s->target_as, pte_addr, &pte,
> +                             MEMTXATTRS_UNSPECIFIED);
> +        }
> +        if (ret != MEMTX_OK) {
> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
> +        }
> +
> +        sc[pass].step++;
> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> +        if (!(pte & PTE_V)) {
> +            break;                /* Invalid PTE */
> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> +            break;                /* Reserved leaf PTE flags: PTE_W */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> +            break;                /* Misaligned PPN */
> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
> +            break;                /* Read access check failed */
> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
> +            break;                /* Write access check failed */
> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
> +            break;                /* Access bit not set */
> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
> +            break;                /* Dirty bit not set */
> +        } else {
> +            /* Leaf PTE, translation completed. */
> +            sc[pass].step = sc[pass].levels;
> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> +            /* Update address mask based on smallest translation granularity */
> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
> +            /* Continue with S-Stage translation? */
> +            if (pass && sc[0].step != sc[0].levels) {
> +                pass = S_STAGE;
> +                addr = iotlb->iova;
> +                continue;
> +            }
> +            /* Translation phase completed (GPA or SPA) */
> +            iotlb->translated_addr = base;
> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> +                                                         : IOMMU_RO;
> +
> +            /* Check MSI GPA address match */
> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> +                riscv_iommu_msi_check(s, ctx, base)) {
> +                /* Trap MSI writes and return GPA address. */
> +                iotlb->target_as = &s->trap_as;
> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +                return 0;
> +            }
> +
> +            /* Continue with G-Stage translation? */
> +            if (!pass && en_g) {
> +                pass = G_STAGE;
> +                addr = base;
> +                base = gatp;
> +                sc[pass].step = 0;
> +                continue;
> +            }
> +
> +            return 0;
> +        }
> +
> +        if (sc[pass].step == sc[pass].levels) {
> +            break; /* Can't find leaf PTE */
> +        }
> +
> +        /* Continue with G-Stage translation? */
> +        if (!pass && en_g) {
> +            pass = G_STAGE;
> +            addr = base;
> +            base = gatp;
> +            sc[pass].step = 0;
> +        }
> +    } while (1);
> +
> +    return (iotlb->perm & IOMMU_WO) ?
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
> +}
> +
> +static void riscv_iommu_report_fault(RISCVIOMMUState *s,
> +                                     RISCVIOMMUContext *ctx,
> +                                     uint32_t fault_type, uint32_t cause,
> +                                     bool pv,
> +                                     uint64_t iotval, uint64_t iotval2)
> +{
> +    struct riscv_iommu_fq_record ev = { 0 };
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
> +        switch (cause) {
> +        case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
> +        case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
> +        case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
> +            break;
> +        default:
> +            /* DTF prevents reporting a fault for this given cause */
> +            return;
> +        }
> +    }
> +
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
> +
> +    if (pv) {
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
> +    }
> +
> +    ev.iotval = iotval;
> +    ev.iotval2 = iotval2;
> +
> +    riscv_iommu_fault(s, &ev);
> +}
> +
> +/* Redirect MSI write for given GPA. */
> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
> +    unsigned size, MemTxAttrs attrs)
> +{
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint64_t intn;
> +    uint32_t n190;
> +    uint64_t pte[2];
> +    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> +    int cause;
> +
> +    /* Interrupt File Number */
> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
> +    if (intn >= 256) {
> +        /* Interrupt file number out of range */
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    /* fetch MSI PTE */
> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
> +    addr = addr | (intn * sizeof(pte));
> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
> +            MEMTXATTRS_UNSPECIFIED);
> +    if (res != MEMTX_OK) {
> +        if (res == MEMTX_DECODE_ERROR) {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
> +        } else {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        }
> +        goto err;
> +    }
> +
> +    le64_to_cpus(&pte[0]);
> +    le64_to_cpus(&pte[1]);
> +
> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
> +        /*
> +         * The spec mentions that: "If msipte.C == 1, then further
> +         * processing to interpret the PTE is implementation
> +         * defined.". We'll abort with cause = 262 for this
> +         * case too.
> +         */
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
> +        goto err;
> +    }
> +
> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
> +        /* MSI Pass-through mode */
> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
> +
> +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                              gpa, addr);
> +
> +        res = dma_memory_write(s->target_as, addr, &data, size, attrs);
> +        if (res != MEMTX_OK) {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> +            goto err;
> +        }
> +
> +        return MEMTX_OK;
> +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
> +        /* MRIF mode, continue. */
> +        break;
> +    default:
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
> +        goto err;
> +    }
> +
> +    /*
> +     * Report an error for interrupt identities exceeding the maximum allowed
> +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
> +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
> +     */
> +    if ((data > 2047) || (gpa & 3)) {
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
> +        goto err;
> +    }
> +
> +    /* MSI MRIF mode, non atomic pending bit update */
> +
> +    /* MRIF pending bit address */
> +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
> +    addr = addr | ((data & 0x7c0) >> 3);
> +
> +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                          gpa, addr);
> +
> +    /* MRIF pending bit mask */
> +    data = 1ULL << (data & 0x03f);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    intn = intn | data;
> +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> +        goto err;
> +    }
> +
> +    /* Get MRIF enable bits */
> +    addr = addr + sizeof(intn);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    if (!(intn & data)) {
> +        /* notification disabled, MRIF update completed. */
> +        return MEMTX_OK;
> +    }
> +
> +    /* Send notification message */
> +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
> +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
> +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
> +
> +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> +        goto err;
> +    }
> +
> +    trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
> +
> +    return MEMTX_OK;
> +
> +err:
> +    riscv_iommu_report_fault(s, ctx, fault_type, cause,
> +                             !!ctx->process_id, 0, 0);
> +    return res;
> +}
> +
> +/*
> + * Check device context configuration as described by the
> + * riscv-iommu spec section "Device-context configuration
> + * checks".
> + */
> +static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
> +                                            RISCVIOMMUContext *ctx)
> +{
> +    uint32_t fsc_mode, msi_mode;
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
> +        ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
> +        return false;
> +    }
> +
> +    if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
> +        ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
> +        return false;
> +    }
> +
> +    if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
> +        msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
> +
> +        if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
> +            msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +            return false;
> +        }
> +    }
> +
> +    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
> +        switch (fsc_mode) {
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
> +                return false;
> +            }
> +            break;
> +        }
> +    } else {
> +        /* DC.tc.PDTV is 0 */
> +        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
> +            return false;
> +        }
> +
> +        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
> +            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
> +                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
> +                return false;
> +            }
> +        } else {
> +            switch (fsc_mode) {
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
> +                    return false;
> +                }
> +                break;
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
> +                    return false;
> +                }
> +            break;
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
> +                    return false;
> +                }
> +                break;
> +            }
> +        }
> +    }
> +
> +    /*
> +     * CAP_END is always zero (only one endianess). FCTL_BE is
> +     * always zero (little-endian accesses). Thus TC_SBE must
> +     * always be LE, i.e. zero.
> +     */
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * Validate process context (PC) according to section
> + * "Process-context configuration checks".
> + */
> +static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
> +                                             RISCVIOMMUContext *ctx)
> +{
> +    uint32_t mode;
> +
> +    if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
> +        return false;
> +    }
> +
> +    if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
> +        return false;
> +    }
> +
> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +    switch (mode) {
> +    case RISCV_IOMMU_DC_FSC_MODE_BARE:
> +    /* sv39 and sv32 modes have the same value (8) */
> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +        break;
> +    default:
> +        return false;
> +    }
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
> +        if (mode == RISCV_IOMMU_CAP_SV32 &&
> +            !(s->cap & RISCV_IOMMU_CAP_SV32)) {
> +                return false;
> +        }
> +    } else {
> +        switch (mode) {
> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +            if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +            if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +            if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
> +                return false;
> +            }
> +            break;
> +        }
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> + *
> + * @s         : IOMMU Device State
> + * @ctx       : Device Translation Context with devid and process_id set.
> + * @return    : success or fault code.
> + */
> +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> +{
> +    const uint64_t ddtp = s->ddtp;
> +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
> +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
> +    struct riscv_iommu_dc dc;
> +    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
> +    const int dc_fmt = !s->enable_msi;
> +    const size_t dc_len = sizeof(dc) >> dc_fmt;
> +    unsigned depth;
> +    uint64_t de;
> +
> +    switch (mode) {
> +    case RISCV_IOMMU_DDTP_MODE_OFF:
> +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +
> +    case RISCV_IOMMU_DDTP_MODE_BARE:
> +        /* mock up pass-through translation context */
> +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
> +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_FSC_MODE_BARE);
> +        ctx->tc = RISCV_IOMMU_DC_TC_V;
> +        ctx->ta = 0;
> +        ctx->msiptp = 0;
> +        return 0;
> +
> +    case RISCV_IOMMU_DDTP_MODE_1LVL:
> +        depth = 0;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_2LVL:
> +        depth = 1;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_3LVL:
> +        depth = 2;
> +        break;
> +
> +    default:
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +    }
> +
> +    /*
> +     * Check supported device id width (in bits).
> +     * See IOMMU Specification, Chapter 6. Software guidelines.
> +     * - if extended device-context format is used:
> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
> +     * - if base device-context format is used:
> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
> +     */
> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
> +        return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +    }
> +
> +    /* Device directory tree walk */
> +    for (; depth-- > 0; ) {
> +        /*
> +         * Select device id index bits based on device directory tree level
> +         * and device context format.
> +         * See IOMMU Specification, Chapter 2. Data Structures.
> +         * - if extended device-context format is used:
> +         *   device index: [23:15][14:6][5:0]
> +         * - if base device-context format is used:
> +         *   device index: [23:16][15:7][6:0]
> +         */
> +        const int split = depth * 9 + 6 + dc_fmt;
> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
> +            /* invalid directory entry */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
> +            /* reserved bits set */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
> +    }
> +
> +    /* index into device context entry page */
> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
> +
> +    memset(&dc, 0, sizeof(dc));
> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +    }
> +
> +    /* Set translation context. */
> +    ctx->tc = le64_to_cpu(dc.tc);
> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
> +    ctx->satp = le64_to_cpu(dc.fsc);
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +    }
> +
> +    if (!riscv_iommu_validate_device_ctx(s, ctx)) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +    }
> +
> +    /* FSC field checks */
> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
> +        if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
> +            /* PID is disabled */
> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +        }
> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
> +            /* Invalid translation mode */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        return 0;
> +    }
> +
> +    if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
> +            /* No default process_id enabled, set BARE mode */
> +            ctx->satp = 0ULL;
> +            return 0;
> +        } else {
> +            /* Use default process_id #0 */
> +            ctx->process_id = 0;
> +        }
> +    }
> +
> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
> +        /* No S-Stage translation, done. */
> +        return 0;
> +    }
> +
> +    /* FSC.TC.PDTV enabled */
> +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
> +        /* Invalid PDTP.MODE */
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> +    }
> +
> +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
> +        /*
> +         * Select process id index bits based on process directory tree
> +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
> +         */
> +        const int split = depth * 9 + 8;
> +        addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
> +    }
> +
> +    /* Leaf entry in PDT */
> +    addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
> +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +    }
> +
> +    /* Use FSC and TA from process directory entry. */
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->satp = le64_to_cpu(dc.fsc);
> +
> +    if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> +    }
> +
> +    if (!riscv_iommu_validate_process_ctx(s, ctx)) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> +    }
> +
> +    return 0;
> +}
> +
> +/* Translation Context cache support */
> +static gboolean riscv_iommu_ctx_equal(gconstpointer v1, gconstpointer v2)
> +{
> +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
> +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
> +    return c1->devid == c2->devid &&
> +           c1->process_id == c2->process_id;
> +}
> +
> +static guint riscv_iommu_ctx_hash(gconstpointer v)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
> +    /*
> +     * Generate simple hash of (process_id, devid)
> +     * assuming 24-bit wide devid.
> +     */
> +    return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
> +}
> +
> +static void riscv_iommu_ctx_inval_devid_procid(gpointer key, gpointer value,
> +                                               gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid &&
> +        ctx->process_id == arg->process_id) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void riscv_iommu_ctx_inval_devid(gpointer key, gpointer value,
> +                                        gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void riscv_iommu_ctx_inval_all(gpointer key, gpointer value,
> +                                      gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
> +                                  uint32_t devid, uint32_t process_id)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .process_id = process_id,
> +    };
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    g_hash_table_foreach(ctx_cache, func, &key);
> +    g_hash_table_unref(ctx_cache);
> +}
> +
> +/* Find or allocate translation context for a given {device_id, process_id} */
> +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
> +                                          unsigned devid, unsigned process_id,
> +                                          void **ref)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext *ctx;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .process_id = process_id,
> +    };
> +
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    ctx = g_hash_table_lookup(ctx_cache, &key);
> +
> +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    ctx = g_new0(RISCVIOMMUContext, 1);
> +    ctx->devid = devid;
> +    ctx->process_id = process_id;
> +
> +    int fault = riscv_iommu_ctx_fetch(s, ctx);
> +    if (!fault) {
> +        if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
> +            g_hash_table_unref(ctx_cache);
> +            ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
> +                                              riscv_iommu_ctx_equal,
> +                                              g_free, NULL);
> +            g_hash_table_ref(ctx_cache);
> +            g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
> +        }
> +        g_hash_table_add(ctx_cache, ctx);
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    g_hash_table_unref(ctx_cache);
> +    *ref = NULL;
> +
> +    riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
> +                             fault, !!process_id, 0, 0);
> +
> +    g_free(ctx);
> +    return NULL;
> +}
> +
> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
> +{
> +    if (ref) {
> +        g_hash_table_unref((GHashTable *)ref);
> +    }
> +}
> +
> +/* Find or allocate address space for a given device */
> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
> +{
> +    RISCVIOMMUSpace *as;
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    QLIST_FOREACH(as, &s->spaces, list) {
> +        if (as->devid == devid) {
> +            break;
> +        }
> +    }
> +
> +    if (as == NULL) {
> +        char name[64];
> +        as = g_new0(RISCVIOMMUSpace, 1);
> +
> +        as->iommu = s;
> +        as->devid = devid;
> +
> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +
> +        /* IOVA address space, untranslated addresses */
> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
> +            OBJECT(as), "riscv_iommu", UINT64_MAX);
> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
> +
> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
> +
> +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +    }
> +    return &as->iova_as;
> +}
> +
> +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    bool enable_pid;
> +    bool enable_pri;
> +    int fault;
> +
> +    /*
> +     * TC[32] is reserved for custom extensions, used here to temporarily
> +     * enable automatic page-request generation for ATS queries.
> +     */
> +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
> +    enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
> +
> +    /* Translate using device directory / page table information. */
> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
> +
> +    if (enable_pri && fault) {
> +        struct riscv_iommu_pq_record pr = {0};
> +        if (enable_pid) {
> +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
> +                               RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
> +        }
> +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
> +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
> +                     RISCV_IOMMU_PREQ_PAYLOAD_M;
> +        riscv_iommu_pri(s, &pr);
> +        return fault;
> +    }
> +
> +    if (fault) {
> +        unsigned ttype;
> +
> +        if (iotlb->perm & IOMMU_RW) {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> +        } else {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
> +        }
> +
> +        riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
> +                                 iotlb->iova, iotlb->translated_addr);
> +        return fault;
> +    }
> +
> +    return 0;
> +}
> +
> +/* IOMMU Command Interface */
> +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
> +    uint64_t addr, uint32_t data)
> +{
> +    /*
> +     * ATS processing in this implementation of the IOMMU is synchronous,
> +     * no need to wait for completions here.
> +     */
> +    if (!notify) {
> +        return MEMTX_OK;
> +    }
> +
> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
> +        MEMTXATTRS_UNSPECIFIED);
> +}
> +
> +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
> +{
> +    uint64_t old_ddtp = s->ddtp;
> +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
> +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    bool ok = false;
> +
> +    /*
> +     * Check for allowed DDTP.MODE transitions:
> +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
> +     */
> +    if (new_mode == old_mode ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
> +        ok = true;
> +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
> +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
> +    }
> +
> +    if (ok) {
> +        /* clear reserved and busy bits, report back sanitized version */
> +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
> +                             RISCV_IOMMU_DDTP_MODE, new_mode);
> +    } else {
> +        new_ddtp = old_ddtp;
> +    }
> +    s->ddtp = new_ddtp;
> +
> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
> +}
> +
> +/* Command function and opcode field. */
> +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
> +
> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
> +{
> +    struct riscv_iommu_command cmd;
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint32_t tail, head, ctrl;
> +    uint64_t cmd_opcode;
> +    GHFunc func;
> +
> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
> +
> +    /* Check for pending error or queue processing disabled */
> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
> +        return;
> +    }
> +
> +    while (tail != head) {
> +        addr = s->cq_addr  + head * sizeof(cmd);
> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
> +                              MEMTXATTRS_UNSPECIFIED);
> +
> +        if (res != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                  RISCV_IOMMU_CQCSR_CQMF, 0);
> +            goto fault;
> +        }
> +
> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
> +
> +        cmd_opcode = get_field(cmd.dword0,
> +                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
> +
> +        switch (cmd_opcode) {
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
> +            res = riscv_iommu_iofence(s,
> +                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
The correct address is cmd.dword1 << 2.
Reviewed-by: Jason Chien <jason.chien@sifive.com>
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
> +
> +            if (res != MEMTX_OK) {
> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
> +                goto fault;
> +            }
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
> +                goto cmd_ill;
> +            }
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* invalidate all device context cache mappings */
> +                func = riscv_iommu_ctx_inval_all;
> +            } else {
> +                /* invalidate all device context matching DID */
> +                func = riscv_iommu_ctx_inval_devid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* illegal command arguments IODIR_PDT & DV == 0 */
> +                goto cmd_ill;
> +            } else {
> +                func = riscv_iommu_ctx_inval_devid_procid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
> +            break;
> +
> +        default:
> +        cmd_ill:
> +            /* Invalid instruction, do not advance instruction index. */
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
> +            goto fault;
> +        }
> +
> +        /* Advance and update head pointer after command completes. */
> +        head = (head + 1) & s->cq_mask;
> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
> +    }
> +    return;
> +
> +fault:
> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
> +    }
> +}
> +
> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
> +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
> +                   RISCV_IOMMU_CQCSR_FENCE_W_IP;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
> +            RISCV_IOMMU_FQCSR_FQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
> +            RISCV_IOMMU_PQCSR_PQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
> +
> +static void riscv_iommu_update_icvec(RISCVIOMMUState *s, uint64_t data)
> +{
> +    uint64_t icvec = 0;
> +
> +    icvec |= MIN(data & RISCV_IOMMU_ICVEC_CIV,
> +                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_CIV);
> +
> +    icvec |= MIN(data & RISCV_IOMMU_ICVEC_FIV,
> +                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_FIV);
> +
> +    icvec |= MIN(data & RISCV_IOMMU_ICVEC_PMIV,
> +                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PMIV);
> +
> +    icvec |= MIN(data & RISCV_IOMMU_ICVEC_PIV,
> +                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PIV);
> +
> +    trace_riscv_iommu_icvec_write(data, icvec);
> +
> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_ICVEC, icvec);
> +}
> +
> +static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
> +{
> +    uint32_t cqcsr, fqcsr, pqcsr;
> +    uint32_t ipsr_set = 0;
> +    uint32_t ipsr_clr = 0;
> +
> +    if (data & RISCV_IOMMU_IPSR_CIP) {
> +        cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +
> +        if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
> +            (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
> +             cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
> +            ipsr_set |= RISCV_IOMMU_IPSR_CIP;
> +        } else {
> +            ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
> +        }
> +    } else {
> +        ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
> +    }
> +
> +    if (data & RISCV_IOMMU_IPSR_FIP) {
> +        fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +
> +        if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
> +            (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
> +             fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
> +            ipsr_set |= RISCV_IOMMU_IPSR_FIP;
> +        } else {
> +            ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
> +        }
> +    } else {
> +        ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
> +    }
> +
> +    if (data & RISCV_IOMMU_IPSR_PIP) {
> +        pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +
> +        if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
> +            (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
> +             pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
> +            ipsr_set |= RISCV_IOMMU_IPSR_PIP;
> +        } else {
> +            ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
> +        }
> +    } else {
> +        ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
> +}
> +
> +/*
> + * Write the resulting value of 'data' for the reg specified
> + * by 'reg_addr', after considering read-only/read-write/write-clear
> + * bits, in the pointer 'dest'.
> + *
> + * The result is written in little-endian.
> + */
> +static void riscv_iommu_write_reg_val(RISCVIOMMUState *s,
> +                                      void *dest, hwaddr reg_addr,
> +                                      int size, uint64_t data)
> +{
> +    uint64_t ro = ldn_le_p(&s->regs_ro[reg_addr], size);
> +    uint64_t wc = ldn_le_p(&s->regs_wc[reg_addr], size);
> +    uint64_t rw = ldn_le_p(&s->regs_rw[reg_addr], size);
> +
> +    stn_le_p(dest, size, ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +}
> +
> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
> +                                          uint64_t data, unsigned size,
> +                                          MemTxAttrs attrs)
> +{
> +    riscv_iommu_process_fn *process_fn = NULL;
> +    RISCVIOMMUState *s = opaque;
> +    uint32_t regb = addr & ~3;
> +    uint32_t busy = 0;
> +    uint64_t val = 0;
> +
> +    if ((addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment or access size */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        /* Unsupported MMIO access location. */
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Track actionable MMIO write. */
> +    switch (regb) {
> +    case RISCV_IOMMU_REG_DDTP:
> +    case RISCV_IOMMU_REG_DDTP + 4:
> +        process_fn = riscv_iommu_process_ddtp;
> +        regb = RISCV_IOMMU_REG_DDTP;
> +        busy = RISCV_IOMMU_DDTP_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQT:
> +        process_fn = riscv_iommu_process_cq_tail;
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQCSR:
> +        process_fn = riscv_iommu_process_cq_control;
> +        busy = RISCV_IOMMU_CQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_FQCSR:
> +        process_fn = riscv_iommu_process_fq_control;
> +        busy = RISCV_IOMMU_FQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_PQCSR:
> +        process_fn = riscv_iommu_process_pq_control;
> +        busy = RISCV_IOMMU_PQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_ICVEC:
> +    case RISCV_IOMMU_REG_IPSR:
> +        /*
> +         * ICVEC and IPSR have special read/write procedures. We'll
> +         * call their respective helpers and exit.
> +         */
> +        riscv_iommu_write_reg_val(s, &val, addr, size, data);
> +
> +        /*
> +         * 'val' is stored as LE. Switch to host endianess
> +         * before using it.
> +         */
> +        val = le64_to_cpu(val);
> +
> +        if (regb == RISCV_IOMMU_REG_ICVEC) {
> +            riscv_iommu_update_icvec(s, val);
> +        } else {
> +            riscv_iommu_update_ipsr(s, val);
> +        }
> +
> +        return MEMTX_OK;
> +
> +    default:
> +        break;
> +    }
> +
> +    /*
> +     * Registers update might be not synchronized with core logic.
> +     * If system software updates register when relevant BUSY bit
> +     * is set IOMMU behavior of additional writes to the register
> +     * is UNSPECIFIED.
> +     */
> +    riscv_iommu_write_reg_val(s, &s->regs_rw[addr], addr, size, data);
> +
> +    /* Busy flag update, MSB 4-byte register. */
> +    if (busy) {
> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
> +        stl_le_p(&s->regs_rw[regb], rw | busy);
> +    }
> +
> +    if (process_fn) {
> +        process_fn(s);
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState *s = opaque;
> +    uint64_t val = -1;
> +    uint8_t *ptr;
> +
> +    if ((addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment. */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    ptr = &s->regs_rw[addr];
> +    val = ldn_le_p(ptr, size);
> +
> +    *data = val;
> +
> +    return MEMTX_OK;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
> +    .read_with_attrs = riscv_iommu_mmio_read,
> +    .write_with_attrs = riscv_iommu_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +        .unaligned = false,
> +    },
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +/*
> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
> + * memory region as untranslated address, for additional MSI/MRIF interception
> + * by IOMMU interrupt remapping implementation.
> + * Note: Device emulation code generating an MSI is expected to provide a valid
> + * memory transaction attributes with requested_id set.
> + */
> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
> +    RISCVIOMMUContext *ctx;
> +    MemTxResult res;
> +    void *ref;
> +    uint32_t devid = attrs.requester_id;
> +
> +    if (attrs.unspecified) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
> +    if (ctx == NULL) {
> +        res = MEMTX_ACCESS_ERROR;
> +    } else {
> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
> +    }
> +    riscv_iommu_ctx_put(s, ref);
> +    return res;
> +}
> +
> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    return MEMTX_ACCESS_ERROR;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_trap_ops = {
> +    .read_with_attrs = riscv_iommu_trap_read,
> +    .write_with_attrs = riscv_iommu_trap_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +        .unaligned = true,
> +    },
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
> +    if (s->enable_msi) {
> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
> +    }
> +    if (s->enable_s_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
> +                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
> +    }
> +    if (s->enable_g_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
> +                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
> +    }
> +    /* Report QEMU target physical address space limits */
> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
> +                       TARGET_PHYS_ADDR_SPACE_BITS);
> +
> +    /* TODO: method to report supported PID bits */
> +    s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
> +    s->cap |= RISCV_IOMMU_CAP_PD8;
> +
> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
> +
> +    /* register storage */
> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +
> +     /* Mark all registers read-only */
> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
> +
> +    /*
> +     * Register complete MMIO space, including MSI/PBA registers.
> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
> +     * managed directly by the PCIDevice implementation.
> +     */
> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
> +
> +    /* Set power-on register state */
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
> +             ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
> +        RISCV_IOMMU_CQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
> +        RISCV_IOMMU_FQCSR_FQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
> +        RISCV_IOMMU_FQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
> +        RISCV_IOMMU_PQCSR_PQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
> +        RISCV_IOMMU_PQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
> +
> +    /* Memory region for downstream access, if specified. */
> +    if (s->target_mr) {
> +        s->target_as = g_new0(AddressSpace, 1);
> +        address_space_init(s->target_as, s->target_mr,
> +            "riscv-iommu-downstream");
> +    } else {
> +        /* Fallback to global system memory. */
> +        s->target_as = &address_space_memory;
> +    }
> +
> +    /* Memory region for untranslated MRIF/MSI writes */
> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
> +            "riscv-iommu-trap", ~0ULL);
> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
> +
> +    /* Device translation context cache */
> +    s->ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
> +                                         riscv_iommu_ctx_equal,
> +                                         g_free, NULL);
> +
> +    s->iommus.le_next = NULL;
> +    s->iommus.le_prev = NULL;
> +    QLIST_INIT(&s->spaces);
> +}
> +
> +static void riscv_iommu_unrealize(DeviceState *dev)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    g_hash_table_unref(s->ctx_cache);
> +}
> +
> +static Property riscv_iommu_properties[] = {
> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
> +        RISCV_IOMMU_SPEC_DOT_VER),
> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
> +    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> +    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
> +    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
> +        TYPE_MEMORY_REGION, MemoryRegion *),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
> +    dc->user_creatable = false;
> +    dc->realize = riscv_iommu_realize;
> +    dc->unrealize = riscv_iommu_unrealize;
> +    device_class_set_props(dc, riscv_iommu_properties);
> +}
> +
> +static const TypeInfo riscv_iommu_info = {
> +    .name = TYPE_RISCV_IOMMU,
> +    .parent = TYPE_DEVICE,
> +    .instance_size = sizeof(RISCVIOMMUState),
> +    .class_init = riscv_iommu_class_init,
> +};
> +
> +static const char *IOMMU_FLAG_STR[] = {
> +    "NA",
> +    "RO",
> +    "WR",
> +    "RW",
> +};
> +
> +/* RISC-V IOMMU Memory Region - Address Translation Space */
> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
> +    IOMMUAccessFlags flag, int iommu_idx)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    RISCVIOMMUContext *ctx;
> +    void *ref;
> +    IOMMUTLBEntry iotlb = {
> +        .iova = addr,
> +        .target_as = as->iommu->target_as,
> +        .addr_mask = ~0ULL,
> +        .perm = flag,
> +    };
> +
> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
> +    if (ctx == NULL) {
> +        /* Translation disabled or invalid. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
> +        /* Translation disabled or fault reported. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    }
> +
> +    /* Trace all dma translations with original access flags. */
> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
> +                          iotlb.translated_addr);
> +
> +    riscv_iommu_ctx_put(as->iommu, ref);
> +
> +    return iotlb;
> +}
> +
> +static int riscv_iommu_memory_region_notify(
> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
> +    IOMMUNotifierFlag new, Error **errp)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +
> +    if (old == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = true;
> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
> +    } else if (new == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = false;
> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
> +    }
> +
> +    return 0;
> +}
> +
> +static inline bool pci_is_iommu(PCIDevice *pdev)
> +{
> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
> +}
> +
> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
> +{
> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> +    AddressSpace *as = NULL;
> +
> +    if (pdev && pci_is_iommu(pdev)) {
> +        return s->target_as;
> +    }
> +
> +    /* Find first registered IOMMU device */
> +    while (s->iommus.le_prev) {
> +        s = *(s->iommus.le_prev);
> +    }
> +
> +    /* Find first matching IOMMU */
> +    while (s != NULL && as == NULL) {
> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> +        s = s->iommus.le_next;
> +    }
> +
> +    return as ? as : &address_space_memory;
> +}
> +
> +static const PCIIOMMUOps riscv_iommu_ops = {
> +    .get_address_space = riscv_iommu_find_as,
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +        Error **errp)
> +{
> +    if (bus->iommu_ops &&
> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> +    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
> +    } else {
> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
> +            pci_bus_num(bus));
> +    }
> +}
> +
> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
> +    MemTxAttrs attrs)
> +{
> +    return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
> +}
> +
> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    return 1 << as->iommu->pid_bits;
> +}
> +
> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
> +{
> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> +
> +    imrc->translate = riscv_iommu_memory_region_translate;
> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> +}
> +
> +static const TypeInfo riscv_iommu_memory_region_info = {
> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> +    .class_init = riscv_iommu_memory_region_init,
> +};
> +
> +static void riscv_iommu_register_mr_types(void)
> +{
> +    type_register_static(&riscv_iommu_memory_region_info);
> +    type_register_static(&riscv_iommu_info);
> +}
> +
> +type_init(riscv_iommu_register_mr_types);
> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> new file mode 100644
> index 0000000000..af3fcafc19
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.h
> @@ -0,0 +1,126 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2 or later, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_STATE_H
> +#define HW_RISCV_IOMMU_STATE_H
> +
> +#include "qom/object.h"
> +#include "hw/riscv/iommu.h"
> +
> +struct RISCVIOMMUState {
> +    /*< private >*/
> +    DeviceState parent_obj;
> +
> +    /*< public >*/
> +    uint32_t version;     /* Reported interface version number */
> +    uint32_t pid_bits;    /* process identifier width */
> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> +
> +    uint64_t cap;         /* IOMMU supported capabilities */
> +    uint64_t fctl;        /* IOMMU enabled features */
> +    uint64_t icvec_avail_vectors;  /* Available interrupt vectors in ICVEC */
> +
> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
> +    bool enable_msi;      /* Enable MSI remapping */
> +    bool enable_s_stage;  /* Enable S/VS-Stage translation */
> +    bool enable_g_stage;  /* Enable G-Stage translation */
> +
> +    /* IOMMU Internal State */
> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> +
> +    dma_addr_t cq_addr;   /* Command queue base physical address */
> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
> +
> +    uint32_t cq_mask;     /* Command queue index bit mask */
> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> +    uint32_t pq_mask;     /* Page request queue index bit mask */
> +
> +    /* interrupt notifier */
> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> +
> +    /* IOMMU State Machine */
> +    QemuThread core_proc; /* Background processing thread */
> +    QemuCond core_cond;   /* Background processing wake up signal */
> +    unsigned core_exec;   /* Processing thread execution actions */
> +
> +    /* IOMMU target address space */
> +    AddressSpace *target_as;
> +    MemoryRegion *target_mr;
> +
> +    /* MSI / MRIF access trap */
> +    AddressSpace trap_as;
> +    MemoryRegion trap_mr;
> +
> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
> +
> +    /* MMIO Hardware Interface */
> +    MemoryRegion regs_mr;
> +    uint8_t *regs_rw;  /* register state (user write) */
> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> +    uint8_t *regs_ro;  /* read-only mask */
> +
> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +         Error **errp);
> +
> +/* private helpers */
> +
> +/* Register helper functions */
> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set, uint32_t clr)
> +{
> +    uint32_t val = ldl_le_p(s->regs_rw + idx);
> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s, unsigned idx,
> +                                         uint32_t set)
> +{
> +    stl_le_p(s->regs_rw + idx, set);
> +}
> +
> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s, unsigned idx)
> +{
> +    return ldl_le_p(s->regs_rw + idx);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s, unsigned idx,
> +                                             uint64_t set, uint64_t clr)
> +{
> +    uint64_t val = ldq_le_p(s->regs_rw + idx);
> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s, unsigned idx,
> +                                         uint64_t set)
> +{
> +    stq_le_p(s->regs_rw + idx, set);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldq_le_p(s->regs_rw + idx);
> +}
> +#endif
> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> new file mode 100644
> index 0000000000..3d5c33102d
> --- /dev/null
> +++ b/hw/riscv/trace-events
> @@ -0,0 +1,14 @@
> +# See documentation at docs/devel/tracing.rst
> +
> +# riscv-iommu.c
> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_mrif_notification(const char *id, uint32_t nid, uint64_t phys) "%s: sent MRIF notification 0x%x to 0x%"PRIx64
> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
> +riscv_iommu_notify_int_vector(uint32_t cause, uint32_t vector) "Interrupt cause 0x%x sent via vector 0x%x"
> +riscv_iommu_icvec_write(uint32_t orig, uint32_t actual) "ICVEC write: incoming 0x%x actual 0x%x"
> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> new file mode 100644
> index 0000000000..8c0e3ca1f3
> --- /dev/null
> +++ b/hw/riscv/trace.h
> @@ -0,0 +1 @@
> +#include "trace/trace-hw_riscv.h"
> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> new file mode 100644
> index 0000000000..80769a1400
> --- /dev/null
> +++ b/include/hw/riscv/iommu.h
> @@ -0,0 +1,36 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2 or later, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_H
> +#define HW_RISCV_IOMMU_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#define TYPE_RISCV_IOMMU "riscv-iommu"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> +
> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> +
> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> +
> +#endif
> diff --git a/meson.build b/meson.build
> index 10464466ff..71de8a5cd1 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -3439,6 +3439,7 @@ if have_system
>       'hw/pci-host',
>       'hw/ppc',
>       'hw/rtc',
> +    'hw/riscv',
>       'hw/s390x',
>       'hw/scsi',
>       'hw/sd',
Daniel Henrique Barboza Oct. 16, 2024, 6:19 p.m. UTC | #2
On 10/16/24 12:37 AM, Jason Chien wrote:
> Hi Daniel,
> 
> On 2024/10/4 下午 11:57, Daniel Henrique Barboza wrote:
>> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>>
>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
>> international process. The latest frozen specifcation can be found at:
>>
>> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>>
>> Add the foundation of the device emulation for RISC-V IOMMU. It includes
>> support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
>> sv39x4, sv48x4, sv57x4 caps).
>>
>> Other capabilities like ATS and DBG support will be added incrementally
>> in the next patches.
>>
>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
>> Acked-by: Alistair Francis <alistair.francis@wdc.com>
>> ---

(...)

>> +
>> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
>> +{
>> +    struct riscv_iommu_command cmd;
>> +    MemTxResult res;
>> +    dma_addr_t addr;
>> +    uint32_t tail, head, ctrl;
>> +    uint64_t cmd_opcode;
>> +    GHFunc func;
>> +
>> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
>> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
>> +
>> +    /* Check for pending error or queue processing disabled */
>> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
>> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
>> +        return;
>> +    }
>> +
>> +    while (tail != head) {
>> +        addr = s->cq_addr  + head * sizeof(cmd);
>> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
>> +                              MEMTXATTRS_UNSPECIFIED);
>> +
>> +        if (res != MEMTX_OK) {
>> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
>> +                                  RISCV_IOMMU_CQCSR_CQMF, 0);
>> +            goto fault;
>> +        }
>> +
>> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
>> +
>> +        cmd_opcode = get_field(cmd.dword0,
>> +                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
>> +
>> +        switch (cmd_opcode) {
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
>> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
>> +            res = riscv_iommu_iofence(s,
>> +                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
> The correct address is cmd.dword1 << 2.

You're right. In riscv-iommu spec, section 3.1.2 "IOMMU Command-queue Fence commands":

The AV command operand indicates if ADDR[63:2] operand and DATA operands are valid.
If AV=1, the IOMMU writes DATA to memory at a 4-byte aligned address ADDR[63:2] * 4 as
a 4-byte store when the command completes. When AV is 0, the ADDR[63:2] and DATA
operands are ignored.


I'll fix this instance and the other 2 instances you pointed out in patch 8.

> Reviewed-by: Jason Chien <jason.chien@sifive.com>

Thanks for the tag!


Daniel

>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
>> +
>> +            if (res != MEMTX_OK) {
>> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
>> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
>> +                goto fault;
>> +            }
>> +            break;
>> +
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
>> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
>> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
>> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
>> +                goto cmd_ill;
>> +            }
>> +            /* translation cache not implemented yet */
>> +            break;
>> +
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
>> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
>> +            /* translation cache not implemented yet */
>> +            break;
>> +
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
>> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
>> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
>> +                /* invalidate all device context cache mappings */
>> +                func = riscv_iommu_ctx_inval_all;
>> +            } else {
>> +                /* invalidate all device context matching DID */
>> +                func = riscv_iommu_ctx_inval_devid;
>> +            }
>> +            riscv_iommu_ctx_inval(s, func,
>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
>> +            break;
>> +
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
>> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
>> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
>> +                /* illegal command arguments IODIR_PDT & DV == 0 */
>> +                goto cmd_ill;
>> +            } else {
>> +                func = riscv_iommu_ctx_inval_devid_procid;
>> +            }
>> +            riscv_iommu_ctx_inval(s, func,
>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
>> +            break;
>> +
>> +        default:
>> +        cmd_ill:
>> +            /* Invalid instruction, do not advance instruction index. */
>> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
>> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
>> +            goto fault;
>> +        }
>> +
>> +        /* Advance and update head pointer after command completes. */
>> +        head = (head + 1) & s->cq_mask;
>> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
>> +    }
>> +    return;
>> +
>> +fault:
>> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
>> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
>> +    }
>> +}
>> +
>> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
>> +{
>> +    uint64_t base;
>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>> +    uint32_t ctrl_clr;
>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
>> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
>> +
>> +    if (enable && !active) {
>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
>> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
>> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
>> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
>> +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
>> +                   RISCV_IOMMU_CQCSR_FENCE_W_IP;
>> +    } else if (!enable && active) {
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
>> +    } else {
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
>> +    }
>> +
>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
>> +}
>> +
>> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
>> +{
>> +    uint64_t base;
>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
>> +    uint32_t ctrl_clr;
>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
>> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
>> +
>> +    if (enable && !active) {
>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
>> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
>> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
>> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
>> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
>> +            RISCV_IOMMU_FQCSR_FQOF;
>> +    } else if (!enable && active) {
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
>> +    } else {
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
>> +    }
>> +
>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
>> +}
>> +
>> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
>> +{
>> +    uint64_t base;
>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
>> +    uint32_t ctrl_clr;
>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
>> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
>> +
>> +    if (enable && !active) {
>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
>> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
>> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
>> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
>> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
>> +            RISCV_IOMMU_PQCSR_PQOF;
>> +    } else if (!enable && active) {
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
>> +    } else {
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
>> +    }
>> +
>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
>> +}
>> +
>> +typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
>> +
>> +static void riscv_iommu_update_icvec(RISCVIOMMUState *s, uint64_t data)
>> +{
>> +    uint64_t icvec = 0;
>> +
>> +    icvec |= MIN(data & RISCV_IOMMU_ICVEC_CIV,
>> +                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_CIV);
>> +
>> +    icvec |= MIN(data & RISCV_IOMMU_ICVEC_FIV,
>> +                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_FIV);
>> +
>> +    icvec |= MIN(data & RISCV_IOMMU_ICVEC_PMIV,
>> +                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PMIV);
>> +
>> +    icvec |= MIN(data & RISCV_IOMMU_ICVEC_PIV,
>> +                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PIV);
>> +
>> +    trace_riscv_iommu_icvec_write(data, icvec);
>> +
>> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_ICVEC, icvec);
>> +}
>> +
>> +static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
>> +{
>> +    uint32_t cqcsr, fqcsr, pqcsr;
>> +    uint32_t ipsr_set = 0;
>> +    uint32_t ipsr_clr = 0;
>> +
>> +    if (data & RISCV_IOMMU_IPSR_CIP) {
>> +        cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>> +
>> +        if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
>> +            (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
>> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
>> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
>> +             cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
>> +            ipsr_set |= RISCV_IOMMU_IPSR_CIP;
>> +        } else {
>> +            ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
>> +        }
>> +    } else {
>> +        ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
>> +    }
>> +
>> +    if (data & RISCV_IOMMU_IPSR_FIP) {
>> +        fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
>> +
>> +        if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
>> +            (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
>> +             fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
>> +            ipsr_set |= RISCV_IOMMU_IPSR_FIP;
>> +        } else {
>> +            ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
>> +        }
>> +    } else {
>> +        ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
>> +    }
>> +
>> +    if (data & RISCV_IOMMU_IPSR_PIP) {
>> +        pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
>> +
>> +        if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
>> +            (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
>> +             pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
>> +            ipsr_set |= RISCV_IOMMU_IPSR_PIP;
>> +        } else {
>> +            ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
>> +        }
>> +    } else {
>> +        ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
>> +    }
>> +
>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
>> +}
>> +
>> +/*
>> + * Write the resulting value of 'data' for the reg specified
>> + * by 'reg_addr', after considering read-only/read-write/write-clear
>> + * bits, in the pointer 'dest'.
>> + *
>> + * The result is written in little-endian.
>> + */
>> +static void riscv_iommu_write_reg_val(RISCVIOMMUState *s,
>> +                                      void *dest, hwaddr reg_addr,
>> +                                      int size, uint64_t data)
>> +{
>> +    uint64_t ro = ldn_le_p(&s->regs_ro[reg_addr], size);
>> +    uint64_t wc = ldn_le_p(&s->regs_wc[reg_addr], size);
>> +    uint64_t rw = ldn_le_p(&s->regs_rw[reg_addr], size);
>> +
>> +    stn_le_p(dest, size, ((rw & ro) | (data & ~ro)) & ~(data & wc));
>> +}
>> +
>> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
>> +                                          uint64_t data, unsigned size,
>> +                                          MemTxAttrs attrs)
>> +{
>> +    riscv_iommu_process_fn *process_fn = NULL;
>> +    RISCVIOMMUState *s = opaque;
>> +    uint32_t regb = addr & ~3;
>> +    uint32_t busy = 0;
>> +    uint64_t val = 0;
>> +
>> +    if ((addr & (size - 1)) != 0) {
>> +        /* Unsupported MMIO alignment or access size */
>> +        return MEMTX_ERROR;
>> +    }
>> +
>> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
>> +        /* Unsupported MMIO access location. */
>> +        return MEMTX_ACCESS_ERROR;
>> +    }
>> +
>> +    /* Track actionable MMIO write. */
>> +    switch (regb) {
>> +    case RISCV_IOMMU_REG_DDTP:
>> +    case RISCV_IOMMU_REG_DDTP + 4:
>> +        process_fn = riscv_iommu_process_ddtp;
>> +        regb = RISCV_IOMMU_REG_DDTP;
>> +        busy = RISCV_IOMMU_DDTP_BUSY;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_CQT:
>> +        process_fn = riscv_iommu_process_cq_tail;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_CQCSR:
>> +        process_fn = riscv_iommu_process_cq_control;
>> +        busy = RISCV_IOMMU_CQCSR_BUSY;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_FQCSR:
>> +        process_fn = riscv_iommu_process_fq_control;
>> +        busy = RISCV_IOMMU_FQCSR_BUSY;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_PQCSR:
>> +        process_fn = riscv_iommu_process_pq_control;
>> +        busy = RISCV_IOMMU_PQCSR_BUSY;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_ICVEC:
>> +    case RISCV_IOMMU_REG_IPSR:
>> +        /*
>> +         * ICVEC and IPSR have special read/write procedures. We'll
>> +         * call their respective helpers and exit.
>> +         */
>> +        riscv_iommu_write_reg_val(s, &val, addr, size, data);
>> +
>> +        /*
>> +         * 'val' is stored as LE. Switch to host endianess
>> +         * before using it.
>> +         */
>> +        val = le64_to_cpu(val);
>> +
>> +        if (regb == RISCV_IOMMU_REG_ICVEC) {
>> +            riscv_iommu_update_icvec(s, val);
>> +        } else {
>> +            riscv_iommu_update_ipsr(s, val);
>> +        }
>> +
>> +        return MEMTX_OK;
>> +
>> +    default:
>> +        break;
>> +    }
>> +
>> +    /*
>> +     * Registers update might be not synchronized with core logic.
>> +     * If system software updates register when relevant BUSY bit
>> +     * is set IOMMU behavior of additional writes to the register
>> +     * is UNSPECIFIED.
>> +     */
>> +    riscv_iommu_write_reg_val(s, &s->regs_rw[addr], addr, size, data);
>> +
>> +    /* Busy flag update, MSB 4-byte register. */
>> +    if (busy) {
>> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
>> +        stl_le_p(&s->regs_rw[regb], rw | busy);
>> +    }
>> +
>> +    if (process_fn) {
>> +        process_fn(s);
>> +    }
>> +
>> +    return MEMTX_OK;
>> +}
>> +
>> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
>> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
>> +{
>> +    RISCVIOMMUState *s = opaque;
>> +    uint64_t val = -1;
>> +    uint8_t *ptr;
>> +
>> +    if ((addr & (size - 1)) != 0) {
>> +        /* Unsupported MMIO alignment. */
>> +        return MEMTX_ERROR;
>> +    }
>> +
>> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
>> +        return MEMTX_ACCESS_ERROR;
>> +    }
>> +
>> +    ptr = &s->regs_rw[addr];
>> +    val = ldn_le_p(ptr, size);
>> +
>> +    *data = val;
>> +
>> +    return MEMTX_OK;
>> +}
>> +
>> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
>> +    .read_with_attrs = riscv_iommu_mmio_read,
>> +    .write_with_attrs = riscv_iommu_mmio_write,
>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>> +    .impl = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +        .unaligned = false,
>> +    },
>> +    .valid = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +    }
>> +};
>> +
>> +/*
>> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
>> + * memory region as untranslated address, for additional MSI/MRIF interception
>> + * by IOMMU interrupt remapping implementation.
>> + * Note: Device emulation code generating an MSI is expected to provide a valid
>> + * memory transaction attributes with requested_id set.
>> + */
>> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
>> +    uint64_t data, unsigned size, MemTxAttrs attrs)
>> +{
>> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
>> +    RISCVIOMMUContext *ctx;
>> +    MemTxResult res;
>> +    void *ref;
>> +    uint32_t devid = attrs.requester_id;
>> +
>> +    if (attrs.unspecified) {
>> +        return MEMTX_ACCESS_ERROR;
>> +    }
>> +
>> +    /* FIXME: PCIe bus remapping for attached endpoints. */
>> +    devid |= s->bus << 8;
>> +
>> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
>> +    if (ctx == NULL) {
>> +        res = MEMTX_ACCESS_ERROR;
>> +    } else {
>> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
>> +    }
>> +    riscv_iommu_ctx_put(s, ref);
>> +    return res;
>> +}
>> +
>> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
>> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
>> +{
>> +    return MEMTX_ACCESS_ERROR;
>> +}
>> +
>> +static const MemoryRegionOps riscv_iommu_trap_ops = {
>> +    .read_with_attrs = riscv_iommu_trap_read,
>> +    .write_with_attrs = riscv_iommu_trap_write,
>> +    .endianness = DEVICE_LITTLE_ENDIAN,
>> +    .impl = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +        .unaligned = true,
>> +    },
>> +    .valid = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +    }
>> +};
>> +
>> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
>> +{
>> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
>> +
>> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
>> +    if (s->enable_msi) {
>> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
>> +    }
>> +    if (s->enable_s_stage) {
>> +        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
>> +                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
>> +    }
>> +    if (s->enable_g_stage) {
>> +        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
>> +                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
>> +    }
>> +    /* Report QEMU target physical address space limits */
>> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
>> +                       TARGET_PHYS_ADDR_SPACE_BITS);
>> +
>> +    /* TODO: method to report supported PID bits */
>> +    s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
>> +    s->cap |= RISCV_IOMMU_CAP_PD8;
>> +
>> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
>> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
>> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
>> +
>> +    /* register storage */
>> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>> +
>> +     /* Mark all registers read-only */
>> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
>> +
>> +    /*
>> +     * Register complete MMIO space, including MSI/PBA registers.
>> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
>> +     * managed directly by the PCIDevice implementation.
>> +     */
>> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
>> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
>> +
>> +    /* Set power-on register state */
>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
>> +             ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
>> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
>> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
>> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
>> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
>> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
>> +        RISCV_IOMMU_CQCSR_BUSY);
>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
>> +        RISCV_IOMMU_FQCSR_FQOF);
>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
>> +        RISCV_IOMMU_FQCSR_BUSY);
>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
>> +        RISCV_IOMMU_PQCSR_PQOF);
>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
>> +        RISCV_IOMMU_PQCSR_BUSY);
>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
>> +
>> +    /* Memory region for downstream access, if specified. */
>> +    if (s->target_mr) {
>> +        s->target_as = g_new0(AddressSpace, 1);
>> +        address_space_init(s->target_as, s->target_mr,
>> +            "riscv-iommu-downstream");
>> +    } else {
>> +        /* Fallback to global system memory. */
>> +        s->target_as = &address_space_memory;
>> +    }
>> +
>> +    /* Memory region for untranslated MRIF/MSI writes */
>> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
>> +            "riscv-iommu-trap", ~0ULL);
>> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
>> +
>> +    /* Device translation context cache */
>> +    s->ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
>> +                                         riscv_iommu_ctx_equal,
>> +                                         g_free, NULL);
>> +
>> +    s->iommus.le_next = NULL;
>> +    s->iommus.le_prev = NULL;
>> +    QLIST_INIT(&s->spaces);
>> +}
>> +
>> +static void riscv_iommu_unrealize(DeviceState *dev)
>> +{
>> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
>> +
>> +    g_hash_table_unref(s->ctx_cache);
>> +}
>> +
>> +static Property riscv_iommu_properties[] = {
>> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
>> +        RISCV_IOMMU_SPEC_DOT_VER),
>> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
>> +    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
>> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
>> +    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
>> +    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
>> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
>> +        TYPE_MEMORY_REGION, MemoryRegion *),
>> +    DEFINE_PROP_END_OF_LIST(),
>> +};
>> +
>> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +
>> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
>> +    dc->user_creatable = false;
>> +    dc->realize = riscv_iommu_realize;
>> +    dc->unrealize = riscv_iommu_unrealize;
>> +    device_class_set_props(dc, riscv_iommu_properties);
>> +}
>> +
>> +static const TypeInfo riscv_iommu_info = {
>> +    .name = TYPE_RISCV_IOMMU,
>> +    .parent = TYPE_DEVICE,
>> +    .instance_size = sizeof(RISCVIOMMUState),
>> +    .class_init = riscv_iommu_class_init,
>> +};
>> +
>> +static const char *IOMMU_FLAG_STR[] = {
>> +    "NA",
>> +    "RO",
>> +    "WR",
>> +    "RW",
>> +};
>> +
>> +/* RISC-V IOMMU Memory Region - Address Translation Space */
>> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
>> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
>> +    IOMMUAccessFlags flag, int iommu_idx)
>> +{
>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>> +    RISCVIOMMUContext *ctx;
>> +    void *ref;
>> +    IOMMUTLBEntry iotlb = {
>> +        .iova = addr,
>> +        .target_as = as->iommu->target_as,
>> +        .addr_mask = ~0ULL,
>> +        .perm = flag,
>> +    };
>> +
>> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
>> +    if (ctx == NULL) {
>> +        /* Translation disabled or invalid. */
>> +        iotlb.addr_mask = 0;
>> +        iotlb.perm = IOMMU_NONE;
>> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
>> +        /* Translation disabled or fault reported. */
>> +        iotlb.addr_mask = 0;
>> +        iotlb.perm = IOMMU_NONE;
>> +    }
>> +
>> +    /* Trace all dma translations with original access flags. */
>> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
>> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
>> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
>> +                          iotlb.translated_addr);
>> +
>> +    riscv_iommu_ctx_put(as->iommu, ref);
>> +
>> +    return iotlb;
>> +}
>> +
>> +static int riscv_iommu_memory_region_notify(
>> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
>> +    IOMMUNotifierFlag new, Error **errp)
>> +{
>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>> +
>> +    if (old == IOMMU_NOTIFIER_NONE) {
>> +        as->notifier = true;
>> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
>> +    } else if (new == IOMMU_NOTIFIER_NONE) {
>> +        as->notifier = false;
>> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static inline bool pci_is_iommu(PCIDevice *pdev)
>> +{
>> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
>> +}
>> +
>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
>> +{
>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>> +    AddressSpace *as = NULL;
>> +
>> +    if (pdev && pci_is_iommu(pdev)) {
>> +        return s->target_as;
>> +    }
>> +
>> +    /* Find first registered IOMMU device */
>> +    while (s->iommus.le_prev) {
>> +        s = *(s->iommus.le_prev);
>> +    }
>> +
>> +    /* Find first matching IOMMU */
>> +    while (s != NULL && as == NULL) {
>> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
>> +        s = s->iommus.le_next;
>> +    }
>> +
>> +    return as ? as : &address_space_memory;
>> +}
>> +
>> +static const PCIIOMMUOps riscv_iommu_ops = {
>> +    .get_address_space = riscv_iommu_find_as,
>> +};
>> +
>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>> +        Error **errp)
>> +{
>> +    if (bus->iommu_ops &&
>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
>> +    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
>> +    } else {
>> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
>> +            pci_bus_num(bus));
>> +    }
>> +}
>> +
>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
>> +    MemTxAttrs attrs)
>> +{
>> +    return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
>> +}
>> +
>> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
>> +{
>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>> +    return 1 << as->iommu->pid_bits;
>> +}
>> +
>> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
>> +{
>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>> +
>> +    imrc->translate = riscv_iommu_memory_region_translate;
>> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
>> +}
>> +
>> +static const TypeInfo riscv_iommu_memory_region_info = {
>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
>> +    .class_init = riscv_iommu_memory_region_init,
>> +};
>> +
>> +static void riscv_iommu_register_mr_types(void)
>> +{
>> +    type_register_static(&riscv_iommu_memory_region_info);
>> +    type_register_static(&riscv_iommu_info);
>> +}
>> +
>> +type_init(riscv_iommu_register_mr_types);
>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
>> new file mode 100644
>> index 0000000000..af3fcafc19
>> --- /dev/null
>> +++ b/hw/riscv/riscv-iommu.h
>> @@ -0,0 +1,126 @@
>> +/*
>> + * QEMU emulation of an RISC-V IOMMU
>> + *
>> + * Copyright (C) 2022-2023 Rivos Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms and conditions of the GNU General Public License,
>> + * version 2 or later, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#ifndef HW_RISCV_IOMMU_STATE_H
>> +#define HW_RISCV_IOMMU_STATE_H
>> +
>> +#include "qom/object.h"
>> +#include "hw/riscv/iommu.h"
>> +
>> +struct RISCVIOMMUState {
>> +    /*< private >*/
>> +    DeviceState parent_obj;
>> +
>> +    /*< public >*/
>> +    uint32_t version;     /* Reported interface version number */
>> +    uint32_t pid_bits;    /* process identifier width */
>> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
>> +
>> +    uint64_t cap;         /* IOMMU supported capabilities */
>> +    uint64_t fctl;        /* IOMMU enabled features */
>> +    uint64_t icvec_avail_vectors;  /* Available interrupt vectors in ICVEC */
>> +
>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>> +    bool enable_msi;      /* Enable MSI remapping */
>> +    bool enable_s_stage;  /* Enable S/VS-Stage translation */
>> +    bool enable_g_stage;  /* Enable G-Stage translation */
>> +
>> +    /* IOMMU Internal State */
>> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
>> +
>> +    dma_addr_t cq_addr;   /* Command queue base physical address */
>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
>> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
>> +
>> +    uint32_t cq_mask;     /* Command queue index bit mask */
>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
>> +    uint32_t pq_mask;     /* Page request queue index bit mask */
>> +
>> +    /* interrupt notifier */
>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
>> +
>> +    /* IOMMU State Machine */
>> +    QemuThread core_proc; /* Background processing thread */
>> +    QemuCond core_cond;   /* Background processing wake up signal */
>> +    unsigned core_exec;   /* Processing thread execution actions */
>> +
>> +    /* IOMMU target address space */
>> +    AddressSpace *target_as;
>> +    MemoryRegion *target_mr;
>> +
>> +    /* MSI / MRIF access trap */
>> +    AddressSpace trap_as;
>> +    MemoryRegion trap_mr;
>> +
>> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
>> +
>> +    /* MMIO Hardware Interface */
>> +    MemoryRegion regs_mr;
>> +    uint8_t *regs_rw;  /* register state (user write) */
>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
>> +    uint8_t *regs_ro;  /* read-only mask */
>> +
>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
>> +};
>> +
>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>> +         Error **errp);
>> +
>> +/* private helpers */
>> +
>> +/* Register helper functions */
>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
>> +    unsigned idx, uint32_t set, uint32_t clr)
>> +{
>> +    uint32_t val = ldl_le_p(s->regs_rw + idx);
>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
>> +    return val;
>> +}
>> +
>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s, unsigned idx,
>> +                                         uint32_t set)
>> +{
>> +    stl_le_p(s->regs_rw + idx, set);
>> +}
>> +
>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s, unsigned idx)
>> +{
>> +    return ldl_le_p(s->regs_rw + idx);
>> +}
>> +
>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s, unsigned idx,
>> +                                             uint64_t set, uint64_t clr)
>> +{
>> +    uint64_t val = ldq_le_p(s->regs_rw + idx);
>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
>> +    return val;
>> +}
>> +
>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s, unsigned idx,
>> +                                         uint64_t set)
>> +{
>> +    stq_le_p(s->regs_rw + idx, set);
>> +}
>> +
>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
>> +    unsigned idx)
>> +{
>> +    return ldq_le_p(s->regs_rw + idx);
>> +}
>> +#endif
>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
>> new file mode 100644
>> index 0000000000..3d5c33102d
>> --- /dev/null
>> +++ b/hw/riscv/trace-events
>> @@ -0,0 +1,14 @@
>> +# See documentation at docs/devel/tracing.rst
>> +
>> +# riscv-iommu.c
>> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
>> +riscv_iommu_mrif_notification(const char *id, uint32_t nid, uint64_t phys) "%s: sent MRIF notification 0x%x to 0x%"PRIx64
>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
>> +riscv_iommu_notify_int_vector(uint32_t cause, uint32_t vector) "Interrupt cause 0x%x sent via vector 0x%x"
>> +riscv_iommu_icvec_write(uint32_t orig, uint32_t actual) "ICVEC write: incoming 0x%x actual 0x%x"
>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
>> new file mode 100644
>> index 0000000000..8c0e3ca1f3
>> --- /dev/null
>> +++ b/hw/riscv/trace.h
>> @@ -0,0 +1 @@
>> +#include "trace/trace-hw_riscv.h"
>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
>> new file mode 100644
>> index 0000000000..80769a1400
>> --- /dev/null
>> +++ b/include/hw/riscv/iommu.h
>> @@ -0,0 +1,36 @@
>> +/*
>> + * QEMU emulation of an RISC-V IOMMU
>> + *
>> + * Copyright (C) 2022-2023 Rivos Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms and conditions of the GNU General Public License,
>> + * version 2 or later, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#ifndef HW_RISCV_IOMMU_H
>> +#define HW_RISCV_IOMMU_H
>> +
>> +#include "qemu/osdep.h"
>> +#include "qom/object.h"
>> +
>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
>> +
>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
>> +
>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
>> +
>> +#endif
>> diff --git a/meson.build b/meson.build
>> index 10464466ff..71de8a5cd1 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -3439,6 +3439,7 @@ if have_system
>>       'hw/pci-host',
>>       'hw/ppc',
>>       'hw/rtc',
>> +    'hw/riscv',
>>       'hw/s390x',
>>       'hw/scsi',
>>       'hw/sd',
diff mbox series

Patch

diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index a2030e3a6f..f69d6e3c8e 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -1,3 +1,6 @@ 
+config RISCV_IOMMU
+    bool
+
 config RISCV_NUMA
     bool
 
@@ -47,6 +50,7 @@  config RISCV_VIRT
     select SERIAL
     select RISCV_ACLINT
     select RISCV_APLIC
+    select RISCV_IOMMU
     select RISCV_IMSIC
     select SIFIVE_PLIC
     select SIFIVE_TEST
diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
index f872674093..cbc99c6e8e 100644
--- a/hw/riscv/meson.build
+++ b/hw/riscv/meson.build
@@ -10,5 +10,6 @@  riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
 riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
 riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
 riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
+riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
 
 hw_arch += {'riscv': riscv_ss}
diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
index c46d7d18ab..b1c477f5c3 100644
--- a/hw/riscv/riscv-iommu-bits.h
+++ b/hw/riscv/riscv-iommu-bits.h
@@ -69,6 +69,14 @@  struct riscv_iommu_pq_record {
 /* 5.3 IOMMU Capabilities (64bits) */
 #define RISCV_IOMMU_REG_CAP             0x0000
 #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
+#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
+#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
+#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
+#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
+#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
+#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
+#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
+#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
 #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
 #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
 #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
@@ -80,7 +88,9 @@  struct riscv_iommu_pq_record {
 
 /* 5.4 Features control register (32bits) */
 #define RISCV_IOMMU_REG_FCTL            0x0008
+#define RISCV_IOMMU_FCTL_BE             BIT(0)
 #define RISCV_IOMMU_FCTL_WSI            BIT(1)
+#define RISCV_IOMMU_FCTL_GXL            BIT(2)
 
 /* 5.5 Device-directory-table pointer (64bits) */
 #define RISCV_IOMMU_REG_DDTP            0x0010
@@ -175,6 +185,10 @@  enum {
 
 /* 5.27 Interrupt cause to vector (64bits) */
 #define RISCV_IOMMU_REG_ICVEC           0x02F8
+#define RISCV_IOMMU_ICVEC_CIV           GENMASK_ULL(3, 0)
+#define RISCV_IOMMU_ICVEC_FIV           GENMASK_ULL(7, 4)
+#define RISCV_IOMMU_ICVEC_PMIV          GENMASK_ULL(11, 8)
+#define RISCV_IOMMU_ICVEC_PIV           GENMASK_ULL(15, 12)
 
 /* 5.28 MSI Configuration table (32 * 64bits) */
 #define RISCV_IOMMU_REG_MSI_CONFIG      0x0300
@@ -203,6 +217,8 @@  struct riscv_iommu_dc {
 #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
 #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
 #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
+#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
+#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
 #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
 #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
 #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
@@ -309,9 +325,11 @@  enum riscv_iommu_fq_causes {
 
 /* Translation attributes fields */
 #define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
+#define RISCV_IOMMU_PC_TA_RESERVED      GENMASK_ULL(63, 32)
 
 /* First stage context fields */
 #define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
+#define RISCV_IOMMU_PC_FSC_RESERVED     GENMASK_ULL(59, 44)
 
 enum riscv_iommu_fq_ttypes {
     RISCV_IOMMU_FQ_TTYPE_NONE = 0,
diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
new file mode 100644
index 0000000000..f2679f3740
--- /dev/null
+++ b/hw/riscv/riscv-iommu.c
@@ -0,0 +1,2021 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU
+ *
+ * Copyright (C) 2021-2023, Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci_device.h"
+#include "hw/qdev-properties.h"
+#include "hw/riscv/riscv_hart.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qemu/timer.h"
+
+#include "cpu_bits.h"
+#include "riscv-iommu.h"
+#include "riscv-iommu-bits.h"
+#include "trace.h"
+
+#define LIMIT_CACHE_CTX               (1U << 7)
+#define LIMIT_CACHE_IOT               (1U << 20)
+
+/* Physical page number coversions */
+#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
+#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
+
+typedef struct RISCVIOMMUContext RISCVIOMMUContext;
+typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
+
+/* Device assigned I/O address space */
+struct RISCVIOMMUSpace {
+    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
+    AddressSpace iova_as;       /* IOVA address space for attached device */
+    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
+    uint32_t devid;             /* Requester identifier, AKA device_id */
+    bool notifier;              /* IOMMU unmap notifier enabled */
+    QLIST_ENTRY(RISCVIOMMUSpace) list;
+};
+
+/* Device translation context state. */
+struct RISCVIOMMUContext {
+    uint64_t devid:24;          /* Requester Id, AKA device_id */
+    uint64_t process_id:20;     /* Process ID. PASID for PCIe */
+    uint64_t tc;                /* Translation Control */
+    uint64_t ta;                /* Translation Attributes */
+    uint64_t satp;              /* S-Stage address translation and protection */
+    uint64_t gatp;              /* G-Stage address translation and protection */
+    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
+    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
+    uint64_t msiptp;            /* MSI redirection page table pointer */
+};
+
+/* IOMMU index for transactions without process_id specified. */
+#define RISCV_IOMMU_NOPROCID 0
+
+static uint8_t riscv_iommu_get_icvec_vector(uint32_t icvec, uint32_t vec_type)
+{
+    switch (vec_type) {
+    case RISCV_IOMMU_INTR_CQ:
+        return icvec & RISCV_IOMMU_ICVEC_CIV;
+    case RISCV_IOMMU_INTR_FQ:
+        return (icvec & RISCV_IOMMU_ICVEC_FIV) >> 4;
+    case RISCV_IOMMU_INTR_PM:
+        return (icvec & RISCV_IOMMU_ICVEC_PMIV) >> 8;
+    case RISCV_IOMMU_INTR_PQ:
+        return (icvec & RISCV_IOMMU_ICVEC_PIV) >> 12;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void riscv_iommu_notify(RISCVIOMMUState *s, int vec_type)
+{
+    const uint32_t fctl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FCTL);
+    uint32_t ipsr, icvec, vector;
+
+    if (fctl & RISCV_IOMMU_FCTL_WSI || !s->notify) {
+        return;
+    }
+
+    icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
+    ipsr = riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec_type), 0);
+
+    if (!(ipsr & (1 << vec_type))) {
+        vector = riscv_iommu_get_icvec_vector(icvec, vec_type);
+        s->notify(s, vector);
+        trace_riscv_iommu_notify_int_vector(vec_type, vector);
+    }
+}
+
+static void riscv_iommu_fault(RISCVIOMMUState *s,
+                              struct riscv_iommu_fq_record *ev)
+{
+    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
+    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
+    uint32_t next = (tail + 1) & s->fq_mask;
+    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
+
+    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                          PCI_FUNC(devid), ev->hdr, ev->iotval);
+
+    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
+        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
+        return;
+    }
+
+    if (head == next) {
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
+                              RISCV_IOMMU_FQCSR_FQOF, 0);
+    } else {
+        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
+        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
+                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
+                                  RISCV_IOMMU_FQCSR_FQMF, 0);
+        } else {
+            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
+        }
+    }
+
+    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
+    }
+}
+
+static void riscv_iommu_pri(RISCVIOMMUState *s,
+    struct riscv_iommu_pq_record *pr)
+{
+    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
+    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
+    uint32_t next = (tail + 1) & s->pq_mask;
+    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
+
+    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                          PCI_FUNC(devid), pr->payload);
+
+    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
+        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
+        return;
+    }
+
+    if (head == next) {
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
+                              RISCV_IOMMU_PQCSR_PQOF, 0);
+    } else {
+        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
+        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
+                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
+                                  RISCV_IOMMU_PQCSR_PQMF, 0);
+        } else {
+            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
+        }
+    }
+
+    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
+    }
+}
+
+/* Portable implementation of pext_u64, bit-mask extraction. */
+static uint64_t _pext_u64(uint64_t val, uint64_t ext)
+{
+    uint64_t ret = 0;
+    uint64_t rot = 1;
+
+    while (ext) {
+        if (ext & 1) {
+            if (val & 1) {
+                ret |= rot;
+            }
+            rot <<= 1;
+        }
+        val >>= 1;
+        ext >>= 1;
+    }
+
+    return ret;
+}
+
+/* Check if GPA matches MSI/MRIF pattern. */
+static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    dma_addr_t gpa)
+{
+    if (!s->enable_msi) {
+        return false;
+    }
+
+    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
+        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
+        return false; /* Invalid MSI/MRIF mode */
+    }
+
+    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
+        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
+    }
+
+    return true;
+}
+
+/*
+ * RISCV IOMMU Address Translation Lookup - Page Table Walk
+ *
+ * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
+ * Both implementation can be merged into single helper function in future.
+ * Keeping them separate for now, as error reporting and flow specifics are
+ * sufficiently different for separate implementation.
+ *
+ * @s        : IOMMU Device State
+ * @ctx      : Translation context for device id and process address space id.
+ * @iotlb    : translation data: physical address and access mode.
+ * @return   : success or fault cause code.
+ */
+static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    IOMMUTLBEntry *iotlb)
+{
+    dma_addr_t addr, base;
+    uint64_t satp, gatp, pte;
+    bool en_s, en_g;
+    struct {
+        unsigned char step;
+        unsigned char levels;
+        unsigned char ptidxbits;
+        unsigned char ptesize;
+    } sc[2];
+    /* Translation stage phase */
+    enum {
+        S_STAGE = 0,
+        G_STAGE = 1,
+    } pass;
+    MemTxResult ret;
+
+    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
+    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
+
+    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
+    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
+
+    /*
+     * Early check for MSI address match when IOVA == GPA.
+     * Note that the (!en_s) condition means that the MSI
+     * page table may only be used when guest pages are
+     * mapped using the g-stage page table, whether single-
+     * or two-stage paging is enabled. It's unavoidable though,
+     * because the spec mandates that we do a first-stage
+     * translation before we check the MSI page table, which
+     * means we can't do an early MSI check unless we have
+     * strictly !en_s.
+     */
+    if (!en_s && (iotlb->perm & IOMMU_WO) &&
+        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
+        iotlb->target_as = &s->trap_as;
+        iotlb->translated_addr = iotlb->iova;
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        return 0;
+    }
+
+    /* Exit early for pass-through mode. */
+    if (!(en_s || en_g)) {
+        iotlb->translated_addr = iotlb->iova;
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        /* Allow R/W in pass-through mode */
+        iotlb->perm = IOMMU_RW;
+        return 0;
+    }
+
+    /* S/G translation parameters. */
+    for (pass = 0; pass < 2; pass++) {
+        uint32_t sv_mode;
+
+        sc[pass].step = 0;
+        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
+            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
+            /* 32bit mode for GXL/SXL == 1 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 2;
+                sc[pass].ptidxbits = 10;
+                sc[pass].ptesize   = 4;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        } else {
+            /* 64bit mode for GXL/SXL == 0 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 3;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 4;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 5;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        }
+    };
+
+    /* S/G stages translation tables root pointers */
+    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
+    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
+    addr = (en_s && en_g) ? satp : iotlb->iova;
+    base = en_g ? gatp : satp;
+    pass = en_g ? G_STAGE : S_STAGE;
+
+    do {
+        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
+        const unsigned va_bits = widened + sc[pass].ptidxbits;
+        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
+                                 (sc[pass].levels - 1 - sc[pass].step);
+        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
+        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
+        const bool ade =
+            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
+
+        /* Address range check before first level lookup */
+        if (!sc[pass].step) {
+            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
+            if ((addr & va_mask) != addr) {
+                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
+            }
+        }
+
+        /* Read page table entry */
+        if (sc[pass].ptesize == 4) {
+            uint32_t pte32 = 0;
+            ret = ldl_le_dma(s->target_as, pte_addr, &pte32,
+                             MEMTXATTRS_UNSPECIFIED);
+            pte = pte32;
+        } else {
+            ret = ldq_le_dma(s->target_as, pte_addr, &pte,
+                             MEMTXATTRS_UNSPECIFIED);
+        }
+        if (ret != MEMTX_OK) {
+            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
+                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
+        }
+
+        sc[pass].step++;
+        hwaddr ppn = pte >> PTE_PPN_SHIFT;
+
+        if (!(pte & PTE_V)) {
+            break;                /* Invalid PTE */
+        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
+            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
+            break;                /* Reserved leaf PTE flags: PTE_W */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
+            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
+        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
+            break;                /* Misaligned PPN */
+        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
+            break;                /* Read access check failed */
+        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
+            break;                /* Write access check failed */
+        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
+            break;                /* Access bit not set */
+        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
+            break;                /* Dirty bit not set */
+        } else {
+            /* Leaf PTE, translation completed. */
+            sc[pass].step = sc[pass].levels;
+            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
+            /* Update address mask based on smallest translation granularity */
+            iotlb->addr_mask &= (1ULL << va_skip) - 1;
+            /* Continue with S-Stage translation? */
+            if (pass && sc[0].step != sc[0].levels) {
+                pass = S_STAGE;
+                addr = iotlb->iova;
+                continue;
+            }
+            /* Translation phase completed (GPA or SPA) */
+            iotlb->translated_addr = base;
+            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
+                                                         : IOMMU_RO;
+
+            /* Check MSI GPA address match */
+            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
+                riscv_iommu_msi_check(s, ctx, base)) {
+                /* Trap MSI writes and return GPA address. */
+                iotlb->target_as = &s->trap_as;
+                iotlb->addr_mask = ~TARGET_PAGE_MASK;
+                return 0;
+            }
+
+            /* Continue with G-Stage translation? */
+            if (!pass && en_g) {
+                pass = G_STAGE;
+                addr = base;
+                base = gatp;
+                sc[pass].step = 0;
+                continue;
+            }
+
+            return 0;
+        }
+
+        if (sc[pass].step == sc[pass].levels) {
+            break; /* Can't find leaf PTE */
+        }
+
+        /* Continue with G-Stage translation? */
+        if (!pass && en_g) {
+            pass = G_STAGE;
+            addr = base;
+            base = gatp;
+            sc[pass].step = 0;
+        }
+    } while (1);
+
+    return (iotlb->perm & IOMMU_WO) ?
+                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
+                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
+}
+
+static void riscv_iommu_report_fault(RISCVIOMMUState *s,
+                                     RISCVIOMMUContext *ctx,
+                                     uint32_t fault_type, uint32_t cause,
+                                     bool pv,
+                                     uint64_t iotval, uint64_t iotval2)
+{
+    struct riscv_iommu_fq_record ev = { 0 };
+
+    if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
+        switch (cause) {
+        case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
+        case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
+        case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
+        case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
+        case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
+        case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
+        case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
+            break;
+        default:
+            /* DTF prevents reporting a fault for this given cause */
+            return;
+        }
+    }
+
+    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
+    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
+    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
+    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
+
+    if (pv) {
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
+    }
+
+    ev.iotval = iotval;
+    ev.iotval2 = iotval2;
+
+    riscv_iommu_fault(s, &ev);
+}
+
+/* Redirect MSI write for given GPA. */
+static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
+    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
+    unsigned size, MemTxAttrs attrs)
+{
+    MemTxResult res;
+    dma_addr_t addr;
+    uint64_t intn;
+    uint32_t n190;
+    uint64_t pte[2];
+    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
+    int cause;
+
+    /* Interrupt File Number */
+    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
+    if (intn >= 256) {
+        /* Interrupt file number out of range */
+        res = MEMTX_ACCESS_ERROR;
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
+        goto err;
+    }
+
+    /* fetch MSI PTE */
+    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
+    addr = addr | (intn * sizeof(pte));
+    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
+            MEMTXATTRS_UNSPECIFIED);
+    if (res != MEMTX_OK) {
+        if (res == MEMTX_DECODE_ERROR) {
+            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
+        } else {
+            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
+        }
+        goto err;
+    }
+
+    le64_to_cpus(&pte[0]);
+    le64_to_cpus(&pte[1]);
+
+    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
+        /*
+         * The spec mentions that: "If msipte.C == 1, then further
+         * processing to interpret the PTE is implementation
+         * defined.". We'll abort with cause = 262 for this
+         * case too.
+         */
+        res = MEMTX_ACCESS_ERROR;
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
+        goto err;
+    }
+
+    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
+    case RISCV_IOMMU_MSI_PTE_M_BASIC:
+        /* MSI Pass-through mode */
+        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
+
+        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
+                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
+                              gpa, addr);
+
+        res = dma_memory_write(s->target_as, addr, &data, size, attrs);
+        if (res != MEMTX_OK) {
+            cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
+            goto err;
+        }
+
+        return MEMTX_OK;
+    case RISCV_IOMMU_MSI_PTE_M_MRIF:
+        /* MRIF mode, continue. */
+        break;
+    default:
+        res = MEMTX_ACCESS_ERROR;
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
+        goto err;
+    }
+
+    /*
+     * Report an error for interrupt identities exceeding the maximum allowed
+     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
+     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
+     */
+    if ((data > 2047) || (gpa & 3)) {
+        res = MEMTX_ACCESS_ERROR;
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
+        goto err;
+    }
+
+    /* MSI MRIF mode, non atomic pending bit update */
+
+    /* MRIF pending bit address */
+    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
+    addr = addr | ((data & 0x7c0) >> 3);
+
+    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
+                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
+                          gpa, addr);
+
+    /* MRIF pending bit mask */
+    data = 1ULL << (data & 0x03f);
+    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
+        goto err;
+    }
+
+    intn = intn | data;
+    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
+        goto err;
+    }
+
+    /* Get MRIF enable bits */
+    addr = addr + sizeof(intn);
+    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
+        goto err;
+    }
+
+    if (!(intn & data)) {
+        /* notification disabled, MRIF update completed. */
+        return MEMTX_OK;
+    }
+
+    /* Send notification message */
+    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
+    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
+          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
+
+    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
+    if (res != MEMTX_OK) {
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
+        goto err;
+    }
+
+    trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
+
+    return MEMTX_OK;
+
+err:
+    riscv_iommu_report_fault(s, ctx, fault_type, cause,
+                             !!ctx->process_id, 0, 0);
+    return res;
+}
+
+/*
+ * Check device context configuration as described by the
+ * riscv-iommu spec section "Device-context configuration
+ * checks".
+ */
+static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
+                                            RISCVIOMMUContext *ctx)
+{
+    uint32_t fsc_mode, msi_mode;
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
+        ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
+        return false;
+    }
+
+    if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
+        ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
+        return false;
+    }
+
+    if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
+        msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
+
+        if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
+            msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
+            return false;
+        }
+    }
+
+    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+
+    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
+        switch (fsc_mode) {
+        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
+            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
+            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
+            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
+                return false;
+            }
+            break;
+        }
+    } else {
+        /* DC.tc.PDTV is 0 */
+        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
+            return false;
+        }
+
+        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
+            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
+                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
+                return false;
+            }
+        } else {
+            switch (fsc_mode) {
+            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
+                    return false;
+                }
+                break;
+            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
+                    return false;
+                }
+            break;
+            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
+                    return false;
+                }
+                break;
+            }
+        }
+    }
+
+    /*
+     * CAP_END is always zero (only one endianess). FCTL_BE is
+     * always zero (little-endian accesses). Thus TC_SBE must
+     * always be LE, i.e. zero.
+     */
+    if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
+        return false;
+    }
+
+    return true;
+}
+
+/*
+ * Validate process context (PC) according to section
+ * "Process-context configuration checks".
+ */
+static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
+                                             RISCVIOMMUContext *ctx)
+{
+    uint32_t mode;
+
+    if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
+        return false;
+    }
+
+    if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
+        return false;
+    }
+
+    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+    switch (mode) {
+    case RISCV_IOMMU_DC_FSC_MODE_BARE:
+    /* sv39 and sv32 modes have the same value (8) */
+    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+        break;
+    default:
+        return false;
+    }
+
+    if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
+        if (mode == RISCV_IOMMU_CAP_SV32 &&
+            !(s->cap & RISCV_IOMMU_CAP_SV32)) {
+                return false;
+        }
+    } else {
+        switch (mode) {
+        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+            if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+            if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+            if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
+                return false;
+            }
+            break;
+        }
+    }
+
+    return true;
+}
+
+/*
+ * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
+ *
+ * @s         : IOMMU Device State
+ * @ctx       : Device Translation Context with devid and process_id set.
+ * @return    : success or fault code.
+ */
+static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
+{
+    const uint64_t ddtp = s->ddtp;
+    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
+    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
+    struct riscv_iommu_dc dc;
+    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
+    const int dc_fmt = !s->enable_msi;
+    const size_t dc_len = sizeof(dc) >> dc_fmt;
+    unsigned depth;
+    uint64_t de;
+
+    switch (mode) {
+    case RISCV_IOMMU_DDTP_MODE_OFF:
+        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
+
+    case RISCV_IOMMU_DDTP_MODE_BARE:
+        /* mock up pass-through translation context */
+        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
+        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_FSC_MODE_BARE);
+        ctx->tc = RISCV_IOMMU_DC_TC_V;
+        ctx->ta = 0;
+        ctx->msiptp = 0;
+        return 0;
+
+    case RISCV_IOMMU_DDTP_MODE_1LVL:
+        depth = 0;
+        break;
+
+    case RISCV_IOMMU_DDTP_MODE_2LVL:
+        depth = 1;
+        break;
+
+    case RISCV_IOMMU_DDTP_MODE_3LVL:
+        depth = 2;
+        break;
+
+    default:
+        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+    }
+
+    /*
+     * Check supported device id width (in bits).
+     * See IOMMU Specification, Chapter 6. Software guidelines.
+     * - if extended device-context format is used:
+     *   1LVL: 6, 2LVL: 15, 3LVL: 24
+     * - if base device-context format is used:
+     *   1LVL: 7, 2LVL: 16, 3LVL: 24
+     */
+    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
+        return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
+    }
+
+    /* Device directory tree walk */
+    for (; depth-- > 0; ) {
+        /*
+         * Select device id index bits based on device directory tree level
+         * and device context format.
+         * See IOMMU Specification, Chapter 2. Data Structures.
+         * - if extended device-context format is used:
+         *   device index: [23:15][14:6][5:0]
+         * - if base device-context format is used:
+         *   device index: [23:16][15:7][6:0]
+         */
+        const int split = depth * 9 + 6 + dc_fmt;
+        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
+        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+        }
+        le64_to_cpus(&de);
+        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
+            /* invalid directory entry */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
+        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
+            /* reserved bits set */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+        }
+        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
+    }
+
+    /* index into device context entry page */
+    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
+
+    memset(&dc, 0, sizeof(dc));
+    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
+                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+    }
+
+    /* Set translation context. */
+    ctx->tc = le64_to_cpu(dc.tc);
+    ctx->gatp = le64_to_cpu(dc.iohgatp);
+    ctx->satp = le64_to_cpu(dc.fsc);
+    ctx->ta = le64_to_cpu(dc.ta);
+    ctx->msiptp = le64_to_cpu(dc.msiptp);
+    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
+    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+    }
+
+    if (!riscv_iommu_validate_device_ctx(s, ctx)) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+    }
+
+    /* FSC field checks */
+    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
+        if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
+            /* PID is disabled */
+            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
+        }
+        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
+            /* Invalid translation mode */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
+        return 0;
+    }
+
+    if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
+        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
+            /* No default process_id enabled, set BARE mode */
+            ctx->satp = 0ULL;
+            return 0;
+        } else {
+            /* Use default process_id #0 */
+            ctx->process_id = 0;
+        }
+    }
+
+    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
+        /* No S-Stage translation, done. */
+        return 0;
+    }
+
+    /* FSC.TC.PDTV enabled */
+    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
+        /* Invalid PDTP.MODE */
+        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
+    }
+
+    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
+        /*
+         * Select process id index bits based on process directory tree
+         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
+         */
+        const int split = depth * 9 + 8;
+        addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
+        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
+        }
+        le64_to_cpus(&de);
+        if (!(de & RISCV_IOMMU_PC_TA_V)) {
+            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
+        }
+        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
+    }
+
+    /* Leaf entry in PDT */
+    addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
+    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
+                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
+    }
+
+    /* Use FSC and TA from process directory entry. */
+    ctx->ta = le64_to_cpu(dc.ta);
+    ctx->satp = le64_to_cpu(dc.fsc);
+
+    if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
+        return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
+    }
+
+    if (!riscv_iommu_validate_process_ctx(s, ctx)) {
+        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
+    }
+
+    return 0;
+}
+
+/* Translation Context cache support */
+static gboolean riscv_iommu_ctx_equal(gconstpointer v1, gconstpointer v2)
+{
+    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
+    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
+    return c1->devid == c2->devid &&
+           c1->process_id == c2->process_id;
+}
+
+static guint riscv_iommu_ctx_hash(gconstpointer v)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
+    /*
+     * Generate simple hash of (process_id, devid)
+     * assuming 24-bit wide devid.
+     */
+    return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
+}
+
+static void riscv_iommu_ctx_inval_devid_procid(gpointer key, gpointer value,
+                                               gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
+        ctx->devid == arg->devid &&
+        ctx->process_id == arg->process_id) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void riscv_iommu_ctx_inval_devid(gpointer key, gpointer value,
+                                        gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
+        ctx->devid == arg->devid) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void riscv_iommu_ctx_inval_all(gpointer key, gpointer value,
+                                      gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
+                                  uint32_t devid, uint32_t process_id)
+{
+    GHashTable *ctx_cache;
+    RISCVIOMMUContext key = {
+        .devid = devid,
+        .process_id = process_id,
+    };
+    ctx_cache = g_hash_table_ref(s->ctx_cache);
+    g_hash_table_foreach(ctx_cache, func, &key);
+    g_hash_table_unref(ctx_cache);
+}
+
+/* Find or allocate translation context for a given {device_id, process_id} */
+static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
+                                          unsigned devid, unsigned process_id,
+                                          void **ref)
+{
+    GHashTable *ctx_cache;
+    RISCVIOMMUContext *ctx;
+    RISCVIOMMUContext key = {
+        .devid = devid,
+        .process_id = process_id,
+    };
+
+    ctx_cache = g_hash_table_ref(s->ctx_cache);
+    ctx = g_hash_table_lookup(ctx_cache, &key);
+
+    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+        *ref = ctx_cache;
+        return ctx;
+    }
+
+    ctx = g_new0(RISCVIOMMUContext, 1);
+    ctx->devid = devid;
+    ctx->process_id = process_id;
+
+    int fault = riscv_iommu_ctx_fetch(s, ctx);
+    if (!fault) {
+        if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
+            g_hash_table_unref(ctx_cache);
+            ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
+                                              riscv_iommu_ctx_equal,
+                                              g_free, NULL);
+            g_hash_table_ref(ctx_cache);
+            g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
+        }
+        g_hash_table_add(ctx_cache, ctx);
+        *ref = ctx_cache;
+        return ctx;
+    }
+
+    g_hash_table_unref(ctx_cache);
+    *ref = NULL;
+
+    riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
+                             fault, !!process_id, 0, 0);
+
+    g_free(ctx);
+    return NULL;
+}
+
+static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
+{
+    if (ref) {
+        g_hash_table_unref((GHashTable *)ref);
+    }
+}
+
+/* Find or allocate address space for a given device */
+static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
+{
+    RISCVIOMMUSpace *as;
+
+    /* FIXME: PCIe bus remapping for attached endpoints. */
+    devid |= s->bus << 8;
+
+    QLIST_FOREACH(as, &s->spaces, list) {
+        if (as->devid == devid) {
+            break;
+        }
+    }
+
+    if (as == NULL) {
+        char name[64];
+        as = g_new0(RISCVIOMMUSpace, 1);
+
+        as->iommu = s;
+        as->devid = devid;
+
+        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
+            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
+
+        /* IOVA address space, untranslated addresses */
+        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
+            TYPE_RISCV_IOMMU_MEMORY_REGION,
+            OBJECT(as), "riscv_iommu", UINT64_MAX);
+        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
+
+        QLIST_INSERT_HEAD(&s->spaces, as, list);
+
+        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
+                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
+    }
+    return &as->iova_as;
+}
+
+static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    IOMMUTLBEntry *iotlb)
+{
+    bool enable_pid;
+    bool enable_pri;
+    int fault;
+
+    /*
+     * TC[32] is reserved for custom extensions, used here to temporarily
+     * enable automatic page-request generation for ATS queries.
+     */
+    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
+    enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
+
+    /* Translate using device directory / page table information. */
+    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
+
+    if (enable_pri && fault) {
+        struct riscv_iommu_pq_record pr = {0};
+        if (enable_pid) {
+            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
+                               RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
+        }
+        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
+        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
+                     RISCV_IOMMU_PREQ_PAYLOAD_M;
+        riscv_iommu_pri(s, &pr);
+        return fault;
+    }
+
+    if (fault) {
+        unsigned ttype;
+
+        if (iotlb->perm & IOMMU_RW) {
+            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
+        } else {
+            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
+        }
+
+        riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
+                                 iotlb->iova, iotlb->translated_addr);
+        return fault;
+    }
+
+    return 0;
+}
+
+/* IOMMU Command Interface */
+static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
+    uint64_t addr, uint32_t data)
+{
+    /*
+     * ATS processing in this implementation of the IOMMU is synchronous,
+     * no need to wait for completions here.
+     */
+    if (!notify) {
+        return MEMTX_OK;
+    }
+
+    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
+        MEMTXATTRS_UNSPECIFIED);
+}
+
+static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
+{
+    uint64_t old_ddtp = s->ddtp;
+    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
+    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
+    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
+    bool ok = false;
+
+    /*
+     * Check for allowed DDTP.MODE transitions:
+     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
+     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
+     */
+    if (new_mode == old_mode ||
+        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
+        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
+        ok = true;
+    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
+               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
+               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
+        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
+             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
+    }
+
+    if (ok) {
+        /* clear reserved and busy bits, report back sanitized version */
+        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
+                             RISCV_IOMMU_DDTP_MODE, new_mode);
+    } else {
+        new_ddtp = old_ddtp;
+    }
+    s->ddtp = new_ddtp;
+
+    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
+}
+
+/* Command function and opcode field. */
+#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
+
+static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
+{
+    struct riscv_iommu_command cmd;
+    MemTxResult res;
+    dma_addr_t addr;
+    uint32_t tail, head, ctrl;
+    uint64_t cmd_opcode;
+    GHFunc func;
+
+    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
+    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
+
+    /* Check for pending error or queue processing disabled */
+    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
+        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
+        return;
+    }
+
+    while (tail != head) {
+        addr = s->cq_addr  + head * sizeof(cmd);
+        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
+                              MEMTXATTRS_UNSPECIFIED);
+
+        if (res != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                                  RISCV_IOMMU_CQCSR_CQMF, 0);
+            goto fault;
+        }
+
+        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
+
+        cmd_opcode = get_field(cmd.dword0,
+                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
+
+        switch (cmd_opcode) {
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
+                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
+            res = riscv_iommu_iofence(s,
+                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
+
+            if (res != MEMTX_OK) {
+                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                                      RISCV_IOMMU_CQCSR_CQMF, 0);
+                goto fault;
+            }
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
+                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
+            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
+                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
+                goto cmd_ill;
+            }
+            /* translation cache not implemented yet */
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
+                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
+            /* translation cache not implemented yet */
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
+                             RISCV_IOMMU_CMD_IODIR_OPCODE):
+            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
+                /* invalidate all device context cache mappings */
+                func = riscv_iommu_ctx_inval_all;
+            } else {
+                /* invalidate all device context matching DID */
+                func = riscv_iommu_ctx_inval_devid;
+            }
+            riscv_iommu_ctx_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
+                             RISCV_IOMMU_CMD_IODIR_OPCODE):
+            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
+                /* illegal command arguments IODIR_PDT & DV == 0 */
+                goto cmd_ill;
+            } else {
+                func = riscv_iommu_ctx_inval_devid_procid;
+            }
+            riscv_iommu_ctx_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
+            break;
+
+        default:
+        cmd_ill:
+            /* Invalid instruction, do not advance instruction index. */
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
+            goto fault;
+        }
+
+        /* Advance and update head pointer after command completes. */
+        head = (head + 1) & s->cq_mask;
+        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
+    }
+    return;
+
+fault:
+    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
+    }
+}
+
+static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
+        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
+        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
+        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
+                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
+                   RISCV_IOMMU_CQCSR_FENCE_W_IP;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
+        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
+        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
+        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
+            RISCV_IOMMU_FQCSR_FQOF;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
+        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
+        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
+        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
+            RISCV_IOMMU_PQCSR_PQOF;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
+}
+
+typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
+
+static void riscv_iommu_update_icvec(RISCVIOMMUState *s, uint64_t data)
+{
+    uint64_t icvec = 0;
+
+    icvec |= MIN(data & RISCV_IOMMU_ICVEC_CIV,
+                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_CIV);
+
+    icvec |= MIN(data & RISCV_IOMMU_ICVEC_FIV,
+                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_FIV);
+
+    icvec |= MIN(data & RISCV_IOMMU_ICVEC_PMIV,
+                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PMIV);
+
+    icvec |= MIN(data & RISCV_IOMMU_ICVEC_PIV,
+                 s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PIV);
+
+    trace_riscv_iommu_icvec_write(data, icvec);
+
+    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_ICVEC, icvec);
+}
+
+static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
+{
+    uint32_t cqcsr, fqcsr, pqcsr;
+    uint32_t ipsr_set = 0;
+    uint32_t ipsr_clr = 0;
+
+    if (data & RISCV_IOMMU_IPSR_CIP) {
+        cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+
+        if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
+            (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
+             cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
+             cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
+             cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
+            ipsr_set |= RISCV_IOMMU_IPSR_CIP;
+        } else {
+            ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
+        }
+    } else {
+        ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
+    }
+
+    if (data & RISCV_IOMMU_IPSR_FIP) {
+        fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+
+        if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
+            (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
+             fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
+            ipsr_set |= RISCV_IOMMU_IPSR_FIP;
+        } else {
+            ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
+        }
+    } else {
+        ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
+    }
+
+    if (data & RISCV_IOMMU_IPSR_PIP) {
+        pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+
+        if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
+            (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
+             pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
+            ipsr_set |= RISCV_IOMMU_IPSR_PIP;
+        } else {
+            ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
+        }
+    } else {
+        ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
+}
+
+/*
+ * Write the resulting value of 'data' for the reg specified
+ * by 'reg_addr', after considering read-only/read-write/write-clear
+ * bits, in the pointer 'dest'.
+ *
+ * The result is written in little-endian.
+ */
+static void riscv_iommu_write_reg_val(RISCVIOMMUState *s,
+                                      void *dest, hwaddr reg_addr,
+                                      int size, uint64_t data)
+{
+    uint64_t ro = ldn_le_p(&s->regs_ro[reg_addr], size);
+    uint64_t wc = ldn_le_p(&s->regs_wc[reg_addr], size);
+    uint64_t rw = ldn_le_p(&s->regs_rw[reg_addr], size);
+
+    stn_le_p(dest, size, ((rw & ro) | (data & ~ro)) & ~(data & wc));
+}
+
+static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
+                                          uint64_t data, unsigned size,
+                                          MemTxAttrs attrs)
+{
+    riscv_iommu_process_fn *process_fn = NULL;
+    RISCVIOMMUState *s = opaque;
+    uint32_t regb = addr & ~3;
+    uint32_t busy = 0;
+    uint64_t val = 0;
+
+    if ((addr & (size - 1)) != 0) {
+        /* Unsupported MMIO alignment or access size */
+        return MEMTX_ERROR;
+    }
+
+    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+        /* Unsupported MMIO access location. */
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* Track actionable MMIO write. */
+    switch (regb) {
+    case RISCV_IOMMU_REG_DDTP:
+    case RISCV_IOMMU_REG_DDTP + 4:
+        process_fn = riscv_iommu_process_ddtp;
+        regb = RISCV_IOMMU_REG_DDTP;
+        busy = RISCV_IOMMU_DDTP_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_CQT:
+        process_fn = riscv_iommu_process_cq_tail;
+        break;
+
+    case RISCV_IOMMU_REG_CQCSR:
+        process_fn = riscv_iommu_process_cq_control;
+        busy = RISCV_IOMMU_CQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_FQCSR:
+        process_fn = riscv_iommu_process_fq_control;
+        busy = RISCV_IOMMU_FQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_PQCSR:
+        process_fn = riscv_iommu_process_pq_control;
+        busy = RISCV_IOMMU_PQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_ICVEC:
+    case RISCV_IOMMU_REG_IPSR:
+        /*
+         * ICVEC and IPSR have special read/write procedures. We'll
+         * call their respective helpers and exit.
+         */
+        riscv_iommu_write_reg_val(s, &val, addr, size, data);
+
+        /*
+         * 'val' is stored as LE. Switch to host endianess
+         * before using it.
+         */
+        val = le64_to_cpu(val);
+
+        if (regb == RISCV_IOMMU_REG_ICVEC) {
+            riscv_iommu_update_icvec(s, val);
+        } else {
+            riscv_iommu_update_ipsr(s, val);
+        }
+
+        return MEMTX_OK;
+
+    default:
+        break;
+    }
+
+    /*
+     * Registers update might be not synchronized with core logic.
+     * If system software updates register when relevant BUSY bit
+     * is set IOMMU behavior of additional writes to the register
+     * is UNSPECIFIED.
+     */
+    riscv_iommu_write_reg_val(s, &s->regs_rw[addr], addr, size, data);
+
+    /* Busy flag update, MSB 4-byte register. */
+    if (busy) {
+        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
+        stl_le_p(&s->regs_rw[regb], rw | busy);
+    }
+
+    if (process_fn) {
+        process_fn(s);
+    }
+
+    return MEMTX_OK;
+}
+
+static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
+    uint64_t *data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState *s = opaque;
+    uint64_t val = -1;
+    uint8_t *ptr;
+
+    if ((addr & (size - 1)) != 0) {
+        /* Unsupported MMIO alignment. */
+        return MEMTX_ERROR;
+    }
+
+    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    ptr = &s->regs_rw[addr];
+    val = ldn_le_p(ptr, size);
+
+    *data = val;
+
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps riscv_iommu_mmio_ops = {
+    .read_with_attrs = riscv_iommu_mmio_read,
+    .write_with_attrs = riscv_iommu_mmio_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    }
+};
+
+/*
+ * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
+ * memory region as untranslated address, for additional MSI/MRIF interception
+ * by IOMMU interrupt remapping implementation.
+ * Note: Device emulation code generating an MSI is expected to provide a valid
+ * memory transaction attributes with requested_id set.
+ */
+static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
+    uint64_t data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
+    RISCVIOMMUContext *ctx;
+    MemTxResult res;
+    void *ref;
+    uint32_t devid = attrs.requester_id;
+
+    if (attrs.unspecified) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* FIXME: PCIe bus remapping for attached endpoints. */
+    devid |= s->bus << 8;
+
+    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
+    if (ctx == NULL) {
+        res = MEMTX_ACCESS_ERROR;
+    } else {
+        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
+    }
+    riscv_iommu_ctx_put(s, ref);
+    return res;
+}
+
+static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
+    uint64_t *data, unsigned size, MemTxAttrs attrs)
+{
+    return MEMTX_ACCESS_ERROR;
+}
+
+static const MemoryRegionOps riscv_iommu_trap_ops = {
+    .read_with_attrs = riscv_iommu_trap_read,
+    .write_with_attrs = riscv_iommu_trap_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+        .unaligned = true,
+    },
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    }
+};
+
+static void riscv_iommu_realize(DeviceState *dev, Error **errp)
+{
+    RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
+    if (s->enable_msi) {
+        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
+    }
+    if (s->enable_s_stage) {
+        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
+                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
+    }
+    if (s->enable_g_stage) {
+        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
+                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
+    }
+    /* Report QEMU target physical address space limits */
+    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
+                       TARGET_PHYS_ADDR_SPACE_BITS);
+
+    /* TODO: method to report supported PID bits */
+    s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
+    s->cap |= RISCV_IOMMU_CAP_PD8;
+
+    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
+    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
+                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
+
+    /* register storage */
+    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+
+     /* Mark all registers read-only */
+    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
+
+    /*
+     * Register complete MMIO space, including MSI/PBA registers.
+     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
+     * managed directly by the PCIDevice implementation.
+     */
+    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
+        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
+
+    /* Set power-on register state */
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
+             ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
+        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
+        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
+        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
+        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
+        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
+        RISCV_IOMMU_CQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
+        RISCV_IOMMU_FQCSR_FQOF);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
+        RISCV_IOMMU_FQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
+        RISCV_IOMMU_PQCSR_PQOF);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
+        RISCV_IOMMU_PQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
+
+    /* Memory region for downstream access, if specified. */
+    if (s->target_mr) {
+        s->target_as = g_new0(AddressSpace, 1);
+        address_space_init(s->target_as, s->target_mr,
+            "riscv-iommu-downstream");
+    } else {
+        /* Fallback to global system memory. */
+        s->target_as = &address_space_memory;
+    }
+
+    /* Memory region for untranslated MRIF/MSI writes */
+    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
+            "riscv-iommu-trap", ~0ULL);
+    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
+
+    /* Device translation context cache */
+    s->ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
+                                         riscv_iommu_ctx_equal,
+                                         g_free, NULL);
+
+    s->iommus.le_next = NULL;
+    s->iommus.le_prev = NULL;
+    QLIST_INIT(&s->spaces);
+}
+
+static void riscv_iommu_unrealize(DeviceState *dev)
+{
+    RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+    g_hash_table_unref(s->ctx_cache);
+}
+
+static Property riscv_iommu_properties[] = {
+    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
+        RISCV_IOMMU_SPEC_DOT_VER),
+    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
+    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
+    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
+    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
+    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
+    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
+        TYPE_MEMORY_REGION, MemoryRegion *),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void riscv_iommu_class_init(ObjectClass *klass, void* data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
+    dc->user_creatable = false;
+    dc->realize = riscv_iommu_realize;
+    dc->unrealize = riscv_iommu_unrealize;
+    device_class_set_props(dc, riscv_iommu_properties);
+}
+
+static const TypeInfo riscv_iommu_info = {
+    .name = TYPE_RISCV_IOMMU,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(RISCVIOMMUState),
+    .class_init = riscv_iommu_class_init,
+};
+
+static const char *IOMMU_FLAG_STR[] = {
+    "NA",
+    "RO",
+    "WR",
+    "RW",
+};
+
+/* RISC-V IOMMU Memory Region - Address Translation Space */
+static IOMMUTLBEntry riscv_iommu_memory_region_translate(
+    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
+    IOMMUAccessFlags flag, int iommu_idx)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+    RISCVIOMMUContext *ctx;
+    void *ref;
+    IOMMUTLBEntry iotlb = {
+        .iova = addr,
+        .target_as = as->iommu->target_as,
+        .addr_mask = ~0ULL,
+        .perm = flag,
+    };
+
+    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
+    if (ctx == NULL) {
+        /* Translation disabled or invalid. */
+        iotlb.addr_mask = 0;
+        iotlb.perm = IOMMU_NONE;
+    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
+        /* Translation disabled or fault reported. */
+        iotlb.addr_mask = 0;
+        iotlb.perm = IOMMU_NONE;
+    }
+
+    /* Trace all dma translations with original access flags. */
+    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
+                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
+                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
+                          iotlb.translated_addr);
+
+    riscv_iommu_ctx_put(as->iommu, ref);
+
+    return iotlb;
+}
+
+static int riscv_iommu_memory_region_notify(
+    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
+    IOMMUNotifierFlag new, Error **errp)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+
+    if (old == IOMMU_NOTIFIER_NONE) {
+        as->notifier = true;
+        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
+    } else if (new == IOMMU_NOTIFIER_NONE) {
+        as->notifier = false;
+        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
+    }
+
+    return 0;
+}
+
+static inline bool pci_is_iommu(PCIDevice *pdev)
+{
+    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
+}
+
+static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
+{
+    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
+    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
+    AddressSpace *as = NULL;
+
+    if (pdev && pci_is_iommu(pdev)) {
+        return s->target_as;
+    }
+
+    /* Find first registered IOMMU device */
+    while (s->iommus.le_prev) {
+        s = *(s->iommus.le_prev);
+    }
+
+    /* Find first matching IOMMU */
+    while (s != NULL && as == NULL) {
+        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
+        s = s->iommus.le_next;
+    }
+
+    return as ? as : &address_space_memory;
+}
+
+static const PCIIOMMUOps riscv_iommu_ops = {
+    .get_address_space = riscv_iommu_find_as,
+};
+
+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+        Error **errp)
+{
+    if (bus->iommu_ops &&
+        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
+        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
+        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
+        QLIST_INSERT_AFTER(last, iommu, iommus);
+    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
+        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
+    } else {
+        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
+            pci_bus_num(bus));
+    }
+}
+
+static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
+    MemTxAttrs attrs)
+{
+    return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
+}
+
+static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+    return 1 << as->iommu->pid_bits;
+}
+
+static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
+{
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+    imrc->translate = riscv_iommu_memory_region_translate;
+    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
+    imrc->attrs_to_index = riscv_iommu_memory_region_index;
+    imrc->num_indexes = riscv_iommu_memory_region_index_len;
+}
+
+static const TypeInfo riscv_iommu_memory_region_info = {
+    .parent = TYPE_IOMMU_MEMORY_REGION,
+    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
+    .class_init = riscv_iommu_memory_region_init,
+};
+
+static void riscv_iommu_register_mr_types(void)
+{
+    type_register_static(&riscv_iommu_memory_region_info);
+    type_register_static(&riscv_iommu_info);
+}
+
+type_init(riscv_iommu_register_mr_types);
diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
new file mode 100644
index 0000000000..af3fcafc19
--- /dev/null
+++ b/hw/riscv/riscv-iommu.h
@@ -0,0 +1,126 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_RISCV_IOMMU_STATE_H
+#define HW_RISCV_IOMMU_STATE_H
+
+#include "qom/object.h"
+#include "hw/riscv/iommu.h"
+
+struct RISCVIOMMUState {
+    /*< private >*/
+    DeviceState parent_obj;
+
+    /*< public >*/
+    uint32_t version;     /* Reported interface version number */
+    uint32_t pid_bits;    /* process identifier width */
+    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
+
+    uint64_t cap;         /* IOMMU supported capabilities */
+    uint64_t fctl;        /* IOMMU enabled features */
+    uint64_t icvec_avail_vectors;  /* Available interrupt vectors in ICVEC */
+
+    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
+    bool enable_msi;      /* Enable MSI remapping */
+    bool enable_s_stage;  /* Enable S/VS-Stage translation */
+    bool enable_g_stage;  /* Enable G-Stage translation */
+
+    /* IOMMU Internal State */
+    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
+
+    dma_addr_t cq_addr;   /* Command queue base physical address */
+    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
+    dma_addr_t pq_addr;   /* Page request queue base physical address */
+
+    uint32_t cq_mask;     /* Command queue index bit mask */
+    uint32_t fq_mask;     /* Fault/event queue index bit mask */
+    uint32_t pq_mask;     /* Page request queue index bit mask */
+
+    /* interrupt notifier */
+    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
+
+    /* IOMMU State Machine */
+    QemuThread core_proc; /* Background processing thread */
+    QemuCond core_cond;   /* Background processing wake up signal */
+    unsigned core_exec;   /* Processing thread execution actions */
+
+    /* IOMMU target address space */
+    AddressSpace *target_as;
+    MemoryRegion *target_mr;
+
+    /* MSI / MRIF access trap */
+    AddressSpace trap_as;
+    MemoryRegion trap_mr;
+
+    GHashTable *ctx_cache;          /* Device translation Context Cache */
+
+    /* MMIO Hardware Interface */
+    MemoryRegion regs_mr;
+    uint8_t *regs_rw;  /* register state (user write) */
+    uint8_t *regs_wc;  /* write-1-to-clear mask */
+    uint8_t *regs_ro;  /* read-only mask */
+
+    QLIST_ENTRY(RISCVIOMMUState) iommus;
+    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
+};
+
+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+         Error **errp);
+
+/* private helpers */
+
+/* Register helper functions */
+static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
+    unsigned idx, uint32_t set, uint32_t clr)
+{
+    uint32_t val = ldl_le_p(s->regs_rw + idx);
+    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
+    return val;
+}
+
+static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s, unsigned idx,
+                                         uint32_t set)
+{
+    stl_le_p(s->regs_rw + idx, set);
+}
+
+static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s, unsigned idx)
+{
+    return ldl_le_p(s->regs_rw + idx);
+}
+
+static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s, unsigned idx,
+                                             uint64_t set, uint64_t clr)
+{
+    uint64_t val = ldq_le_p(s->regs_rw + idx);
+    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
+    return val;
+}
+
+static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s, unsigned idx,
+                                         uint64_t set)
+{
+    stq_le_p(s->regs_rw + idx, set);
+}
+
+static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
+    unsigned idx)
+{
+    return ldq_le_p(s->regs_rw + idx);
+}
+#endif
diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
new file mode 100644
index 0000000000..3d5c33102d
--- /dev/null
+++ b/hw/riscv/trace-events
@@ -0,0 +1,14 @@ 
+# See documentation at docs/devel/tracing.rst
+
+# riscv-iommu.c
+riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
+riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
+riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
+riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
+riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
+riscv_iommu_mrif_notification(const char *id, uint32_t nid, uint64_t phys) "%s: sent MRIF notification 0x%x to 0x%"PRIx64
+riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
+riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
+riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
+riscv_iommu_notify_int_vector(uint32_t cause, uint32_t vector) "Interrupt cause 0x%x sent via vector 0x%x"
+riscv_iommu_icvec_write(uint32_t orig, uint32_t actual) "ICVEC write: incoming 0x%x actual 0x%x"
diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
new file mode 100644
index 0000000000..8c0e3ca1f3
--- /dev/null
+++ b/hw/riscv/trace.h
@@ -0,0 +1 @@ 
+#include "trace/trace-hw_riscv.h"
diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
new file mode 100644
index 0000000000..80769a1400
--- /dev/null
+++ b/include/hw/riscv/iommu.h
@@ -0,0 +1,36 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_RISCV_IOMMU_H
+#define HW_RISCV_IOMMU_H
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#define TYPE_RISCV_IOMMU "riscv-iommu"
+OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
+typedef struct RISCVIOMMUState RISCVIOMMUState;
+
+#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
+typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
+
+#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
+OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
+typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
+
+#endif
diff --git a/meson.build b/meson.build
index 10464466ff..71de8a5cd1 100644
--- a/meson.build
+++ b/meson.build
@@ -3439,6 +3439,7 @@  if have_system
     'hw/pci-host',
     'hw/ppc',
     'hw/rtc',
+    'hw/riscv',
     'hw/s390x',
     'hw/scsi',
     'hw/sd',