diff mbox series

[v2,03/15] hw/riscv: add RISC-V IOMMU base emulation

Message ID 20240307160319.675044-4-dbarboza@ventanamicro.com (mailing list archive)
State New, archived
Headers show
Series riscv: QEMU RISC-V IOMMU Support | expand

Commit Message

Daniel Henrique Barboza March 7, 2024, 4:03 p.m. UTC
From: Tomasz Jeznach <tjeznach@rivosinc.com>

The RISC-V IOMMU specification is now ratified as-per the RISC-V
international process. The latest frozen specifcation can be found
at:

https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf

Add the foundation of the device emulation for RISC-V IOMMU, which
includes an IOMMU that has no capabilities but MSI interrupt support and
fault queue interfaces. We'll add add more features incrementally in the
next patches.

Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
---
 hw/riscv/Kconfig         |    4 +
 hw/riscv/meson.build     |    1 +
 hw/riscv/riscv-iommu.c   | 1492 ++++++++++++++++++++++++++++++++++++++
 hw/riscv/riscv-iommu.h   |  141 ++++
 hw/riscv/trace-events    |   11 +
 hw/riscv/trace.h         |    2 +
 include/hw/riscv/iommu.h |   36 +
 meson.build              |    1 +
 8 files changed, 1688 insertions(+)
 create mode 100644 hw/riscv/riscv-iommu.c
 create mode 100644 hw/riscv/riscv-iommu.h
 create mode 100644 hw/riscv/trace-events
 create mode 100644 hw/riscv/trace.h
 create mode 100644 include/hw/riscv/iommu.h

Comments

Jason Chien May 1, 2024, 11:57 a.m. UTC | #1
Daniel Henrique Barboza 於 2024/3/8 上午 12:03 寫道:
> From: Tomasz Jeznach<tjeznach@rivosinc.com>
>
> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> international process. The latest frozen specifcation can be found
> at:
>
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>
> Add the foundation of the device emulation for RISC-V IOMMU, which
> includes an IOMMU that has no capabilities but MSI interrupt support and
> fault queue interfaces. We'll add add more features incrementally in the
> next patches.
>
> Co-developed-by: Sebastien Boeuf<seb@rivosinc.com>
> Signed-off-by: Sebastien Boeuf<seb@rivosinc.com>
> Signed-off-by: Tomasz Jeznach<tjeznach@rivosinc.com>
> Signed-off-by: Daniel Henrique Barboza<dbarboza@ventanamicro.com>
> ---
>   hw/riscv/Kconfig         |    4 +
>   hw/riscv/meson.build     |    1 +
>   hw/riscv/riscv-iommu.c   | 1492 ++++++++++++++++++++++++++++++++++++++
>   hw/riscv/riscv-iommu.h   |  141 ++++
>   hw/riscv/trace-events    |   11 +
>   hw/riscv/trace.h         |    2 +
>   include/hw/riscv/iommu.h |   36 +
>   meson.build              |    1 +
>   8 files changed, 1688 insertions(+)
>   create mode 100644 hw/riscv/riscv-iommu.c
>   create mode 100644 hw/riscv/riscv-iommu.h
>   create mode 100644 hw/riscv/trace-events
>   create mode 100644 hw/riscv/trace.h
>   create mode 100644 include/hw/riscv/iommu.h
>
> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> index 5d644eb7b1..faf6a10029 100644
> --- a/hw/riscv/Kconfig
> +++ b/hw/riscv/Kconfig
> @@ -1,3 +1,6 @@
> +config RISCV_IOMMU
> +    bool
> +
>   config RISCV_NUMA
>       bool
>   
> @@ -38,6 +41,7 @@ config RISCV_VIRT
>       select SERIAL
>       select RISCV_ACLINT
>       select RISCV_APLIC
> +    select RISCV_IOMMU
>       select RISCV_IMSIC
>       select SIFIVE_PLIC
>       select SIFIVE_TEST
> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> index 2f7ee81be3..ba9eebd605 100644
> --- a/hw/riscv/meson.build
> +++ b/hw/riscv/meson.build
> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>   riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>   riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
>   riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
>   
>   hw_arch += {'riscv': riscv_ss}
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> new file mode 100644
> index 0000000000..df534b99b0
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.c
> @@ -0,0 +1,1492 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2021-2023, Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see<http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci_device.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/riscv/riscv_hart.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qemu/timer.h"
> +
> +#include "cpu_bits.h"
> +#include "riscv-iommu.h"
> +#include "riscv-iommu-bits.h"
> +#include "trace.h"
> +
> +#define LIMIT_CACHE_CTX               (1U << 7)
> +#define LIMIT_CACHE_IOT               (1U << 20)
> +
> +/* Physical page number coversions */
> +#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
> +#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
> +
> +typedef struct RISCVIOMMUContext RISCVIOMMUContext;
> +typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
> +
> +/* Device assigned I/O address space */
> +struct RISCVIOMMUSpace {
> +    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
> +    AddressSpace iova_as;       /* IOVA address space for attached device */
> +    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
> +    uint32_t devid;             /* Requester identifier, AKA device_id */
> +    bool notifier;              /* IOMMU unmap notifier enabled */
> +    QLIST_ENTRY(RISCVIOMMUSpace) list;
> +};
> +
> +/* Device translation context state. */
> +struct RISCVIOMMUContext {
> +    uint64_t devid:24;          /* Requester Id, AKA device_id */
> +    uint64_t pasid:20;          /* Process Address Space ID */
> +    uint64_t __rfu:20;          /* reserved */
> +    uint64_t tc;                /* Translation Control */
> +    uint64_t ta;                /* Translation Attributes */
> +    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
> +    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
> +    uint64_t msiptp;            /* MSI redirection page table pointer */
> +};
> +
> +/* IOMMU index for transactions without PASID specified. */
> +#define RISCV_IOMMU_NOPASID 0
> +
> +static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
> +{
> +    const uint32_t ipsr =
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
> +    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
> +    if (s->notify && !(ipsr & (1 << vec))) {
> +        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
> +    }
> +}
The RISC-V IOMMU also supports WSI.
> +
> +static void riscv_iommu_fault(RISCVIOMMUState *s,
> +                              struct riscv_iommu_fq_record *ev)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
> +    uint32_t next = (tail + 1) & s->fq_mask;
> +    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
> +
> +    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), ev->hdr, ev->iotval);
> +
> +    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
> +        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                              RISCV_IOMMU_FQCSR_FQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
> +        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                                  RISCV_IOMMU_FQCSR_FQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
> +    }
> +}
> +
> +static void riscv_iommu_pri(RISCVIOMMUState *s,
> +    struct riscv_iommu_pq_record *pr)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
> +    uint32_t next = (tail + 1) & s->pq_mask;
> +    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
> +
> +    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), pr->payload);
> +
> +    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
> +        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                              RISCV_IOMMU_PQCSR_PQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
> +        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                                  RISCV_IOMMU_PQCSR_PQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
> +    }
> +}
> +
> +/* Portable implementation of pext_u64, bit-mask extraction. */
> +static uint64_t _pext_u64(uint64_t val, uint64_t ext)
> +{
> +    uint64_t ret = 0;
> +    uint64_t rot = 1;
> +
> +    while (ext) {
> +        if (ext & 1) {
> +            if (val & 1) {
> +                ret |= rot;
> +            }
> +            rot <<= 1;
> +        }
> +        val >>= 1;
> +        ext >>= 1;
> +    }
> +
> +    return ret;
> +}
> +
> +/* Check if GPA matches MSI/MRIF pattern. */
> +static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    dma_addr_t gpa)
> +{
> +    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
> +        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +        return false; /* Invalid MSI/MRIF mode */
> +    }
> +
> +    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
> +        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
> +    }
> +
> +    return true;
> +}
> +
> +/* RISCV IOMMU Address Translation Lookup - Page Table Walk */
> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    /* Early check for MSI address match when IOVA == GPA */
> +    if (iotlb->perm & IOMMU_WO &&
> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
> +        iotlb->target_as = &s->trap_as;
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        return 0;
> +    }
> +
> +    /* Exit early for pass-through mode. */
> +    iotlb->translated_addr = iotlb->iova;
> +    iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +    /* Allow R/W in pass-through mode */
> +    iotlb->perm = IOMMU_RW;
> +    return 0;
> +}
> +
> +/* Redirect MSI write for given GPA. */
> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
> +    unsigned size, MemTxAttrs attrs)
> +{
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint64_t intn;
> +    uint32_t n190;
> +    uint64_t pte[2];
> +
> +    if (!riscv_iommu_msi_check(s, ctx, gpa)) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Interrupt File Number */
> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
> +    if (intn >= 256) {
> +        /* Interrupt file number out of range */
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* fetch MSI PTE */
> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
> +    addr = addr | (intn * sizeof(pte));
> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
> +            MEMTXATTRS_UNSPECIFIED);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +
> +    le64_to_cpus(&pte[0]);
> +    le64_to_cpus(&pte[1]);
> +
> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
> +        /* MSI Pass-through mode */
> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
> +        addr = addr | (gpa & TARGET_PAGE_MASK);
> +
> +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                              gpa, addr);
> +
> +        return dma_memory_write(s->target_as, addr, &data, size, attrs);
> +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
> +        /* MRIF mode, continue. */
> +        break;
> +    default:
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /*
> +     * Report an error for interrupt identities exceeding the maximum allowed
> +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
> +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
> +     */
> +    if ((data > 2047) || (gpa & 3)) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* MSI MRIF mode, non atomic pending bit update */
> +
> +    /* MRIF pending bit address */
> +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
> +    addr = addr | ((data & 0x7c0) >> 3);
> +
> +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                          gpa, addr);
> +
> +    /* MRIF pending bit mask */
> +    data = 1ULL << (data & 0x03f);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +    intn = intn | data;
> +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +
> +    /* Get MRIF enable bits */
> +    addr = addr + sizeof(intn);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +    if (!(intn & data)) {
> +        /* notification disabled, MRIF update completed. */
> +        return MEMTX_OK;
> +    }
> +
> +    /* Send notification message */
> +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
> +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
> +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
> +
> +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +/*
> + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> + *
> + * @s         : IOMMU Device State
> + * @ctx       : Device Translation Context with devid and pasid set.
> + * @return    : success or fault code.
> + */
> +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> +{
> +    const uint64_t ddtp = s->ddtp;
> +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
> +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
> +    struct riscv_iommu_dc dc;
> +    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
> +    const int dc_fmt = !s->enable_msi;
> +    const size_t dc_len = sizeof(dc) >> dc_fmt;
> +    unsigned depth;
> +    uint64_t de;
> +
> +    switch (mode) {
> +    case RISCV_IOMMU_DDTP_MODE_OFF:
> +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +
> +    case RISCV_IOMMU_DDTP_MODE_BARE:
> +        /* mock up pass-through translation context */
> +        ctx->tc = RISCV_IOMMU_DC_TC_V;
> +        ctx->ta = 0;
> +        ctx->msiptp = 0;
> +        return 0;
> +
> +    case RISCV_IOMMU_DDTP_MODE_1LVL:
> +        depth = 0;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_2LVL:
> +        depth = 1;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_3LVL:
> +        depth = 2;
> +        break;
> +
> +    default:
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +    }
> +
> +    /*
> +     * Check supported device id width (in bits).
> +     * See IOMMU Specification, Chapter 6. Software guidelines.
> +     * - if extended device-context format is used:
> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
> +     * - if base device-context format is used:
> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
> +     */
> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;

The cause should be 260 not 258.

 From the RISC-V IOMMU Architecture Spec v1.0.0 section 2.3:
If the device_id is wider than that supported by the IOMMU mode, as 
determined by the following checks then stop and report "Transaction 
type disallowed" (cause = 260).
a. ddtp.iommu_mode is 2LVL and DDI[2] is not 0
b. ddtp.iommu_mode is 1LVL and either DDI[2] is not 0 or DDI[1] is not 0

> +    }
> +
> +    /* Device directory tree walk */
> +    for (; depth-- > 0; ) {
> +        /*
> +         * Select device id index bits based on device directory tree level
> +         * and device context format.
> +         * See IOMMU Specification, Chapter 2. Data Structures.
> +         * - if extended device-context format is used:
> +         *   device index: [23:15][14:6][5:0]
> +         * - if base device-context format is used:
> +         *   device index: [23:16][15:7][6:0]
> +         */
> +        const int split = depth * 9 + 6 + dc_fmt;
> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
> +            /* invalid directory entry */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
> +            /* reserved bits set */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;

The cause should be 259 not 258.

 From RISC-V IOMMU Architecture Spec v1.0.0 section 2.3.1:
If any bits or encoding that are reserved for future standard use are 
set within ddte, stop and report "DDT entry misconfigured" (cause = 259).

> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
> +    }
> +
> +    /* index into device context entry page */
> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
> +
> +    memset(&dc, 0, sizeof(dc));
> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +    }
> +
> +    /* Set translation context. */
> +    ctx->tc = le64_to_cpu(dc.tc);
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
> +
According to RISC-V IOMMU Architecture spec v1.0.0 section 2.1.4, we 
should do some checks for the found device context.
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +    }
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
> +        if (ctx->pasid != RISCV_IOMMU_NOPASID) {
> +            /* PASID is disabled */
> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +        }
> +        return 0;
> +    }
> +
> +    /* FSC.TC.PDTV enabled */
> +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
> +        /* Invalid PDTP.MODE */
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> +    }
> +
> +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
> +        /*
> +         * Select process id index bits based on process directory tree
> +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
> +         */
> +        const int split = depth * 9 + 8;
> +        addr |= ((ctx->pasid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
> +    }
> +
> +    /* Leaf entry in PDT */
> +    addr |= (ctx->pasid << 4) & ~TARGET_PAGE_MASK;
> +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +    }
> +
According to RISC-V IOMMU Architecture spec v1.0.0 section 2.2.4, we 
should do some checks for the found process context.
> +    /* Use FSC and TA from process directory entry. */
> +    ctx->ta = le64_to_cpu(dc.ta);
> +
> +    return 0;
> +}
> +
> +/* Translation Context cache support */
> +static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
> +{
> +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
> +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
> +    return c1->devid == c2->devid && c1->pasid == c2->pasid;
> +}
> +
> +static guint __ctx_hash(gconstpointer v)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
> +    /* Generate simple hash of (pasid, devid), assuming 24-bit wide devid */
> +    return (guint)(ctx->devid) + ((guint)(ctx->pasid) << 24);
> +}
> +
> +static void __ctx_inval_devid_pasid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid &&
> +        ctx->pasid == arg->pasid) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
> +    uint32_t devid, uint32_t pasid)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .pasid = pasid,
> +    };
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    g_hash_table_foreach(ctx_cache, func, &key);
> +    g_hash_table_unref(ctx_cache);
> +}
> +
> +/* Find or allocate translation context for a given {device_id, process_id} */
> +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
> +    unsigned devid, unsigned pasid, void **ref)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext *ctx;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .pasid = pasid,
> +    };
> +
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    ctx = g_hash_table_lookup(ctx_cache, &key);
> +
> +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    if (g_hash_table_size(s->ctx_cache) >= LIMIT_CACHE_CTX) {
> +        ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                          g_free, NULL);
> +        g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
> +    }
> +
> +    ctx = g_new0(RISCVIOMMUContext, 1);
> +    ctx->devid = devid;
> +    ctx->pasid = pasid;
> +
> +    int fault = riscv_iommu_ctx_fetch(s, ctx);
> +    if (!fault) {
> +        g_hash_table_add(ctx_cache, ctx);
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    g_hash_table_unref(ctx_cache);
> +    *ref = NULL;
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_DTF)) {
> +        struct riscv_iommu_fq_record ev = { 0 };
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE,
> +            RISCV_IOMMU_FQ_TTYPE_UADDR_RD);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, devid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, pasid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, !!pasid);
> +        riscv_iommu_fault(s, &ev);
> +    }
> +
> +    g_free(ctx);
> +    return NULL;
> +}
> +
> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
> +{
> +    if (ref) {
> +        g_hash_table_unref((GHashTable *)ref);
> +    }
> +}
> +
> +/* Find or allocate address space for a given device */
> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
> +{
> +    RISCVIOMMUSpace *as;
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    QLIST_FOREACH(as, &s->spaces, list) {
> +        if (as->devid == devid) {
> +            break;
> +        }
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +
> +    if (as == NULL) {
> +        char name[64];
> +        as = g_new0(RISCVIOMMUSpace, 1);
> +
> +        as->iommu = s;
> +        as->devid = devid;
> +
> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +
> +        /* IOVA address space, untranslated addresses */
> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
> +            OBJECT(as), name, UINT64_MAX);
> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
> +            TYPE_RISCV_IOMMU_PCI);
> +
> +        qemu_mutex_lock(&s->core_lock);
> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
> +        qemu_mutex_unlock(&s->core_lock);
> +
> +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +    }
> +    return &as->iova_as;
> +}
> +
> +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    bool enable_faults;
> +    bool enable_pasid;
> +    bool enable_pri;
> +    int fault;
> +
> +    enable_faults = !(ctx->tc & RISCV_IOMMU_DC_TC_DTF);
> +    /*
> +     * TC[32] is reserved for custom extensions, used here to temporarily
> +     * enable automatic page-request generation for ATS queries.
> +     */
> +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
> +    enable_pasid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
> +
> +    /* Translate using device directory / page table information. */
> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
> +
> +    if (enable_pri && fault) {
> +        struct riscv_iommu_pq_record pr = {0};
> +        if (enable_pasid) {
> +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
> +                RISCV_IOMMU_PREQ_HDR_PID, ctx->pasid);
> +        }
> +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
> +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
> +                     RISCV_IOMMU_PREQ_PAYLOAD_M;
> +        riscv_iommu_pri(s, &pr);
> +        return fault;
> +    }
> +
> +    if (enable_faults && fault) {
> +        struct riscv_iommu_fq_record ev;
> +        unsigned ttype;
> +
> +        if (iotlb->perm & IOMMU_RW) {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> +        } else {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
> +        }
> +        ev.hdr = set_field(0, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, ttype);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, enable_pasid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->pasid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
> +        ev.iotval    = iotlb->iova;
> +        ev.iotval2   = iotlb->translated_addr;
> +        ev._reserved = 0;
> +        riscv_iommu_fault(s, &ev);
> +        return fault;
> +    }
> +
> +    return 0;
> +}
> +
> +/* IOMMU Command Interface */
> +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
> +    uint64_t addr, uint32_t data)
> +{
> +    /*
> +     * ATS processing in this implementation of the IOMMU is synchronous,
> +     * no need to wait for completions here.
> +     */
> +    if (!notify) {
> +        return MEMTX_OK;
> +    }
> +
> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
> +        MEMTXATTRS_UNSPECIFIED);
> +}
> +
> +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
> +{
> +    uint64_t old_ddtp = s->ddtp;
> +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
> +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    bool ok = false;
> +
> +    /*
> +     * Check for allowed DDTP.MODE transitions:
> +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
> +     */
> +    if (new_mode == old_mode ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
> +        ok = true;
> +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
> +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
> +    }
> +
> +    if (ok) {
> +        /* clear reserved and busy bits, report back sanitized version */
> +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
> +                             RISCV_IOMMU_DDTP_MODE, new_mode);
> +    } else {
> +        new_ddtp = old_ddtp;
> +    }
> +    s->ddtp = new_ddtp;
> +
> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
> +}
> +
> +/* Command function and opcode field. */
> +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
> +
> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
> +{
> +    struct riscv_iommu_command cmd;
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint32_t tail, head, ctrl;
> +    uint64_t cmd_opcode;
> +    GHFunc func;
> +
> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
> +
> +    /* Check for pending error or queue processing disabled */
> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
> +        return;
> +    }
> +
> +    while (tail != head) {
> +        addr = s->cq_addr  + head * sizeof(cmd);
> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
> +                              MEMTXATTRS_UNSPECIFIED);
> +
> +        if (res != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                  RISCV_IOMMU_CQCSR_CQMF, 0);
> +            goto fault;
> +        }
> +
> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
> +
> +        cmd_opcode = get_field(cmd.dword0,
> +                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
> +
> +        switch (cmd_opcode) {
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
> +            res = riscv_iommu_iofence(s,
> +                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
> +
> +            if (res != MEMTX_OK) {
> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
> +                goto fault;
> +            }
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
> +                goto cmd_ill;
> +            }
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* invalidate all device context cache mappings */
> +                func = __ctx_inval_all;
> +            } else {
> +                /* invalidate all device context matching DID */
> +                func = __ctx_inval_devid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* illegal command arguments IODIR_PDT & DV == 0 */
> +                goto cmd_ill;
> +            } else {
> +                func = __ctx_inval_devid_pasid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
> +            break;
> +
> +        default:
> +        cmd_ill:
> +            /* Invalid instruction, do not advance instruction index. */
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
> +            goto fault;
> +        }
> +
> +        /* Advance and update head pointer after command completes. */
> +        head = (head + 1) & s->cq_mask;
> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
> +    }
> +    return;
> +
> +fault:
> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
> +    }
> +}
> +
> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
> +            RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO;
cqcsr.fence_w_ip should be set to 0 as well.
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
> +            RISCV_IOMMU_FQCSR_FQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
> +            RISCV_IOMMU_PQCSR_PQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +/* Core IOMMU execution activation */
> +enum {
> +    RISCV_IOMMU_EXEC_DDTP,
> +    RISCV_IOMMU_EXEC_CQCSR,
> +    RISCV_IOMMU_EXEC_CQT,
> +    RISCV_IOMMU_EXEC_FQCSR,
> +    RISCV_IOMMU_EXEC_FQH,
> +    RISCV_IOMMU_EXEC_PQCSR,
> +    RISCV_IOMMU_EXEC_PQH,
> +    RISCV_IOMMU_EXEC_TR_REQUEST,
> +    /* RISCV_IOMMU_EXEC_EXIT must be the last enum value */
> +    RISCV_IOMMU_EXEC_EXIT,
> +};
> +
> +static void *riscv_iommu_core_proc(void* arg)
> +{
> +    RISCVIOMMUState *s = arg;
> +    unsigned exec = 0;
> +    unsigned mask = 0;
> +
> +    while (!(exec & BIT(RISCV_IOMMU_EXEC_EXIT))) {
> +        mask = (mask ? mask : BIT(RISCV_IOMMU_EXEC_EXIT)) >> 1;
> +        switch (exec & mask) {
> +        case BIT(RISCV_IOMMU_EXEC_DDTP):
> +            riscv_iommu_process_ddtp(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_CQCSR):
> +            riscv_iommu_process_cq_control(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_CQT):
> +            riscv_iommu_process_cq_tail(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_FQCSR):
> +            riscv_iommu_process_fq_control(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_FQH):
> +            /* NOP */
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_PQCSR):
> +            riscv_iommu_process_pq_control(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_PQH):
> +            /* NOP */
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_TR_REQUEST):
> +            /* DBG support not implemented yet */
> +            break;
> +        }
> +        exec &= ~mask;
> +        if (!exec) {
> +            qemu_mutex_lock(&s->core_lock);
> +            exec = s->core_exec;
> +            while (!exec) {
> +                qemu_cond_wait(&s->core_cond, &s->core_lock);
> +                exec = s->core_exec;
> +            }
> +            s->core_exec = 0;
> +            qemu_mutex_unlock(&s->core_lock);
> +        }
> +    };
> +
> +    return NULL;
> +}
> +
> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState *s = opaque;
> +    uint32_t regb = addr & ~3;
> +    uint32_t busy = 0;
> +    uint32_t exec = 0;
> +
> +    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment or access size */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        /* Unsupported MMIO access location. */
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Track actionable MMIO write. */
> +    switch (regb) {

There should be a case for IPSR register.

 From RISC-V IOMMU Architecture Spec v1.0.0 section 5.18:
If a bit in ipsr is 1 then a write of 1 to the bit transitions the bit 
from 1→0. If the conditions to set that bit are still present (See 
[IPSR_FIELDS]) or if they occur after the bit is cleared then that bit 
transitions again from 0→1.
> +    case RISCV_IOMMU_REG_DDTP:
> +    case RISCV_IOMMU_REG_DDTP + 4:
> +        exec = BIT(RISCV_IOMMU_EXEC_DDTP);
> +        regb = RISCV_IOMMU_REG_DDTP;
> +        busy = RISCV_IOMMU_DDTP_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQT:
> +        exec = BIT(RISCV_IOMMU_EXEC_CQT);
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQCSR:
> +        exec = BIT(RISCV_IOMMU_EXEC_CQCSR);
> +        busy = RISCV_IOMMU_CQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_FQH:
> +        exec = BIT(RISCV_IOMMU_EXEC_FQH);
> +        break;
> +
> +    case RISCV_IOMMU_REG_FQCSR:
> +        exec = BIT(RISCV_IOMMU_EXEC_FQCSR);
> +        busy = RISCV_IOMMU_FQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_PQH:
> +        exec = BIT(RISCV_IOMMU_EXEC_PQH);
> +        break;
> +
> +    case RISCV_IOMMU_REG_PQCSR:
> +        exec = BIT(RISCV_IOMMU_EXEC_PQCSR);
> +        busy = RISCV_IOMMU_PQCSR_BUSY;
> +        break;
> +    }
> +
> +    /*
> +     * Registers update might be not synchronized with core logic.
> +     * If system software updates register when relevant BUSY bit is set
> +     * IOMMU behavior of additional writes to the register is UNSPECIFIED
> +     */
> +
> +    qemu_spin_lock(&s->regs_lock);
> +    if (size == 1) {
> +        uint8_t ro = s->regs_ro[addr];
> +        uint8_t wc = s->regs_wc[addr];
> +        uint8_t rw = s->regs_rw[addr];
> +        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
> +    } else if (size == 2) {
> +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
> +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
> +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
> +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 4) {
> +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 8) {
> +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    }
> +
> +    /* Busy flag update, MSB 4-byte register. */
> +    if (busy) {
> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
> +        stl_le_p(&s->regs_rw[regb], rw | busy);
> +    }
> +    qemu_spin_unlock(&s->regs_lock);
> +
> +    /* Wake up core processing thread. */
> +    if (exec) {
> +        qemu_mutex_lock(&s->core_lock);
> +        s->core_exec |= exec;
> +        qemu_cond_signal(&s->core_cond);
> +        qemu_mutex_unlock(&s->core_lock);
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState *s = opaque;
> +    uint64_t val = -1;
> +    uint8_t *ptr;
> +
> +    if ((addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment. */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    ptr = &s->regs_rw[addr];
> +
> +    if (size == 1) {
> +        val = (uint64_t)*ptr;
> +    } else if (size == 2) {
> +        val = lduw_le_p(ptr);
> +    } else if (size == 4) {
> +        val = ldl_le_p(ptr);
> +    } else if (size == 8) {
> +        val = ldq_le_p(ptr);
> +    } else {
> +        return MEMTX_ERROR;
> +    }
> +
> +    *data = val;
> +
> +    return MEMTX_OK;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
> +    .read_with_attrs = riscv_iommu_mmio_read,
> +    .write_with_attrs = riscv_iommu_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +        .unaligned = false,
> +    },
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +/*
> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
> + * memory region as untranslated address, for additional MSI/MRIF interception
> + * by IOMMU interrupt remapping implementation.
> + * Note: Device emulation code generating an MSI is expected to provide a valid
> + * memory transaction attributes with requested_id set.
> + */
> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
> +    RISCVIOMMUContext *ctx;
> +    MemTxResult res;
> +    void *ref;
> +    uint32_t devid = attrs.requester_id;
> +
> +    if (attrs.unspecified) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
> +    if (ctx == NULL) {
> +        res = MEMTX_ACCESS_ERROR;
> +    } else {
> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
> +    }
> +    riscv_iommu_ctx_put(s, ref);
> +    return res;
> +}
> +
> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    return MEMTX_ACCESS_ERROR;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_trap_ops = {
> +    .read_with_attrs = riscv_iommu_trap_read,
> +    .write_with_attrs = riscv_iommu_trap_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +        .unaligned = true,
> +    },
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
> +    if (s->enable_msi) {
> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
> +    }
> +    /* Report QEMU target physical address space limits */
> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
> +                       TARGET_PHYS_ADDR_SPACE_BITS);
> +
> +    /* TODO: method to report supported PASID bits */
> +    s->pasid_bits = 8; /* restricted to size of MemTxAttrs.pasid */
> +    s->cap |= RISCV_IOMMU_CAP_PD8;
> +
> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
> +
> +    /* register storage */
> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +
> +     /* Mark all registers read-only */
> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
> +
> +    /*
> +     * Register complete MMIO space, including MSI/PBA registers.
> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
> +     * managed directly by the PCIDevice implementation.
> +     */
> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
> +
> +    /* Set power-on register state */
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], s->fctl);
s->fctl is not initialized.
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
> +        RISCV_IOMMU_CQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
> +        RISCV_IOMMU_FQCSR_FQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
> +        RISCV_IOMMU_FQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
> +        RISCV_IOMMU_PQCSR_PQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
> +        RISCV_IOMMU_PQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_IVEC], 0);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
> +
> +    /* Memory region for downstream access, if specified. */
> +    if (s->target_mr) {
> +        s->target_as = g_new0(AddressSpace, 1);
> +        address_space_init(s->target_as, s->target_mr,
> +            "riscv-iommu-downstream");
> +    } else {
> +        /* Fallback to global system memory. */
> +        s->target_as = &address_space_memory;
> +    }
> +
> +    /* Memory region for untranslated MRIF/MSI writes */
> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
> +            "riscv-iommu-trap", ~0ULL);
> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
> +
> +    /* Device translation context cache */
> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                         g_free, NULL);
> +
> +    s->iommus.le_next = NULL;
> +    s->iommus.le_prev = NULL;
> +    QLIST_INIT(&s->spaces);
> +    qemu_cond_init(&s->core_cond);
> +    qemu_mutex_init(&s->core_lock);
> +    qemu_spin_init(&s->regs_lock);
> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
> +}
> +
> +static void riscv_iommu_unrealize(DeviceState *dev)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    /* cancel pending operations and stop */
> +    s->core_exec = BIT(RISCV_IOMMU_EXEC_EXIT);
> +    qemu_cond_signal(&s->core_cond);
> +    qemu_mutex_unlock(&s->core_lock);
> +    qemu_thread_join(&s->core_proc);
> +    qemu_cond_destroy(&s->core_cond);
> +    qemu_mutex_destroy(&s->core_lock);
> +    g_hash_table_unref(s->ctx_cache);
> +}
> +
> +static Property riscv_iommu_properties[] = {
> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
> +        RISCV_IOMMU_SPEC_DOT_VER),
> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
> +    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
> +        TYPE_MEMORY_REGION, MemoryRegion *),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
> +    dc->user_creatable = false;
> +    dc->realize = riscv_iommu_realize;
> +    dc->unrealize = riscv_iommu_unrealize;
> +    device_class_set_props(dc, riscv_iommu_properties);
> +}
> +
> +static const TypeInfo riscv_iommu_info = {
> +    .name = TYPE_RISCV_IOMMU,
> +    .parent = TYPE_DEVICE,
> +    .instance_size = sizeof(RISCVIOMMUState),
> +    .class_init = riscv_iommu_class_init,
> +};
> +
> +static const char *IOMMU_FLAG_STR[] = {
> +    "NA",
> +    "RO",
> +    "WR",
> +    "RW",
> +};
> +
> +/* RISC-V IOMMU Memory Region - Address Translation Space */
> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
> +    IOMMUAccessFlags flag, int iommu_idx)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    RISCVIOMMUContext *ctx;
> +    void *ref;
> +    IOMMUTLBEntry iotlb = {
> +        .iova = addr,
> +        .target_as = as->iommu->target_as,
> +        .addr_mask = ~0ULL,
> +        .perm = flag,
> +    };
> +
> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
> +    if (ctx == NULL) {
> +        /* Translation disabled or invalid. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
> +        /* Translation disabled or fault reported. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    }
> +
> +    /* Trace all dma translations with original access flags. */
> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
> +                          iotlb.translated_addr);
> +
> +    riscv_iommu_ctx_put(as->iommu, ref);
> +
> +    return iotlb;
> +}
> +
> +static int riscv_iommu_memory_region_notify(
> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
> +    IOMMUNotifierFlag new, Error **errp)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +
> +    if (old == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = true;
> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
> +    } else if (new == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = false;
> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
> +    }
> +
> +    return 0;
> +}
> +
> +static inline bool pci_is_iommu(PCIDevice *pdev)
> +{
> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
> +}
> +
> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
> +{
> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> +    AddressSpace *as = NULL;
> +
> +    if (pdev && pci_is_iommu(pdev)) {
> +        return s->target_as;
> +    }
> +
> +    /* Find first registered IOMMU device */
> +    while (s->iommus.le_prev) {
> +        s = *(s->iommus.le_prev);
> +    }
> +
> +    /* Find first matching IOMMU */
> +    while (s != NULL && as == NULL) {
> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> +        s = s->iommus.le_next;
> +    }
> +
> +    return as ? as : &address_space_memory;
> +}
> +
> +static const PCIIOMMUOps riscv_iommu_ops = {
> +    .get_address_space = riscv_iommu_find_as,
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +        Error **errp)
> +{
> +    if (bus->iommu_ops &&
> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> +    } else if (bus->iommu_ops == NULL) {
> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
The original bus->iommu_op and bus->iommu_opaque will be lost.
> +    } else {
> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
> +            pci_bus_num(bus));
> +    }
> +}
> +
> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
> +    MemTxAttrs attrs)
> +{
> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
> +}
> +
> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    return 1 << as->iommu->pasid_bits;
> +}
> +
> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
> +{
> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> +
> +    imrc->translate = riscv_iommu_memory_region_translate;
> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> +}
> +
> +static const TypeInfo riscv_iommu_memory_region_info = {
> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> +    .class_init = riscv_iommu_memory_region_init,
> +};
> +
> +static void riscv_iommu_register_mr_types(void)
> +{
> +    type_register_static(&riscv_iommu_memory_region_info);
> +    type_register_static(&riscv_iommu_info);
> +}
> +
> +type_init(riscv_iommu_register_mr_types);
> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> new file mode 100644
> index 0000000000..6f740de690
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.h
> @@ -0,0 +1,141 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see<http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_STATE_H
> +#define HW_RISCV_IOMMU_STATE_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#include "hw/riscv/iommu.h"
> +
> +struct RISCVIOMMUState {
> +    /*< private >*/
> +    DeviceState parent_obj;
> +
> +    /*< public >*/
> +    uint32_t version;     /* Reported interface version number */
> +    uint32_t pasid_bits;  /* process identifier width */
> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> +
> +    uint64_t cap;         /* IOMMU supported capabilities */
> +    uint64_t fctl;        /* IOMMU enabled features */
> +
> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
> +    bool enable_msi;      /* Enable MSI remapping */
> +
> +    /* IOMMU Internal State */
> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> +
> +    dma_addr_t cq_addr;   /* Command queue base physical address */
> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
> +
> +    uint32_t cq_mask;     /* Command queue index bit mask */
> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> +    uint32_t pq_mask;     /* Page request queue index bit mask */
> +
> +    /* interrupt notifier */
> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> +
> +    /* IOMMU State Machine */
> +    QemuThread core_proc; /* Background processing thread */
> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
> +    QemuCond core_cond;   /* Background processing wake up signal */
> +    unsigned core_exec;   /* Processing thread execution actions */
> +
> +    /* IOMMU target address space */
> +    AddressSpace *target_as;
> +    MemoryRegion *target_mr;
> +
> +    /* MSI / MRIF access trap */
> +    AddressSpace trap_as;
> +    MemoryRegion trap_mr;
> +
> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
> +
> +    /* MMIO Hardware Interface */
> +    MemoryRegion regs_mr;
> +    QemuSpin regs_lock;
> +    uint8_t *regs_rw;  /* register state (user write) */
> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> +    uint8_t *regs_ro;  /* read-only mask */
> +
> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +         Error **errp);
> +
> +/* private helpers */
> +
> +/* Register helper functions */
> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set, uint32_t clr)
> +{
> +    uint32_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldl_le_p(s->regs_rw + idx);
> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stl_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldl_le_p(s->regs_rw + idx);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set, uint64_t clr)
> +{
> +    uint64_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldq_le_p(s->regs_rw + idx);
> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stq_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldq_le_p(s->regs_rw + idx);
> +}
> +
> +
> +
> +#endif
> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> new file mode 100644
> index 0000000000..42a97caffa
> --- /dev/null
> +++ b/hw/riscv/trace-events
> @@ -0,0 +1,11 @@
> +# See documentation at docs/devel/tracing.rst
> +
> +# riscv-iommu.c
> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> new file mode 100644
> index 0000000000..b88504b750
> --- /dev/null
> +++ b/hw/riscv/trace.h
> @@ -0,0 +1,2 @@
> +#include "trace/trace-hw_riscv.h"
> +
> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> new file mode 100644
> index 0000000000..403b365893
> --- /dev/null
> +++ b/include/hw/riscv/iommu.h
> @@ -0,0 +1,36 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see<http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_H
> +#define HW_RISCV_IOMMU_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#define TYPE_RISCV_IOMMU "riscv-iommu"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> +
> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> +
> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> +
> +#endif
> diff --git a/meson.build b/meson.build
> index c59ca496f2..75e56f3282 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -3361,6 +3361,7 @@ if have_system
>       'hw/rdma',
>       'hw/rdma/vmw',
>       'hw/rtc',
> +    'hw/riscv',
>       'hw/s390x',
>       'hw/scsi',
>       'hw/sd',
Frank Chang May 2, 2024, 11:37 a.m. UTC | #2
Hi Daniel,

Daniel Henrique Barboza <dbarboza@ventanamicro.com> 於 2024年3月8日 週五 上午12:04寫道:
>
> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>
> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> international process. The latest frozen specifcation can be found
> at:
>
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>
> Add the foundation of the device emulation for RISC-V IOMMU, which
> includes an IOMMU that has no capabilities but MSI interrupt support and
> fault queue interfaces. We'll add add more features incrementally in the
> next patches.
>
> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> ---
>  hw/riscv/Kconfig         |    4 +
>  hw/riscv/meson.build     |    1 +
>  hw/riscv/riscv-iommu.c   | 1492 ++++++++++++++++++++++++++++++++++++++
>  hw/riscv/riscv-iommu.h   |  141 ++++
>  hw/riscv/trace-events    |   11 +
>  hw/riscv/trace.h         |    2 +
>  include/hw/riscv/iommu.h |   36 +
>  meson.build              |    1 +
>  8 files changed, 1688 insertions(+)
>  create mode 100644 hw/riscv/riscv-iommu.c
>  create mode 100644 hw/riscv/riscv-iommu.h
>  create mode 100644 hw/riscv/trace-events
>  create mode 100644 hw/riscv/trace.h
>  create mode 100644 include/hw/riscv/iommu.h
>
> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> index 5d644eb7b1..faf6a10029 100644
> --- a/hw/riscv/Kconfig
> +++ b/hw/riscv/Kconfig
> @@ -1,3 +1,6 @@
> +config RISCV_IOMMU
> +    bool
> +
>  config RISCV_NUMA
>      bool
>
> @@ -38,6 +41,7 @@ config RISCV_VIRT
>      select SERIAL
>      select RISCV_ACLINT
>      select RISCV_APLIC
> +    select RISCV_IOMMU
>      select RISCV_IMSIC
>      select SIFIVE_PLIC
>      select SIFIVE_TEST
> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> index 2f7ee81be3..ba9eebd605 100644
> --- a/hw/riscv/meson.build
> +++ b/hw/riscv/meson.build
> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>  riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>  riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
>  riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
>
>  hw_arch += {'riscv': riscv_ss}
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> new file mode 100644
> index 0000000000..df534b99b0
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.c
> @@ -0,0 +1,1492 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2021-2023, Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci_device.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/riscv/riscv_hart.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qemu/timer.h"
> +
> +#include "cpu_bits.h"
> +#include "riscv-iommu.h"
> +#include "riscv-iommu-bits.h"
> +#include "trace.h"
> +
> +#define LIMIT_CACHE_CTX               (1U << 7)
> +#define LIMIT_CACHE_IOT               (1U << 20)
> +
> +/* Physical page number coversions */
> +#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
> +#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
> +
> +typedef struct RISCVIOMMUContext RISCVIOMMUContext;
> +typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
> +
> +/* Device assigned I/O address space */
> +struct RISCVIOMMUSpace {
> +    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
> +    AddressSpace iova_as;       /* IOVA address space for attached device */
> +    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
> +    uint32_t devid;             /* Requester identifier, AKA device_id */
> +    bool notifier;              /* IOMMU unmap notifier enabled */
> +    QLIST_ENTRY(RISCVIOMMUSpace) list;
> +};
> +
> +/* Device translation context state. */
> +struct RISCVIOMMUContext {
> +    uint64_t devid:24;          /* Requester Id, AKA device_id */
> +    uint64_t pasid:20;          /* Process Address Space ID */
> +    uint64_t __rfu:20;          /* reserved */
> +    uint64_t tc;                /* Translation Control */
> +    uint64_t ta;                /* Translation Attributes */
> +    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
> +    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
> +    uint64_t msiptp;            /* MSI redirection page table pointer */
> +};
> +
> +/* IOMMU index for transactions without PASID specified. */
> +#define RISCV_IOMMU_NOPASID 0
> +
> +static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
> +{
> +    const uint32_t ipsr =
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
> +    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
> +    if (s->notify && !(ipsr & (1 << vec))) {
> +        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
> +    }

s->notify is assigned to riscv_iommu_pci_notify() only.
There's no way to assert the wire-signaled interrupt.

We should also check fctl.WSI before asserting the interrupt.

> +}
> +
> +static void riscv_iommu_fault(RISCVIOMMUState *s,
> +                              struct riscv_iommu_fq_record *ev)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
> +    uint32_t next = (tail + 1) & s->fq_mask;
> +    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
> +
> +    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), ev->hdr, ev->iotval);
> +
> +    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
> +        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                              RISCV_IOMMU_FQCSR_FQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
> +        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                                  RISCV_IOMMU_FQCSR_FQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
> +    }
> +}
> +
> +static void riscv_iommu_pri(RISCVIOMMUState *s,
> +    struct riscv_iommu_pq_record *pr)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
> +    uint32_t next = (tail + 1) & s->pq_mask;
> +    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
> +
> +    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), pr->payload);
> +
> +    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
> +        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                              RISCV_IOMMU_PQCSR_PQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
> +        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                                  RISCV_IOMMU_PQCSR_PQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
> +    }
> +}
> +
> +/* Portable implementation of pext_u64, bit-mask extraction. */
> +static uint64_t _pext_u64(uint64_t val, uint64_t ext)
> +{
> +    uint64_t ret = 0;
> +    uint64_t rot = 1;
> +
> +    while (ext) {
> +        if (ext & 1) {
> +            if (val & 1) {
> +                ret |= rot;
> +            }
> +            rot <<= 1;
> +        }
> +        val >>= 1;
> +        ext >>= 1;
> +    }
> +
> +    return ret;
> +}
> +
> +/* Check if GPA matches MSI/MRIF pattern. */
> +static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    dma_addr_t gpa)
> +{
> +    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
> +        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +        return false; /* Invalid MSI/MRIF mode */
> +    }
> +
> +    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
> +        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
> +    }
> +
> +    return true;
> +}
> +
> +/* RISCV IOMMU Address Translation Lookup - Page Table Walk */
> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    /* Early check for MSI address match when IOVA == GPA */
> +    if (iotlb->perm & IOMMU_WO &&
> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
> +        iotlb->target_as = &s->trap_as;
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        return 0;
> +    }
> +
> +    /* Exit early for pass-through mode. */
> +    iotlb->translated_addr = iotlb->iova;
> +    iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +    /* Allow R/W in pass-through mode */
> +    iotlb->perm = IOMMU_RW;
> +    return 0;
> +}
> +
> +/* Redirect MSI write for given GPA. */
> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
> +    unsigned size, MemTxAttrs attrs)
> +{
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint64_t intn;
> +    uint32_t n190;
> +    uint64_t pte[2];
> +
> +    if (!riscv_iommu_msi_check(s, ctx, gpa)) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Interrupt File Number */
> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
> +    if (intn >= 256) {
> +        /* Interrupt file number out of range */
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* fetch MSI PTE */
> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
> +    addr = addr | (intn * sizeof(pte));
> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
> +            MEMTXATTRS_UNSPECIFIED);
> +    if (res != MEMTX_OK) {
> +        return res;

The spec says that:
"If msipte access detects a data corruption (a.k.a. poisoned data),
then stop and report "MSI PT data corruption" (cause = 270)."

> +    }
> +
> +    le64_to_cpus(&pte[0]);
> +    le64_to_cpus(&pte[1]);
> +
> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
> +        return MEMTX_ACCESS_ERROR;

The spec says that:
"If msipte.V == 0, then stop and report "MSI PTE not valid" (cause = 262)."

> +    }
> +
> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
> +        /* MSI Pass-through mode */
> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
> +        addr = addr | (gpa & TARGET_PAGE_MASK);
> +
> +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                              gpa, addr);
> +
> +        return dma_memory_write(s->target_as, addr, &data, size, attrs);
> +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
> +        /* MRIF mode, continue. */
> +        break;
> +    default:
> +        return MEMTX_ACCESS_ERROR;

The spec says that:
"If msipte.M == 0 or msipte.M == 2, then stop and report "MSI PTE
misconfigured" (cause = 263)."

> +    }
> +
> +    /*
> +     * Report an error for interrupt identities exceeding the maximum allowed
> +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
> +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
> +     */
> +    if ((data > 2047) || (gpa & 3)) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* MSI MRIF mode, non atomic pending bit update */
> +
> +    /* MRIF pending bit address */
> +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
> +    addr = addr | ((data & 0x7c0) >> 3);
> +
> +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                          gpa, addr);
> +
> +    /* MRIF pending bit mask */
> +    data = 1ULL << (data & 0x03f);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +    intn = intn | data;
> +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +
> +    /* Get MRIF enable bits */
> +    addr = addr + sizeof(intn);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +    if (!(intn & data)) {
> +        /* notification disabled, MRIF update completed. */
> +        return MEMTX_OK;
> +    }
> +
> +    /* Send notification message */
> +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
> +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
> +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
> +
> +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +/*
> + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> + *
> + * @s         : IOMMU Device State
> + * @ctx       : Device Translation Context with devid and pasid set.
> + * @return    : success or fault code.
> + */
> +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> +{
> +    const uint64_t ddtp = s->ddtp;
> +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
> +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
> +    struct riscv_iommu_dc dc;
> +    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
> +    const int dc_fmt = !s->enable_msi;
> +    const size_t dc_len = sizeof(dc) >> dc_fmt;
> +    unsigned depth;
> +    uint64_t de;
> +
> +    switch (mode) {
> +    case RISCV_IOMMU_DDTP_MODE_OFF:
> +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +
> +    case RISCV_IOMMU_DDTP_MODE_BARE:
> +        /* mock up pass-through translation context */
> +        ctx->tc = RISCV_IOMMU_DC_TC_V;
> +        ctx->ta = 0;
> +        ctx->msiptp = 0;
> +        return 0;
> +
> +    case RISCV_IOMMU_DDTP_MODE_1LVL:
> +        depth = 0;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_2LVL:
> +        depth = 1;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_3LVL:
> +        depth = 2;
> +        break;
> +
> +    default:
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +    }
> +
> +    /*
> +     * Check supported device id width (in bits).
> +     * See IOMMU Specification, Chapter 6. Software guidelines.
> +     * - if extended device-context format is used:
> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
> +     * - if base device-context format is used:
> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
> +     */
> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +    }
> +
> +    /* Device directory tree walk */
> +    for (; depth-- > 0; ) {
> +        /*
> +         * Select device id index bits based on device directory tree level
> +         * and device context format.
> +         * See IOMMU Specification, Chapter 2. Data Structures.
> +         * - if extended device-context format is used:
> +         *   device index: [23:15][14:6][5:0]
> +         * - if base device-context format is used:
> +         *   device index: [23:16][15:7][6:0]
> +         */
> +        const int split = depth * 9 + 6 + dc_fmt;
> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
> +            /* invalid directory entry */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
> +            /* reserved bits set */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
> +    }
> +
> +    /* index into device context entry page */
> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
> +
> +    memset(&dc, 0, sizeof(dc));
> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +    }
> +
> +    /* Set translation context. */
> +    ctx->tc = le64_to_cpu(dc.tc);
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +    }
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
> +        if (ctx->pasid != RISCV_IOMMU_NOPASID) {
> +            /* PASID is disabled */
> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +        }
> +        return 0;
> +    }
> +
> +    /* FSC.TC.PDTV enabled */
> +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
> +        /* Invalid PDTP.MODE */
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> +    }
> +
> +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
> +        /*
> +         * Select process id index bits based on process directory tree
> +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
> +         */
> +        const int split = depth * 9 + 8;
> +        addr |= ((ctx->pasid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
> +    }
> +
> +    /* Leaf entry in PDT */
> +    addr |= (ctx->pasid << 4) & ~TARGET_PAGE_MASK;
> +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +    }
> +
> +    /* Use FSC and TA from process directory entry. */
> +    ctx->ta = le64_to_cpu(dc.ta);
> +
> +    return 0;
> +}
> +
> +/* Translation Context cache support */
> +static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
> +{
> +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
> +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
> +    return c1->devid == c2->devid && c1->pasid == c2->pasid;
> +}
> +
> +static guint __ctx_hash(gconstpointer v)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
> +    /* Generate simple hash of (pasid, devid), assuming 24-bit wide devid */
> +    return (guint)(ctx->devid) + ((guint)(ctx->pasid) << 24);
> +}
> +
> +static void __ctx_inval_devid_pasid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid &&
> +        ctx->pasid == arg->pasid) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
> +    uint32_t devid, uint32_t pasid)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .pasid = pasid,
> +    };
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    g_hash_table_foreach(ctx_cache, func, &key);
> +    g_hash_table_unref(ctx_cache);
> +}
> +
> +/* Find or allocate translation context for a given {device_id, process_id} */
> +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
> +    unsigned devid, unsigned pasid, void **ref)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext *ctx;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .pasid = pasid,
> +    };
> +
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    ctx = g_hash_table_lookup(ctx_cache, &key);
> +
> +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    if (g_hash_table_size(s->ctx_cache) >= LIMIT_CACHE_CTX) {
> +        ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                          g_free, NULL);
> +        g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
> +    }
> +
> +    ctx = g_new0(RISCVIOMMUContext, 1);
> +    ctx->devid = devid;
> +    ctx->pasid = pasid;
> +
> +    int fault = riscv_iommu_ctx_fetch(s, ctx);
> +    if (!fault) {
> +        g_hash_table_add(ctx_cache, ctx);
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    g_hash_table_unref(ctx_cache);
> +    *ref = NULL;
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_DTF)) {

riscv_iommu_ctx_fetch() may return:
RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED (256)
RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT (257)
RISCV_IOMMU_FQ_CAUSE_DDT_INVALID (258)
RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED (259)

These faults are reported even when DTF is set to 1.
We should report these faults regardless of DTF setting.

> +        struct riscv_iommu_fq_record ev = { 0 };
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE,
> +            RISCV_IOMMU_FQ_TTYPE_UADDR_RD);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, devid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, pasid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, !!pasid);
> +        riscv_iommu_fault(s, &ev);
> +    }
> +
> +    g_free(ctx);
> +    return NULL;
> +}
> +
> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
> +{
> +    if (ref) {
> +        g_hash_table_unref((GHashTable *)ref);
> +    }
> +}
> +
> +/* Find or allocate address space for a given device */
> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
> +{
> +    RISCVIOMMUSpace *as;
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    QLIST_FOREACH(as, &s->spaces, list) {
> +        if (as->devid == devid) {
> +            break;
> +        }
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +
> +    if (as == NULL) {
> +        char name[64];
> +        as = g_new0(RISCVIOMMUSpace, 1);
> +
> +        as->iommu = s;
> +        as->devid = devid;
> +
> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +
> +        /* IOVA address space, untranslated addresses */
> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
> +            OBJECT(as), name, UINT64_MAX);
> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
> +            TYPE_RISCV_IOMMU_PCI);

Why do we use TYPE_RISCV_IOMMU_PCI as the address space name here?

> +
> +        qemu_mutex_lock(&s->core_lock);
> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
> +        qemu_mutex_unlock(&s->core_lock);
> +
> +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +    }
> +    return &as->iova_as;
> +}
> +
> +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    bool enable_faults;
> +    bool enable_pasid;
> +    bool enable_pri;
> +    int fault;
> +
> +    enable_faults = !(ctx->tc & RISCV_IOMMU_DC_TC_DTF);
> +    /*
> +     * TC[32] is reserved for custom extensions, used here to temporarily
> +     * enable automatic page-request generation for ATS queries.
> +     */
> +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
> +    enable_pasid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
> +
> +    /* Translate using device directory / page table information. */
> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
> +
> +    if (enable_pri && fault) {
> +        struct riscv_iommu_pq_record pr = {0};
> +        if (enable_pasid) {
> +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
> +                RISCV_IOMMU_PREQ_HDR_PID, ctx->pasid);
> +        }
> +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
> +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
> +                     RISCV_IOMMU_PREQ_PAYLOAD_M;
> +        riscv_iommu_pri(s, &pr);
> +        return fault;
> +    }
> +
> +    if (enable_faults && fault) {
> +        struct riscv_iommu_fq_record ev;
> +        unsigned ttype;
> +
> +        if (iotlb->perm & IOMMU_RW) {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> +        } else {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
> +        }
> +        ev.hdr = set_field(0, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, ttype);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, enable_pasid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->pasid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
> +        ev.iotval    = iotlb->iova;
> +        ev.iotval2   = iotlb->translated_addr;
> +        ev._reserved = 0;
> +        riscv_iommu_fault(s, &ev);
> +        return fault;
> +    }
> +
> +    return 0;
> +}
> +
> +/* IOMMU Command Interface */
> +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
> +    uint64_t addr, uint32_t data)
> +{
> +    /*
> +     * ATS processing in this implementation of the IOMMU is synchronous,
> +     * no need to wait for completions here.
> +     */
> +    if (!notify) {
> +        return MEMTX_OK;
> +    }
> +
> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
> +        MEMTXATTRS_UNSPECIFIED);

We should also assert the interrupt when IOFENCE.WSI is true
and IOMMU is configured with wire-signaled interrupt.

> +}
> +
> +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
> +{
> +    uint64_t old_ddtp = s->ddtp;
> +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
> +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    bool ok = false;
> +
> +    /*
> +     * Check for allowed DDTP.MODE transitions:
> +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
> +     */
> +    if (new_mode == old_mode ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
> +        ok = true;
> +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
> +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
> +    }
> +
> +    if (ok) {
> +        /* clear reserved and busy bits, report back sanitized version */
> +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
> +                             RISCV_IOMMU_DDTP_MODE, new_mode);
> +    } else {
> +        new_ddtp = old_ddtp;
> +    }
> +    s->ddtp = new_ddtp;
> +
> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
> +}
> +
> +/* Command function and opcode field. */
> +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
> +
> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
> +{
> +    struct riscv_iommu_command cmd;
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint32_t tail, head, ctrl;
> +    uint64_t cmd_opcode;
> +    GHFunc func;
> +
> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
> +
> +    /* Check for pending error or queue processing disabled */
> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
> +        return;
> +    }
> +
> +    while (tail != head) {
> +        addr = s->cq_addr  + head * sizeof(cmd);
> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
> +                              MEMTXATTRS_UNSPECIFIED);
> +
> +        if (res != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                  RISCV_IOMMU_CQCSR_CQMF, 0);
> +            goto fault;
> +        }
> +
> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
> +
> +        cmd_opcode = get_field(cmd.dword0,
> +                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
> +
> +        switch (cmd_opcode) {
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
> +            res = riscv_iommu_iofence(s,
> +                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
> +
> +            if (res != MEMTX_OK) {
> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
> +                goto fault;
> +            }
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
> +                goto cmd_ill;
> +            }
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* invalidate all device context cache mappings */
> +                func = __ctx_inval_all;
> +            } else {
> +                /* invalidate all device context matching DID */
> +                func = __ctx_inval_devid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* illegal command arguments IODIR_PDT & DV == 0 */
> +                goto cmd_ill;
> +            } else {
> +                func = __ctx_inval_devid_pasid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
> +            break;
> +
> +        default:
> +        cmd_ill:
> +            /* Invalid instruction, do not advance instruction index. */
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
> +            goto fault;
> +        }
> +
> +        /* Advance and update head pointer after command completes. */
> +        head = (head + 1) & s->cq_mask;
> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
> +    }
> +    return;
> +
> +fault:
> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
> +    }
> +}
> +
> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
> +            RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
> +            RISCV_IOMMU_FQCSR_FQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
> +            RISCV_IOMMU_PQCSR_PQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +/* Core IOMMU execution activation */
> +enum {
> +    RISCV_IOMMU_EXEC_DDTP,
> +    RISCV_IOMMU_EXEC_CQCSR,
> +    RISCV_IOMMU_EXEC_CQT,
> +    RISCV_IOMMU_EXEC_FQCSR,
> +    RISCV_IOMMU_EXEC_FQH,
> +    RISCV_IOMMU_EXEC_PQCSR,
> +    RISCV_IOMMU_EXEC_PQH,
> +    RISCV_IOMMU_EXEC_TR_REQUEST,
> +    /* RISCV_IOMMU_EXEC_EXIT must be the last enum value */
> +    RISCV_IOMMU_EXEC_EXIT,
> +};
> +
> +static void *riscv_iommu_core_proc(void* arg)
> +{
> +    RISCVIOMMUState *s = arg;
> +    unsigned exec = 0;
> +    unsigned mask = 0;
> +
> +    while (!(exec & BIT(RISCV_IOMMU_EXEC_EXIT))) {
> +        mask = (mask ? mask : BIT(RISCV_IOMMU_EXEC_EXIT)) >> 1;
> +        switch (exec & mask) {
> +        case BIT(RISCV_IOMMU_EXEC_DDTP):
> +            riscv_iommu_process_ddtp(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_CQCSR):
> +            riscv_iommu_process_cq_control(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_CQT):
> +            riscv_iommu_process_cq_tail(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_FQCSR):
> +            riscv_iommu_process_fq_control(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_FQH):
> +            /* NOP */
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_PQCSR):
> +            riscv_iommu_process_pq_control(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_PQH):
> +            /* NOP */
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_TR_REQUEST):
> +            /* DBG support not implemented yet */
> +            break;
> +        }
> +        exec &= ~mask;
> +        if (!exec) {
> +            qemu_mutex_lock(&s->core_lock);
> +            exec = s->core_exec;
> +            while (!exec) {
> +                qemu_cond_wait(&s->core_cond, &s->core_lock);
> +                exec = s->core_exec;
> +            }
> +            s->core_exec = 0;
> +            qemu_mutex_unlock(&s->core_lock);
> +        }
> +    };
> +
> +    return NULL;
> +}
> +
> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState *s = opaque;
> +    uint32_t regb = addr & ~3;
> +    uint32_t busy = 0;
> +    uint32_t exec = 0;
> +
> +    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {

Is it ever possible to have size = 0 or size > 8 write access?
This should be guarded by .valid.min_access_size and .valid.max_access_size.

> +        /* Unsupported MMIO alignment or access size */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        /* Unsupported MMIO access location. */
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Track actionable MMIO write. */
> +    switch (regb) {
> +    case RISCV_IOMMU_REG_DDTP:
> +    case RISCV_IOMMU_REG_DDTP + 4:
> +        exec = BIT(RISCV_IOMMU_EXEC_DDTP);
> +        regb = RISCV_IOMMU_REG_DDTP;
> +        busy = RISCV_IOMMU_DDTP_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQT:
> +        exec = BIT(RISCV_IOMMU_EXEC_CQT);
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQCSR:
> +        exec = BIT(RISCV_IOMMU_EXEC_CQCSR);
> +        busy = RISCV_IOMMU_CQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_FQH:
> +        exec = BIT(RISCV_IOMMU_EXEC_FQH);
> +        break;
> +
> +    case RISCV_IOMMU_REG_FQCSR:
> +        exec = BIT(RISCV_IOMMU_EXEC_FQCSR);
> +        busy = RISCV_IOMMU_FQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_PQH:
> +        exec = BIT(RISCV_IOMMU_EXEC_PQH);
> +        break;
> +
> +    case RISCV_IOMMU_REG_PQCSR:
> +        exec = BIT(RISCV_IOMMU_EXEC_PQCSR);
> +        busy = RISCV_IOMMU_PQCSR_BUSY;
> +        break;
> +    }
> +
> +    /*
> +     * Registers update might be not synchronized with core logic.
> +     * If system software updates register when relevant BUSY bit is set
> +     * IOMMU behavior of additional writes to the register is UNSPECIFIED
> +     */
> +
> +    qemu_spin_lock(&s->regs_lock);
> +    if (size == 1) {
> +        uint8_t ro = s->regs_ro[addr];
> +        uint8_t wc = s->regs_wc[addr];
> +        uint8_t rw = s->regs_rw[addr];
> +        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
> +    } else if (size == 2) {
> +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
> +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
> +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
> +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 4) {
> +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 8) {
> +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    }
> +
> +    /* Busy flag update, MSB 4-byte register. */
> +    if (busy) {
> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
> +        stl_le_p(&s->regs_rw[regb], rw | busy);
> +    }
> +    qemu_spin_unlock(&s->regs_lock);
> +
> +    /* Wake up core processing thread. */
> +    if (exec) {
> +        qemu_mutex_lock(&s->core_lock);
> +        s->core_exec |= exec;
> +        qemu_cond_signal(&s->core_cond);
> +        qemu_mutex_unlock(&s->core_lock);
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState *s = opaque;
> +    uint64_t val = -1;
> +    uint8_t *ptr;
> +
> +    if ((addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment. */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    ptr = &s->regs_rw[addr];
> +
> +    if (size == 1) {
> +        val = (uint64_t)*ptr;
> +    } else if (size == 2) {
> +        val = lduw_le_p(ptr);
> +    } else if (size == 4) {
> +        val = ldl_le_p(ptr);
> +    } else if (size == 8) {
> +        val = ldq_le_p(ptr);
> +    } else {
> +        return MEMTX_ERROR;
> +    }
> +
> +    *data = val;
> +
> +    return MEMTX_OK;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
> +    .read_with_attrs = riscv_iommu_mmio_read,
> +    .write_with_attrs = riscv_iommu_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +        .unaligned = false,
> +    },
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    }

Spec says:
"The IOMMU behavior for register accesses where the address is not aligned
to the size of the access, or if the access spans multiple registers,
or if the size
of the access is not 4 bytes or 8 bytes, is UNSPECIFIED."

Section 6.1. Reading and writing IOMMU registers also says:
"Registers that are 64-bit wide may be accessed using either a 32-bit
or a 64-bit access.
Registers that are 32-bit wide must only be accessed using a 32-bit access."

Should we limit the access sizes to only 4 and 8 bytes?

> +};
> +
> +/*
> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
> + * memory region as untranslated address, for additional MSI/MRIF interception
> + * by IOMMU interrupt remapping implementation.
> + * Note: Device emulation code generating an MSI is expected to provide a valid
> + * memory transaction attributes with requested_id set.
> + */
> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
> +    RISCVIOMMUContext *ctx;
> +    MemTxResult res;
> +    void *ref;
> +    uint32_t devid = attrs.requester_id;
> +
> +    if (attrs.unspecified) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
> +    if (ctx == NULL) {
> +        res = MEMTX_ACCESS_ERROR;
> +    } else {
> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
> +    }
> +    riscv_iommu_ctx_put(s, ref);
> +    return res;
> +}
> +
> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    return MEMTX_ACCESS_ERROR;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_trap_ops = {
> +    .read_with_attrs = riscv_iommu_trap_read,
> +    .write_with_attrs = riscv_iommu_trap_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +        .unaligned = true,
> +    },
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
> +    if (s->enable_msi) {
> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
> +    }
> +    /* Report QEMU target physical address space limits */
> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
> +                       TARGET_PHYS_ADDR_SPACE_BITS);
> +
> +    /* TODO: method to report supported PASID bits */
> +    s->pasid_bits = 8; /* restricted to size of MemTxAttrs.pasid */
> +    s->cap |= RISCV_IOMMU_CAP_PD8;
> +
> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
> +
> +    /* register storage */
> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +
> +     /* Mark all registers read-only */
> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
> +
> +    /*
> +     * Register complete MMIO space, including MSI/PBA registers.
> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
> +     * managed directly by the PCIDevice implementation.
> +     */
> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
> +
> +    /* Set power-on register state */
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], s->fctl);
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
> +        RISCV_IOMMU_CQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
> +        RISCV_IOMMU_FQCSR_FQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
> +        RISCV_IOMMU_FQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
> +        RISCV_IOMMU_PQCSR_PQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
> +        RISCV_IOMMU_PQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_IVEC], 0);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
> +
> +    /* Memory region for downstream access, if specified. */
> +    if (s->target_mr) {
> +        s->target_as = g_new0(AddressSpace, 1);
> +        address_space_init(s->target_as, s->target_mr,
> +            "riscv-iommu-downstream");
> +    } else {
> +        /* Fallback to global system memory. */
> +        s->target_as = &address_space_memory;
> +    }
> +
> +    /* Memory region for untranslated MRIF/MSI writes */
> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
> +            "riscv-iommu-trap", ~0ULL);
> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
> +
> +    /* Device translation context cache */
> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                         g_free, NULL);
> +
> +    s->iommus.le_next = NULL;
> +    s->iommus.le_prev = NULL;
> +    QLIST_INIT(&s->spaces);
> +    qemu_cond_init(&s->core_cond);
> +    qemu_mutex_init(&s->core_lock);
> +    qemu_spin_init(&s->regs_lock);
> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);

In our experience, using QEMU thread increases the latency of command
queue processing,
which leads to the potential IOMMU fence timeout in the Linux driver
when using IOMMU with KVM,
e.g. booting the guest Linux.

Is it possible to remove the thread from the IOMMU just like ARM, AMD,
and Intel IOMMU models?

> +}
> +
> +static void riscv_iommu_unrealize(DeviceState *dev)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    /* cancel pending operations and stop */
> +    s->core_exec = BIT(RISCV_IOMMU_EXEC_EXIT);
> +    qemu_cond_signal(&s->core_cond);
> +    qemu_mutex_unlock(&s->core_lock);
> +    qemu_thread_join(&s->core_proc);
> +    qemu_cond_destroy(&s->core_cond);
> +    qemu_mutex_destroy(&s->core_lock);
> +    g_hash_table_unref(s->ctx_cache);
> +}
> +
> +static Property riscv_iommu_properties[] = {
> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
> +        RISCV_IOMMU_SPEC_DOT_VER),
> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
> +    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
> +        TYPE_MEMORY_REGION, MemoryRegion *),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
> +    dc->user_creatable = false;
> +    dc->realize = riscv_iommu_realize;
> +    dc->unrealize = riscv_iommu_unrealize;
> +    device_class_set_props(dc, riscv_iommu_properties);
> +}
> +
> +static const TypeInfo riscv_iommu_info = {
> +    .name = TYPE_RISCV_IOMMU,
> +    .parent = TYPE_DEVICE,
> +    .instance_size = sizeof(RISCVIOMMUState),
> +    .class_init = riscv_iommu_class_init,
> +};
> +
> +static const char *IOMMU_FLAG_STR[] = {
> +    "NA",
> +    "RO",
> +    "WR",
> +    "RW",
> +};
> +
> +/* RISC-V IOMMU Memory Region - Address Translation Space */
> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
> +    IOMMUAccessFlags flag, int iommu_idx)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    RISCVIOMMUContext *ctx;
> +    void *ref;
> +    IOMMUTLBEntry iotlb = {
> +        .iova = addr,
> +        .target_as = as->iommu->target_as,
> +        .addr_mask = ~0ULL,
> +        .perm = flag,
> +    };
> +
> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
> +    if (ctx == NULL) {
> +        /* Translation disabled or invalid. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
> +        /* Translation disabled or fault reported. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    }
> +
> +    /* Trace all dma translations with original access flags. */
> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
> +                          iotlb.translated_addr);
> +
> +    riscv_iommu_ctx_put(as->iommu, ref);
> +
> +    return iotlb;
> +}
> +
> +static int riscv_iommu_memory_region_notify(
> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
> +    IOMMUNotifierFlag new, Error **errp)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +
> +    if (old == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = true;
> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
> +    } else if (new == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = false;
> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
> +    }
> +
> +    return 0;
> +}
> +
> +static inline bool pci_is_iommu(PCIDevice *pdev)
> +{
> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
> +}
> +
> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
> +{
> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> +    AddressSpace *as = NULL;
> +
> +    if (pdev && pci_is_iommu(pdev)) {
> +        return s->target_as;
> +    }
> +
> +    /* Find first registered IOMMU device */
> +    while (s->iommus.le_prev) {
> +        s = *(s->iommus.le_prev);
> +    }
> +
> +    /* Find first matching IOMMU */
> +    while (s != NULL && as == NULL) {
> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));

For pci_bus_num(),
riscv_iommu_find_as() can be called at the very early stage
where software has no chance to enumerate the bus numbers.




> +        s = s->iommus.le_next;
> +    }
> +
> +    return as ? as : &address_space_memory;
> +}
> +
> +static const PCIIOMMUOps riscv_iommu_ops = {
> +    .get_address_space = riscv_iommu_find_as,
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +        Error **errp)
> +{
> +    if (bus->iommu_ops &&
> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> +    } else if (bus->iommu_ops == NULL) {
> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
> +    } else {
> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
> +            pci_bus_num(bus));
> +    }
> +}
> +
> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
> +    MemTxAttrs attrs)
> +{
> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
> +}
> +
> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    return 1 << as->iommu->pasid_bits;
> +}
> +
> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
> +{
> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> +
> +    imrc->translate = riscv_iommu_memory_region_translate;
> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> +}
> +
> +static const TypeInfo riscv_iommu_memory_region_info = {
> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> +    .class_init = riscv_iommu_memory_region_init,
> +};
> +
> +static void riscv_iommu_register_mr_types(void)
> +{
> +    type_register_static(&riscv_iommu_memory_region_info);
> +    type_register_static(&riscv_iommu_info);
> +}
> +
> +type_init(riscv_iommu_register_mr_types);
> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> new file mode 100644
> index 0000000000..6f740de690
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.h
> @@ -0,0 +1,141 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_STATE_H
> +#define HW_RISCV_IOMMU_STATE_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#include "hw/riscv/iommu.h"
> +
> +struct RISCVIOMMUState {
> +    /*< private >*/
> +    DeviceState parent_obj;
> +
> +    /*< public >*/
> +    uint32_t version;     /* Reported interface version number */
> +    uint32_t pasid_bits;  /* process identifier width */
> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> +
> +    uint64_t cap;         /* IOMMU supported capabilities */
> +    uint64_t fctl;        /* IOMMU enabled features */
> +
> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
> +    bool enable_msi;      /* Enable MSI remapping */
> +
> +    /* IOMMU Internal State */
> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> +
> +    dma_addr_t cq_addr;   /* Command queue base physical address */
> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
> +
> +    uint32_t cq_mask;     /* Command queue index bit mask */
> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> +    uint32_t pq_mask;     /* Page request queue index bit mask */
> +
> +    /* interrupt notifier */
> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> +
> +    /* IOMMU State Machine */
> +    QemuThread core_proc; /* Background processing thread */
> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
> +    QemuCond core_cond;   /* Background processing wake up signal */
> +    unsigned core_exec;   /* Processing thread execution actions */
> +
> +    /* IOMMU target address space */
> +    AddressSpace *target_as;
> +    MemoryRegion *target_mr;
> +
> +    /* MSI / MRIF access trap */
> +    AddressSpace trap_as;
> +    MemoryRegion trap_mr;
> +
> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
> +
> +    /* MMIO Hardware Interface */
> +    MemoryRegion regs_mr;
> +    QemuSpin regs_lock;
> +    uint8_t *regs_rw;  /* register state (user write) */
> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> +    uint8_t *regs_ro;  /* read-only mask */
> +
> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +         Error **errp);
> +
> +/* private helpers */
> +
> +/* Register helper functions */
> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set, uint32_t clr)
> +{
> +    uint32_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldl_le_p(s->regs_rw + idx);
> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stl_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldl_le_p(s->regs_rw + idx);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set, uint64_t clr)
> +{
> +    uint64_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldq_le_p(s->regs_rw + idx);
> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stq_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldq_le_p(s->regs_rw + idx);
> +}
> +
> +
> +
> +#endif
> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> new file mode 100644
> index 0000000000..42a97caffa
> --- /dev/null
> +++ b/hw/riscv/trace-events
> @@ -0,0 +1,11 @@
> +# See documentation at docs/devel/tracing.rst
> +
> +# riscv-iommu.c
> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> new file mode 100644
> index 0000000000..b88504b750
> --- /dev/null
> +++ b/hw/riscv/trace.h
> @@ -0,0 +1,2 @@
> +#include "trace/trace-hw_riscv.h"
> +
> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> new file mode 100644
> index 0000000000..403b365893
> --- /dev/null
> +++ b/include/hw/riscv/iommu.h
> @@ -0,0 +1,36 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_H
> +#define HW_RISCV_IOMMU_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#define TYPE_RISCV_IOMMU "riscv-iommu"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> +
> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> +
> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> +
> +#endif
> diff --git a/meson.build b/meson.build
> index c59ca496f2..75e56f3282 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -3361,6 +3361,7 @@ if have_system
>      'hw/rdma',
>      'hw/rdma/vmw',
>      'hw/rtc',
> +    'hw/riscv',
>      'hw/s390x',
>      'hw/scsi',
>      'hw/sd',
> --
> 2.43.2
>
>
Daniel Henrique Barboza May 8, 2024, 11:15 a.m. UTC | #3
Hi Frank,

I'll reply with that I've done so far. Still missing some stuff:

On 5/2/24 08:37, Frank Chang wrote:
> Hi Daniel,
> 
> Daniel Henrique Barboza <dbarboza@ventanamicro.com> 於 2024年3月8日 週五 上午12:04寫道:
>>
>> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>>
>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
>> international process. The latest frozen specifcation can be found
>> at:
>>
>> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>>
>> Add the foundation of the device emulation for RISC-V IOMMU, which
>> includes an IOMMU that has no capabilities but MSI interrupt support and
>> fault queue interfaces. We'll add add more features incrementally in the
>> next patches.
>>
>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
>> ---
>>   hw/riscv/Kconfig         |    4 +
>>   hw/riscv/meson.build     |    1 +
>>   hw/riscv/riscv-iommu.c   | 1492 ++++++++++++++++++++++++++++++++++++++
>>   hw/riscv/riscv-iommu.h   |  141 ++++
>>   hw/riscv/trace-events    |   11 +
>>   hw/riscv/trace.h         |    2 +
>>   include/hw/riscv/iommu.h |   36 +
>>   meson.build              |    1 +
>>   8 files changed, 1688 insertions(+)
>>   create mode 100644 hw/riscv/riscv-iommu.c
>>   create mode 100644 hw/riscv/riscv-iommu.h
>>   create mode 100644 hw/riscv/trace-events
>>   create mode 100644 hw/riscv/trace.h
>>   create mode 100644 include/hw/riscv/iommu.h
>>

(...)

+{
>> +    const uint32_t ipsr =
>> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
>> +    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
>> +    if (s->notify && !(ipsr & (1 << vec))) {
>> +        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
>> +    }
> 
> s->notify is assigned to riscv_iommu_pci_notify() only.
> There's no way to assert the wire-signaled interrupt.
> 
> We should also check fctl.WSI before asserting the interrupt.
> 

This implementation does not support wire-signalled interrupts. It supports only
MSI, i.e. capabililities.IGS is always MSI (0). For this reason the code is also
not checking for fctl.WSI.



>> +}
  (...)

>> +    g_hash_table_unref(ctx_cache);
>> +    *ref = NULL;
>> +
>> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_DTF)) {
> 
> riscv_iommu_ctx_fetch() may return:
> RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED (256)
> RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT (257)
> RISCV_IOMMU_FQ_CAUSE_DDT_INVALID (258)
> RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED (259)
> 
> These faults are reported even when DTF is set to 1.
> We should report these faults regardless of DTF setting.


I created a "riscv_iommu_report_fault()" helper to centralize all the report fault
logic. This helper will check for DTF and, if set, we'll check the 'cause' to see if
we still want the fault to be reported or not. This helper is then used in these 2
instances where we're creating a fault by hand. It's also used extensively in
riscv_iommu_msi_write() to handle all the cases you mentioned above where we
weren't issuing faults.


> 
>> +        struct riscv_iommu_fq_record ev = { 0 };
>> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
>> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE,
>> +            RISCV_IOMMU_FQ_TTYPE_UADDR_RD);
>> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, devid);
>> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, pasid);
>> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, !!pasid);
>> +        riscv_iommu_fault(s, &ev);
>> +    }
>> +
>> +    g_free(ctx);
>> +    return NULL;
>> +}
>> +
>> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
>> +{
>> +    if (ref) {
>> +        g_hash_table_unref((GHashTable *)ref);
>> +    }
>> +}
>> +
>> +/* Find or allocate address space for a given device */
>> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
>> +{
>> +    RISCVIOMMUSpace *as;
>> +
>> +    /* FIXME: PCIe bus remapping for attached endpoints. */
>> +    devid |= s->bus << 8;
>> +
>> +    qemu_mutex_lock(&s->core_lock);
>> +    QLIST_FOREACH(as, &s->spaces, list) {
>> +        if (as->devid == devid) {
>> +            break;
>> +        }
>> +    }
>> +    qemu_mutex_unlock(&s->core_lock);
>> +
>> +    if (as == NULL) {
>> +        char name[64];
>> +        as = g_new0(RISCVIOMMUSpace, 1);
>> +
>> +        as->iommu = s;
>> +        as->devid = devid;
>> +
>> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
>> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
>> +
>> +        /* IOVA address space, untranslated addresses */
>> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
>> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
>> +            OBJECT(as), name, UINT64_MAX);
>> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
>> +            TYPE_RISCV_IOMMU_PCI);
> 
> Why do we use TYPE_RISCV_IOMMU_PCI as the address space name here?
> 

This is an error. TYPE_RISCV_IOMMU_PCI is the name of the PCI IOMMU device.

Seeing other iommus in QEMU it seems like the name of memory region is a simple
string, e.g. "amd_iommu", and then the name of the address space of the device
is something that includes the device identification.

I'll change this to something like:

         snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
             PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));

         /* IOVA address space, untranslated addresses */
         memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
             TYPE_RISCV_IOMMU_MEMORY_REGION,
             OBJECT(as), "riscv_iommu", UINT64_MAX);
         address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
                            name);

>> +
>> +        qemu_mutex_lock(&s->core_lock);

(...)


>> +    }
>> +
>> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
>> +        MEMTXATTRS_UNSPECIFIED);
> 
> We should also assert the interrupt when IOFENCE.WSI is true
> and IOMMU is configured with wire-signaled interrupt.


I believe that for the same reason I pointed earlier ("this implementation does not
support wire-signalled interrupts") we're not checking for IOFENCE.WSI here.

> 
>> +}
>> +

(...)

>> +
>> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
>> +    uint64_t data, unsigned size, MemTxAttrs attrs)
>> +{
>> +    RISCVIOMMUState *s = opaque;
>> +    uint32_t regb = addr & ~3;
>> +    uint32_t busy = 0;
>> +    uint32_t exec = 0;
>> +
>> +    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
> 
> Is it ever possible to have size = 0 or size > 8 write access?
> This should be guarded by .valid.min_access_size and .valid.max_access_size.

Yes. And on this point:


> 

(...)

>> +
>> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
>> +    .read_with_attrs = riscv_iommu_mmio_read,
>> +    .write_with_attrs = riscv_iommu_mmio_write,
>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>> +    .impl = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +        .unaligned = false,
>> +    },
>> +    .valid = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +    }
> 
> Spec says:
> "The IOMMU behavior for register accesses where the address is not aligned
> to the size of the access, or if the access spans multiple registers,
> or if the size
> of the access is not 4 bytes or 8 bytes, is UNSPECIFIED."
> 
> Section 6.1. Reading and writing IOMMU registers also says:
> "Registers that are 64-bit wide may be accessed using either a 32-bit
> or a 64-bit access.
> Registers that are 32-bit wide must only be accessed using a 32-bit access."
> 
> Should we limit the access sizes to only 4 and 8 bytes?

Yes. We should set min = 4, max = 8, and use min and max to validate the
access  in riscv_iommu_mmio_write().


> 
>> +};
>> +
>> +/*

(...)

>> +
>> +static const MemoryRegionOps riscv_iommu_trap_ops = {
>> +    .read_with_attrs = riscv_iommu_trap_read,
>> +    .write_with_attrs = riscv_iommu_trap_write,
>> +    .endianness = DEVICE_LITTLE_ENDIAN,
>> +    .impl = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +        .unaligned = true,
>> +    },
>> +    .valid = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +    }
>> +};

We'll also want set min = 4 and max = 8 in these ops too.

>> +
>> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)

(...)

>> +
>> +    /* Memory region for untranslated MRIF/MSI writes */
>> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
>> +            "riscv-iommu-trap", ~0ULL);
>> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
>> +
>> +    /* Device translation context cache */
>> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
>> +                                         g_free, NULL);
>> +
>> +    s->iommus.le_next = NULL;
>> +    s->iommus.le_prev = NULL;
>> +    QLIST_INIT(&s->spaces);
>> +    qemu_cond_init(&s->core_cond);
>> +    qemu_mutex_init(&s->core_lock);
>> +    qemu_spin_init(&s->regs_lock);
>> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
>> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
> 
> In our experience, using QEMU thread increases the latency of command
> queue processing,
> which leads to the potential IOMMU fence timeout in the Linux driver
> when using IOMMU with KVM,
> e.g. booting the guest Linux.
> 
> Is it possible to remove the thread from the IOMMU just like ARM, AMD,
> and Intel IOMMU models?

Interesting. We've been using this emulation internally in Ventana, with
KVM and VFIO, and didn't experience this issue. Drew is on CC and can talk
more about it.

That said, I don't mind this change, assuming it's feasible to make it for this
first version.  I'll need to check it how other IOMMUs are doing it.



> 
>> +}
>> +

(...)

>> +
>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
>> +{
>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>> +    AddressSpace *as = NULL;
>> +
>> +    if (pdev && pci_is_iommu(pdev)) {
>> +        return s->target_as;
>> +    }
>> +
>> +    /* Find first registered IOMMU device */
>> +    while (s->iommus.le_prev) {
>> +        s = *(s->iommus.le_prev);
>> +    }
>> +
>> +    /* Find first matching IOMMU */
>> +    while (s != NULL && as == NULL) {
>> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> 
> For pci_bus_num(),
> riscv_iommu_find_as() can be called at the very early stage
> where software has no chance to enumerate the bus numbers.

I'll see how other IOMMUs are handling their iommu_find_as()


Thanks,


Daniel


> 
> 
> 
> 
>> +        s = s->iommus.le_next;
>> +    }
>> +
>> +    return as ? as : &address_space_memory;
>> +}
>> +
>> +static const PCIIOMMUOps riscv_iommu_ops = {
>> +    .get_address_space = riscv_iommu_find_as,
>> +};
>> +
>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>> +        Error **errp)
>> +{
>> +    if (bus->iommu_ops &&
>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
>> +    } else if (bus->iommu_ops == NULL) {
>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
>> +    } else {
>> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
>> +            pci_bus_num(bus));
>> +    }
>> +}
>> +
>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
>> +    MemTxAttrs attrs)
>> +{
>> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
>> +}
>> +
>> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
>> +{
>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>> +    return 1 << as->iommu->pasid_bits;
>> +}
>> +
>> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
>> +{
>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>> +
>> +    imrc->translate = riscv_iommu_memory_region_translate;
>> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
>> +}
>> +
>> +static const TypeInfo riscv_iommu_memory_region_info = {
>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
>> +    .class_init = riscv_iommu_memory_region_init,
>> +};
>> +
>> +static void riscv_iommu_register_mr_types(void)
>> +{
>> +    type_register_static(&riscv_iommu_memory_region_info);
>> +    type_register_static(&riscv_iommu_info);
>> +}
>> +
>> +type_init(riscv_iommu_register_mr_types);
>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
>> new file mode 100644
>> index 0000000000..6f740de690
>> --- /dev/null
>> +++ b/hw/riscv/riscv-iommu.h
>> @@ -0,0 +1,141 @@
>> +/*
>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>> + *
>> + * Copyright (C) 2022-2023 Rivos Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#ifndef HW_RISCV_IOMMU_STATE_H
>> +#define HW_RISCV_IOMMU_STATE_H
>> +
>> +#include "qemu/osdep.h"
>> +#include "qom/object.h"
>> +
>> +#include "hw/riscv/iommu.h"
>> +
>> +struct RISCVIOMMUState {
>> +    /*< private >*/
>> +    DeviceState parent_obj;
>> +
>> +    /*< public >*/
>> +    uint32_t version;     /* Reported interface version number */
>> +    uint32_t pasid_bits;  /* process identifier width */
>> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
>> +
>> +    uint64_t cap;         /* IOMMU supported capabilities */
>> +    uint64_t fctl;        /* IOMMU enabled features */
>> +
>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>> +    bool enable_msi;      /* Enable MSI remapping */
>> +
>> +    /* IOMMU Internal State */
>> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
>> +
>> +    dma_addr_t cq_addr;   /* Command queue base physical address */
>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
>> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
>> +
>> +    uint32_t cq_mask;     /* Command queue index bit mask */
>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
>> +    uint32_t pq_mask;     /* Page request queue index bit mask */
>> +
>> +    /* interrupt notifier */
>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
>> +
>> +    /* IOMMU State Machine */
>> +    QemuThread core_proc; /* Background processing thread */
>> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
>> +    QemuCond core_cond;   /* Background processing wake up signal */
>> +    unsigned core_exec;   /* Processing thread execution actions */
>> +
>> +    /* IOMMU target address space */
>> +    AddressSpace *target_as;
>> +    MemoryRegion *target_mr;
>> +
>> +    /* MSI / MRIF access trap */
>> +    AddressSpace trap_as;
>> +    MemoryRegion trap_mr;
>> +
>> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
>> +
>> +    /* MMIO Hardware Interface */
>> +    MemoryRegion regs_mr;
>> +    QemuSpin regs_lock;
>> +    uint8_t *regs_rw;  /* register state (user write) */
>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
>> +    uint8_t *regs_ro;  /* read-only mask */
>> +
>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
>> +};
>> +
>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>> +         Error **errp);
>> +
>> +/* private helpers */
>> +
>> +/* Register helper functions */
>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
>> +    unsigned idx, uint32_t set, uint32_t clr)
>> +{
>> +    uint32_t val;
>> +    qemu_spin_lock(&s->regs_lock);
>> +    val = ldl_le_p(s->regs_rw + idx);
>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
>> +    qemu_spin_unlock(&s->regs_lock);
>> +    return val;
>> +}
>> +
>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
>> +    unsigned idx, uint32_t set)
>> +{
>> +    qemu_spin_lock(&s->regs_lock);
>> +    stl_le_p(s->regs_rw + idx, set);
>> +    qemu_spin_unlock(&s->regs_lock);
>> +}
>> +
>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
>> +    unsigned idx)
>> +{
>> +    return ldl_le_p(s->regs_rw + idx);
>> +}
>> +
>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
>> +    unsigned idx, uint64_t set, uint64_t clr)
>> +{
>> +    uint64_t val;
>> +    qemu_spin_lock(&s->regs_lock);
>> +    val = ldq_le_p(s->regs_rw + idx);
>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
>> +    qemu_spin_unlock(&s->regs_lock);
>> +    return val;
>> +}
>> +
>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
>> +    unsigned idx, uint64_t set)
>> +{
>> +    qemu_spin_lock(&s->regs_lock);
>> +    stq_le_p(s->regs_rw + idx, set);
>> +    qemu_spin_unlock(&s->regs_lock);
>> +}
>> +
>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
>> +    unsigned idx)
>> +{
>> +    return ldq_le_p(s->regs_rw + idx);
>> +}
>> +
>> +
>> +
>> +#endif
>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
>> new file mode 100644
>> index 0000000000..42a97caffa
>> --- /dev/null
>> +++ b/hw/riscv/trace-events
>> @@ -0,0 +1,11 @@
>> +# See documentation at docs/devel/tracing.rst
>> +
>> +# riscv-iommu.c
>> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
>> new file mode 100644
>> index 0000000000..b88504b750
>> --- /dev/null
>> +++ b/hw/riscv/trace.h
>> @@ -0,0 +1,2 @@
>> +#include "trace/trace-hw_riscv.h"
>> +
>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
>> new file mode 100644
>> index 0000000000..403b365893
>> --- /dev/null
>> +++ b/include/hw/riscv/iommu.h
>> @@ -0,0 +1,36 @@
>> +/*
>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>> + *
>> + * Copyright (C) 2022-2023 Rivos Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#ifndef HW_RISCV_IOMMU_H
>> +#define HW_RISCV_IOMMU_H
>> +
>> +#include "qemu/osdep.h"
>> +#include "qom/object.h"
>> +
>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
>> +
>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
>> +
>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
>> +
>> +#endif
>> diff --git a/meson.build b/meson.build
>> index c59ca496f2..75e56f3282 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -3361,6 +3361,7 @@ if have_system
>>       'hw/rdma',
>>       'hw/rdma/vmw',
>>       'hw/rtc',
>> +    'hw/riscv',
>>       'hw/s390x',
>>       'hw/scsi',
>>       'hw/sd',
>> --
>> 2.43.2
>>
>>
Frank Chang May 10, 2024, 10:58 a.m. UTC | #4
Hi Daniel,

Daniel Henrique Barboza <dbarboza@ventanamicro.com> 於 2024年5月8日 週三 下午7:16寫道:
>
> Hi Frank,
>
> I'll reply with that I've done so far. Still missing some stuff:
>
> On 5/2/24 08:37, Frank Chang wrote:
> > Hi Daniel,
> >
> > Daniel Henrique Barboza <dbarboza@ventanamicro.com> 於 2024年3月8日 週五 上午12:04寫道:
> >>
> >> From: Tomasz Jeznach <tjeznach@rivosinc.com>
> >>
> >> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> >> international process. The latest frozen specifcation can be found
> >> at:
> >>
> >> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
> >>
> >> Add the foundation of the device emulation for RISC-V IOMMU, which
> >> includes an IOMMU that has no capabilities but MSI interrupt support and
> >> fault queue interfaces. We'll add add more features incrementally in the
> >> next patches.
> >>
> >> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> >> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> >> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> >> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> >> ---
> >>   hw/riscv/Kconfig         |    4 +
> >>   hw/riscv/meson.build     |    1 +
> >>   hw/riscv/riscv-iommu.c   | 1492 ++++++++++++++++++++++++++++++++++++++
> >>   hw/riscv/riscv-iommu.h   |  141 ++++
> >>   hw/riscv/trace-events    |   11 +
> >>   hw/riscv/trace.h         |    2 +
> >>   include/hw/riscv/iommu.h |   36 +
> >>   meson.build              |    1 +
> >>   8 files changed, 1688 insertions(+)
> >>   create mode 100644 hw/riscv/riscv-iommu.c
> >>   create mode 100644 hw/riscv/riscv-iommu.h
> >>   create mode 100644 hw/riscv/trace-events
> >>   create mode 100644 hw/riscv/trace.h
> >>   create mode 100644 include/hw/riscv/iommu.h
> >>
>
> (...)
>
> +{
> >> +    const uint32_t ipsr =
> >> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
> >> +    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
> >> +    if (s->notify && !(ipsr & (1 << vec))) {
> >> +        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
> >> +    }
> >
> > s->notify is assigned to riscv_iommu_pci_notify() only.
> > There's no way to assert the wire-signaled interrupt.
> >
> > We should also check fctl.WSI before asserting the interrupt.
> >
>
> This implementation does not support wire-signalled interrupts. It supports only
> MSI, i.e. capabililities.IGS is always MSI (0). For this reason the code is also
> not checking for fctl.WSI.
>
>
>
> >> +}
>   (...)
>
> >> +    g_hash_table_unref(ctx_cache);
> >> +    *ref = NULL;
> >> +
> >> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_DTF)) {
> >
> > riscv_iommu_ctx_fetch() may return:
> > RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED (256)
> > RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT (257)
> > RISCV_IOMMU_FQ_CAUSE_DDT_INVALID (258)
> > RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED (259)
> >
> > These faults are reported even when DTF is set to 1.
> > We should report these faults regardless of DTF setting.
>
>
> I created a "riscv_iommu_report_fault()" helper to centralize all the report fault
> logic. This helper will check for DTF and, if set, we'll check the 'cause' to see if
> we still want the fault to be reported or not. This helper is then used in these 2
> instances where we're creating a fault by hand. It's also used extensively in
> riscv_iommu_msi_write() to handle all the cases you mentioned above where we
> weren't issuing faults.
>
>
> >
> >> +        struct riscv_iommu_fq_record ev = { 0 };
> >> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
> >> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE,
> >> +            RISCV_IOMMU_FQ_TTYPE_UADDR_RD);
> >> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, devid);
> >> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, pasid);
> >> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, !!pasid);
> >> +        riscv_iommu_fault(s, &ev);
> >> +    }
> >> +
> >> +    g_free(ctx);
> >> +    return NULL;
> >> +}
> >> +
> >> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
> >> +{
> >> +    if (ref) {
> >> +        g_hash_table_unref((GHashTable *)ref);
> >> +    }
> >> +}
> >> +
> >> +/* Find or allocate address space for a given device */
> >> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
> >> +{
> >> +    RISCVIOMMUSpace *as;
> >> +
> >> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> >> +    devid |= s->bus << 8;
> >> +
> >> +    qemu_mutex_lock(&s->core_lock);
> >> +    QLIST_FOREACH(as, &s->spaces, list) {
> >> +        if (as->devid == devid) {
> >> +            break;
> >> +        }
> >> +    }
> >> +    qemu_mutex_unlock(&s->core_lock);
> >> +
> >> +    if (as == NULL) {
> >> +        char name[64];
> >> +        as = g_new0(RISCVIOMMUSpace, 1);
> >> +
> >> +        as->iommu = s;
> >> +        as->devid = devid;
> >> +
> >> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
> >> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> >> +
> >> +        /* IOVA address space, untranslated addresses */
> >> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
> >> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
> >> +            OBJECT(as), name, UINT64_MAX);
> >> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
> >> +            TYPE_RISCV_IOMMU_PCI);
> >
> > Why do we use TYPE_RISCV_IOMMU_PCI as the address space name here?
> >
>
> This is an error. TYPE_RISCV_IOMMU_PCI is the name of the PCI IOMMU device.
>
> Seeing other iommus in QEMU it seems like the name of memory region is a simple
> string, e.g. "amd_iommu", and then the name of the address space of the device
> is something that includes the device identification.
>
> I'll change this to something like:
>
>          snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
>              PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
>
>          /* IOVA address space, untranslated addresses */
>          memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
>              TYPE_RISCV_IOMMU_MEMORY_REGION,
>              OBJECT(as), "riscv_iommu", UINT64_MAX);
>          address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
>                             name);
>
> >> +
> >> +        qemu_mutex_lock(&s->core_lock);
>
> (...)
>
>
> >> +    }
> >> +
> >> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
> >> +        MEMTXATTRS_UNSPECIFIED);
> >
> > We should also assert the interrupt when IOFENCE.WSI is true
> > and IOMMU is configured with wire-signaled interrupt.
>
>
> I believe that for the same reason I pointed earlier ("this implementation does not
> support wire-signalled interrupts") we're not checking for IOFENCE.WSI here.
>
> >
> >> +}
> >> +
>
> (...)
>
> >> +
> >> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
> >> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> >> +{
> >> +    RISCVIOMMUState *s = opaque;
> >> +    uint32_t regb = addr & ~3;
> >> +    uint32_t busy = 0;
> >> +    uint32_t exec = 0;
> >> +
> >> +    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
> >
> > Is it ever possible to have size = 0 or size > 8 write access?
> > This should be guarded by .valid.min_access_size and .valid.max_access_size.
>
> Yes. And on this point:
>
>
> >
>
> (...)
>
> >> +
> >> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
> >> +    .read_with_attrs = riscv_iommu_mmio_read,
> >> +    .write_with_attrs = riscv_iommu_mmio_write,
> >> +    .endianness = DEVICE_NATIVE_ENDIAN,
> >> +    .impl = {
> >> +        .min_access_size = 1,
> >> +        .max_access_size = 8,
> >> +        .unaligned = false,
> >> +    },
> >> +    .valid = {
> >> +        .min_access_size = 1,
> >> +        .max_access_size = 8,
> >> +    }
> >
> > Spec says:
> > "The IOMMU behavior for register accesses where the address is not aligned
> > to the size of the access, or if the access spans multiple registers,
> > or if the size
> > of the access is not 4 bytes or 8 bytes, is UNSPECIFIED."
> >
> > Section 6.1. Reading and writing IOMMU registers also says:
> > "Registers that are 64-bit wide may be accessed using either a 32-bit
> > or a 64-bit access.
> > Registers that are 32-bit wide must only be accessed using a 32-bit access."
> >
> > Should we limit the access sizes to only 4 and 8 bytes?
>
> Yes. We should set min = 4, max = 8, and use min and max to validate the
> access  in riscv_iommu_mmio_write().
>
>
> >
> >> +};
> >> +
> >> +/*
>
> (...)
>
> >> +
> >> +static const MemoryRegionOps riscv_iommu_trap_ops = {
> >> +    .read_with_attrs = riscv_iommu_trap_read,
> >> +    .write_with_attrs = riscv_iommu_trap_write,
> >> +    .endianness = DEVICE_LITTLE_ENDIAN,
> >> +    .impl = {
> >> +        .min_access_size = 1,
> >> +        .max_access_size = 8,
> >> +        .unaligned = true,
> >> +    },
> >> +    .valid = {
> >> +        .min_access_size = 1,
> >> +        .max_access_size = 8,
> >> +    }
> >> +};
>
> We'll also want set min = 4 and max = 8 in these ops too.
>
> >> +
> >> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
>
> (...)
>
> >> +
> >> +    /* Memory region for untranslated MRIF/MSI writes */
> >> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
> >> +            "riscv-iommu-trap", ~0ULL);
> >> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
> >> +
> >> +    /* Device translation context cache */
> >> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> >> +                                         g_free, NULL);
> >> +
> >> +    s->iommus.le_next = NULL;
> >> +    s->iommus.le_prev = NULL;
> >> +    QLIST_INIT(&s->spaces);
> >> +    qemu_cond_init(&s->core_cond);
> >> +    qemu_mutex_init(&s->core_lock);
> >> +    qemu_spin_init(&s->regs_lock);
> >> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
> >> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
> >
> > In our experience, using QEMU thread increases the latency of command
> > queue processing,
> > which leads to the potential IOMMU fence timeout in the Linux driver
> > when using IOMMU with KVM,
> > e.g. booting the guest Linux.
> >
> > Is it possible to remove the thread from the IOMMU just like ARM, AMD,
> > and Intel IOMMU models?
>
> Interesting. We've been using this emulation internally in Ventana, with
> KVM and VFIO, and didn't experience this issue. Drew is on CC and can talk
> more about it.

We've developed IOFENCE timeout detection mechanism in our Linux
driver internally
to detect the long-run IOFENCE command on the hardware.

However, we hit the assertion when running on QEMU
and the issue was resolved after we removed the thread from IOMMU model.
However, the assertion didn't happen on our hardware.

Regards,
Frank CHang

>
> That said, I don't mind this change, assuming it's feasible to make it for this
> first version.  I'll need to check it how other IOMMUs are doing it.
>
>
>
> >
> >> +}
> >> +
>
> (...)
>
> >> +
> >> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
> >> +{
> >> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> >> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> >> +    AddressSpace *as = NULL;
> >> +
> >> +    if (pdev && pci_is_iommu(pdev)) {
> >> +        return s->target_as;
> >> +    }
> >> +
> >> +    /* Find first registered IOMMU device */
> >> +    while (s->iommus.le_prev) {
> >> +        s = *(s->iommus.le_prev);
> >> +    }
> >> +
> >> +    /* Find first matching IOMMU */
> >> +    while (s != NULL && as == NULL) {
> >> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> >
> > For pci_bus_num(),
> > riscv_iommu_find_as() can be called at the very early stage
> > where software has no chance to enumerate the bus numbers.
>
> I'll see how other IOMMUs are handling their iommu_find_as()
>
>
> Thanks,
>
>
> Daniel
>
>
> >
> >
> >
> >
> >> +        s = s->iommus.le_next;
> >> +    }
> >> +
> >> +    return as ? as : &address_space_memory;
> >> +}
> >> +
> >> +static const PCIIOMMUOps riscv_iommu_ops = {
> >> +    .get_address_space = riscv_iommu_find_as,
> >> +};
> >> +
> >> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> >> +        Error **errp)
> >> +{
> >> +    if (bus->iommu_ops &&
> >> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
> >> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
> >> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> >> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> >> +    } else if (bus->iommu_ops == NULL) {
> >> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
> >> +    } else {
> >> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
> >> +            pci_bus_num(bus));
> >> +    }
> >> +}
> >> +
> >> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
> >> +    MemTxAttrs attrs)
> >> +{
> >> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
> >> +}
> >> +
> >> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> >> +{
> >> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> >> +    return 1 << as->iommu->pasid_bits;
> >> +}
> >> +
> >> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
> >> +{
> >> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> >> +
> >> +    imrc->translate = riscv_iommu_memory_region_translate;
> >> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> >> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> >> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> >> +}
> >> +
> >> +static const TypeInfo riscv_iommu_memory_region_info = {
> >> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> >> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> >> +    .class_init = riscv_iommu_memory_region_init,
> >> +};
> >> +
> >> +static void riscv_iommu_register_mr_types(void)
> >> +{
> >> +    type_register_static(&riscv_iommu_memory_region_info);
> >> +    type_register_static(&riscv_iommu_info);
> >> +}
> >> +
> >> +type_init(riscv_iommu_register_mr_types);
> >> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> >> new file mode 100644
> >> index 0000000000..6f740de690
> >> --- /dev/null
> >> +++ b/hw/riscv/riscv-iommu.h
> >> @@ -0,0 +1,141 @@
> >> +/*
> >> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> >> + *
> >> + * Copyright (C) 2022-2023 Rivos Inc.
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License as published by
> >> + * the Free Software Foundation; either version 2 of the License.
> >> + *
> >> + * This program is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >> + * GNU General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License along
> >> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >> + */
> >> +
> >> +#ifndef HW_RISCV_IOMMU_STATE_H
> >> +#define HW_RISCV_IOMMU_STATE_H
> >> +
> >> +#include "qemu/osdep.h"
> >> +#include "qom/object.h"
> >> +
> >> +#include "hw/riscv/iommu.h"
> >> +
> >> +struct RISCVIOMMUState {
> >> +    /*< private >*/
> >> +    DeviceState parent_obj;
> >> +
> >> +    /*< public >*/
> >> +    uint32_t version;     /* Reported interface version number */
> >> +    uint32_t pasid_bits;  /* process identifier width */
> >> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> >> +
> >> +    uint64_t cap;         /* IOMMU supported capabilities */
> >> +    uint64_t fctl;        /* IOMMU enabled features */
> >> +
> >> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
> >> +    bool enable_msi;      /* Enable MSI remapping */
> >> +
> >> +    /* IOMMU Internal State */
> >> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> >> +
> >> +    dma_addr_t cq_addr;   /* Command queue base physical address */
> >> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
> >> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
> >> +
> >> +    uint32_t cq_mask;     /* Command queue index bit mask */
> >> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> >> +    uint32_t pq_mask;     /* Page request queue index bit mask */
> >> +
> >> +    /* interrupt notifier */
> >> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> >> +
> >> +    /* IOMMU State Machine */
> >> +    QemuThread core_proc; /* Background processing thread */
> >> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
> >> +    QemuCond core_cond;   /* Background processing wake up signal */
> >> +    unsigned core_exec;   /* Processing thread execution actions */
> >> +
> >> +    /* IOMMU target address space */
> >> +    AddressSpace *target_as;
> >> +    MemoryRegion *target_mr;
> >> +
> >> +    /* MSI / MRIF access trap */
> >> +    AddressSpace trap_as;
> >> +    MemoryRegion trap_mr;
> >> +
> >> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
> >> +
> >> +    /* MMIO Hardware Interface */
> >> +    MemoryRegion regs_mr;
> >> +    QemuSpin regs_lock;
> >> +    uint8_t *regs_rw;  /* register state (user write) */
> >> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> >> +    uint8_t *regs_ro;  /* read-only mask */
> >> +
> >> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> >> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> >> +};
> >> +
> >> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> >> +         Error **errp);
> >> +
> >> +/* private helpers */
> >> +
> >> +/* Register helper functions */
> >> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> >> +    unsigned idx, uint32_t set, uint32_t clr)
> >> +{
> >> +    uint32_t val;
> >> +    qemu_spin_lock(&s->regs_lock);
> >> +    val = ldl_le_p(s->regs_rw + idx);
> >> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> >> +    qemu_spin_unlock(&s->regs_lock);
> >> +    return val;
> >> +}
> >> +
> >> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> >> +    unsigned idx, uint32_t set)
> >> +{
> >> +    qemu_spin_lock(&s->regs_lock);
> >> +    stl_le_p(s->regs_rw + idx, set);
> >> +    qemu_spin_unlock(&s->regs_lock);
> >> +}
> >> +
> >> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
> >> +    unsigned idx)
> >> +{
> >> +    return ldl_le_p(s->regs_rw + idx);
> >> +}
> >> +
> >> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
> >> +    unsigned idx, uint64_t set, uint64_t clr)
> >> +{
> >> +    uint64_t val;
> >> +    qemu_spin_lock(&s->regs_lock);
> >> +    val = ldq_le_p(s->regs_rw + idx);
> >> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> >> +    qemu_spin_unlock(&s->regs_lock);
> >> +    return val;
> >> +}
> >> +
> >> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> >> +    unsigned idx, uint64_t set)
> >> +{
> >> +    qemu_spin_lock(&s->regs_lock);
> >> +    stq_le_p(s->regs_rw + idx, set);
> >> +    qemu_spin_unlock(&s->regs_lock);
> >> +}
> >> +
> >> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> >> +    unsigned idx)
> >> +{
> >> +    return ldq_le_p(s->regs_rw + idx);
> >> +}
> >> +
> >> +
> >> +
> >> +#endif
> >> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> >> new file mode 100644
> >> index 0000000000..42a97caffa
> >> --- /dev/null
> >> +++ b/hw/riscv/trace-events
> >> @@ -0,0 +1,11 @@
> >> +# See documentation at docs/devel/tracing.rst
> >> +
> >> +# riscv-iommu.c
> >> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
> >> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
> >> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> >> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> >> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
> >> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
> >> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
> >> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
> >> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> >> new file mode 100644
> >> index 0000000000..b88504b750
> >> --- /dev/null
> >> +++ b/hw/riscv/trace.h
> >> @@ -0,0 +1,2 @@
> >> +#include "trace/trace-hw_riscv.h"
> >> +
> >> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> >> new file mode 100644
> >> index 0000000000..403b365893
> >> --- /dev/null
> >> +++ b/include/hw/riscv/iommu.h
> >> @@ -0,0 +1,36 @@
> >> +/*
> >> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> >> + *
> >> + * Copyright (C) 2022-2023 Rivos Inc.
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License as published by
> >> + * the Free Software Foundation; either version 2 of the License.
> >> + *
> >> + * This program is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >> + * GNU General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License along
> >> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >> + */
> >> +
> >> +#ifndef HW_RISCV_IOMMU_H
> >> +#define HW_RISCV_IOMMU_H
> >> +
> >> +#include "qemu/osdep.h"
> >> +#include "qom/object.h"
> >> +
> >> +#define TYPE_RISCV_IOMMU "riscv-iommu"
> >> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> >> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> >> +
> >> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
> >> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> >> +
> >> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
> >> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> >> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> >> +
> >> +#endif
> >> diff --git a/meson.build b/meson.build
> >> index c59ca496f2..75e56f3282 100644
> >> --- a/meson.build
> >> +++ b/meson.build
> >> @@ -3361,6 +3361,7 @@ if have_system
> >>       'hw/rdma',
> >>       'hw/rdma/vmw',
> >>       'hw/rtc',
> >> +    'hw/riscv',
> >>       'hw/s390x',
> >>       'hw/scsi',
> >>       'hw/sd',
> >> --
> >> 2.43.2
> >>
> >>
>
Daniel Henrique Barboza May 13, 2024, 12:37 p.m. UTC | #5
Hi Frank,


On 5/8/24 08:15, Daniel Henrique Barboza wrote:
> Hi Frank,
> 
> I'll reply with that I've done so far. Still missing some stuff:
> 
> On 5/2/24 08:37, Frank Chang wrote:
>> Hi Daniel,
>>
>> Daniel Henrique Barboza <dbarboza@ventanamicro.com> 於 2024年3月8日 週五 上午12:04寫道:
>>>
>>> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>>>
>>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
>>> international process. The latest frozen specifcation can be found
>>> at:
>>>
>>> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>>>
>>> Add the foundation of the device emulation for RISC-V IOMMU, which
>>> includes an IOMMU that has no capabilities but MSI interrupt support and
>>> fault queue interfaces. We'll add add more features incrementally in the
>>> next patches.
>>>
>>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
>>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
>>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
>>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
>>> ---
>>>   hw/riscv/Kconfig         |    4 +

(...)

>>> +
>>> +    s->iommus.le_next = NULL;
>>> +    s->iommus.le_prev = NULL;
>>> +    QLIST_INIT(&s->spaces);
>>> +    qemu_cond_init(&s->core_cond);
>>> +    qemu_mutex_init(&s->core_lock);
>>> +    qemu_spin_init(&s->regs_lock);
>>> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
>>> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
>>
>> In our experience, using QEMU thread increases the latency of command
>> queue processing,
>> which leads to the potential IOMMU fence timeout in the Linux driver
>> when using IOMMU with KVM,
>> e.g. booting the guest Linux.
>>
>> Is it possible to remove the thread from the IOMMU just like ARM, AMD,
>> and Intel IOMMU models?
> 
> Interesting. We've been using this emulation internally in Ventana, with
> KVM and VFIO, and didn't experience this issue. Drew is on CC and can talk
> more about it.
> 
> That said, I don't mind this change, assuming it's feasible to make it for this
> first version.  I'll need to check it how other IOMMUs are doing it.


I removed the threading and it seems to be working fine without it. I'll commit this
change for v3.

> 
> 
> 
>>
>>> +}
>>> +
> 
> (...)
> 
>>> +
>>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
>>> +{
>>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
>>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>>> +    AddressSpace *as = NULL;
>>> +
>>> +    if (pdev && pci_is_iommu(pdev)) {
>>> +        return s->target_as;
>>> +    }
>>> +
>>> +    /* Find first registered IOMMU device */
>>> +    while (s->iommus.le_prev) {
>>> +        s = *(s->iommus.le_prev);
>>> +    }
>>> +
>>> +    /* Find first matching IOMMU */
>>> +    while (s != NULL && as == NULL) {
>>> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
>>
>> For pci_bus_num(),
>> riscv_iommu_find_as() can be called at the very early stage
>> where software has no chance to enumerate the bus numbers.

I investigated and this doesn't seem to be a problem. This function is called at the
last step of the realize() steps of both riscv_iommu_pci_realize() and
riscv_iommu_sys_realize(), and by that time the pci_bus_num() is already assigned.
Other iommus use pci_bus_num() into their own get_address_space() callbacks like
this too.


Thanks,


Daniel


> 
> I'll see how other IOMMUs are handling their iommu_find_as()
> 
> 
> Thanks,
> 
> 
> Daniel
> 
> 
>>
>>
>>
>>
>>> +        s = s->iommus.le_next;
>>> +    }
>>> +
>>> +    return as ? as : &address_space_memory;
>>> +}
>>> +
>>> +static const PCIIOMMUOps riscv_iommu_ops = {
>>> +    .get_address_space = riscv_iommu_find_as,
>>> +};
>>> +
>>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>>> +        Error **errp)
>>> +{
>>> +    if (bus->iommu_ops &&
>>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
>>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
>>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
>>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
>>> +    } else if (bus->iommu_ops == NULL) {
>>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
>>> +    } else {
>>> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
>>> +            pci_bus_num(bus));
>>> +    }
>>> +}
>>> +
>>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
>>> +    MemTxAttrs attrs)
>>> +{
>>> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
>>> +}
>>> +
>>> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
>>> +{
>>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>>> +    return 1 << as->iommu->pasid_bits;
>>> +}
>>> +
>>> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
>>> +{
>>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>>> +
>>> +    imrc->translate = riscv_iommu_memory_region_translate;
>>> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
>>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
>>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
>>> +}
>>> +
>>> +static const TypeInfo riscv_iommu_memory_region_info = {
>>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
>>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
>>> +    .class_init = riscv_iommu_memory_region_init,
>>> +};
>>> +
>>> +static void riscv_iommu_register_mr_types(void)
>>> +{
>>> +    type_register_static(&riscv_iommu_memory_region_info);
>>> +    type_register_static(&riscv_iommu_info);
>>> +}
>>> +
>>> +type_init(riscv_iommu_register_mr_types);
>>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
>>> new file mode 100644
>>> index 0000000000..6f740de690
>>> --- /dev/null
>>> +++ b/hw/riscv/riscv-iommu.h
>>> @@ -0,0 +1,141 @@
>>> +/*
>>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>>> + *
>>> + * Copyright (C) 2022-2023 Rivos Inc.
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify
>>> + * it under the terms of the GNU General Public License as published by
>>> + * the Free Software Foundation; either version 2 of the License.
>>> + *
>>> + * This program is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> + * GNU General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU General Public License along
>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>> + */
>>> +
>>> +#ifndef HW_RISCV_IOMMU_STATE_H
>>> +#define HW_RISCV_IOMMU_STATE_H
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "qom/object.h"
>>> +
>>> +#include "hw/riscv/iommu.h"
>>> +
>>> +struct RISCVIOMMUState {
>>> +    /*< private >*/
>>> +    DeviceState parent_obj;
>>> +
>>> +    /*< public >*/
>>> +    uint32_t version;     /* Reported interface version number */
>>> +    uint32_t pasid_bits;  /* process identifier width */
>>> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
>>> +
>>> +    uint64_t cap;         /* IOMMU supported capabilities */
>>> +    uint64_t fctl;        /* IOMMU enabled features */
>>> +
>>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>>> +    bool enable_msi;      /* Enable MSI remapping */
>>> +
>>> +    /* IOMMU Internal State */
>>> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
>>> +
>>> +    dma_addr_t cq_addr;   /* Command queue base physical address */
>>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
>>> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
>>> +
>>> +    uint32_t cq_mask;     /* Command queue index bit mask */
>>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
>>> +    uint32_t pq_mask;     /* Page request queue index bit mask */
>>> +
>>> +    /* interrupt notifier */
>>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
>>> +
>>> +    /* IOMMU State Machine */
>>> +    QemuThread core_proc; /* Background processing thread */
>>> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
>>> +    QemuCond core_cond;   /* Background processing wake up signal */
>>> +    unsigned core_exec;   /* Processing thread execution actions */
>>> +
>>> +    /* IOMMU target address space */
>>> +    AddressSpace *target_as;
>>> +    MemoryRegion *target_mr;
>>> +
>>> +    /* MSI / MRIF access trap */
>>> +    AddressSpace trap_as;
>>> +    MemoryRegion trap_mr;
>>> +
>>> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
>>> +
>>> +    /* MMIO Hardware Interface */
>>> +    MemoryRegion regs_mr;
>>> +    QemuSpin regs_lock;
>>> +    uint8_t *regs_rw;  /* register state (user write) */
>>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
>>> +    uint8_t *regs_ro;  /* read-only mask */
>>> +
>>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
>>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
>>> +};
>>> +
>>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>>> +         Error **errp);
>>> +
>>> +/* private helpers */
>>> +
>>> +/* Register helper functions */
>>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
>>> +    unsigned idx, uint32_t set, uint32_t clr)
>>> +{
>>> +    uint32_t val;
>>> +    qemu_spin_lock(&s->regs_lock);
>>> +    val = ldl_le_p(s->regs_rw + idx);
>>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
>>> +    qemu_spin_unlock(&s->regs_lock);
>>> +    return val;
>>> +}
>>> +
>>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
>>> +    unsigned idx, uint32_t set)
>>> +{
>>> +    qemu_spin_lock(&s->regs_lock);
>>> +    stl_le_p(s->regs_rw + idx, set);
>>> +    qemu_spin_unlock(&s->regs_lock);
>>> +}
>>> +
>>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
>>> +    unsigned idx)
>>> +{
>>> +    return ldl_le_p(s->regs_rw + idx);
>>> +}
>>> +
>>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
>>> +    unsigned idx, uint64_t set, uint64_t clr)
>>> +{
>>> +    uint64_t val;
>>> +    qemu_spin_lock(&s->regs_lock);
>>> +    val = ldq_le_p(s->regs_rw + idx);
>>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
>>> +    qemu_spin_unlock(&s->regs_lock);
>>> +    return val;
>>> +}
>>> +
>>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
>>> +    unsigned idx, uint64_t set)
>>> +{
>>> +    qemu_spin_lock(&s->regs_lock);
>>> +    stq_le_p(s->regs_rw + idx, set);
>>> +    qemu_spin_unlock(&s->regs_lock);
>>> +}
>>> +
>>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
>>> +    unsigned idx)
>>> +{
>>> +    return ldq_le_p(s->regs_rw + idx);
>>> +}
>>> +
>>> +
>>> +
>>> +#endif
>>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
>>> new file mode 100644
>>> index 0000000000..42a97caffa
>>> --- /dev/null
>>> +++ b/hw/riscv/trace-events
>>> @@ -0,0 +1,11 @@
>>> +# See documentation at docs/devel/tracing.rst
>>> +
>>> +# riscv-iommu.c
>>> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
>>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
>>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
>>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
>>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
>>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
>>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
>>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
>>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
>>> new file mode 100644
>>> index 0000000000..b88504b750
>>> --- /dev/null
>>> +++ b/hw/riscv/trace.h
>>> @@ -0,0 +1,2 @@
>>> +#include "trace/trace-hw_riscv.h"
>>> +
>>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
>>> new file mode 100644
>>> index 0000000000..403b365893
>>> --- /dev/null
>>> +++ b/include/hw/riscv/iommu.h
>>> @@ -0,0 +1,36 @@
>>> +/*
>>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>>> + *
>>> + * Copyright (C) 2022-2023 Rivos Inc.
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify
>>> + * it under the terms of the GNU General Public License as published by
>>> + * the Free Software Foundation; either version 2 of the License.
>>> + *
>>> + * This program is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> + * GNU General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU General Public License along
>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>> + */
>>> +
>>> +#ifndef HW_RISCV_IOMMU_H
>>> +#define HW_RISCV_IOMMU_H
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "qom/object.h"
>>> +
>>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
>>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
>>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
>>> +
>>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
>>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
>>> +
>>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
>>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
>>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
>>> +
>>> +#endif
>>> diff --git a/meson.build b/meson.build
>>> index c59ca496f2..75e56f3282 100644
>>> --- a/meson.build
>>> +++ b/meson.build
>>> @@ -3361,6 +3361,7 @@ if have_system
>>>       'hw/rdma',
>>>       'hw/rdma/vmw',
>>>       'hw/rtc',
>>> +    'hw/riscv',
>>>       'hw/s390x',
>>>       'hw/scsi',
>>>       'hw/sd',
>>> -- 
>>> 2.43.2
>>>
>>>
Daniel Henrique Barboza May 13, 2024, 12:41 p.m. UTC | #6
Hi Frank,

On 5/10/24 07:58, Frank Chang wrote:
> Hi Daniel,
> 
> Daniel Henrique Barboza <dbarboza@ventanamicro.com> 於 2024年5月8日 週三 下午7:16寫道:
>>
>> Hi Frank,
>>
>> I'll reply with that I've done so far. Still missing some stuff:
>>
>> On 5/2/24 08:37, Frank Chang wrote:
>>> Hi Daniel,
>>>
>>> Daniel Henrique Barboza <dbarboza@ventanamicro.com> 於 2024年3月8日 週五 上午12:04寫道:


(...)


>>> In our experience, using QEMU thread increases the latency of command
>>> queue processing,
>>> which leads to the potential IOMMU fence timeout in the Linux driver
>>> when using IOMMU with KVM,
>>> e.g. booting the guest Linux.
>>>
>>> Is it possible to remove the thread from the IOMMU just like ARM, AMD,
>>> and Intel IOMMU models?
>>
>> Interesting. We've been using this emulation internally in Ventana, with
>> KVM and VFIO, and didn't experience this issue. Drew is on CC and can talk
>> more about it.
> 
> We've developed IOFENCE timeout detection mechanism in our Linux
> driver internally
> to detect the long-run IOFENCE command on the hardware.
> 
> However, we hit the assertion when running on QEMU
> and the issue was resolved after we removed the thread from IOMMU model.
> However, the assertion didn't happen on our hardware.
> 
> Regards,
> Frank CHang


I see. Well, one more reason to remove the threading for v3 then. I removed it and
it seems to be working as usual in my tests, i.e. no perceptible performance or
behavior impacts. Thanks,


Daniel


> 
>>
>> That said, I don't mind this change, assuming it's feasible to make it for this
>> first version.  I'll need to check it how other IOMMUs are doing it.
>>
>>
>>
>>>
>>>> +}
>>>> +
>>
>> (...)
>>
>>>> +
>>>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
>>>> +{
>>>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
>>>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>>>> +    AddressSpace *as = NULL;
>>>> +
>>>> +    if (pdev && pci_is_iommu(pdev)) {
>>>> +        return s->target_as;
>>>> +    }
>>>> +
>>>> +    /* Find first registered IOMMU device */
>>>> +    while (s->iommus.le_prev) {
>>>> +        s = *(s->iommus.le_prev);
>>>> +    }
>>>> +
>>>> +    /* Find first matching IOMMU */
>>>> +    while (s != NULL && as == NULL) {
>>>> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
>>>
>>> For pci_bus_num(),
>>> riscv_iommu_find_as() can be called at the very early stage
>>> where software has no chance to enumerate the bus numbers.
>>
>> I'll see how other IOMMUs are handling their iommu_find_as()
>>
>>
>> Thanks,
>>
>>
>> Daniel
>>
>>
>>>
>>>
>>>
>>>
>>>> +        s = s->iommus.le_next;
>>>> +    }
>>>> +
>>>> +    return as ? as : &address_space_memory;
>>>> +}
>>>> +
>>>> +static const PCIIOMMUOps riscv_iommu_ops = {
>>>> +    .get_address_space = riscv_iommu_find_as,
>>>> +};
>>>> +
>>>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>>>> +        Error **errp)
>>>> +{
>>>> +    if (bus->iommu_ops &&
>>>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
>>>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
>>>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
>>>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
>>>> +    } else if (bus->iommu_ops == NULL) {
>>>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
>>>> +    } else {
>>>> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
>>>> +            pci_bus_num(bus));
>>>> +    }
>>>> +}
>>>> +
>>>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
>>>> +    MemTxAttrs attrs)
>>>> +{
>>>> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
>>>> +}
>>>> +
>>>> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
>>>> +{
>>>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>>>> +    return 1 << as->iommu->pasid_bits;
>>>> +}
>>>> +
>>>> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
>>>> +{
>>>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>>>> +
>>>> +    imrc->translate = riscv_iommu_memory_region_translate;
>>>> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
>>>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
>>>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
>>>> +}
>>>> +
>>>> +static const TypeInfo riscv_iommu_memory_region_info = {
>>>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
>>>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
>>>> +    .class_init = riscv_iommu_memory_region_init,
>>>> +};
>>>> +
>>>> +static void riscv_iommu_register_mr_types(void)
>>>> +{
>>>> +    type_register_static(&riscv_iommu_memory_region_info);
>>>> +    type_register_static(&riscv_iommu_info);
>>>> +}
>>>> +
>>>> +type_init(riscv_iommu_register_mr_types);
>>>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
>>>> new file mode 100644
>>>> index 0000000000..6f740de690
>>>> --- /dev/null
>>>> +++ b/hw/riscv/riscv-iommu.h
>>>> @@ -0,0 +1,141 @@
>>>> +/*
>>>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>>>> + *
>>>> + * Copyright (C) 2022-2023 Rivos Inc.
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify
>>>> + * it under the terms of the GNU General Public License as published by
>>>> + * the Free Software Foundation; either version 2 of the License.
>>>> + *
>>>> + * This program is distributed in the hope that it will be useful,
>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>> + * GNU General Public License for more details.
>>>> + *
>>>> + * You should have received a copy of the GNU General Public License along
>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>>> + */
>>>> +
>>>> +#ifndef HW_RISCV_IOMMU_STATE_H
>>>> +#define HW_RISCV_IOMMU_STATE_H
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qom/object.h"
>>>> +
>>>> +#include "hw/riscv/iommu.h"
>>>> +
>>>> +struct RISCVIOMMUState {
>>>> +    /*< private >*/
>>>> +    DeviceState parent_obj;
>>>> +
>>>> +    /*< public >*/
>>>> +    uint32_t version;     /* Reported interface version number */
>>>> +    uint32_t pasid_bits;  /* process identifier width */
>>>> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
>>>> +
>>>> +    uint64_t cap;         /* IOMMU supported capabilities */
>>>> +    uint64_t fctl;        /* IOMMU enabled features */
>>>> +
>>>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>>>> +    bool enable_msi;      /* Enable MSI remapping */
>>>> +
>>>> +    /* IOMMU Internal State */
>>>> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
>>>> +
>>>> +    dma_addr_t cq_addr;   /* Command queue base physical address */
>>>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
>>>> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
>>>> +
>>>> +    uint32_t cq_mask;     /* Command queue index bit mask */
>>>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
>>>> +    uint32_t pq_mask;     /* Page request queue index bit mask */
>>>> +
>>>> +    /* interrupt notifier */
>>>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
>>>> +
>>>> +    /* IOMMU State Machine */
>>>> +    QemuThread core_proc; /* Background processing thread */
>>>> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
>>>> +    QemuCond core_cond;   /* Background processing wake up signal */
>>>> +    unsigned core_exec;   /* Processing thread execution actions */
>>>> +
>>>> +    /* IOMMU target address space */
>>>> +    AddressSpace *target_as;
>>>> +    MemoryRegion *target_mr;
>>>> +
>>>> +    /* MSI / MRIF access trap */
>>>> +    AddressSpace trap_as;
>>>> +    MemoryRegion trap_mr;
>>>> +
>>>> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
>>>> +
>>>> +    /* MMIO Hardware Interface */
>>>> +    MemoryRegion regs_mr;
>>>> +    QemuSpin regs_lock;
>>>> +    uint8_t *regs_rw;  /* register state (user write) */
>>>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
>>>> +    uint8_t *regs_ro;  /* read-only mask */
>>>> +
>>>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
>>>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
>>>> +};
>>>> +
>>>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>>>> +         Error **errp);
>>>> +
>>>> +/* private helpers */
>>>> +
>>>> +/* Register helper functions */
>>>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
>>>> +    unsigned idx, uint32_t set, uint32_t clr)
>>>> +{
>>>> +    uint32_t val;
>>>> +    qemu_spin_lock(&s->regs_lock);
>>>> +    val = ldl_le_p(s->regs_rw + idx);
>>>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
>>>> +    qemu_spin_unlock(&s->regs_lock);
>>>> +    return val;
>>>> +}
>>>> +
>>>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
>>>> +    unsigned idx, uint32_t set)
>>>> +{
>>>> +    qemu_spin_lock(&s->regs_lock);
>>>> +    stl_le_p(s->regs_rw + idx, set);
>>>> +    qemu_spin_unlock(&s->regs_lock);
>>>> +}
>>>> +
>>>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
>>>> +    unsigned idx)
>>>> +{
>>>> +    return ldl_le_p(s->regs_rw + idx);
>>>> +}
>>>> +
>>>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
>>>> +    unsigned idx, uint64_t set, uint64_t clr)
>>>> +{
>>>> +    uint64_t val;
>>>> +    qemu_spin_lock(&s->regs_lock);
>>>> +    val = ldq_le_p(s->regs_rw + idx);
>>>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
>>>> +    qemu_spin_unlock(&s->regs_lock);
>>>> +    return val;
>>>> +}
>>>> +
>>>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
>>>> +    unsigned idx, uint64_t set)
>>>> +{
>>>> +    qemu_spin_lock(&s->regs_lock);
>>>> +    stq_le_p(s->regs_rw + idx, set);
>>>> +    qemu_spin_unlock(&s->regs_lock);
>>>> +}
>>>> +
>>>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
>>>> +    unsigned idx)
>>>> +{
>>>> +    return ldq_le_p(s->regs_rw + idx);
>>>> +}
>>>> +
>>>> +
>>>> +
>>>> +#endif
>>>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
>>>> new file mode 100644
>>>> index 0000000000..42a97caffa
>>>> --- /dev/null
>>>> +++ b/hw/riscv/trace-events
>>>> @@ -0,0 +1,11 @@
>>>> +# See documentation at docs/devel/tracing.rst
>>>> +
>>>> +# riscv-iommu.c
>>>> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
>>>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
>>>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
>>>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
>>>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
>>>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
>>>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
>>>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
>>>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
>>>> new file mode 100644
>>>> index 0000000000..b88504b750
>>>> --- /dev/null
>>>> +++ b/hw/riscv/trace.h
>>>> @@ -0,0 +1,2 @@
>>>> +#include "trace/trace-hw_riscv.h"
>>>> +
>>>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
>>>> new file mode 100644
>>>> index 0000000000..403b365893
>>>> --- /dev/null
>>>> +++ b/include/hw/riscv/iommu.h
>>>> @@ -0,0 +1,36 @@
>>>> +/*
>>>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>>>> + *
>>>> + * Copyright (C) 2022-2023 Rivos Inc.
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify
>>>> + * it under the terms of the GNU General Public License as published by
>>>> + * the Free Software Foundation; either version 2 of the License.
>>>> + *
>>>> + * This program is distributed in the hope that it will be useful,
>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>> + * GNU General Public License for more details.
>>>> + *
>>>> + * You should have received a copy of the GNU General Public License along
>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>>> + */
>>>> +
>>>> +#ifndef HW_RISCV_IOMMU_H
>>>> +#define HW_RISCV_IOMMU_H
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qom/object.h"
>>>> +
>>>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
>>>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
>>>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
>>>> +
>>>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
>>>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
>>>> +
>>>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
>>>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
>>>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
>>>> +
>>>> +#endif
>>>> diff --git a/meson.build b/meson.build
>>>> index c59ca496f2..75e56f3282 100644
>>>> --- a/meson.build
>>>> +++ b/meson.build
>>>> @@ -3361,6 +3361,7 @@ if have_system
>>>>        'hw/rdma',
>>>>        'hw/rdma/vmw',
>>>>        'hw/rtc',
>>>> +    'hw/riscv',
>>>>        'hw/s390x',
>>>>        'hw/scsi',
>>>>        'hw/sd',
>>>> --
>>>> 2.43.2
>>>>
>>>>
>>
Daniel Henrique Barboza May 14, 2024, 8:06 p.m. UTC | #7
Hi Jason,

On 5/1/24 08:57, Jason Chien wrote:
> Daniel Henrique Barboza 於 2024/3/8 上午 12:03 寫道:
>> From: Tomasz Jeznach<tjeznach@rivosinc.com>
>>
>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
>> international process. The latest frozen specifcation can be found
>> at:
>>
>> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>>
>> Add the foundation of the device emulation for RISC-V IOMMU, which
>> includes an IOMMU that has no capabilities but MSI interrupt support and
>> fault queue interfaces. We'll add add more features incrementally in the
>> next patches.
>>
>> Co-developed-by: Sebastien Boeuf<seb@rivosinc.com>
>> Signed-off-by: Sebastien Boeuf<seb@rivosinc.com>
>> Signed-off-by: Tomasz Jeznach<tjeznach@rivosinc.com>
>> Signed-off-by: Daniel Henrique Barboza<dbarboza@ventanamicro.com>
>> ---
>>   hw/riscv/Kconfig         |    4 +
>>   hw/riscv/meson.build     |    1 +
>>   hw/riscv/riscv-iommu.c   | 1492 ++++++++++++++++++++++++++++++++++++++
>>   hw/riscv/riscv-iommu.h   |  141 ++++
>>   hw/riscv/trace-events    |   11 +
>>   hw/riscv/trace.h         |    2 +
>>   include/hw/riscv/iommu.h |   36 +
>>   meson.build              |    1 +
>>   8 files changed, 1688 insertions(+)
>>   create mode 100644 hw/riscv/riscv-iommu.c
>>   create mode 100644 hw/riscv/riscv-iommu.h
>>   create mode 100644 hw/riscv/trace-events
>>   create mode 100644 hw/riscv/trace.h
>>   create mode 100644 include/hw/riscv/iommu.h
>>
>> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
>> index 5d644eb7b1..faf6a10029 100644
>> --- a/hw/riscv/Kconfig
>> +++ b/hw/riscv/Kconfig
>> @@ -1,3 +1,6 @@
>> +config RISCV_IOMMU
>> +    bool
>> +

(...)

>> +
>> +/* IOMMU index for transactions without PASID specified. */
>> +#define RISCV_IOMMU_NOPASID 0
>> +
>> +static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
>> +{
>> +    const uint32_t ipsr =
>> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
>> +    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
>> +    if (s->notify && !(ipsr & (1 << vec))) {
>> +        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
>> +    }
>> +}
> The RISC-V IOMMU also supports WSI.
>> +

I mentioned in the review with Frank that this impl does not support WSI, but
it really seems clearer to do the check here nevertheless. I'll add it.


>> +static void riscv_iommu_fault(RISCVIOMMUState *s,
>> +                              struct riscv_iommu_fq_record *ev)
>> +{

(...)

>> +
>> +    /*
>> +     * Check supported device id width (in bits).
>> +     * See IOMMU Specification, Chapter 6. Software guidelines.
>> +     * - if extended device-context format is used:
>> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
>> +     * - if base device-context format is used:
>> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
>> +     */
>> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> 
> The cause should be 260 not 258.
> 
>  From the RISC-V IOMMU Architecture Spec v1.0.0 section 2.3:
> If the device_id is wider than that supported by the IOMMU mode, as determined by the following checks then stop and report "Transaction type disallowed" (cause = 260).
> a. ddtp.iommu_mode is 2LVL and DDI[2] is not 0
> b. ddtp.iommu_mode is 1LVL and either DDI[2] is not 0 or DDI[1] is not 0
> 

Changed.

>> +    }
>> +
>> +    /* Device directory tree walk */
>> +    for (; depth-- > 0; ) {
>> +        /*
>> +         * Select device id index bits based on device directory tree level
>> +         * and device context format.
>> +         * See IOMMU Specification, Chapter 2. Data Structures.
>> +         * - if extended device-context format is used:
>> +         *   device index: [23:15][14:6][5:0]
>> +         * - if base device-context format is used:
>> +         *   device index: [23:16][15:7][6:0]
>> +         */
>> +        const int split = depth * 9 + 6 + dc_fmt;
>> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
>> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
>> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
>> +        }
>> +        le64_to_cpus(&de);
>> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
>> +            /* invalid directory entry */
>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>> +        }
>> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
>> +            /* reserved bits set */
>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> 
> The cause should be 259 not 258.
> 
>  From RISC-V IOMMU Architecture Spec v1.0.0 section 2.3.1:
> If any bits or encoding that are reserved for future standard use are set within ddte, stop and report "DDT entry misconfigured" (cause = 259).

Changed

> 
>> +        }
>> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
>> +    }
>> +
>> +    /* index into device context entry page */
>> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
>> +
>> +    memset(&dc, 0, sizeof(dc));
>> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
>> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
>> +    }
>> +
>> +    /* Set translation context. */
>> +    ctx->tc = le64_to_cpu(dc.tc);
>> +    ctx->ta = le64_to_cpu(dc.ta);
>> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
>> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
>> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
>> +
> According to RISC-V IOMMU Architecture spec v1.0.0 section 2.1.4, we should do some checks for the found device context.

I added a new helper to validate the device context at this point, following
section 2.1.4 steps.

>> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>> +    }
>> +
>> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
>> +        if (ctx->pasid != RISCV_IOMMU_NOPASID) {
>> +            /* PASID is disabled */
>> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
>> +        }
>> +        return 0;
>> +    }
>> +

(...)

>> +
>> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
>> +{
>> +    uint64_t base;
>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>> +    uint32_t ctrl_clr;
>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
>> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
>> +
>> +    if (enable && !active) {
>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
>> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
>> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
>> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
>> +            RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO;
> cqcsr.fence_w_ip should be set to 0 as well.

Done.


>> +    } else if (!enable && active) {
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
>> +    } else {

(...)

>> +}
>> +
>> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
>> +    uint64_t data, unsigned size, MemTxAttrs attrs)
>> +{
>> +    RISCVIOMMUState *s = opaque;
>> +    uint32_t regb = addr & ~3;
>> +    uint32_t busy = 0;
>> +    uint32_t exec = 0;
>> +
>> +    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
>> +        /* Unsupported MMIO alignment or access size */
>> +        return MEMTX_ERROR;
>> +    }
>> +
>> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
>> +        /* Unsupported MMIO access location. */
>> +        return MEMTX_ACCESS_ERROR;
>> +    }
>> +
>> +    /* Track actionable MMIO write. */
>> +    switch (regb) {
> 
> There should be a case for IPSR register.
> 
>  From RISC-V IOMMU Architecture Spec v1.0.0 section 5.18:
> If a bit in ipsr is 1 then a write of 1 to the bit transitions the bit from 1→0. If the conditions to set that bit are still present (See [IPSR_FIELDS]) or if they occur after the bit is cleared then that bit transitions again from 0→1.


A new helper to handle ipsr updates via mmio_write was created.

>> +    case RISCV_IOMMU_REG_DDTP:
>> +    case RISCV_IOMMU_REG_DDTP + 4:
>> +        exec = BIT(RISCV_IOMMU_EXEC_DDTP);

(...)

>> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
>> +{
>> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
>> +
>> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
>> +    if (s->enable_msi) {
>> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
>> +    }
>> +    /* Report QEMU target physical address space limits */
>> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
>> +                       TARGET_PHYS_ADDR_SPACE_BITS);
>> +
>> +    /* TODO: method to report supported PASID bits */
>> +    s->pasid_bits = 8; /* restricted to size of MemTxAttrs.pasid */
>> +    s->cap |= RISCV_IOMMU_CAP_PD8;
>> +
>> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
>> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
>> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
>> +
>> +    /* register storage */
>> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>> +
>> +     /* Mark all registers read-only */
>> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
>> +
>> +    /*
>> +     * Register complete MMIO space, including MSI/PBA registers.
>> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
>> +     * managed directly by the PCIDevice implementation.
>> +     */
>> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
>> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
>> +
>> +    /* Set power-on register state */
>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], s->fctl);
> s->fctl is not initialized.

I believe the idea is to init it as zero. I'll change it to init as zero
explicitly.

>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
>> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
>> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],

(...)

>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>> +        Error **errp)
>> +{
>> +    if (bus->iommu_ops &&
>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
>> +    } else if (bus->iommu_ops == NULL) {
>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
> The original bus->iommu_op and bus->iommu_opaque will be lost.

Not sure what you meant with 'iommu_op'. We have 'iommu_ops', which is being checked
for NULL before calling pci_setup_iommu().

As for overwriting the original bus->iommu_opaque, I added an extra iommu_opaque == NULL
check. We'll make:

     } else if (!bus->iommu_ops && !bus->iommu_opaque) {
         pci_setup_iommu(bus, &riscv_iommu_ops, iommu);

This will guarantee that we're not overwriting any existing ops or opaque by accident.



Thanks,


Daniel
Frank Chang May 16, 2024, 7:13 a.m. UTC | #8
On Mon, May 13, 2024 at 8:37 PM Daniel Henrique Barboza <
dbarboza@ventanamicro.com> wrote:

> Hi Frank,
>
>
> On 5/8/24 08:15, Daniel Henrique Barboza wrote:
> > Hi Frank,
> >
> > I'll reply with that I've done so far. Still missing some stuff:
> >
> > On 5/2/24 08:37, Frank Chang wrote:
> >> Hi Daniel,
> >>
> >> Daniel Henrique Barboza <dbarboza@ventanamicro.com> 於 2024年3月8日 週五
> 上午12:04寫道:
> >>>
> >>> From: Tomasz Jeznach <tjeznach@rivosinc.com>
> >>>
> >>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> >>> international process. The latest frozen specifcation can be found
> >>> at:
> >>>
> >>>
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
> >>>
> >>> Add the foundation of the device emulation for RISC-V IOMMU, which
> >>> includes an IOMMU that has no capabilities but MSI interrupt support
> and
> >>> fault queue interfaces. We'll add add more features incrementally in
> the
> >>> next patches.
> >>>
> >>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> >>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> >>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> >>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> >>> ---
> >>>   hw/riscv/Kconfig         |    4 +
>
> (...)
>
> >>> +
> >>> +    s->iommus.le_next = NULL;
> >>> +    s->iommus.le_prev = NULL;
> >>> +    QLIST_INIT(&s->spaces);
> >>> +    qemu_cond_init(&s->core_cond);
> >>> +    qemu_mutex_init(&s->core_lock);
> >>> +    qemu_spin_init(&s->regs_lock);
> >>> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
> >>> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
> >>
> >> In our experience, using QEMU thread increases the latency of command
> >> queue processing,
> >> which leads to the potential IOMMU fence timeout in the Linux driver
> >> when using IOMMU with KVM,
> >> e.g. booting the guest Linux.
> >>
> >> Is it possible to remove the thread from the IOMMU just like ARM, AMD,
> >> and Intel IOMMU models?
> >
> > Interesting. We've been using this emulation internally in Ventana, with
> > KVM and VFIO, and didn't experience this issue. Drew is on CC and can
> talk
> > more about it.
> >
> > That said, I don't mind this change, assuming it's feasible to make it
> for this
> > first version.  I'll need to check it how other IOMMUs are doing it.
>
>
> I removed the threading and it seems to be working fine without it. I'll
> commit this
> change for v3.
>
> >
> >
> >
> >>
> >>> +}
> >>> +
> >
> > (...)
> >
> >>> +
> >>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque,
> int devfn)
> >>> +{
> >>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> >>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> >>> +    AddressSpace *as = NULL;
> >>> +
> >>> +    if (pdev && pci_is_iommu(pdev)) {
> >>> +        return s->target_as;
> >>> +    }
> >>> +
> >>> +    /* Find first registered IOMMU device */
> >>> +    while (s->iommus.le_prev) {
> >>> +        s = *(s->iommus.le_prev);
> >>> +    }
> >>> +
> >>> +    /* Find first matching IOMMU */
> >>> +    while (s != NULL && as == NULL) {
> >>> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus),
> devfn));
> >>
> >> For pci_bus_num(),
> >> riscv_iommu_find_as() can be called at the very early stage
> >> where software has no chance to enumerate the bus numbers.
>
> I investigated and this doesn't seem to be a problem. This function is
> called at the
> last step of the realize() steps of both riscv_iommu_pci_realize() and
> riscv_iommu_sys_realize(), and by that time the pci_bus_num() is already
> assigned.
> Other iommus use pci_bus_num() into their own get_address_space()
> callbacks like
> this too.
>

Hi Daniel,

IIUC, pci_bus_num() by default is assigned to pcibus_num():

static int pcibus_num(PCIBus *bus)
{
    if (pci_bus_is_root(bus)) {
        return 0; /* pci host bridge */
    }
    return bus->parent_dev->config[PCI_SECONDARY_BUS];
}

If the bus is not the root bus, it tries to read the bus' parent device's
secondary bus number (PCI_SECONDARY_BUS) field in the PCI configuration
space.
This field should be programmable by the SW during PCIe enumeration.
But I don't think SW has a chance to be executed before
riscv_iommu_sys_realize() is called,
since it's pretty early before CPU's execution unless RISC-V IOMMU is
hot-plugged.
Even if RISC-V IOMMU is hot-plugged, I think riscv_iommu_sys_realize() is
still called
before SW aware of the existence of IOMMU on the PCI topology tree.

Do you think this matches your observation?

Regards,
Frank Chang


>
>
> Thanks,
>
>
> Daniel
>
>
> >
> > I'll see how other IOMMUs are handling their iommu_find_as()
> >
> >
> > Thanks,
> >
> >
> > Daniel
> >
> >
> >>
> >>
> >>
> >>
> >>> +        s = s->iommus.le_next;
> >>> +    }
> >>> +
> >>> +    return as ? as : &address_space_memory;
> >>> +}
> >>> +
> >>> +static const PCIIOMMUOps riscv_iommu_ops = {
> >>> +    .get_address_space = riscv_iommu_find_as,
> >>> +};
> >>> +
> >>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> >>> +        Error **errp)
> >>> +{
> >>> +    if (bus->iommu_ops &&
> >>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
> >>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known
> devices */
> >>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> >>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> >>> +    } else if (bus->iommu_ops == NULL) {
> >>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
> >>> +    } else {
> >>> +        error_setg(errp, "can't register secondary IOMMU for PCI bus
> #%d",
> >>> +            pci_bus_num(bus));
> >>> +    }
> >>> +}
> >>> +
> >>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion
> *iommu_mr,
> >>> +    MemTxAttrs attrs)
> >>> +{
> >>> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
> >>> +}
> >>> +
> >>> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion
> *iommu_mr)
> >>> +{
> >>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace,
> iova_mr);
> >>> +    return 1 << as->iommu->pasid_bits;
> >>> +}
> >>> +
> >>> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void
> *data)
> >>> +{
> >>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> >>> +
> >>> +    imrc->translate = riscv_iommu_memory_region_translate;
> >>> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> >>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> >>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> >>> +}
> >>> +
> >>> +static const TypeInfo riscv_iommu_memory_region_info = {
> >>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> >>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> >>> +    .class_init = riscv_iommu_memory_region_init,
> >>> +};
> >>> +
> >>> +static void riscv_iommu_register_mr_types(void)
> >>> +{
> >>> +    type_register_static(&riscv_iommu_memory_region_info);
> >>> +    type_register_static(&riscv_iommu_info);
> >>> +}
> >>> +
> >>> +type_init(riscv_iommu_register_mr_types);
> >>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> >>> new file mode 100644
> >>> index 0000000000..6f740de690
> >>> --- /dev/null
> >>> +++ b/hw/riscv/riscv-iommu.h
> >>> @@ -0,0 +1,141 @@
> >>> +/*
> >>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> >>> + *
> >>> + * Copyright (C) 2022-2023 Rivos Inc.
> >>> + *
> >>> + * This program is free software; you can redistribute it and/or
> modify
> >>> + * it under the terms of the GNU General Public License as published
> by
> >>> + * the Free Software Foundation; either version 2 of the License.
> >>> + *
> >>> + * This program is distributed in the hope that it will be useful,
> >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>> + * GNU General Public License for more details.
> >>> + *
> >>> + * You should have received a copy of the GNU General Public License
> along
> >>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >>> + */
> >>> +
> >>> +#ifndef HW_RISCV_IOMMU_STATE_H
> >>> +#define HW_RISCV_IOMMU_STATE_H
> >>> +
> >>> +#include "qemu/osdep.h"
> >>> +#include "qom/object.h"
> >>> +
> >>> +#include "hw/riscv/iommu.h"
> >>> +
> >>> +struct RISCVIOMMUState {
> >>> +    /*< private >*/
> >>> +    DeviceState parent_obj;
> >>> +
> >>> +    /*< public >*/
> >>> +    uint32_t version;     /* Reported interface version number */
> >>> +    uint32_t pasid_bits;  /* process identifier width */
> >>> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> >>> +
> >>> +    uint64_t cap;         /* IOMMU supported capabilities */
> >>> +    uint64_t fctl;        /* IOMMU enabled features */
> >>> +
> >>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA
> disabled) */
> >>> +    bool enable_msi;      /* Enable MSI remapping */
> >>> +
> >>> +    /* IOMMU Internal State */
> >>> +    uint64_t ddtp;        /* Validated Device Directory Tree Root
> Pointer */
> >>> +
> >>> +    dma_addr_t cq_addr;   /* Command queue base physical address */
> >>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address
> */
> >>> +    dma_addr_t pq_addr;   /* Page request queue base physical address
> */
> >>> +
> >>> +    uint32_t cq_mask;     /* Command queue index bit mask */
> >>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> >>> +    uint32_t pq_mask;     /* Page request queue index bit mask */
> >>> +
> >>> +    /* interrupt notifier */
> >>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> >>> +
> >>> +    /* IOMMU State Machine */
> >>> +    QemuThread core_proc; /* Background processing thread */
> >>> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs
> updates */
> >>> +    QemuCond core_cond;   /* Background processing wake up signal */
> >>> +    unsigned core_exec;   /* Processing thread execution actions */
> >>> +
> >>> +    /* IOMMU target address space */
> >>> +    AddressSpace *target_as;
> >>> +    MemoryRegion *target_mr;
> >>> +
> >>> +    /* MSI / MRIF access trap */
> >>> +    AddressSpace trap_as;
> >>> +    MemoryRegion trap_mr;
> >>> +
> >>> +    GHashTable *ctx_cache;          /* Device translation Context
> Cache */
> >>> +
> >>> +    /* MMIO Hardware Interface */
> >>> +    MemoryRegion regs_mr;
> >>> +    QemuSpin regs_lock;
> >>> +    uint8_t *regs_rw;  /* register state (user write) */
> >>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> >>> +    uint8_t *regs_ro;  /* read-only mask */
> >>> +
> >>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> >>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> >>> +};
> >>> +
> >>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> >>> +         Error **errp);
> >>> +
> >>> +/* private helpers */
> >>> +
> >>> +/* Register helper functions */
> >>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> >>> +    unsigned idx, uint32_t set, uint32_t clr)
> >>> +{
> >>> +    uint32_t val;
> >>> +    qemu_spin_lock(&s->regs_lock);
> >>> +    val = ldl_le_p(s->regs_rw + idx);
> >>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> >>> +    qemu_spin_unlock(&s->regs_lock);
> >>> +    return val;
> >>> +}
> >>> +
> >>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> >>> +    unsigned idx, uint32_t set)
> >>> +{
> >>> +    qemu_spin_lock(&s->regs_lock);
> >>> +    stl_le_p(s->regs_rw + idx, set);
> >>> +    qemu_spin_unlock(&s->regs_lock);
> >>> +}
> >>> +
> >>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
> >>> +    unsigned idx)
> >>> +{
> >>> +    return ldl_le_p(s->regs_rw + idx);
> >>> +}
> >>> +
> >>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
> >>> +    unsigned idx, uint64_t set, uint64_t clr)
> >>> +{
> >>> +    uint64_t val;
> >>> +    qemu_spin_lock(&s->regs_lock);
> >>> +    val = ldq_le_p(s->regs_rw + idx);
> >>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> >>> +    qemu_spin_unlock(&s->regs_lock);
> >>> +    return val;
> >>> +}
> >>> +
> >>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> >>> +    unsigned idx, uint64_t set)
> >>> +{
> >>> +    qemu_spin_lock(&s->regs_lock);
> >>> +    stq_le_p(s->regs_rw + idx, set);
> >>> +    qemu_spin_unlock(&s->regs_lock);
> >>> +}
> >>> +
> >>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> >>> +    unsigned idx)
> >>> +{
> >>> +    return ldq_le_p(s->regs_rw + idx);
> >>> +}
> >>> +
> >>> +
> >>> +
> >>> +#endif
> >>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> >>> new file mode 100644
> >>> index 0000000000..42a97caffa
> >>> --- /dev/null
> >>> +++ b/hw/riscv/trace-events
> >>> @@ -0,0 +1,11 @@
> >>> +# See documentation at docs/devel/tracing.rst
> >>> +
> >>> +# riscv-iommu.c
> >>> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f)
> "%s: device attached %04x:%02x.%d"
> >>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f,
> uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64"
> iova: 0x%"PRIx64
> >>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f,
> uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> >>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f,
> unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s:
> translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> >>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f,
> uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64"
> -> 0x%"PRIx64
> >>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command
> 0x%"PRIx64" 0x%"PRIx64
> >>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier
> added"
> >>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier
> removed"
> >>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> >>> new file mode 100644
> >>> index 0000000000..b88504b750
> >>> --- /dev/null
> >>> +++ b/hw/riscv/trace.h
> >>> @@ -0,0 +1,2 @@
> >>> +#include "trace/trace-hw_riscv.h"
> >>> +
> >>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> >>> new file mode 100644
> >>> index 0000000000..403b365893
> >>> --- /dev/null
> >>> +++ b/include/hw/riscv/iommu.h
> >>> @@ -0,0 +1,36 @@
> >>> +/*
> >>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> >>> + *
> >>> + * Copyright (C) 2022-2023 Rivos Inc.
> >>> + *
> >>> + * This program is free software; you can redistribute it and/or
> modify
> >>> + * it under the terms of the GNU General Public License as published
> by
> >>> + * the Free Software Foundation; either version 2 of the License.
> >>> + *
> >>> + * This program is distributed in the hope that it will be useful,
> >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>> + * GNU General Public License for more details.
> >>> + *
> >>> + * You should have received a copy of the GNU General Public License
> along
> >>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >>> + */
> >>> +
> >>> +#ifndef HW_RISCV_IOMMU_H
> >>> +#define HW_RISCV_IOMMU_H
> >>> +
> >>> +#include "qemu/osdep.h"
> >>> +#include "qom/object.h"
> >>> +
> >>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
> >>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> >>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> >>> +
> >>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
> >>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> >>> +
> >>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
> >>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> >>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> >>> +
> >>> +#endif
> >>> diff --git a/meson.build b/meson.build
> >>> index c59ca496f2..75e56f3282 100644
> >>> --- a/meson.build
> >>> +++ b/meson.build
> >>> @@ -3361,6 +3361,7 @@ if have_system
> >>>       'hw/rdma',
> >>>       'hw/rdma/vmw',
> >>>       'hw/rtc',
> >>> +    'hw/riscv',
> >>>       'hw/s390x',
> >>>       'hw/scsi',
> >>>       'hw/sd',
> >>> --
> >>> 2.43.2
> >>>
> >>>
>
Daniel Henrique Barboza May 20, 2024, 4:17 p.m. UTC | #9
Hi Frank,

On 5/16/24 04:13, Frank Chang wrote:
> On Mon, May 13, 2024 at 8:37 PM Daniel Henrique Barboza <dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com>> wrote:
> 
>     Hi Frank,
> 
> 
>     On 5/8/24 08:15, Daniel Henrique Barboza wrote:
>      > Hi Frank,
>      >
>      > I'll reply with that I've done so far. Still missing some stuff:
>      >
>      > On 5/2/24 08:37, Frank Chang wrote:
>      >> Hi Daniel,
>      >>
>      >> Daniel Henrique Barboza <dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com>> 於 2024年3月8日 週五 上午12:04寫道:
>      >>>
>      >>> From: Tomasz Jeznach <tjeznach@rivosinc.com <mailto:tjeznach@rivosinc.com>>
>      >>>
>      >>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
>      >>> international process. The latest frozen specifcation can be found
>      >>> at:
>      >>>
>      >>> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf <https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf>
>      >>>
>      >>> Add the foundation of the device emulation for RISC-V IOMMU, which
>      >>> includes an IOMMU that has no capabilities but MSI interrupt support and
>      >>> fault queue interfaces. We'll add add more features incrementally in the
>      >>> next patches.
>      >>>
>      >>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com <mailto:seb@rivosinc.com>>
>      >>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com <mailto:seb@rivosinc.com>>
>      >>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com <mailto:tjeznach@rivosinc.com>>
>      >>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com>>
>      >>> ---
>      >>>   hw/riscv/Kconfig         |    4 +
> 
>     (...)
> 
>      >>> +
>      >>> +    s->iommus.le_next = NULL;
>      >>> +    s->iommus.le_prev = NULL;
>      >>> +    QLIST_INIT(&s->spaces);
>      >>> +    qemu_cond_init(&s->core_cond);
>      >>> +    qemu_mutex_init(&s->core_lock);
>      >>> +    qemu_spin_init(&s->regs_lock);
>      >>> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
>      >>> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
>      >>
>      >> In our experience, using QEMU thread increases the latency of command
>      >> queue processing,
>      >> which leads to the potential IOMMU fence timeout in the Linux driver
>      >> when using IOMMU with KVM,
>      >> e.g. booting the guest Linux.
>      >>
>      >> Is it possible to remove the thread from the IOMMU just like ARM, AMD,
>      >> and Intel IOMMU models?
>      >
>      > Interesting. We've been using this emulation internally in Ventana, with
>      > KVM and VFIO, and didn't experience this issue. Drew is on CC and can talk
>      > more about it.
>      >
>      > That said, I don't mind this change, assuming it's feasible to make it for this
>      > first version.  I'll need to check it how other IOMMUs are doing it.
> 
> 
>     I removed the threading and it seems to be working fine without it. I'll commit this
>     change for v3.
> 
>      >
>      >
>      >
>      >>
>      >>> +}
>      >>> +
>      >
>      > (...)
>      >
>      >>> +
>      >>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
>      >>> +{
>      >>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
>      >>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>      >>> +    AddressSpace *as = NULL;
>      >>> +
>      >>> +    if (pdev && pci_is_iommu(pdev)) {
>      >>> +        return s->target_as;
>      >>> +    }
>      >>> +
>      >>> +    /* Find first registered IOMMU device */
>      >>> +    while (s->iommus.le_prev) {
>      >>> +        s = *(s->iommus.le_prev);
>      >>> +    }
>      >>> +
>      >>> +    /* Find first matching IOMMU */
>      >>> +    while (s != NULL && as == NULL) {
>      >>> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
>      >>
>      >> For pci_bus_num(),
>      >> riscv_iommu_find_as() can be called at the very early stage
>      >> where software has no chance to enumerate the bus numbers.
> 
>     I investigated and this doesn't seem to be a problem. This function is called at the
>     last step of the realize() steps of both riscv_iommu_pci_realize() and
>     riscv_iommu_sys_realize(), and by that time the pci_bus_num() is already assigned.
>     Other iommus use pci_bus_num() into their own get_address_space() callbacks like
>     this too.
> 
> 
> Hi Daniel,
> 
> IIUC, pci_bus_num() by default is assigned to pcibus_num():
> 
> static int pcibus_num(PCIBus *bus)
> {
>      if (pci_bus_is_root(bus)) {
>          return 0; /* pci host bridge */
>      }
>      return bus->parent_dev->config[PCI_SECONDARY_BUS];
> }
> 
> If the bus is not the root bus, it tries to read the bus' parent device's
> secondary bus number (PCI_SECONDARY_BUS) field in the PCI configuration space.
> This field should be programmable by the SW during PCIe enumeration.
> But I don't think SW has a chance to be executed before riscv_iommu_sys_realize() is called,
> since it's pretty early before CPU's execution unless RISC-V IOMMU is hot-plugged.
> Even if RISC-V IOMMU is hot-plugged, I think riscv_iommu_sys_realize() is still called
> before SW aware of the existence of IOMMU on the PCI topology tree.
> 
> Do you think this matches your observation?

It does. You have a good point on how the pcibus_num() can vary if SW wants to
change the PCI_SECONDARY_BUS and the IOMMU isn't in a root bus. Note that this
will not happen with the existing riscv-iommu-pci device as it is now, since it
has code to prevent the device to be attached to non-pci root buses, but there's
no restrictions in the riscv-iommu-sys device.

And speaking on riscv-iommu-bus, the current device we have in this series is too
bare bones, without an actual use case for it (e.g. code to add it in the 'virt'
machine), but it's getting in the way nevertheless.

I'll remove the riscv-iommu-sys device from v3 and re-introduce it in a later
revision or as a follow up series. Sunil has a handful of patches that add the
riscv-iommu-sys device in the 'virt' machine and the proper ACPI support for it [1],
and I intend to use them as a base. We'll then need some minor adjustments in the
existing code to make it fully functional like we're doing with riscv-iommu-pci.


Thanks,

Daniel


[1] https://github.com/vlsunil/qemu/commits/acpi_rimt_poc_v1/
> 
> Regards,
> Frank Chang
> 
> 
> 
>     Thanks,
> 
> 
>     Daniel
> 
> 
>      >
>      > I'll see how other IOMMUs are handling their iommu_find_as()
>      >
>      >
>      > Thanks,
>      >
>      >
>      > Daniel
>      >
>      >
>      >>
>      >>
>      >>
>      >>
>      >>> +        s = s->iommus.le_next;
>      >>> +    }
>      >>> +
>      >>> +    return as ? as : &address_space_memory;
>      >>> +}
>      >>> +
>      >>> +static const PCIIOMMUOps riscv_iommu_ops = {
>      >>> +    .get_address_space = riscv_iommu_find_as,
>      >>> +};
>      >>> +
>      >>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>      >>> +        Error **errp)
>      >>> +{
>      >>> +    if (bus->iommu_ops &&
>      >>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
>      >>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
>      >>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
>      >>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
>      >>> +    } else if (bus->iommu_ops == NULL) {
>      >>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
>      >>> +    } else {
>      >>> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
>      >>> +            pci_bus_num(bus));
>      >>> +    }
>      >>> +}
>      >>> +
>      >>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
>      >>> +    MemTxAttrs attrs)
>      >>> +{
>      >>> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
>      >>> +}
>      >>> +
>      >>> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
>      >>> +{
>      >>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>      >>> +    return 1 << as->iommu->pasid_bits;
>      >>> +}
>      >>> +
>      >>> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
>      >>> +{
>      >>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>      >>> +
>      >>> +    imrc->translate = riscv_iommu_memory_region_translate;
>      >>> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
>      >>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
>      >>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
>      >>> +}
>      >>> +
>      >>> +static const TypeInfo riscv_iommu_memory_region_info = {
>      >>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
>      >>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
>      >>> +    .class_init = riscv_iommu_memory_region_init,
>      >>> +};
>      >>> +
>      >>> +static void riscv_iommu_register_mr_types(void)
>      >>> +{
>      >>> +    type_register_static(&riscv_iommu_memory_region_info);
>      >>> +    type_register_static(&riscv_iommu_info);
>      >>> +}
>      >>> +
>      >>> +type_init(riscv_iommu_register_mr_types);
>      >>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
>      >>> new file mode 100644
>      >>> index 0000000000..6f740de690
>      >>> --- /dev/null
>      >>> +++ b/hw/riscv/riscv-iommu.h
>      >>> @@ -0,0 +1,141 @@
>      >>> +/*
>      >>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>      >>> + *
>      >>> + * Copyright (C) 2022-2023 Rivos Inc.
>      >>> + *
>      >>> + * This program is free software; you can redistribute it and/or modify
>      >>> + * it under the terms of the GNU General Public License as published by
>      >>> + * the Free Software Foundation; either version 2 of the License.
>      >>> + *
>      >>> + * This program is distributed in the hope that it will be useful,
>      >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>      >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>      >>> + * GNU General Public License for more details.
>      >>> + *
>      >>> + * You should have received a copy of the GNU General Public License along
>      >>> + * with this program; if not, see <http://www.gnu.org/licenses/ <http://www.gnu.org/licenses/>>.
>      >>> + */
>      >>> +
>      >>> +#ifndef HW_RISCV_IOMMU_STATE_H
>      >>> +#define HW_RISCV_IOMMU_STATE_H
>      >>> +
>      >>> +#include "qemu/osdep.h"
>      >>> +#include "qom/object.h"
>      >>> +
>      >>> +#include "hw/riscv/iommu.h"
>      >>> +
>      >>> +struct RISCVIOMMUState {
>      >>> +    /*< private >*/
>      >>> +    DeviceState parent_obj;
>      >>> +
>      >>> +    /*< public >*/
>      >>> +    uint32_t version;     /* Reported interface version number */
>      >>> +    uint32_t pasid_bits;  /* process identifier width */
>      >>> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
>      >>> +
>      >>> +    uint64_t cap;         /* IOMMU supported capabilities */
>      >>> +    uint64_t fctl;        /* IOMMU enabled features */
>      >>> +
>      >>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>      >>> +    bool enable_msi;      /* Enable MSI remapping */
>      >>> +
>      >>> +    /* IOMMU Internal State */
>      >>> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
>      >>> +
>      >>> +    dma_addr_t cq_addr;   /* Command queue base physical address */
>      >>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
>      >>> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
>      >>> +
>      >>> +    uint32_t cq_mask;     /* Command queue index bit mask */
>      >>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
>      >>> +    uint32_t pq_mask;     /* Page request queue index bit mask */
>      >>> +
>      >>> +    /* interrupt notifier */
>      >>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
>      >>> +
>      >>> +    /* IOMMU State Machine */
>      >>> +    QemuThread core_proc; /* Background processing thread */
>      >>> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
>      >>> +    QemuCond core_cond;   /* Background processing wake up signal */
>      >>> +    unsigned core_exec;   /* Processing thread execution actions */
>      >>> +
>      >>> +    /* IOMMU target address space */
>      >>> +    AddressSpace *target_as;
>      >>> +    MemoryRegion *target_mr;
>      >>> +
>      >>> +    /* MSI / MRIF access trap */
>      >>> +    AddressSpace trap_as;
>      >>> +    MemoryRegion trap_mr;
>      >>> +
>      >>> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
>      >>> +
>      >>> +    /* MMIO Hardware Interface */
>      >>> +    MemoryRegion regs_mr;
>      >>> +    QemuSpin regs_lock;
>      >>> +    uint8_t *regs_rw;  /* register state (user write) */
>      >>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
>      >>> +    uint8_t *regs_ro;  /* read-only mask */
>      >>> +
>      >>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
>      >>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
>      >>> +};
>      >>> +
>      >>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>      >>> +         Error **errp);
>      >>> +
>      >>> +/* private helpers */
>      >>> +
>      >>> +/* Register helper functions */
>      >>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
>      >>> +    unsigned idx, uint32_t set, uint32_t clr)
>      >>> +{
>      >>> +    uint32_t val;
>      >>> +    qemu_spin_lock(&s->regs_lock);
>      >>> +    val = ldl_le_p(s->regs_rw + idx);
>      >>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
>      >>> +    qemu_spin_unlock(&s->regs_lock);
>      >>> +    return val;
>      >>> +}
>      >>> +
>      >>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
>      >>> +    unsigned idx, uint32_t set)
>      >>> +{
>      >>> +    qemu_spin_lock(&s->regs_lock);
>      >>> +    stl_le_p(s->regs_rw + idx, set);
>      >>> +    qemu_spin_unlock(&s->regs_lock);
>      >>> +}
>      >>> +
>      >>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
>      >>> +    unsigned idx)
>      >>> +{
>      >>> +    return ldl_le_p(s->regs_rw + idx);
>      >>> +}
>      >>> +
>      >>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
>      >>> +    unsigned idx, uint64_t set, uint64_t clr)
>      >>> +{
>      >>> +    uint64_t val;
>      >>> +    qemu_spin_lock(&s->regs_lock);
>      >>> +    val = ldq_le_p(s->regs_rw + idx);
>      >>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
>      >>> +    qemu_spin_unlock(&s->regs_lock);
>      >>> +    return val;
>      >>> +}
>      >>> +
>      >>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
>      >>> +    unsigned idx, uint64_t set)
>      >>> +{
>      >>> +    qemu_spin_lock(&s->regs_lock);
>      >>> +    stq_le_p(s->regs_rw + idx, set);
>      >>> +    qemu_spin_unlock(&s->regs_lock);
>      >>> +}
>      >>> +
>      >>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
>      >>> +    unsigned idx)
>      >>> +{
>      >>> +    return ldq_le_p(s->regs_rw + idx);
>      >>> +}
>      >>> +
>      >>> +
>      >>> +
>      >>> +#endif
>      >>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
>      >>> new file mode 100644
>      >>> index 0000000000..42a97caffa
>      >>> --- /dev/null
>      >>> +++ b/hw/riscv/trace-events
>      >>> @@ -0,0 +1,11 @@
>      >>> +# See documentation at docs/devel/tracing.rst
>      >>> +
>      >>> +# riscv-iommu.c
>      >>> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
>      >>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
>      >>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
>      >>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
>      >>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
>      >>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
>      >>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
>      >>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
>      >>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
>      >>> new file mode 100644
>      >>> index 0000000000..b88504b750
>      >>> --- /dev/null
>      >>> +++ b/hw/riscv/trace.h
>      >>> @@ -0,0 +1,2 @@
>      >>> +#include "trace/trace-hw_riscv.h"
>      >>> +
>      >>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
>      >>> new file mode 100644
>      >>> index 0000000000..403b365893
>      >>> --- /dev/null
>      >>> +++ b/include/hw/riscv/iommu.h
>      >>> @@ -0,0 +1,36 @@
>      >>> +/*
>      >>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>      >>> + *
>      >>> + * Copyright (C) 2022-2023 Rivos Inc.
>      >>> + *
>      >>> + * This program is free software; you can redistribute it and/or modify
>      >>> + * it under the terms of the GNU General Public License as published by
>      >>> + * the Free Software Foundation; either version 2 of the License.
>      >>> + *
>      >>> + * This program is distributed in the hope that it will be useful,
>      >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>      >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>      >>> + * GNU General Public License for more details.
>      >>> + *
>      >>> + * You should have received a copy of the GNU General Public License along
>      >>> + * with this program; if not, see <http://www.gnu.org/licenses/ <http://www.gnu.org/licenses/>>.
>      >>> + */
>      >>> +
>      >>> +#ifndef HW_RISCV_IOMMU_H
>      >>> +#define HW_RISCV_IOMMU_H
>      >>> +
>      >>> +#include "qemu/osdep.h"
>      >>> +#include "qom/object.h"
>      >>> +
>      >>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
>      >>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
>      >>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
>      >>> +
>      >>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
>      >>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
>      >>> +
>      >>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
>      >>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
>      >>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
>      >>> +
>      >>> +#endif
>      >>> diff --git a/meson.build b/meson.build
>      >>> index c59ca496f2..75e56f3282 100644
>      >>> --- a/meson.build
>      >>> +++ b/meson.build
>      >>> @@ -3361,6 +3361,7 @@ if have_system
>      >>>       'hw/rdma',
>      >>>       'hw/rdma/vmw',
>      >>>       'hw/rtc',
>      >>> +    'hw/riscv',
>      >>>       'hw/s390x',
>      >>>       'hw/scsi',
>      >>>       'hw/sd',
>      >>> --
>      >>> 2.43.2
>      >>>
>      >>>
>
Frank Chang May 21, 2024, 10:52 a.m. UTC | #10
Hi Daniel,

On Tue, May 21, 2024 at 12:17 AM Daniel Henrique Barboza <
dbarboza@ventanamicro.com> wrote:

> Hi Frank,
>
> On 5/16/24 04:13, Frank Chang wrote:
> > On Mon, May 13, 2024 at 8:37 PM Daniel Henrique Barboza <
> dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com>> wrote:
> >
> >     Hi Frank,
> >
> >
> >     On 5/8/24 08:15, Daniel Henrique Barboza wrote:
> >      > Hi Frank,
> >      >
> >      > I'll reply with that I've done so far. Still missing some stuff:
> >      >
> >      > On 5/2/24 08:37, Frank Chang wrote:
> >      >> Hi Daniel,
> >      >>
> >      >> Daniel Henrique Barboza <dbarboza@ventanamicro.com <mailto:
> dbarboza@ventanamicro.com>> 於 2024年3月8日 週五 上午12:04寫道:
> >      >>>
> >      >>> From: Tomasz Jeznach <tjeznach@rivosinc.com <mailto:
> tjeznach@rivosinc.com>>
> >      >>>
> >      >>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> >      >>> international process. The latest frozen specifcation can be
> found
> >      >>> at:
> >      >>>
> >      >>>
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
> <
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
> >
> >      >>>
> >      >>> Add the foundation of the device emulation for RISC-V IOMMU,
> which
> >      >>> includes an IOMMU that has no capabilities but MSI interrupt
> support and
> >      >>> fault queue interfaces. We'll add add more features
> incrementally in the
> >      >>> next patches.
> >      >>>
> >      >>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com <mailto:
> seb@rivosinc.com>>
> >      >>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com <mailto:
> seb@rivosinc.com>>
> >      >>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com <mailto:
> tjeznach@rivosinc.com>>
> >      >>> Signed-off-by: Daniel Henrique Barboza <
> dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com>>
> >      >>> ---
> >      >>>   hw/riscv/Kconfig         |    4 +
> >
> >     (...)
> >
> >      >>> +
> >      >>> +    s->iommus.le_next = NULL;
> >      >>> +    s->iommus.le_prev = NULL;
> >      >>> +    QLIST_INIT(&s->spaces);
> >      >>> +    qemu_cond_init(&s->core_cond);
> >      >>> +    qemu_mutex_init(&s->core_lock);
> >      >>> +    qemu_spin_init(&s->regs_lock);
> >      >>> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
> >      >>> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
> >      >>
> >      >> In our experience, using QEMU thread increases the latency of
> command
> >      >> queue processing,
> >      >> which leads to the potential IOMMU fence timeout in the Linux
> driver
> >      >> when using IOMMU with KVM,
> >      >> e.g. booting the guest Linux.
> >      >>
> >      >> Is it possible to remove the thread from the IOMMU just like
> ARM, AMD,
> >      >> and Intel IOMMU models?
> >      >
> >      > Interesting. We've been using this emulation internally in
> Ventana, with
> >      > KVM and VFIO, and didn't experience this issue. Drew is on CC and
> can talk
> >      > more about it.
> >      >
> >      > That said, I don't mind this change, assuming it's feasible to
> make it for this
> >      > first version.  I'll need to check it how other IOMMUs are doing
> it.
> >
> >
> >     I removed the threading and it seems to be working fine without it.
> I'll commit this
> >     change for v3.
> >
> >      >
> >      >
> >      >
> >      >>
> >      >>> +}
> >      >>> +
> >      >
> >      > (...)
> >      >
> >      >>> +
> >      >>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void
> *opaque, int devfn)
> >      >>> +{
> >      >>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> >      >>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus),
> devfn);
> >      >>> +    AddressSpace *as = NULL;
> >      >>> +
> >      >>> +    if (pdev && pci_is_iommu(pdev)) {
> >      >>> +        return s->target_as;
> >      >>> +    }
> >      >>> +
> >      >>> +    /* Find first registered IOMMU device */
> >      >>> +    while (s->iommus.le_prev) {
> >      >>> +        s = *(s->iommus.le_prev);
> >      >>> +    }
> >      >>> +
> >      >>> +    /* Find first matching IOMMU */
> >      >>> +    while (s != NULL && as == NULL) {
> >      >>> +        as = riscv_iommu_space(s,
> PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> >      >>
> >      >> For pci_bus_num(),
> >      >> riscv_iommu_find_as() can be called at the very early stage
> >      >> where software has no chance to enumerate the bus numbers.
> >
> >     I investigated and this doesn't seem to be a problem. This function
> is called at the
> >     last step of the realize() steps of both riscv_iommu_pci_realize()
> and
> >     riscv_iommu_sys_realize(), and by that time the pci_bus_num() is
> already assigned.
> >     Other iommus use pci_bus_num() into their own get_address_space()
> callbacks like
> >     this too.
> >
> >
> > Hi Daniel,
> >
> > IIUC, pci_bus_num() by default is assigned to pcibus_num():
> >
> > static int pcibus_num(PCIBus *bus)
> > {
> >      if (pci_bus_is_root(bus)) {
> >          return 0; /* pci host bridge */
> >      }
> >      return bus->parent_dev->config[PCI_SECONDARY_BUS];
> > }
> >
> > If the bus is not the root bus, it tries to read the bus' parent device's
> > secondary bus number (PCI_SECONDARY_BUS) field in the PCI configuration
> space.
> > This field should be programmable by the SW during PCIe enumeration.
> > But I don't think SW has a chance to be executed before
> riscv_iommu_sys_realize() is called,
> > since it's pretty early before CPU's execution unless RISC-V IOMMU is
> hot-plugged.
> > Even if RISC-V IOMMU is hot-plugged, I think riscv_iommu_sys_realize()
> is still called
> > before SW aware of the existence of IOMMU on the PCI topology tree.
> >
> > Do you think this matches your observation?
>
> It does. You have a good point on how the pcibus_num() can vary if SW
> wants to
> change the PCI_SECONDARY_BUS and the IOMMU isn't in a root bus. Note that
> this
> will not happen with the existing riscv-iommu-pci device as it is now,
> since it
> has code to prevent the device to be attached to non-pci root buses, but
> there's
> no restrictions in the riscv-iommu-sys device.
>

Thanks for the explanation.

Do you know where this limitation is from?
Is it in this patchset or it's somewhere else in the Linux RISC-V IOMMU
driver?

BTW, for the case like DesignWare PCIe host controller [1],
we cannot connect RISC-V IOMMU to the root bus ("pcie") [2]
because it already has a child bus ("dw-pcie") connecting to it [3].

If we try to connect RISC-V IOMMU to the root bus ("pcie"),
it can't be discovered by Linux PCIe driver as a PCIe Downstream Port
normally leads to a Link with only Device 0 on it.

PCIe spec 6.0, section 7.3.1 stats:
"Downstream Ports that do not have ARI Forwarding enabled must associate
only Device 0 with the device attached to the Logical Bus representing the
Link
from the Port."

The PCI slot scan is early returned in the Linux PCIe driver [4][5].

Do you think it's possible to remove this limitation?

[1] https://github.com/qemu/qemu/blob/master/hw/pci-host/designware.c
[2] https://github.com/qemu/qemu/blob/master/hw/pci-host/designware.c#L695
[3] https://github.com/qemu/qemu/blob/master/hw/pci-host/designware.c#L409
[4] https://github.com/torvalds/linux/blob/master/drivers/pci/probe.c#L2674
[5] https://github.com/torvalds/linux/blob/master/drivers/pci/probe.c#L2652

Regards,
Frank Chang

>
> And speaking on riscv-iommu-bus, the current device we have in this series
> is too
> bare bones, without an actual use case for it (e.g. code to add it in the
> 'virt'
> machine), but it's getting in the way nevertheless.
>
> I'll remove the riscv-iommu-sys device from v3 and re-introduce it in a
> later
> revision or as a follow up series. Sunil has a handful of patches that add
> the
> riscv-iommu-sys device in the 'virt' machine and the proper ACPI support
> for it [1],
> and I intend to use them as a base. We'll then need some minor adjustments
> in the
> existing code to make it fully functional like we're doing with
> riscv-iommu-pci.
>
>
> Thanks,
>
> Daniel
>
>
> [1] https://github.com/vlsunil/qemu/commits/acpi_rimt_poc_v1/
> >
> > Regards,
> > Frank Chang
> >
> >
> >
> >     Thanks,
> >
> >
> >     Daniel
> >
> >
> >      >
> >      > I'll see how other IOMMUs are handling their iommu_find_as()
> >      >
> >      >
> >      > Thanks,
> >      >
> >      >
> >      > Daniel
> >      >
> >      >
> >      >>
> >      >>
> >      >>
> >      >>
> >      >>> +        s = s->iommus.le_next;
> >      >>> +    }
> >      >>> +
> >      >>> +    return as ? as : &address_space_memory;
> >      >>> +}
> >      >>> +
> >      >>> +static const PCIIOMMUOps riscv_iommu_ops = {
> >      >>> +    .get_address_space = riscv_iommu_find_as,
> >      >>> +};
> >      >>> +
> >      >>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu,
> PCIBus *bus,
> >      >>> +        Error **errp)
> >      >>> +{
> >      >>> +    if (bus->iommu_ops &&
> >      >>> +        bus->iommu_ops->get_address_space ==
> riscv_iommu_find_as) {
> >      >>> +        /* Allow multiple IOMMUs on the same PCIe bus, link
> known devices */
> >      >>> +        RISCVIOMMUState *last = (RISCVIOMMUState
> *)bus->iommu_opaque;
> >      >>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> >      >>> +    } else if (bus->iommu_ops == NULL) {
> >      >>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
> >      >>> +    } else {
> >      >>> +        error_setg(errp, "can't register secondary IOMMU for
> PCI bus #%d",
> >      >>> +            pci_bus_num(bus));
> >      >>> +    }
> >      >>> +}
> >      >>> +
> >      >>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion
> *iommu_mr,
> >      >>> +    MemTxAttrs attrs)
> >      >>> +{
> >      >>> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID :
> (int)attrs.pasid;
> >      >>> +}
> >      >>> +
> >      >>> +static int
> riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> >      >>> +{
> >      >>> +    RISCVIOMMUSpace *as = container_of(iommu_mr,
> RISCVIOMMUSpace, iova_mr);
> >      >>> +    return 1 << as->iommu->pasid_bits;
> >      >>> +}
> >      >>> +
> >      >>> +static void riscv_iommu_memory_region_init(ObjectClass *klass,
> void *data)
> >      >>> +{
> >      >>> +    IOMMUMemoryRegionClass *imrc =
> IOMMU_MEMORY_REGION_CLASS(klass);
> >      >>> +
> >      >>> +    imrc->translate = riscv_iommu_memory_region_translate;
> >      >>> +    imrc->notify_flag_changed =
> riscv_iommu_memory_region_notify;
> >      >>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> >      >>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> >      >>> +}
> >      >>> +
> >      >>> +static const TypeInfo riscv_iommu_memory_region_info = {
> >      >>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> >      >>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> >      >>> +    .class_init = riscv_iommu_memory_region_init,
> >      >>> +};
> >      >>> +
> >      >>> +static void riscv_iommu_register_mr_types(void)
> >      >>> +{
> >      >>> +    type_register_static(&riscv_iommu_memory_region_info);
> >      >>> +    type_register_static(&riscv_iommu_info);
> >      >>> +}
> >      >>> +
> >      >>> +type_init(riscv_iommu_register_mr_types);
> >      >>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> >      >>> new file mode 100644
> >      >>> index 0000000000..6f740de690
> >      >>> --- /dev/null
> >      >>> +++ b/hw/riscv/riscv-iommu.h
> >      >>> @@ -0,0 +1,141 @@
> >      >>> +/*
> >      >>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> >      >>> + *
> >      >>> + * Copyright (C) 2022-2023 Rivos Inc.
> >      >>> + *
> >      >>> + * This program is free software; you can redistribute it
> and/or modify
> >      >>> + * it under the terms of the GNU General Public License as
> published by
> >      >>> + * the Free Software Foundation; either version 2 of the
> License.
> >      >>> + *
> >      >>> + * This program is distributed in the hope that it will be
> useful,
> >      >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty
> of
> >      >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
> the
> >      >>> + * GNU General Public License for more details.
> >      >>> + *
> >      >>> + * You should have received a copy of the GNU General Public
> License along
> >      >>> + * with this program; if not, see <
> http://www.gnu.org/licenses/ <http://www.gnu.org/licenses/>>.
> >      >>> + */
> >      >>> +
> >      >>> +#ifndef HW_RISCV_IOMMU_STATE_H
> >      >>> +#define HW_RISCV_IOMMU_STATE_H
> >      >>> +
> >      >>> +#include "qemu/osdep.h"
> >      >>> +#include "qom/object.h"
> >      >>> +
> >      >>> +#include "hw/riscv/iommu.h"
> >      >>> +
> >      >>> +struct RISCVIOMMUState {
> >      >>> +    /*< private >*/
> >      >>> +    DeviceState parent_obj;
> >      >>> +
> >      >>> +    /*< public >*/
> >      >>> +    uint32_t version;     /* Reported interface version number
> */
> >      >>> +    uint32_t pasid_bits;  /* process identifier width */
> >      >>> +    uint32_t bus;         /* PCI bus mapping for non-root
> endpoints */
> >      >>> +
> >      >>> +    uint64_t cap;         /* IOMMU supported capabilities */
> >      >>> +    uint64_t fctl;        /* IOMMU enabled features */
> >      >>> +
> >      >>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA
> disabled) */
> >      >>> +    bool enable_msi;      /* Enable MSI remapping */
> >      >>> +
> >      >>> +    /* IOMMU Internal State */
> >      >>> +    uint64_t ddtp;        /* Validated Device Directory Tree
> Root Pointer */
> >      >>> +
> >      >>> +    dma_addr_t cq_addr;   /* Command queue base physical
> address */
> >      >>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical
> address */
> >      >>> +    dma_addr_t pq_addr;   /* Page request queue base physical
> address */
> >      >>> +
> >      >>> +    uint32_t cq_mask;     /* Command queue index bit mask */
> >      >>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask
> */
> >      >>> +    uint32_t pq_mask;     /* Page request queue index bit mask
> */
> >      >>> +
> >      >>> +    /* interrupt notifier */
> >      >>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> >      >>> +
> >      >>> +    /* IOMMU State Machine */
> >      >>> +    QemuThread core_proc; /* Background processing thread */
> >      >>> +    QemuMutex core_lock;  /* Global IOMMU lock, used for
> cache/regs updates */
> >      >>> +    QemuCond core_cond;   /* Background processing wake up
> signal */
> >      >>> +    unsigned core_exec;   /* Processing thread execution
> actions */
> >      >>> +
> >      >>> +    /* IOMMU target address space */
> >      >>> +    AddressSpace *target_as;
> >      >>> +    MemoryRegion *target_mr;
> >      >>> +
> >      >>> +    /* MSI / MRIF access trap */
> >      >>> +    AddressSpace trap_as;
> >      >>> +    MemoryRegion trap_mr;
> >      >>> +
> >      >>> +    GHashTable *ctx_cache;          /* Device translation
> Context Cache */
> >      >>> +
> >      >>> +    /* MMIO Hardware Interface */
> >      >>> +    MemoryRegion regs_mr;
> >      >>> +    QemuSpin regs_lock;
> >      >>> +    uint8_t *regs_rw;  /* register state (user write) */
> >      >>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> >      >>> +    uint8_t *regs_ro;  /* read-only mask */
> >      >>> +
> >      >>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> >      >>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> >      >>> +};
> >      >>> +
> >      >>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu,
> PCIBus *bus,
> >      >>> +         Error **errp);
> >      >>> +
> >      >>> +/* private helpers */
> >      >>> +
> >      >>> +/* Register helper functions */
> >      >>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState
> *s,
> >      >>> +    unsigned idx, uint32_t set, uint32_t clr)
> >      >>> +{
> >      >>> +    uint32_t val;
> >      >>> +    qemu_spin_lock(&s->regs_lock);
> >      >>> +    val = ldl_le_p(s->regs_rw + idx);
> >      >>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> >      >>> +    qemu_spin_unlock(&s->regs_lock);
> >      >>> +    return val;
> >      >>> +}
> >      >>> +
> >      >>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> >      >>> +    unsigned idx, uint32_t set)
> >      >>> +{
> >      >>> +    qemu_spin_lock(&s->regs_lock);
> >      >>> +    stl_le_p(s->regs_rw + idx, set);
> >      >>> +    qemu_spin_unlock(&s->regs_lock);
> >      >>> +}
> >      >>> +
> >      >>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState
> *s,
> >      >>> +    unsigned idx)
> >      >>> +{
> >      >>> +    return ldl_le_p(s->regs_rw + idx);
> >      >>> +}
> >      >>> +
> >      >>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState
> *s,
> >      >>> +    unsigned idx, uint64_t set, uint64_t clr)
> >      >>> +{
> >      >>> +    uint64_t val;
> >      >>> +    qemu_spin_lock(&s->regs_lock);
> >      >>> +    val = ldq_le_p(s->regs_rw + idx);
> >      >>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> >      >>> +    qemu_spin_unlock(&s->regs_lock);
> >      >>> +    return val;
> >      >>> +}
> >      >>> +
> >      >>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> >      >>> +    unsigned idx, uint64_t set)
> >      >>> +{
> >      >>> +    qemu_spin_lock(&s->regs_lock);
> >      >>> +    stq_le_p(s->regs_rw + idx, set);
> >      >>> +    qemu_spin_unlock(&s->regs_lock);
> >      >>> +}
> >      >>> +
> >      >>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState
> *s,
> >      >>> +    unsigned idx)
> >      >>> +{
> >      >>> +    return ldq_le_p(s->regs_rw + idx);
> >      >>> +}
> >      >>> +
> >      >>> +
> >      >>> +
> >      >>> +#endif
> >      >>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> >      >>> new file mode 100644
> >      >>> index 0000000000..42a97caffa
> >      >>> --- /dev/null
> >      >>> +++ b/hw/riscv/trace-events
> >      >>> @@ -0,0 +1,11 @@
> >      >>> +# See documentation at docs/devel/tracing.rst
> >      >>> +
> >      >>> +# riscv-iommu.c
> >      >>> +riscv_iommu_new(const char *id, unsigned b, unsigned d,
> unsigned f) "%s: device attached %04x:%02x.%d"
> >      >>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d,
> unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason:
> 0x%"PRIx64" iova: 0x%"PRIx64
> >      >>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d,
> unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> >      >>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d,
> unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys)
> "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> >      >>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d,
> unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI
> 0x%"PRIx64" -> 0x%"PRIx64
> >      >>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s:
> command 0x%"PRIx64" 0x%"PRIx64
> >      >>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb
> notifier added"
> >      >>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb
> notifier removed"
> >      >>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> >      >>> new file mode 100644
> >      >>> index 0000000000..b88504b750
> >      >>> --- /dev/null
> >      >>> +++ b/hw/riscv/trace.h
> >      >>> @@ -0,0 +1,2 @@
> >      >>> +#include "trace/trace-hw_riscv.h"
> >      >>> +
> >      >>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> >      >>> new file mode 100644
> >      >>> index 0000000000..403b365893
> >      >>> --- /dev/null
> >      >>> +++ b/include/hw/riscv/iommu.h
> >      >>> @@ -0,0 +1,36 @@
> >      >>> +/*
> >      >>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> >      >>> + *
> >      >>> + * Copyright (C) 2022-2023 Rivos Inc.
> >      >>> + *
> >      >>> + * This program is free software; you can redistribute it
> and/or modify
> >      >>> + * it under the terms of the GNU General Public License as
> published by
> >      >>> + * the Free Software Foundation; either version 2 of the
> License.
> >      >>> + *
> >      >>> + * This program is distributed in the hope that it will be
> useful,
> >      >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty
> of
> >      >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
> the
> >      >>> + * GNU General Public License for more details.
> >      >>> + *
> >      >>> + * You should have received a copy of the GNU General Public
> License along
> >      >>> + * with this program; if not, see <
> http://www.gnu.org/licenses/ <http://www.gnu.org/licenses/>>.
> >      >>> + */
> >      >>> +
> >      >>> +#ifndef HW_RISCV_IOMMU_H
> >      >>> +#define HW_RISCV_IOMMU_H
> >      >>> +
> >      >>> +#include "qemu/osdep.h"
> >      >>> +#include "qom/object.h"
> >      >>> +
> >      >>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
> >      >>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> >      >>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> >      >>> +
> >      >>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
> >      >>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> >      >>> +
> >      >>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
> >      >>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> >      >>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> >      >>> +
> >      >>> +#endif
> >      >>> diff --git a/meson.build b/meson.build
> >      >>> index c59ca496f2..75e56f3282 100644
> >      >>> --- a/meson.build
> >      >>> +++ b/meson.build
> >      >>> @@ -3361,6 +3361,7 @@ if have_system
> >      >>>       'hw/rdma',
> >      >>>       'hw/rdma/vmw',
> >      >>>       'hw/rtc',
> >      >>> +    'hw/riscv',
> >      >>>       'hw/s390x',
> >      >>>       'hw/scsi',
> >      >>>       'hw/sd',
> >      >>> --
> >      >>> 2.43.2
> >      >>>
> >      >>>
> >
>
Daniel Henrique Barboza May 21, 2024, 12:28 p.m. UTC | #11
On 5/21/24 07:52, Frank Chang wrote:
> Hi Daniel,
> 
> On Tue, May 21, 2024 at 12:17 AM Daniel Henrique Barboza <dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com>> wrote:
> 
>     Hi Frank,
> 
>     On 5/16/24 04:13, Frank Chang wrote:
>      > On Mon, May 13, 2024 at 8:37 PM Daniel Henrique Barboza <dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com> <mailto:dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com>>> wrote:
>      >
>      >     Hi Frank,
>      >
>      >
>      >     On 5/8/24 08:15, Daniel Henrique Barboza wrote:
>      >      > Hi Frank,
>      >      >
>      >      > I'll reply with that I've done so far. Still missing some stuff:
>      >      >
>      >      > On 5/2/24 08:37, Frank Chang wrote:
>      >      >> Hi Daniel,
>      >      >>
>      >      >> Daniel Henrique Barboza <dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com> <mailto:dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com>>> 於 2024年3月8日 週五 上午12:04寫道:
>      >      >>>
>      >      >>> From: Tomasz Jeznach <tjeznach@rivosinc.com <mailto:tjeznach@rivosinc.com> <mailto:tjeznach@rivosinc.com <mailto:tjeznach@rivosinc.com>>>
>      >      >>>
>      >      >>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
>      >      >>> international process. The latest frozen specifcation can be found
>      >      >>> at:
>      >      >>>
>      >      >>> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf <https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf> <https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf <https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf>>
>      >      >>>
>      >      >>> Add the foundation of the device emulation for RISC-V IOMMU, which
>      >      >>> includes an IOMMU that has no capabilities but MSI interrupt support and
>      >      >>> fault queue interfaces. We'll add add more features incrementally in the
>      >      >>> next patches.
>      >      >>>
>      >      >>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com <mailto:seb@rivosinc.com> <mailto:seb@rivosinc.com <mailto:seb@rivosinc.com>>>
>      >      >>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com <mailto:seb@rivosinc.com> <mailto:seb@rivosinc.com <mailto:seb@rivosinc.com>>>
>      >      >>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com <mailto:tjeznach@rivosinc.com> <mailto:tjeznach@rivosinc.com <mailto:tjeznach@rivosinc.com>>>
>      >      >>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com> <mailto:dbarboza@ventanamicro.com <mailto:dbarboza@ventanamicro.com>>>
>      >      >>> ---
>      >      >>>   hw/riscv/Kconfig         |    4 +
>      >
>      >     (...)
>      >
>      >      >>> +
>      >      >>> +    s->iommus.le_next = NULL;
>      >      >>> +    s->iommus.le_prev = NULL;
>      >      >>> +    QLIST_INIT(&s->spaces);
>      >      >>> +    qemu_cond_init(&s->core_cond);
>      >      >>> +    qemu_mutex_init(&s->core_lock);
>      >      >>> +    qemu_spin_init(&s->regs_lock);
>      >      >>> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
>      >      >>> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
>      >      >>
>      >      >> In our experience, using QEMU thread increases the latency of command
>      >      >> queue processing,
>      >      >> which leads to the potential IOMMU fence timeout in the Linux driver
>      >      >> when using IOMMU with KVM,
>      >      >> e.g. booting the guest Linux.
>      >      >>
>      >      >> Is it possible to remove the thread from the IOMMU just like ARM, AMD,
>      >      >> and Intel IOMMU models?
>      >      >
>      >      > Interesting. We've been using this emulation internally in Ventana, with
>      >      > KVM and VFIO, and didn't experience this issue. Drew is on CC and can talk
>      >      > more about it.
>      >      >
>      >      > That said, I don't mind this change, assuming it's feasible to make it for this
>      >      > first version.  I'll need to check it how other IOMMUs are doing it.
>      >
>      >
>      >     I removed the threading and it seems to be working fine without it. I'll commit this
>      >     change for v3.
>      >
>      >      >
>      >      >
>      >      >
>      >      >>
>      >      >>> +}
>      >      >>> +
>      >      >
>      >      > (...)
>      >      >
>      >      >>> +
>      >      >>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
>      >      >>> +{
>      >      >>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
>      >      >>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>      >      >>> +    AddressSpace *as = NULL;
>      >      >>> +
>      >      >>> +    if (pdev && pci_is_iommu(pdev)) {
>      >      >>> +        return s->target_as;
>      >      >>> +    }
>      >      >>> +
>      >      >>> +    /* Find first registered IOMMU device */
>      >      >>> +    while (s->iommus.le_prev) {
>      >      >>> +        s = *(s->iommus.le_prev);
>      >      >>> +    }
>      >      >>> +
>      >      >>> +    /* Find first matching IOMMU */
>      >      >>> +    while (s != NULL && as == NULL) {
>      >      >>> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
>      >      >>
>      >      >> For pci_bus_num(),
>      >      >> riscv_iommu_find_as() can be called at the very early stage
>      >      >> where software has no chance to enumerate the bus numbers.
>      >
>      >     I investigated and this doesn't seem to be a problem. This function is called at the
>      >     last step of the realize() steps of both riscv_iommu_pci_realize() and
>      >     riscv_iommu_sys_realize(), and by that time the pci_bus_num() is already assigned.
>      >     Other iommus use pci_bus_num() into their own get_address_space() callbacks like
>      >     this too.
>      >
>      >
>      > Hi Daniel,
>      >
>      > IIUC, pci_bus_num() by default is assigned to pcibus_num():
>      >
>      > static int pcibus_num(PCIBus *bus)
>      > {
>      >      if (pci_bus_is_root(bus)) {
>      >          return 0; /* pci host bridge */
>      >      }
>      >      return bus->parent_dev->config[PCI_SECONDARY_BUS];
>      > }
>      >
>      > If the bus is not the root bus, it tries to read the bus' parent device's
>      > secondary bus number (PCI_SECONDARY_BUS) field in the PCI configuration space.
>      > This field should be programmable by the SW during PCIe enumeration.
>      > But I don't think SW has a chance to be executed before riscv_iommu_sys_realize() is called,
>      > since it's pretty early before CPU's execution unless RISC-V IOMMU is hot-plugged.
>      > Even if RISC-V IOMMU is hot-plugged, I think riscv_iommu_sys_realize() is still called
>      > before SW aware of the existence of IOMMU on the PCI topology tree.
>      >
>      > Do you think this matches your observation?
> 
>     It does. You have a good point on how the pcibus_num() can vary if SW wants to
>     change the PCI_SECONDARY_BUS and the IOMMU isn't in a root bus. Note that this
>     will not happen with the existing riscv-iommu-pci device as it is now, since it
>     has code to prevent the device to be attached to non-pci root buses, but there's
>     no restrictions in the riscv-iommu-sys device.
> 
> 
> Thanks for the explanation.
> 
> Do you know where this limitation is from?
> Is it in this patchset or it's somewhere else in the Linux RISC-V IOMMU driver?

I don't know. I know that the riscv-iommu spec does not impose these restrictions, so
I assume that this is a design choice from Tomasz when designing both the first
QEMU impl and the Linux driver.


> 
> BTW, for the case like DesignWare PCIe host controller [1],
> we cannot connect RISC-V IOMMU to the root bus ("pcie") [2]
> because it already has a child bus ("dw-pcie") connecting to it [3].
> 
> If we try to connect RISC-V IOMMU to the root bus ("pcie"),
> it can't be discovered by Linux PCIe driver as a PCIe Downstream Port
> normally leads to a Link with only Device 0 on it.
> 
> PCIe spec 6.0, section 7.3.1 stats:
> "Downstream Ports that do not have ARI Forwarding enabled must associate
> only Device 0 with the device attached to the Logical Bus representing the Link
> from the Port."
> 
> The PCI slot scan is early returned in the Linux PCIe driver [4][5].
> 
> Do you think it's possible to remove this limitation?

I'm pretty sure it is. It's only a matter of how much code and effort we're
willing to put into it.

For this initial impl I believe we can live with this restriction. We will enable more
use cases as we go along on both Linux and QEMU.


Thanks,

Daniel



> 
> [1] https://github.com/qemu/qemu/blob/master/hw/pci-host/designware.c <https://github.com/qemu/qemu/blob/master/hw/pci-host/designware.c>
> [2] https://github.com/qemu/qemu/blob/master/hw/pci-host/designware.c#L695 <https://github.com/qemu/qemu/blob/master/hw/pci-host/designware.c#L695>
> [3] https://github.com/qemu/qemu/blob/master/hw/pci-host/designware.c#L409 <https://github.com/qemu/qemu/blob/master/hw/pci-host/designware.c#L409>
> [4] https://github.com/torvalds/linux/blob/master/drivers/pci/probe.c#L2674 <https://github.com/torvalds/linux/blob/master/drivers/pci/probe.c#L2674>
> [5] https://github.com/torvalds/linux/blob/master/drivers/pci/probe.c#L2652 <https://github.com/torvalds/linux/blob/master/drivers/pci/probe.c#L2652>
> 
> Regards,
> Frank Chang
> 
> 
>     And speaking on riscv-iommu-bus, the current device we have in this series is too
>     bare bones, without an actual use case for it (e.g. code to add it in the 'virt'
>     machine), but it's getting in the way nevertheless.
> 
>     I'll remove the riscv-iommu-sys device from v3 and re-introduce it in a later
>     revision or as a follow up series. Sunil has a handful of patches that add the
>     riscv-iommu-sys device in the 'virt' machine and the proper ACPI support for it [1],
>     and I intend to use them as a base. We'll then need some minor adjustments in the
>     existing code to make it fully functional like we're doing with riscv-iommu-pci.
> 
> 
>     Thanks,
> 
>     Daniel
> 
> 
>     [1] https://github.com/vlsunil/qemu/commits/acpi_rimt_poc_v1/ <https://github.com/vlsunil/qemu/commits/acpi_rimt_poc_v1/>
>      >
>      > Regards,
>      > Frank Chang
>      >
>      >
>      >
>      >     Thanks,
>      >
>      >
>      >     Daniel
>      >
>      >
>      >      >
>      >      > I'll see how other IOMMUs are handling their iommu_find_as()
>      >      >
>      >      >
>      >      > Thanks,
>      >      >
>      >      >
>      >      > Daniel
>      >      >
>      >      >
>      >      >>
>      >      >>
>      >      >>
>      >      >>
>      >      >>> +        s = s->iommus.le_next;
>      >      >>> +    }
>      >      >>> +
>      >      >>> +    return as ? as : &address_space_memory;
>      >      >>> +}
>      >      >>> +
>      >      >>> +static const PCIIOMMUOps riscv_iommu_ops = {
>      >      >>> +    .get_address_space = riscv_iommu_find_as,
>      >      >>> +};
>      >      >>> +
>      >      >>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>      >      >>> +        Error **errp)
>      >      >>> +{
>      >      >>> +    if (bus->iommu_ops &&
>      >      >>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
>      >      >>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
>      >      >>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
>      >      >>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
>      >      >>> +    } else if (bus->iommu_ops == NULL) {
>      >      >>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
>      >      >>> +    } else {
>      >      >>> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
>      >      >>> +            pci_bus_num(bus));
>      >      >>> +    }
>      >      >>> +}
>      >      >>> +
>      >      >>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
>      >      >>> +    MemTxAttrs attrs)
>      >      >>> +{
>      >      >>> +    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
>      >      >>> +}
>      >      >>> +
>      >      >>> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
>      >      >>> +{
>      >      >>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>      >      >>> +    return 1 << as->iommu->pasid_bits;
>      >      >>> +}
>      >      >>> +
>      >      >>> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
>      >      >>> +{
>      >      >>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>      >      >>> +
>      >      >>> +    imrc->translate = riscv_iommu_memory_region_translate;
>      >      >>> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
>      >      >>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
>      >      >>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
>      >      >>> +}
>      >      >>> +
>      >      >>> +static const TypeInfo riscv_iommu_memory_region_info = {
>      >      >>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
>      >      >>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
>      >      >>> +    .class_init = riscv_iommu_memory_region_init,
>      >      >>> +};
>      >      >>> +
>      >      >>> +static void riscv_iommu_register_mr_types(void)
>      >      >>> +{
>      >      >>> +    type_register_static(&riscv_iommu_memory_region_info);
>      >      >>> +    type_register_static(&riscv_iommu_info);
>      >      >>> +}
>      >      >>> +
>      >      >>> +type_init(riscv_iommu_register_mr_types);
>      >      >>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
>      >      >>> new file mode 100644
>      >      >>> index 0000000000..6f740de690
>      >      >>> --- /dev/null
>      >      >>> +++ b/hw/riscv/riscv-iommu.h
>      >      >>> @@ -0,0 +1,141 @@
>      >      >>> +/*
>      >      >>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>      >      >>> + *
>      >      >>> + * Copyright (C) 2022-2023 Rivos Inc.
>      >      >>> + *
>      >      >>> + * This program is free software; you can redistribute it and/or modify
>      >      >>> + * it under the terms of the GNU General Public License as published by
>      >      >>> + * the Free Software Foundation; either version 2 of the License.
>      >      >>> + *
>      >      >>> + * This program is distributed in the hope that it will be useful,
>      >      >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>      >      >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>      >      >>> + * GNU General Public License for more details.
>      >      >>> + *
>      >      >>> + * You should have received a copy of the GNU General Public License along
>      >      >>> + * with this program; if not, see <http://www.gnu.org/licenses/ <http://www.gnu.org/licenses/> <http://www.gnu.org/licenses/ <http://www.gnu.org/licenses/>>>.
>      >      >>> + */
>      >      >>> +
>      >      >>> +#ifndef HW_RISCV_IOMMU_STATE_H
>      >      >>> +#define HW_RISCV_IOMMU_STATE_H
>      >      >>> +
>      >      >>> +#include "qemu/osdep.h"
>      >      >>> +#include "qom/object.h"
>      >      >>> +
>      >      >>> +#include "hw/riscv/iommu.h"
>      >      >>> +
>      >      >>> +struct RISCVIOMMUState {
>      >      >>> +    /*< private >*/
>      >      >>> +    DeviceState parent_obj;
>      >      >>> +
>      >      >>> +    /*< public >*/
>      >      >>> +    uint32_t version;     /* Reported interface version number */
>      >      >>> +    uint32_t pasid_bits;  /* process identifier width */
>      >      >>> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
>      >      >>> +
>      >      >>> +    uint64_t cap;         /* IOMMU supported capabilities */
>      >      >>> +    uint64_t fctl;        /* IOMMU enabled features */
>      >      >>> +
>      >      >>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>      >      >>> +    bool enable_msi;      /* Enable MSI remapping */
>      >      >>> +
>      >      >>> +    /* IOMMU Internal State */
>      >      >>> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
>      >      >>> +
>      >      >>> +    dma_addr_t cq_addr;   /* Command queue base physical address */
>      >      >>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
>      >      >>> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
>      >      >>> +
>      >      >>> +    uint32_t cq_mask;     /* Command queue index bit mask */
>      >      >>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
>      >      >>> +    uint32_t pq_mask;     /* Page request queue index bit mask */
>      >      >>> +
>      >      >>> +    /* interrupt notifier */
>      >      >>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
>      >      >>> +
>      >      >>> +    /* IOMMU State Machine */
>      >      >>> +    QemuThread core_proc; /* Background processing thread */
>      >      >>> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
>      >      >>> +    QemuCond core_cond;   /* Background processing wake up signal */
>      >      >>> +    unsigned core_exec;   /* Processing thread execution actions */
>      >      >>> +
>      >      >>> +    /* IOMMU target address space */
>      >      >>> +    AddressSpace *target_as;
>      >      >>> +    MemoryRegion *target_mr;
>      >      >>> +
>      >      >>> +    /* MSI / MRIF access trap */
>      >      >>> +    AddressSpace trap_as;
>      >      >>> +    MemoryRegion trap_mr;
>      >      >>> +
>      >      >>> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
>      >      >>> +
>      >      >>> +    /* MMIO Hardware Interface */
>      >      >>> +    MemoryRegion regs_mr;
>      >      >>> +    QemuSpin regs_lock;
>      >      >>> +    uint8_t *regs_rw;  /* register state (user write) */
>      >      >>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
>      >      >>> +    uint8_t *regs_ro;  /* read-only mask */
>      >      >>> +
>      >      >>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
>      >      >>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
>      >      >>> +};
>      >      >>> +
>      >      >>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>      >      >>> +         Error **errp);
>      >      >>> +
>      >      >>> +/* private helpers */
>      >      >>> +
>      >      >>> +/* Register helper functions */
>      >      >>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
>      >      >>> +    unsigned idx, uint32_t set, uint32_t clr)
>      >      >>> +{
>      >      >>> +    uint32_t val;
>      >      >>> +    qemu_spin_lock(&s->regs_lock);
>      >      >>> +    val = ldl_le_p(s->regs_rw + idx);
>      >      >>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
>      >      >>> +    qemu_spin_unlock(&s->regs_lock);
>      >      >>> +    return val;
>      >      >>> +}
>      >      >>> +
>      >      >>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
>      >      >>> +    unsigned idx, uint32_t set)
>      >      >>> +{
>      >      >>> +    qemu_spin_lock(&s->regs_lock);
>      >      >>> +    stl_le_p(s->regs_rw + idx, set);
>      >      >>> +    qemu_spin_unlock(&s->regs_lock);
>      >      >>> +}
>      >      >>> +
>      >      >>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
>      >      >>> +    unsigned idx)
>      >      >>> +{
>      >      >>> +    return ldl_le_p(s->regs_rw + idx);
>      >      >>> +}
>      >      >>> +
>      >      >>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
>      >      >>> +    unsigned idx, uint64_t set, uint64_t clr)
>      >      >>> +{
>      >      >>> +    uint64_t val;
>      >      >>> +    qemu_spin_lock(&s->regs_lock);
>      >      >>> +    val = ldq_le_p(s->regs_rw + idx);
>      >      >>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
>      >      >>> +    qemu_spin_unlock(&s->regs_lock);
>      >      >>> +    return val;
>      >      >>> +}
>      >      >>> +
>      >      >>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
>      >      >>> +    unsigned idx, uint64_t set)
>      >      >>> +{
>      >      >>> +    qemu_spin_lock(&s->regs_lock);
>      >      >>> +    stq_le_p(s->regs_rw + idx, set);
>      >      >>> +    qemu_spin_unlock(&s->regs_lock);
>      >      >>> +}
>      >      >>> +
>      >      >>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
>      >      >>> +    unsigned idx)
>      >      >>> +{
>      >      >>> +    return ldq_le_p(s->regs_rw + idx);
>      >      >>> +}
>      >      >>> +
>      >      >>> +
>      >      >>> +
>      >      >>> +#endif
>      >      >>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
>      >      >>> new file mode 100644
>      >      >>> index 0000000000..42a97caffa
>      >      >>> --- /dev/null
>      >      >>> +++ b/hw/riscv/trace-events
>      >      >>> @@ -0,0 +1,11 @@
>      >      >>> +# See documentation at docs/devel/tracing.rst
>      >      >>> +
>      >      >>> +# riscv-iommu.c
>      >      >>> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
>      >      >>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
>      >      >>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
>      >      >>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
>      >      >>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
>      >      >>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
>      >      >>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
>      >      >>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
>      >      >>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
>      >      >>> new file mode 100644
>      >      >>> index 0000000000..b88504b750
>      >      >>> --- /dev/null
>      >      >>> +++ b/hw/riscv/trace.h
>      >      >>> @@ -0,0 +1,2 @@
>      >      >>> +#include "trace/trace-hw_riscv.h"
>      >      >>> +
>      >      >>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
>      >      >>> new file mode 100644
>      >      >>> index 0000000000..403b365893
>      >      >>> --- /dev/null
>      >      >>> +++ b/include/hw/riscv/iommu.h
>      >      >>> @@ -0,0 +1,36 @@
>      >      >>> +/*
>      >      >>> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
>      >      >>> + *
>      >      >>> + * Copyright (C) 2022-2023 Rivos Inc.
>      >      >>> + *
>      >      >>> + * This program is free software; you can redistribute it and/or modify
>      >      >>> + * it under the terms of the GNU General Public License as published by
>      >      >>> + * the Free Software Foundation; either version 2 of the License.
>      >      >>> + *
>      >      >>> + * This program is distributed in the hope that it will be useful,
>      >      >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>      >      >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>      >      >>> + * GNU General Public License for more details.
>      >      >>> + *
>      >      >>> + * You should have received a copy of the GNU General Public License along
>      >      >>> + * with this program; if not, see <http://www.gnu.org/licenses/ <http://www.gnu.org/licenses/> <http://www.gnu.org/licenses/ <http://www.gnu.org/licenses/>>>.
>      >      >>> + */
>      >      >>> +
>      >      >>> +#ifndef HW_RISCV_IOMMU_H
>      >      >>> +#define HW_RISCV_IOMMU_H
>      >      >>> +
>      >      >>> +#include "qemu/osdep.h"
>      >      >>> +#include "qom/object.h"
>      >      >>> +
>      >      >>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
>      >      >>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
>      >      >>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
>      >      >>> +
>      >      >>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
>      >      >>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
>      >      >>> +
>      >      >>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
>      >      >>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
>      >      >>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
>      >      >>> +
>      >      >>> +#endif
>      >      >>> diff --git a/meson.build b/meson.build
>      >      >>> index c59ca496f2..75e56f3282 100644
>      >      >>> --- a/meson.build
>      >      >>> +++ b/meson.build
>      >      >>> @@ -3361,6 +3361,7 @@ if have_system
>      >      >>>       'hw/rdma',
>      >      >>>       'hw/rdma/vmw',
>      >      >>>       'hw/rtc',
>      >      >>> +    'hw/riscv',
>      >      >>>       'hw/s390x',
>      >      >>>       'hw/scsi',
>      >      >>>       'hw/sd',
>      >      >>> --
>      >      >>> 2.43.2
>      >      >>>
>      >      >>>
>      >
>
diff mbox series

Patch

diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index 5d644eb7b1..faf6a10029 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -1,3 +1,6 @@ 
+config RISCV_IOMMU
+    bool
+
 config RISCV_NUMA
     bool
 
@@ -38,6 +41,7 @@  config RISCV_VIRT
     select SERIAL
     select RISCV_ACLINT
     select RISCV_APLIC
+    select RISCV_IOMMU
     select RISCV_IMSIC
     select SIFIVE_PLIC
     select SIFIVE_TEST
diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
index 2f7ee81be3..ba9eebd605 100644
--- a/hw/riscv/meson.build
+++ b/hw/riscv/meson.build
@@ -10,5 +10,6 @@  riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
 riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
 riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
 riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
+riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
 
 hw_arch += {'riscv': riscv_ss}
diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
new file mode 100644
index 0000000000..df534b99b0
--- /dev/null
+++ b/hw/riscv/riscv-iommu.c
@@ -0,0 +1,1492 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU (Ziommu)
+ *
+ * Copyright (C) 2021-2023, Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci_device.h"
+#include "hw/qdev-properties.h"
+#include "hw/riscv/riscv_hart.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qemu/timer.h"
+
+#include "cpu_bits.h"
+#include "riscv-iommu.h"
+#include "riscv-iommu-bits.h"
+#include "trace.h"
+
+#define LIMIT_CACHE_CTX               (1U << 7)
+#define LIMIT_CACHE_IOT               (1U << 20)
+
+/* Physical page number coversions */
+#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
+#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
+
+typedef struct RISCVIOMMUContext RISCVIOMMUContext;
+typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
+
+/* Device assigned I/O address space */
+struct RISCVIOMMUSpace {
+    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
+    AddressSpace iova_as;       /* IOVA address space for attached device */
+    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
+    uint32_t devid;             /* Requester identifier, AKA device_id */
+    bool notifier;              /* IOMMU unmap notifier enabled */
+    QLIST_ENTRY(RISCVIOMMUSpace) list;
+};
+
+/* Device translation context state. */
+struct RISCVIOMMUContext {
+    uint64_t devid:24;          /* Requester Id, AKA device_id */
+    uint64_t pasid:20;          /* Process Address Space ID */
+    uint64_t __rfu:20;          /* reserved */
+    uint64_t tc;                /* Translation Control */
+    uint64_t ta;                /* Translation Attributes */
+    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
+    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
+    uint64_t msiptp;            /* MSI redirection page table pointer */
+};
+
+/* IOMMU index for transactions without PASID specified. */
+#define RISCV_IOMMU_NOPASID 0
+
+static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
+{
+    const uint32_t ipsr =
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
+    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
+    if (s->notify && !(ipsr & (1 << vec))) {
+        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
+    }
+}
+
+static void riscv_iommu_fault(RISCVIOMMUState *s,
+                              struct riscv_iommu_fq_record *ev)
+{
+    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
+    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
+    uint32_t next = (tail + 1) & s->fq_mask;
+    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
+
+    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                          PCI_FUNC(devid), ev->hdr, ev->iotval);
+
+    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
+        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
+        return;
+    }
+
+    if (head == next) {
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
+                              RISCV_IOMMU_FQCSR_FQOF, 0);
+    } else {
+        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
+        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
+                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
+                                  RISCV_IOMMU_FQCSR_FQMF, 0);
+        } else {
+            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
+        }
+    }
+
+    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
+    }
+}
+
+static void riscv_iommu_pri(RISCVIOMMUState *s,
+    struct riscv_iommu_pq_record *pr)
+{
+    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
+    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
+    uint32_t next = (tail + 1) & s->pq_mask;
+    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
+
+    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                          PCI_FUNC(devid), pr->payload);
+
+    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
+        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
+        return;
+    }
+
+    if (head == next) {
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
+                              RISCV_IOMMU_PQCSR_PQOF, 0);
+    } else {
+        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
+        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
+                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
+                                  RISCV_IOMMU_PQCSR_PQMF, 0);
+        } else {
+            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
+        }
+    }
+
+    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
+    }
+}
+
+/* Portable implementation of pext_u64, bit-mask extraction. */
+static uint64_t _pext_u64(uint64_t val, uint64_t ext)
+{
+    uint64_t ret = 0;
+    uint64_t rot = 1;
+
+    while (ext) {
+        if (ext & 1) {
+            if (val & 1) {
+                ret |= rot;
+            }
+            rot <<= 1;
+        }
+        val >>= 1;
+        ext >>= 1;
+    }
+
+    return ret;
+}
+
+/* Check if GPA matches MSI/MRIF pattern. */
+static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    dma_addr_t gpa)
+{
+    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
+        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
+        return false; /* Invalid MSI/MRIF mode */
+    }
+
+    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
+        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
+    }
+
+    return true;
+}
+
+/* RISCV IOMMU Address Translation Lookup - Page Table Walk */
+static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    IOMMUTLBEntry *iotlb)
+{
+    /* Early check for MSI address match when IOVA == GPA */
+    if (iotlb->perm & IOMMU_WO &&
+        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
+        iotlb->target_as = &s->trap_as;
+        iotlb->translated_addr = iotlb->iova;
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        return 0;
+    }
+
+    /* Exit early for pass-through mode. */
+    iotlb->translated_addr = iotlb->iova;
+    iotlb->addr_mask = ~TARGET_PAGE_MASK;
+    /* Allow R/W in pass-through mode */
+    iotlb->perm = IOMMU_RW;
+    return 0;
+}
+
+/* Redirect MSI write for given GPA. */
+static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
+    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
+    unsigned size, MemTxAttrs attrs)
+{
+    MemTxResult res;
+    dma_addr_t addr;
+    uint64_t intn;
+    uint32_t n190;
+    uint64_t pte[2];
+
+    if (!riscv_iommu_msi_check(s, ctx, gpa)) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* Interrupt File Number */
+    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
+    if (intn >= 256) {
+        /* Interrupt file number out of range */
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* fetch MSI PTE */
+    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
+    addr = addr | (intn * sizeof(pte));
+    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
+            MEMTXATTRS_UNSPECIFIED);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+
+    le64_to_cpus(&pte[0]);
+    le64_to_cpus(&pte[1]);
+
+    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
+    case RISCV_IOMMU_MSI_PTE_M_BASIC:
+        /* MSI Pass-through mode */
+        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
+        addr = addr | (gpa & TARGET_PAGE_MASK);
+
+        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
+                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
+                              gpa, addr);
+
+        return dma_memory_write(s->target_as, addr, &data, size, attrs);
+    case RISCV_IOMMU_MSI_PTE_M_MRIF:
+        /* MRIF mode, continue. */
+        break;
+    default:
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /*
+     * Report an error for interrupt identities exceeding the maximum allowed
+     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
+     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
+     */
+    if ((data > 2047) || (gpa & 3)) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* MSI MRIF mode, non atomic pending bit update */
+
+    /* MRIF pending bit address */
+    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
+    addr = addr | ((data & 0x7c0) >> 3);
+
+    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
+                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
+                          gpa, addr);
+
+    /* MRIF pending bit mask */
+    data = 1ULL << (data & 0x03f);
+    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+    intn = intn | data;
+    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+
+    /* Get MRIF enable bits */
+    addr = addr + sizeof(intn);
+    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+    if (!(intn & data)) {
+        /* notification disabled, MRIF update completed. */
+        return MEMTX_OK;
+    }
+
+    /* Send notification message */
+    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
+    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
+          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
+
+    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+
+    return MEMTX_OK;
+}
+
+/*
+ * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
+ *
+ * @s         : IOMMU Device State
+ * @ctx       : Device Translation Context with devid and pasid set.
+ * @return    : success or fault code.
+ */
+static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
+{
+    const uint64_t ddtp = s->ddtp;
+    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
+    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
+    struct riscv_iommu_dc dc;
+    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
+    const int dc_fmt = !s->enable_msi;
+    const size_t dc_len = sizeof(dc) >> dc_fmt;
+    unsigned depth;
+    uint64_t de;
+
+    switch (mode) {
+    case RISCV_IOMMU_DDTP_MODE_OFF:
+        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
+
+    case RISCV_IOMMU_DDTP_MODE_BARE:
+        /* mock up pass-through translation context */
+        ctx->tc = RISCV_IOMMU_DC_TC_V;
+        ctx->ta = 0;
+        ctx->msiptp = 0;
+        return 0;
+
+    case RISCV_IOMMU_DDTP_MODE_1LVL:
+        depth = 0;
+        break;
+
+    case RISCV_IOMMU_DDTP_MODE_2LVL:
+        depth = 1;
+        break;
+
+    case RISCV_IOMMU_DDTP_MODE_3LVL:
+        depth = 2;
+        break;
+
+    default:
+        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+    }
+
+    /*
+     * Check supported device id width (in bits).
+     * See IOMMU Specification, Chapter 6. Software guidelines.
+     * - if extended device-context format is used:
+     *   1LVL: 6, 2LVL: 15, 3LVL: 24
+     * - if base device-context format is used:
+     *   1LVL: 7, 2LVL: 16, 3LVL: 24
+     */
+    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+    }
+
+    /* Device directory tree walk */
+    for (; depth-- > 0; ) {
+        /*
+         * Select device id index bits based on device directory tree level
+         * and device context format.
+         * See IOMMU Specification, Chapter 2. Data Structures.
+         * - if extended device-context format is used:
+         *   device index: [23:15][14:6][5:0]
+         * - if base device-context format is used:
+         *   device index: [23:16][15:7][6:0]
+         */
+        const int split = depth * 9 + 6 + dc_fmt;
+        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
+        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+        }
+        le64_to_cpus(&de);
+        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
+            /* invalid directory entry */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
+        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
+            /* reserved bits set */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
+        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
+    }
+
+    /* index into device context entry page */
+    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
+
+    memset(&dc, 0, sizeof(dc));
+    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
+                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+    }
+
+    /* Set translation context. */
+    ctx->tc = le64_to_cpu(dc.tc);
+    ctx->ta = le64_to_cpu(dc.ta);
+    ctx->msiptp = le64_to_cpu(dc.msiptp);
+    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
+    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+    }
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
+        if (ctx->pasid != RISCV_IOMMU_NOPASID) {
+            /* PASID is disabled */
+            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
+        }
+        return 0;
+    }
+
+    /* FSC.TC.PDTV enabled */
+    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
+        /* Invalid PDTP.MODE */
+        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
+    }
+
+    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
+        /*
+         * Select process id index bits based on process directory tree
+         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
+         */
+        const int split = depth * 9 + 8;
+        addr |= ((ctx->pasid >> split) << 3) & ~TARGET_PAGE_MASK;
+        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
+        }
+        le64_to_cpus(&de);
+        if (!(de & RISCV_IOMMU_PC_TA_V)) {
+            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
+        }
+        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
+    }
+
+    /* Leaf entry in PDT */
+    addr |= (ctx->pasid << 4) & ~TARGET_PAGE_MASK;
+    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
+                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
+    }
+
+    /* Use FSC and TA from process directory entry. */
+    ctx->ta = le64_to_cpu(dc.ta);
+
+    return 0;
+}
+
+/* Translation Context cache support */
+static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
+{
+    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
+    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
+    return c1->devid == c2->devid && c1->pasid == c2->pasid;
+}
+
+static guint __ctx_hash(gconstpointer v)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
+    /* Generate simple hash of (pasid, devid), assuming 24-bit wide devid */
+    return (guint)(ctx->devid) + ((guint)(ctx->pasid) << 24);
+}
+
+static void __ctx_inval_devid_pasid(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
+        ctx->devid == arg->devid &&
+        ctx->pasid == arg->pasid) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
+        ctx->devid == arg->devid) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
+    uint32_t devid, uint32_t pasid)
+{
+    GHashTable *ctx_cache;
+    RISCVIOMMUContext key = {
+        .devid = devid,
+        .pasid = pasid,
+    };
+    ctx_cache = g_hash_table_ref(s->ctx_cache);
+    g_hash_table_foreach(ctx_cache, func, &key);
+    g_hash_table_unref(ctx_cache);
+}
+
+/* Find or allocate translation context for a given {device_id, process_id} */
+static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
+    unsigned devid, unsigned pasid, void **ref)
+{
+    GHashTable *ctx_cache;
+    RISCVIOMMUContext *ctx;
+    RISCVIOMMUContext key = {
+        .devid = devid,
+        .pasid = pasid,
+    };
+
+    ctx_cache = g_hash_table_ref(s->ctx_cache);
+    ctx = g_hash_table_lookup(ctx_cache, &key);
+
+    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+        *ref = ctx_cache;
+        return ctx;
+    }
+
+    if (g_hash_table_size(s->ctx_cache) >= LIMIT_CACHE_CTX) {
+        ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
+                                          g_free, NULL);
+        g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
+    }
+
+    ctx = g_new0(RISCVIOMMUContext, 1);
+    ctx->devid = devid;
+    ctx->pasid = pasid;
+
+    int fault = riscv_iommu_ctx_fetch(s, ctx);
+    if (!fault) {
+        g_hash_table_add(ctx_cache, ctx);
+        *ref = ctx_cache;
+        return ctx;
+    }
+
+    g_hash_table_unref(ctx_cache);
+    *ref = NULL;
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_DTF)) {
+        struct riscv_iommu_fq_record ev = { 0 };
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE,
+            RISCV_IOMMU_FQ_TTYPE_UADDR_RD);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, devid);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, pasid);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, !!pasid);
+        riscv_iommu_fault(s, &ev);
+    }
+
+    g_free(ctx);
+    return NULL;
+}
+
+static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
+{
+    if (ref) {
+        g_hash_table_unref((GHashTable *)ref);
+    }
+}
+
+/* Find or allocate address space for a given device */
+static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
+{
+    RISCVIOMMUSpace *as;
+
+    /* FIXME: PCIe bus remapping for attached endpoints. */
+    devid |= s->bus << 8;
+
+    qemu_mutex_lock(&s->core_lock);
+    QLIST_FOREACH(as, &s->spaces, list) {
+        if (as->devid == devid) {
+            break;
+        }
+    }
+    qemu_mutex_unlock(&s->core_lock);
+
+    if (as == NULL) {
+        char name[64];
+        as = g_new0(RISCVIOMMUSpace, 1);
+
+        as->iommu = s;
+        as->devid = devid;
+
+        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
+            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
+
+        /* IOVA address space, untranslated addresses */
+        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
+            TYPE_RISCV_IOMMU_MEMORY_REGION,
+            OBJECT(as), name, UINT64_MAX);
+        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
+            TYPE_RISCV_IOMMU_PCI);
+
+        qemu_mutex_lock(&s->core_lock);
+        QLIST_INSERT_HEAD(&s->spaces, as, list);
+        qemu_mutex_unlock(&s->core_lock);
+
+        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
+                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
+    }
+    return &as->iova_as;
+}
+
+static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    IOMMUTLBEntry *iotlb)
+{
+    bool enable_faults;
+    bool enable_pasid;
+    bool enable_pri;
+    int fault;
+
+    enable_faults = !(ctx->tc & RISCV_IOMMU_DC_TC_DTF);
+    /*
+     * TC[32] is reserved for custom extensions, used here to temporarily
+     * enable automatic page-request generation for ATS queries.
+     */
+    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
+    enable_pasid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
+
+    /* Translate using device directory / page table information. */
+    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
+
+    if (enable_pri && fault) {
+        struct riscv_iommu_pq_record pr = {0};
+        if (enable_pasid) {
+            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
+                RISCV_IOMMU_PREQ_HDR_PID, ctx->pasid);
+        }
+        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
+        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
+                     RISCV_IOMMU_PREQ_PAYLOAD_M;
+        riscv_iommu_pri(s, &pr);
+        return fault;
+    }
+
+    if (enable_faults && fault) {
+        struct riscv_iommu_fq_record ev;
+        unsigned ttype;
+
+        if (iotlb->perm & IOMMU_RW) {
+            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
+        } else {
+            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
+        }
+        ev.hdr = set_field(0, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, ttype);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, enable_pasid);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->pasid);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
+        ev.iotval    = iotlb->iova;
+        ev.iotval2   = iotlb->translated_addr;
+        ev._reserved = 0;
+        riscv_iommu_fault(s, &ev);
+        return fault;
+    }
+
+    return 0;
+}
+
+/* IOMMU Command Interface */
+static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
+    uint64_t addr, uint32_t data)
+{
+    /*
+     * ATS processing in this implementation of the IOMMU is synchronous,
+     * no need to wait for completions here.
+     */
+    if (!notify) {
+        return MEMTX_OK;
+    }
+
+    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
+        MEMTXATTRS_UNSPECIFIED);
+}
+
+static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
+{
+    uint64_t old_ddtp = s->ddtp;
+    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
+    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
+    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
+    bool ok = false;
+
+    /*
+     * Check for allowed DDTP.MODE transitions:
+     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
+     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
+     */
+    if (new_mode == old_mode ||
+        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
+        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
+        ok = true;
+    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
+               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
+               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
+        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
+             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
+    }
+
+    if (ok) {
+        /* clear reserved and busy bits, report back sanitized version */
+        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
+                             RISCV_IOMMU_DDTP_MODE, new_mode);
+    } else {
+        new_ddtp = old_ddtp;
+    }
+    s->ddtp = new_ddtp;
+
+    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
+}
+
+/* Command function and opcode field. */
+#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
+
+static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
+{
+    struct riscv_iommu_command cmd;
+    MemTxResult res;
+    dma_addr_t addr;
+    uint32_t tail, head, ctrl;
+    uint64_t cmd_opcode;
+    GHFunc func;
+
+    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
+    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
+
+    /* Check for pending error or queue processing disabled */
+    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
+        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
+        return;
+    }
+
+    while (tail != head) {
+        addr = s->cq_addr  + head * sizeof(cmd);
+        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
+                              MEMTXATTRS_UNSPECIFIED);
+
+        if (res != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                                  RISCV_IOMMU_CQCSR_CQMF, 0);
+            goto fault;
+        }
+
+        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
+
+        cmd_opcode = get_field(cmd.dword0,
+                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
+
+        switch (cmd_opcode) {
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
+                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
+            res = riscv_iommu_iofence(s,
+                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
+
+            if (res != MEMTX_OK) {
+                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                                      RISCV_IOMMU_CQCSR_CQMF, 0);
+                goto fault;
+            }
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
+                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
+            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
+                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
+                goto cmd_ill;
+            }
+            /* translation cache not implemented yet */
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
+                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
+            /* translation cache not implemented yet */
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
+                             RISCV_IOMMU_CMD_IODIR_OPCODE):
+            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
+                /* invalidate all device context cache mappings */
+                func = __ctx_inval_all;
+            } else {
+                /* invalidate all device context matching DID */
+                func = __ctx_inval_devid;
+            }
+            riscv_iommu_ctx_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
+                             RISCV_IOMMU_CMD_IODIR_OPCODE):
+            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
+                /* illegal command arguments IODIR_PDT & DV == 0 */
+                goto cmd_ill;
+            } else {
+                func = __ctx_inval_devid_pasid;
+            }
+            riscv_iommu_ctx_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
+            break;
+
+        default:
+        cmd_ill:
+            /* Invalid instruction, do not advance instruction index. */
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
+            goto fault;
+        }
+
+        /* Advance and update head pointer after command completes. */
+        head = (head + 1) & s->cq_mask;
+        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
+    }
+    return;
+
+fault:
+    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
+    }
+}
+
+static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
+        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
+        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
+        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
+            RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
+        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
+        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
+        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
+            RISCV_IOMMU_FQCSR_FQOF;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
+        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
+        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
+        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
+            RISCV_IOMMU_PQCSR_PQOF;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
+}
+
+/* Core IOMMU execution activation */
+enum {
+    RISCV_IOMMU_EXEC_DDTP,
+    RISCV_IOMMU_EXEC_CQCSR,
+    RISCV_IOMMU_EXEC_CQT,
+    RISCV_IOMMU_EXEC_FQCSR,
+    RISCV_IOMMU_EXEC_FQH,
+    RISCV_IOMMU_EXEC_PQCSR,
+    RISCV_IOMMU_EXEC_PQH,
+    RISCV_IOMMU_EXEC_TR_REQUEST,
+    /* RISCV_IOMMU_EXEC_EXIT must be the last enum value */
+    RISCV_IOMMU_EXEC_EXIT,
+};
+
+static void *riscv_iommu_core_proc(void* arg)
+{
+    RISCVIOMMUState *s = arg;
+    unsigned exec = 0;
+    unsigned mask = 0;
+
+    while (!(exec & BIT(RISCV_IOMMU_EXEC_EXIT))) {
+        mask = (mask ? mask : BIT(RISCV_IOMMU_EXEC_EXIT)) >> 1;
+        switch (exec & mask) {
+        case BIT(RISCV_IOMMU_EXEC_DDTP):
+            riscv_iommu_process_ddtp(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_CQCSR):
+            riscv_iommu_process_cq_control(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_CQT):
+            riscv_iommu_process_cq_tail(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_FQCSR):
+            riscv_iommu_process_fq_control(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_FQH):
+            /* NOP */
+            break;
+        case BIT(RISCV_IOMMU_EXEC_PQCSR):
+            riscv_iommu_process_pq_control(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_PQH):
+            /* NOP */
+            break;
+        case BIT(RISCV_IOMMU_EXEC_TR_REQUEST):
+            /* DBG support not implemented yet */
+            break;
+        }
+        exec &= ~mask;
+        if (!exec) {
+            qemu_mutex_lock(&s->core_lock);
+            exec = s->core_exec;
+            while (!exec) {
+                qemu_cond_wait(&s->core_cond, &s->core_lock);
+                exec = s->core_exec;
+            }
+            s->core_exec = 0;
+            qemu_mutex_unlock(&s->core_lock);
+        }
+    };
+
+    return NULL;
+}
+
+static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
+    uint64_t data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState *s = opaque;
+    uint32_t regb = addr & ~3;
+    uint32_t busy = 0;
+    uint32_t exec = 0;
+
+    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
+        /* Unsupported MMIO alignment or access size */
+        return MEMTX_ERROR;
+    }
+
+    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+        /* Unsupported MMIO access location. */
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* Track actionable MMIO write. */
+    switch (regb) {
+    case RISCV_IOMMU_REG_DDTP:
+    case RISCV_IOMMU_REG_DDTP + 4:
+        exec = BIT(RISCV_IOMMU_EXEC_DDTP);
+        regb = RISCV_IOMMU_REG_DDTP;
+        busy = RISCV_IOMMU_DDTP_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_CQT:
+        exec = BIT(RISCV_IOMMU_EXEC_CQT);
+        break;
+
+    case RISCV_IOMMU_REG_CQCSR:
+        exec = BIT(RISCV_IOMMU_EXEC_CQCSR);
+        busy = RISCV_IOMMU_CQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_FQH:
+        exec = BIT(RISCV_IOMMU_EXEC_FQH);
+        break;
+
+    case RISCV_IOMMU_REG_FQCSR:
+        exec = BIT(RISCV_IOMMU_EXEC_FQCSR);
+        busy = RISCV_IOMMU_FQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_PQH:
+        exec = BIT(RISCV_IOMMU_EXEC_PQH);
+        break;
+
+    case RISCV_IOMMU_REG_PQCSR:
+        exec = BIT(RISCV_IOMMU_EXEC_PQCSR);
+        busy = RISCV_IOMMU_PQCSR_BUSY;
+        break;
+    }
+
+    /*
+     * Registers update might be not synchronized with core logic.
+     * If system software updates register when relevant BUSY bit is set
+     * IOMMU behavior of additional writes to the register is UNSPECIFIED
+     */
+
+    qemu_spin_lock(&s->regs_lock);
+    if (size == 1) {
+        uint8_t ro = s->regs_ro[addr];
+        uint8_t wc = s->regs_wc[addr];
+        uint8_t rw = s->regs_rw[addr];
+        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
+    } else if (size == 2) {
+        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
+        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
+        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
+        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
+    } else if (size == 4) {
+        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
+        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
+        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
+        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
+    } else if (size == 8) {
+        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
+        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
+        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
+        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
+    }
+
+    /* Busy flag update, MSB 4-byte register. */
+    if (busy) {
+        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
+        stl_le_p(&s->regs_rw[regb], rw | busy);
+    }
+    qemu_spin_unlock(&s->regs_lock);
+
+    /* Wake up core processing thread. */
+    if (exec) {
+        qemu_mutex_lock(&s->core_lock);
+        s->core_exec |= exec;
+        qemu_cond_signal(&s->core_cond);
+        qemu_mutex_unlock(&s->core_lock);
+    }
+
+    return MEMTX_OK;
+}
+
+static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
+    uint64_t *data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState *s = opaque;
+    uint64_t val = -1;
+    uint8_t *ptr;
+
+    if ((addr & (size - 1)) != 0) {
+        /* Unsupported MMIO alignment. */
+        return MEMTX_ERROR;
+    }
+
+    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    ptr = &s->regs_rw[addr];
+
+    if (size == 1) {
+        val = (uint64_t)*ptr;
+    } else if (size == 2) {
+        val = lduw_le_p(ptr);
+    } else if (size == 4) {
+        val = ldl_le_p(ptr);
+    } else if (size == 8) {
+        val = ldq_le_p(ptr);
+    } else {
+        return MEMTX_ERROR;
+    }
+
+    *data = val;
+
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps riscv_iommu_mmio_ops = {
+    .read_with_attrs = riscv_iommu_mmio_read,
+    .write_with_attrs = riscv_iommu_mmio_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    }
+};
+
+/*
+ * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
+ * memory region as untranslated address, for additional MSI/MRIF interception
+ * by IOMMU interrupt remapping implementation.
+ * Note: Device emulation code generating an MSI is expected to provide a valid
+ * memory transaction attributes with requested_id set.
+ */
+static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
+    uint64_t data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
+    RISCVIOMMUContext *ctx;
+    MemTxResult res;
+    void *ref;
+    uint32_t devid = attrs.requester_id;
+
+    if (attrs.unspecified) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* FIXME: PCIe bus remapping for attached endpoints. */
+    devid |= s->bus << 8;
+
+    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
+    if (ctx == NULL) {
+        res = MEMTX_ACCESS_ERROR;
+    } else {
+        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
+    }
+    riscv_iommu_ctx_put(s, ref);
+    return res;
+}
+
+static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
+    uint64_t *data, unsigned size, MemTxAttrs attrs)
+{
+    return MEMTX_ACCESS_ERROR;
+}
+
+static const MemoryRegionOps riscv_iommu_trap_ops = {
+    .read_with_attrs = riscv_iommu_trap_read,
+    .write_with_attrs = riscv_iommu_trap_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = true,
+    },
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    }
+};
+
+static void riscv_iommu_realize(DeviceState *dev, Error **errp)
+{
+    RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
+    if (s->enable_msi) {
+        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
+    }
+    /* Report QEMU target physical address space limits */
+    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
+                       TARGET_PHYS_ADDR_SPACE_BITS);
+
+    /* TODO: method to report supported PASID bits */
+    s->pasid_bits = 8; /* restricted to size of MemTxAttrs.pasid */
+    s->cap |= RISCV_IOMMU_CAP_PD8;
+
+    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
+    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
+                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
+
+    /* register storage */
+    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+
+     /* Mark all registers read-only */
+    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
+
+    /*
+     * Register complete MMIO space, including MSI/PBA registers.
+     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
+     * managed directly by the PCIDevice implementation.
+     */
+    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
+        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
+
+    /* Set power-on register state */
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], s->fctl);
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
+        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
+        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
+        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
+        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
+        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
+        RISCV_IOMMU_CQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
+        RISCV_IOMMU_FQCSR_FQOF);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
+        RISCV_IOMMU_FQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
+        RISCV_IOMMU_PQCSR_PQOF);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
+        RISCV_IOMMU_PQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_IVEC], 0);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
+
+    /* Memory region for downstream access, if specified. */
+    if (s->target_mr) {
+        s->target_as = g_new0(AddressSpace, 1);
+        address_space_init(s->target_as, s->target_mr,
+            "riscv-iommu-downstream");
+    } else {
+        /* Fallback to global system memory. */
+        s->target_as = &address_space_memory;
+    }
+
+    /* Memory region for untranslated MRIF/MSI writes */
+    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
+            "riscv-iommu-trap", ~0ULL);
+    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
+
+    /* Device translation context cache */
+    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
+                                         g_free, NULL);
+
+    s->iommus.le_next = NULL;
+    s->iommus.le_prev = NULL;
+    QLIST_INIT(&s->spaces);
+    qemu_cond_init(&s->core_cond);
+    qemu_mutex_init(&s->core_lock);
+    qemu_spin_init(&s->regs_lock);
+    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
+        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
+}
+
+static void riscv_iommu_unrealize(DeviceState *dev)
+{
+    RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+    qemu_mutex_lock(&s->core_lock);
+    /* cancel pending operations and stop */
+    s->core_exec = BIT(RISCV_IOMMU_EXEC_EXIT);
+    qemu_cond_signal(&s->core_cond);
+    qemu_mutex_unlock(&s->core_lock);
+    qemu_thread_join(&s->core_proc);
+    qemu_cond_destroy(&s->core_cond);
+    qemu_mutex_destroy(&s->core_lock);
+    g_hash_table_unref(s->ctx_cache);
+}
+
+static Property riscv_iommu_properties[] = {
+    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
+        RISCV_IOMMU_SPEC_DOT_VER),
+    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
+    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
+    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
+    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
+        TYPE_MEMORY_REGION, MemoryRegion *),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void riscv_iommu_class_init(ObjectClass *klass, void* data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
+    dc->user_creatable = false;
+    dc->realize = riscv_iommu_realize;
+    dc->unrealize = riscv_iommu_unrealize;
+    device_class_set_props(dc, riscv_iommu_properties);
+}
+
+static const TypeInfo riscv_iommu_info = {
+    .name = TYPE_RISCV_IOMMU,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(RISCVIOMMUState),
+    .class_init = riscv_iommu_class_init,
+};
+
+static const char *IOMMU_FLAG_STR[] = {
+    "NA",
+    "RO",
+    "WR",
+    "RW",
+};
+
+/* RISC-V IOMMU Memory Region - Address Translation Space */
+static IOMMUTLBEntry riscv_iommu_memory_region_translate(
+    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
+    IOMMUAccessFlags flag, int iommu_idx)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+    RISCVIOMMUContext *ctx;
+    void *ref;
+    IOMMUTLBEntry iotlb = {
+        .iova = addr,
+        .target_as = as->iommu->target_as,
+        .addr_mask = ~0ULL,
+        .perm = flag,
+    };
+
+    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
+    if (ctx == NULL) {
+        /* Translation disabled or invalid. */
+        iotlb.addr_mask = 0;
+        iotlb.perm = IOMMU_NONE;
+    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
+        /* Translation disabled or fault reported. */
+        iotlb.addr_mask = 0;
+        iotlb.perm = IOMMU_NONE;
+    }
+
+    /* Trace all dma translations with original access flags. */
+    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
+                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
+                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
+                          iotlb.translated_addr);
+
+    riscv_iommu_ctx_put(as->iommu, ref);
+
+    return iotlb;
+}
+
+static int riscv_iommu_memory_region_notify(
+    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
+    IOMMUNotifierFlag new, Error **errp)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+
+    if (old == IOMMU_NOTIFIER_NONE) {
+        as->notifier = true;
+        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
+    } else if (new == IOMMU_NOTIFIER_NONE) {
+        as->notifier = false;
+        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
+    }
+
+    return 0;
+}
+
+static inline bool pci_is_iommu(PCIDevice *pdev)
+{
+    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
+}
+
+static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
+{
+    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
+    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
+    AddressSpace *as = NULL;
+
+    if (pdev && pci_is_iommu(pdev)) {
+        return s->target_as;
+    }
+
+    /* Find first registered IOMMU device */
+    while (s->iommus.le_prev) {
+        s = *(s->iommus.le_prev);
+    }
+
+    /* Find first matching IOMMU */
+    while (s != NULL && as == NULL) {
+        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
+        s = s->iommus.le_next;
+    }
+
+    return as ? as : &address_space_memory;
+}
+
+static const PCIIOMMUOps riscv_iommu_ops = {
+    .get_address_space = riscv_iommu_find_as,
+};
+
+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+        Error **errp)
+{
+    if (bus->iommu_ops &&
+        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
+        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
+        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
+        QLIST_INSERT_AFTER(last, iommu, iommus);
+    } else if (bus->iommu_ops == NULL) {
+        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
+    } else {
+        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
+            pci_bus_num(bus));
+    }
+}
+
+static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
+    MemTxAttrs attrs)
+{
+    return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
+}
+
+static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+    return 1 << as->iommu->pasid_bits;
+}
+
+static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
+{
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+    imrc->translate = riscv_iommu_memory_region_translate;
+    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
+    imrc->attrs_to_index = riscv_iommu_memory_region_index;
+    imrc->num_indexes = riscv_iommu_memory_region_index_len;
+}
+
+static const TypeInfo riscv_iommu_memory_region_info = {
+    .parent = TYPE_IOMMU_MEMORY_REGION,
+    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
+    .class_init = riscv_iommu_memory_region_init,
+};
+
+static void riscv_iommu_register_mr_types(void)
+{
+    type_register_static(&riscv_iommu_memory_region_info);
+    type_register_static(&riscv_iommu_info);
+}
+
+type_init(riscv_iommu_register_mr_types);
diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
new file mode 100644
index 0000000000..6f740de690
--- /dev/null
+++ b/hw/riscv/riscv-iommu.h
@@ -0,0 +1,141 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU (Ziommu)
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_RISCV_IOMMU_STATE_H
+#define HW_RISCV_IOMMU_STATE_H
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#include "hw/riscv/iommu.h"
+
+struct RISCVIOMMUState {
+    /*< private >*/
+    DeviceState parent_obj;
+
+    /*< public >*/
+    uint32_t version;     /* Reported interface version number */
+    uint32_t pasid_bits;  /* process identifier width */
+    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
+
+    uint64_t cap;         /* IOMMU supported capabilities */
+    uint64_t fctl;        /* IOMMU enabled features */
+
+    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
+    bool enable_msi;      /* Enable MSI remapping */
+
+    /* IOMMU Internal State */
+    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
+
+    dma_addr_t cq_addr;   /* Command queue base physical address */
+    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
+    dma_addr_t pq_addr;   /* Page request queue base physical address */
+
+    uint32_t cq_mask;     /* Command queue index bit mask */
+    uint32_t fq_mask;     /* Fault/event queue index bit mask */
+    uint32_t pq_mask;     /* Page request queue index bit mask */
+
+    /* interrupt notifier */
+    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
+
+    /* IOMMU State Machine */
+    QemuThread core_proc; /* Background processing thread */
+    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
+    QemuCond core_cond;   /* Background processing wake up signal */
+    unsigned core_exec;   /* Processing thread execution actions */
+
+    /* IOMMU target address space */
+    AddressSpace *target_as;
+    MemoryRegion *target_mr;
+
+    /* MSI / MRIF access trap */
+    AddressSpace trap_as;
+    MemoryRegion trap_mr;
+
+    GHashTable *ctx_cache;          /* Device translation Context Cache */
+
+    /* MMIO Hardware Interface */
+    MemoryRegion regs_mr;
+    QemuSpin regs_lock;
+    uint8_t *regs_rw;  /* register state (user write) */
+    uint8_t *regs_wc;  /* write-1-to-clear mask */
+    uint8_t *regs_ro;  /* read-only mask */
+
+    QLIST_ENTRY(RISCVIOMMUState) iommus;
+    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
+};
+
+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+         Error **errp);
+
+/* private helpers */
+
+/* Register helper functions */
+static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
+    unsigned idx, uint32_t set, uint32_t clr)
+{
+    uint32_t val;
+    qemu_spin_lock(&s->regs_lock);
+    val = ldl_le_p(s->regs_rw + idx);
+    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
+    qemu_spin_unlock(&s->regs_lock);
+    return val;
+}
+
+static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
+    unsigned idx, uint32_t set)
+{
+    qemu_spin_lock(&s->regs_lock);
+    stl_le_p(s->regs_rw + idx, set);
+    qemu_spin_unlock(&s->regs_lock);
+}
+
+static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
+    unsigned idx)
+{
+    return ldl_le_p(s->regs_rw + idx);
+}
+
+static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
+    unsigned idx, uint64_t set, uint64_t clr)
+{
+    uint64_t val;
+    qemu_spin_lock(&s->regs_lock);
+    val = ldq_le_p(s->regs_rw + idx);
+    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
+    qemu_spin_unlock(&s->regs_lock);
+    return val;
+}
+
+static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
+    unsigned idx, uint64_t set)
+{
+    qemu_spin_lock(&s->regs_lock);
+    stq_le_p(s->regs_rw + idx, set);
+    qemu_spin_unlock(&s->regs_lock);
+}
+
+static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
+    unsigned idx)
+{
+    return ldq_le_p(s->regs_rw + idx);
+}
+
+
+
+#endif
diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
new file mode 100644
index 0000000000..42a97caffa
--- /dev/null
+++ b/hw/riscv/trace-events
@@ -0,0 +1,11 @@ 
+# See documentation at docs/devel/tracing.rst
+
+# riscv-iommu.c
+riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
+riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
+riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
+riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
+riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
+riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
+riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
+riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
new file mode 100644
index 0000000000..b88504b750
--- /dev/null
+++ b/hw/riscv/trace.h
@@ -0,0 +1,2 @@ 
+#include "trace/trace-hw_riscv.h"
+
diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
new file mode 100644
index 0000000000..403b365893
--- /dev/null
+++ b/include/hw/riscv/iommu.h
@@ -0,0 +1,36 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU (Ziommu)
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_RISCV_IOMMU_H
+#define HW_RISCV_IOMMU_H
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#define TYPE_RISCV_IOMMU "riscv-iommu"
+OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
+typedef struct RISCVIOMMUState RISCVIOMMUState;
+
+#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
+typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
+
+#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
+OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
+typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
+
+#endif
diff --git a/meson.build b/meson.build
index c59ca496f2..75e56f3282 100644
--- a/meson.build
+++ b/meson.build
@@ -3361,6 +3361,7 @@  if have_system
     'hw/rdma',
     'hw/rdma/vmw',
     'hw/rtc',
+    'hw/riscv',
     'hw/s390x',
     'hw/scsi',
     'hw/sd',