diff mbox series

[1/5] hw/riscv: Introduction of RISC-V IOMMU device

Message ID 16cf403dcaa44903c6148c5027b18bd9b6bff6a2.1689819031.git.tjeznach@rivosinc.com (mailing list archive)
State New, archived
Headers show
Series QEMU RISC-V IOMMU Support | expand

Commit Message

Tomasz Jeznach July 20, 2023, 2:32 a.m. UTC
The RISC-V IOMMU specification is now ratified as-per the RISC-V international
process [1]. The latest frozen specifcation can be found at:
https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf

The patch add device emulation for RISC-V IOMMU which supports device and process
context lookups, command and fault queue interfaces, two stage address translation
logic with Sv32, Sv39, Sv48, Sv57 addressing modes, address translation cache,
MSI remapping with FLAT/MRIF modes, initial ATS and PRI interfaces, debug capabilities,
hardware performance counters. Platform and PCIe device instantiation is supported,
with wire-signaled and message-signaled interrupt capabilities.

Hardware interface definition file is shared with Linux kernel driver implementation,
available in the maintainer's branch riscv_iommu_v1 at https://github.com/tjeznach/linux.

Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
---
 hw/riscv/Kconfig            |    3 +
 hw/riscv/meson.build        |    1 +
 hw/riscv/riscv-iommu-bits.h |  749 +++++++++++
 hw/riscv/riscv-iommu-pci.c  |  181 +++
 hw/riscv/riscv-iommu-sys.c  |  123 ++
 hw/riscv/riscv-iommu.c      | 2539 +++++++++++++++++++++++++++++++++++
 hw/riscv/riscv-iommu.h      |  152 +++
 hw/riscv/trace-events       |   14 +
 hw/riscv/trace.h            |    2 +
 include/hw/riscv/iommu.h    |   40 +
 meson.build                 |    1 +
 11 files changed, 3805 insertions(+)
 create mode 100644 hw/riscv/riscv-iommu-bits.h
 create mode 100644 hw/riscv/riscv-iommu-pci.c
 create mode 100644 hw/riscv/riscv-iommu-sys.c
 create mode 100644 hw/riscv/riscv-iommu.c
 create mode 100644 hw/riscv/riscv-iommu.h
 create mode 100644 hw/riscv/trace-events
 create mode 100644 hw/riscv/trace.h
 create mode 100644 include/hw/riscv/iommu.h

Comments

Alistair Francis July 24, 2023, 2:32 a.m. UTC | #1
On Thu, Jul 20, 2023 at 12:34 PM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> The RISC-V IOMMU specification is now ratified as-per the RISC-V international
> process [1]. The latest frozen specifcation can be found at:
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf

Exciting!

>
> The patch add device emulation for RISC-V IOMMU which supports device and process
> context lookups, command and fault queue interfaces, two stage address translation
> logic with Sv32, Sv39, Sv48, Sv57 addressing modes, address translation cache,
> MSI remapping with FLAT/MRIF modes, initial ATS and PRI interfaces, debug capabilities,
> hardware performance counters. Platform and PCIe device instantiation is supported,
> with wire-signaled and message-signaled interrupt capabilities.
>
> Hardware interface definition file is shared with Linux kernel driver implementation,
> available in the maintainer's branch riscv_iommu_v1 at https://github.com/tjeznach/linux.
>
> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> ---
>  hw/riscv/Kconfig            |    3 +
>  hw/riscv/meson.build        |    1 +
>  hw/riscv/riscv-iommu-bits.h |  749 +++++++++++
>  hw/riscv/riscv-iommu-pci.c  |  181 +++
>  hw/riscv/riscv-iommu-sys.c  |  123 ++
>  hw/riscv/riscv-iommu.c      | 2539 +++++++++++++++++++++++++++++++++++
>  hw/riscv/riscv-iommu.h      |  152 +++
>  hw/riscv/trace-events       |   14 +
>  hw/riscv/trace.h            |    2 +
>  include/hw/riscv/iommu.h    |   40 +
>  meson.build                 |    1 +

This is a really long patch!

I think this should at least be split up to rougly each file (as long
as it compiles). For example the header files could be added in a
patch each. Which  would reduce some of the review burden.

>  11 files changed, 3805 insertions(+)
>  create mode 100644 hw/riscv/riscv-iommu-bits.h
>  create mode 100644 hw/riscv/riscv-iommu-pci.c
>  create mode 100644 hw/riscv/riscv-iommu-sys.c
>  create mode 100644 hw/riscv/riscv-iommu.c
>  create mode 100644 hw/riscv/riscv-iommu.h
>  create mode 100644 hw/riscv/trace-events
>  create mode 100644 hw/riscv/trace.h
>  create mode 100644 include/hw/riscv/iommu.h
>
> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> index b6a5eb4452..617a509f1b 100644
> --- a/hw/riscv/Kconfig
> +++ b/hw/riscv/Kconfig
> @@ -1,3 +1,6 @@
> +config RISCV_IOMMU
> +    bool
> +
>  config RISCV_NUMA
>      bool
>
> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> index 2f7ee81be3..e37c5d78e2 100644
> --- a/hw/riscv/meson.build
> +++ b/hw/riscv/meson.build
> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>  riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>  riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
>  riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c', 'riscv-iommu-pci.c', 'riscv-iommu-sys.c'))
>
>  hw_arch += {'riscv': riscv_ss}
> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
> new file mode 100644
> index 0000000000..9ce713361f
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu-bits.h
> @@ -0,0 +1,749 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright © 2022-2023 Rivos Inc.
> + * Copyright © 2023 FORTH-ICS/CARV
> + * Copyright © 2023 RISC-V IOMMU Task Group
> + *
> + * RISC-V Ziommu - Register Layout and Data Structures.
> + *
> + * Based on the 'RISC-V IOMMU Architecture Specification', Version 1.0
> + * Published at  https://github.com/riscv-non-isa/riscv-iommu
> + *
> + */
> +
> +#ifndef HW_RISCV_IOMMU_BITS_H
> +#define HW_RISCV_IOMMU_BITS_H
> +
> +/*
> + * This file is based on Linux RISC-V IOMMU file
> + * located at 'drivers/iommu/riscv/iommu-bits.h'
> + */
> +
> +#include "qemu/osdep.h"

This shouldn't be included in header files

> +
> +#define RISCV_IOMMU_SPEC_DOT_VER 0x010
> +
> +#ifndef GENMASK_ULL
> +#define GENMASK_ULL(h, l) (((~0ULL) >> (63 - (h) + (l))) << (l))
> +#endif
> +
> +/*
> + * Chapter 5: Memory Mapped register interface
> + */
> +
> +/* Common field positions */
> +#define RISCV_IOMMU_PPN_FIELD           GENMASK_ULL(53, 10)
> +#define RISCV_IOMMU_QUEUE_LOGSZ_FIELD   GENMASK_ULL(4, 0)
> +#define RISCV_IOMMU_QUEUE_INDEX_FIELD   GENMASK_ULL(31, 0)
> +#define RISCV_IOMMU_QUEUE_ENABLE        BIT(0)
> +#define RISCV_IOMMU_QUEUE_INTR_ENABLE   BIT(1)
> +#define RISCV_IOMMU_QUEUE_MEM_FAULT     BIT(8)
> +#define RISCV_IOMMU_QUEUE_OVERFLOW      BIT(9)
> +#define RISCV_IOMMU_QUEUE_ACTIVE        BIT(16)
> +#define RISCV_IOMMU_QUEUE_BUSY          BIT(17)
> +#define RISCV_IOMMU_ATP_PPN_FIELD       GENMASK_ULL(43, 0)
> +#define RISCV_IOMMU_ATP_MODE_FIELD      GENMASK_ULL(63, 60)
> +
> +/* 5.3 IOMMU Capabilities (64bits) */
> +#define RISCV_IOMMU_REG_CAP             0x0000
> +#define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
> +#define RISCV_IOMMU_CAP_S_SV32          BIT_ULL(8)
> +#define RISCV_IOMMU_CAP_S_SV39          BIT_ULL(9)
> +#define RISCV_IOMMU_CAP_S_SV48          BIT_ULL(10)
> +#define RISCV_IOMMU_CAP_S_SV57          BIT_ULL(11)
> +#define RISCV_IOMMU_CAP_SVPBMT          BIT_ULL(15)
> +#define RISCV_IOMMU_CAP_G_SV32          BIT_ULL(16)
> +#define RISCV_IOMMU_CAP_G_SV39          BIT_ULL(17)
> +#define RISCV_IOMMU_CAP_G_SV48          BIT_ULL(18)
> +#define RISCV_IOMMU_CAP_G_SV57          BIT_ULL(19)
> +#define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
> +#define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
> +#define RISCV_IOMMU_CAP_AMO             BIT_ULL(24)
> +#define RISCV_IOMMU_CAP_ATS             BIT_ULL(25)
> +#define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
> +#define RISCV_IOMMU_CAP_END             BIT_ULL(27)
> +#define RISCV_IOMMU_CAP_IGS             GENMASK_ULL(29, 28)
> +#define RISCV_IOMMU_CAP_HPM             BIT_ULL(30)
> +#define RISCV_IOMMU_CAP_DBG             BIT_ULL(31)
> +#define RISCV_IOMMU_CAP_PAS             GENMASK_ULL(37, 32)
> +#define RISCV_IOMMU_CAP_PD8             BIT_ULL(38)
> +#define RISCV_IOMMU_CAP_PD17            BIT_ULL(39)
> +#define RISCV_IOMMU_CAP_PD20            BIT_ULL(40)
> +
> +#define RISCV_IOMMU_CAP_VERSION_VER_MASK      0xF0
> +#define RISCV_IOMMU_CAP_VERSION_REV_MASK      0x0F
> +
> +/**
> + * enum riscv_iommu_igs_settings - Interrupt Generation Support Settings
> + * @RISCV_IOMMU_CAP_IGS_MSI: I/O MMU supports only MSI generation
> + * @RISCV_IOMMU_CAP_IGS_WSI: I/O MMU supports only Wired-Signaled interrupt
> + * @RISCV_IOMMU_CAP_IGS_BOTH: I/O MMU supports both MSI and WSI generation
> + * @RISCV_IOMMU_CAP_IGS_RSRV: Reserved for standard use
> + */
> +enum riscv_iommu_igs_settings {
> +      RISCV_IOMMU_CAP_IGS_MSI  = 0,
> +      RISCV_IOMMU_CAP_IGS_WSI  = 1,
> +      RISCV_IOMMU_CAP_IGS_BOTH = 2,
> +      RISCV_IOMMU_CAP_IGS_RSRV = 3
> +};
> +
> +
> +/* 5.4 Features control register (32bits) */
> +#define RISCV_IOMMU_REG_FCTL            0x0008
> +#define RISCV_IOMMU_FCTL_BE             BIT(0)
> +#define RISCV_IOMMU_FCTL_WSI            BIT(1)
> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
> +
> +
> +/* 5.5 Device-directory-table pointer (64bits) */
> +#define RISCV_IOMMU_REG_DDTP            0x0010
> +#define RISCV_IOMMU_DDTP_MODE           GENMASK_ULL(3, 0)
> +#define RISCV_IOMMU_DDTP_BUSY           BIT_ULL(4)
> +#define RISCV_IOMMU_DDTP_PPN            RISCV_IOMMU_PPN_FIELD
> +
> +/**
> + * enum riscv_iommu_ddtp_modes - I/O MMU translation modes
> + * @RISCV_IOMMU_DDTP_MODE_OFF: No inbound transactions allowed
> + * @RISCV_IOMMU_DDTP_MODE_BARE: Pass-through mode
> + * @RISCV_IOMMU_DDTP_MODE_1LVL: One-level DDT
> + * @RISCV_IOMMU_DDTP_MODE_2LVL: Two-level DDT
> + * @RISCV_IOMMU_DDTP_MODE_3LVL: Three-level DDT
> + */
> +enum riscv_iommu_ddtp_modes {
> +      RISCV_IOMMU_DDTP_MODE_OFF = 0,
> +      RISCV_IOMMU_DDTP_MODE_BARE = 1,
> +      RISCV_IOMMU_DDTP_MODE_1LVL = 2,
> +      RISCV_IOMMU_DDTP_MODE_2LVL = 3,
> +      RISCV_IOMMU_DDTP_MODE_3LVL = 4,
> +      RISCV_IOMMU_DDTP_MODE_MAX = 4
> +};
> +
> +
> +/* 5.6 Command Queue Base (64bits) */
> +#define RISCV_IOMMU_REG_CQB             0x0018
> +#define RISCV_IOMMU_CQB_LOG2SZ          RISCV_IOMMU_QUEUE_LOGSZ_FIELD
> +#define RISCV_IOMMU_CQB_PPN             RISCV_IOMMU_PPN_FIELD
> +
> +/* 5.7 Command Queue head (32bits) */
> +#define RISCV_IOMMU_REG_CQH             0x0020
> +#define RISCV_IOMMU_CQH_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> +
> +/* 5.8 Command Queue tail (32bits) */
> +#define RISCV_IOMMU_REG_CQT             0x0024
> +#define RISCV_IOMMU_CQT_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> +
> +
> +/* 5.9 Fault Queue Base (64bits) */
> +#define RISCV_IOMMU_REG_FQB             0x0028
> +#define RISCV_IOMMU_FQB_LOG2SZ          RISCV_IOMMU_QUEUE_LOGSZ_FIELD
> +#define RISCV_IOMMU_FQB_PPN             RISCV_IOMMU_PPN_FIELD
> +
> +/* 5.10 Fault Queue Head (32bits) */
> +#define RISCV_IOMMU_REG_FQH             0x0030
> +#define RISCV_IOMMU_FQH_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> +
> +/* 5.11 Fault Queue tail (32bits) */
> +#define RISCV_IOMMU_REG_FQT             0x0034
> +#define RISCV_IOMMU_FQT_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> +
> +
> +/* 5.12 Page Request Queue base (64bits) */
> +#define RISCV_IOMMU_REG_PQB             0x0038
> +#define RISCV_IOMMU_PQB_LOG2SZ          RISCV_IOMMU_QUEUE_LOGSZ_FIELD
> +#define RISCV_IOMMU_PQB_PPN             RISCV_IOMMU_PPN_FIELD
> +
> +/* 5.13 Page Request Queue head (32bits) */
> +#define RISCV_IOMMU_REG_PQH             0x0040
> +#define RISCV_IOMMU_PQH_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> +
> +/* 5.14 Page Request Queue tail (32bits) */
> +#define RISCV_IOMMU_REG_PQT             0x0044
> +#define RISCV_IOMMU_PQT_INDEX_MASK      RISCV_IOMMU_QUEUE_INDEX_FIELD
> +
> +/* 5.15 Command Queue CSR (32bits) */
> +#define RISCV_IOMMU_REG_CQCSR           0x0048
> +#define RISCV_IOMMU_CQCSR_CQEN          RISCV_IOMMU_QUEUE_ENABLE
> +#define RISCV_IOMMU_CQCSR_CIE           RISCV_IOMMU_QUEUE_INTR_ENABLE
> +#define RISCV_IOMMU_CQCSR_CQMF          RISCV_IOMMU_QUEUE_MEM_FAULT
> +#define RISCV_IOMMU_CQCSR_CMD_TO        BIT(9)
> +#define RISCV_IOMMU_CQCSR_CMD_ILL       BIT(10)
> +#define RISCV_IOMMU_CQCSR_FENCE_W_IP    BIT(11)
> +#define RISCV_IOMMU_CQCSR_CQON          RISCV_IOMMU_QUEUE_ACTIVE
> +#define RISCV_IOMMU_CQCSR_BUSY          RISCV_IOMMU_QUEUE_BUSY
> +
> +
> +/* 5.16 Fault Queue CSR (32bits) */
> +#define RISCV_IOMMU_REG_FQCSR           0x004C
> +#define RISCV_IOMMU_FQCSR_FQEN          RISCV_IOMMU_QUEUE_ENABLE
> +#define RISCV_IOMMU_FQCSR_FIE           RISCV_IOMMU_QUEUE_INTR_ENABLE
> +#define RISCV_IOMMU_FQCSR_FQMF          RISCV_IOMMU_QUEUE_MEM_FAULT
> +#define RISCV_IOMMU_FQCSR_FQOF          RISCV_IOMMU_QUEUE_OVERFLOW
> +#define RISCV_IOMMU_FQCSR_FQON          RISCV_IOMMU_QUEUE_ACTIVE
> +#define RISCV_IOMMU_FQCSR_BUSY          RISCV_IOMMU_QUEUE_BUSY
> +
> +
> +/* 5.17 Page Request Queue CSR (32bits) */
> +#define RISCV_IOMMU_REG_PQCSR           0x0050
> +#define RISCV_IOMMU_PQCSR_PQEN          RISCV_IOMMU_QUEUE_ENABLE
> +#define RISCV_IOMMU_PQCSR_PIE           RISCV_IOMMU_QUEUE_INTR_ENABLE
> +#define RISCV_IOMMU_PQCSR_PQMF          RISCV_IOMMU_QUEUE_MEM_FAULT
> +#define RISCV_IOMMU_PQCSR_PQOF          RISCV_IOMMU_QUEUE_OVERFLOW
> +#define RISCV_IOMMU_PQCSR_PQON          RISCV_IOMMU_QUEUE_ACTIVE
> +#define RISCV_IOMMU_PQCSR_BUSY          RISCV_IOMMU_QUEUE_BUSY
> +
> +
> +/* 5.18 Interrupt Pending Status (32bits) */
> +#define RISCV_IOMMU_REG_IPSR            0x0054
> +
> +#define RISCV_IOMMU_INTR_CQ             0
> +#define RISCV_IOMMU_INTR_FQ             1
> +#define RISCV_IOMMU_INTR_PM             2
> +#define RISCV_IOMMU_INTR_PQ             3
> +#define RISCV_IOMMU_INTR_COUNT          4
> +
> +#define RISCV_IOMMU_IPSR_CIP            BIT(RISCV_IOMMU_INTR_CQ)
> +#define RISCV_IOMMU_IPSR_FIP            BIT(RISCV_IOMMU_INTR_FQ)
> +#define RISCV_IOMMU_IPSR_PMIP           BIT(RISCV_IOMMU_INTR_PM)
> +#define RISCV_IOMMU_IPSR_PIP            BIT(RISCV_IOMMU_INTR_PQ)
> +
> +#define RISCV_IOMMU_IOCOUNT_NUM         31
> +
> +/* 5.19 Performance monitoring counter overflow status (32bits) */
> +#define RISCV_IOMMU_REG_IOCOUNTOVF      0x0058
> +#define RISCV_IOMMU_IOCOUNTOVF_CY       BIT(0)
> +#define RISCV_IOMMU_IOCOUNTOVF_HPM      GENMASK(31, 1)
> +
> +/* 5.20 Performance monitoring counter inhibits (32bits) */
> +#define RISCV_IOMMU_REG_IOCOUNTINH      0x005C
> +#define RISCV_IOMMU_IOCOUNTINH_CY       BIT(0)
> +#define RISCV_IOMMU_IOCOUNTINH_HPM      GENMASK(31, 1)
> +
> +/* 5.21 Performance monitoring cycles counter (64bits) */
> +#define RISCV_IOMMU_REG_IOHPMCYCLES     0x0060
> +#define RISCV_IOMMU_IOHPMCYCLES_COUNTER GENMASK_ULL(62, 0)
> +#define RISCV_IOMMU_IOHPMCYCLES_OVF     BIT_ULL(63)
> +
> +/* 5.22 Performance monitoring event counters (31 * 64bits) */
> +#define RISCV_IOMMU_REG_IOHPMCTR_BASE   0x0068
> +#define RISCV_IOMMU_REG_IOHPMCTR(_n)    \
> +    (RISCV_IOMMU_REG_IOHPMCTR_BASE + (_n * 0x8))
> +
> +/* 5.23 Performance monitoring event selectors (31 * 64bits) */
> +#define RISCV_IOMMU_REG_IOHPMEVT_BASE   0x0160
> +#define RISCV_IOMMU_REG_IOHPMEVT(_n)    \
> +    (RISCV_IOMMU_REG_IOHPMEVT_BASE + (_n * 0x8))
> +#define RISCV_IOMMU_IOHPMEVT_EVENT_ID   GENMASK_ULL(14, 0)
> +#define RISCV_IOMMU_IOHPMEVT_DMASK      BIT_ULL(15)
> +#define RISCV_IOMMU_IOHPMEVT_PID_PSCID  GENMASK_ULL(35, 16)
> +#define RISCV_IOMMU_IOHPMEVT_DID_GSCID  GENMASK_ULL(59, 36)
> +#define RISCV_IOMMU_IOHPMEVT_PV_PSCV    BIT_ULL(60)
> +#define RISCV_IOMMU_IOHPMEVT_DV_GSCV    BIT_ULL(61)
> +#define RISCV_IOMMU_IOHPMEVT_IDT        BIT_ULL(62)
> +#define RISCV_IOMMU_IOHPMEVT_OF         BIT_ULL(63)
> +
> +/**
> + * enum RISCV_IOMMU_HPMEVENT_id - Performance-monitoring event identifier
> + *
> + * @RISCV_IOMMU_HPMEVENT_INVALID: Invalid event, do not count
> + * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests
> + * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests
> + * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests
> + * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses
> + * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks
> + * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks
> + * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: S/VS-Stage page table walks
> + * @RISCV_IOMMU_HPMEVENT_G_WALKS: G-Stage page table walks
> + * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs
> + */
> +enum RISCV_IOMMU_HPMEVENT_id {
> +    RISCV_IOMMU_HPMEVENT_INVALID    = 0,
> +    RISCV_IOMMU_HPMEVENT_URQ        = 1,
> +    RISCV_IOMMU_HPMEVENT_TRQ        = 2,
> +    RISCV_IOMMU_HPMEVENT_ATS_RQ     = 3,
> +    RISCV_IOMMU_HPMEVENT_TLB_MISS   = 4,
> +    RISCV_IOMMU_HPMEVENT_DD_WALK    = 5,
> +    RISCV_IOMMU_HPMEVENT_PD_WALK    = 6,
> +    RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7,
> +    RISCV_IOMMU_HPMEVENT_G_WALKS    = 8,
> +    RISCV_IOMMU_HPMEVENT_MAX        = 9
> +};
> +
> +/* 5.24 Translation request IOVA (64bits) */
> +#define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
> +#define RISCV_IOMMU_TR_REQ_IOVA_VPN     GENMASK_ULL(63, 12)
> +
> +/* 5.25 Translation request control (64bits) */
> +#define RISCV_IOMMU_REG_TR_REQ_CTL      0x0260
> +#define RISCV_IOMMU_TR_REQ_CTL_GO_BUSY  BIT_ULL(0)
> +#define RISCV_IOMMU_TR_REQ_CTL_PRIV     BIT_ULL(1)
> +#define RISCV_IOMMU_TR_REQ_CTL_EXE      BIT_ULL(2)
> +#define RISCV_IOMMU_TR_REQ_CTL_NW       BIT_ULL(3)
> +#define RISCV_IOMMU_TR_REQ_CTL_PID      GENMASK_ULL(31, 12)
> +#define RISCV_IOMMU_TR_REQ_CTL_PV       BIT_ULL(32)
> +#define RISCV_IOMMU_TR_REQ_CTL_DID      GENMASK_ULL(63, 40)
> +
> +/* 5.26 Translation request response (64bits) */
> +#define RISCV_IOMMU_REG_TR_RESPONSE     0x0268
> +#define RISCV_IOMMU_TR_RESPONSE_FAULT   BIT_ULL(0)
> +#define RISCV_IOMMU_TR_RESPONSE_PBMT    GENMASK_ULL(8, 7)
> +#define RISCV_IOMMU_TR_RESPONSE_SZ      BIT_ULL(9)
> +#define RISCV_IOMMU_TR_RESPONSE_PPN     RISCV_IOMMU_PPN_FIELD
> +
> +
> +/* 5.27 Interrupt cause to vector (64bits) */
> +#define RISCV_IOMMU_REG_IVEC            0x02F8
> +#define RISCV_IOMMU_IVEC_CIV            GENMASK_ULL(3, 0)
> +#define RISCV_IOMMU_IVEC_FIV            GENMASK_ULL(7, 4)
> +#define RISCV_IOMMU_IVEC_PMIV           GENMASK_ULL(11, 8)
> +#define RISCV_IOMMU_IVEC_PIV            GENMASK_ULL(15, 12)
> +
> +
> +/* 5.28 MSI Configuration table (32 * 64bits) */
> +#define RISCV_IOMMU_REG_MSI_CONFIG      0x0300
> +#define RISCV_IOMMU_REG_MSI_ADDR(_n)    \
> +    (RISCV_IOMMU_REG_MSI_CONFIG + (_n * 0x10))
> +#define RISCV_IOMMU_MSI_ADDR            GENMASK_ULL(55, 2)
> +#define RISCV_IOMMU_REG_MSI_DATA(_n)    \
> +    (RISCV_IOMMU_REG_MSI_CONFIG + (_n * 0x10) + 0x08)
> +#define RISCV_IOMMU_MSI_DATA            GENMASK_ULL(31, 0)
> +#define RISCV_IOMMU_REG_MSI_VEC_CTL(_n) \
> +    (RISCV_IOMMU_REG_MSI_CONFIG + (_n * 0x10) + 0x0C)
> +#define RISCV_IOMMU_MSI_VEC_CTL_M      BIT_ULL(0)
> +
> +
> +#define RISCV_IOMMU_REG_SIZE           0x1000
> +
> +/*
> + * Chapter 2: Data structures
> + */
> +
> +/*
> + * Device Directory Table macros for non-leaf nodes
> + */
> +#define RISCV_IOMMU_DDTE_VALID          BIT_ULL(0)
> +#define RISCV_IOMMU_DDTE_PPN            RISCV_IOMMU_PPN_FIELD
> +
> +/**
> + * struct riscv_iommu_dc - Device Context
> + * @tc: Translation Control
> + * @iohgatp: I/O Hypervisor guest address translation and protection
> + *           (Second stage context)
> + * @ta: Translation Attributes
> + * @fsc: First stage context
> + * @msiptpt: MSI page table pointer
> + * @msi_addr_mask: MSI address mask
> + * @msi_addr_pattern: MSI address pattern
> + *
> + * This structure is used for leaf nodes on the Device Directory Table,
> + * in case RISCV_IOMMU_CAP_MSI_FLAT is not set, the bottom 4 fields are
> + * not present and are skipped with pointer arithmetic to avoid
> + * casting, check out riscv_iommu_get_dc().
> + * See section 2.1 for more details
> + */
> +struct riscv_iommu_dc {
> +      uint64_t tc;
> +      uint64_t iohgatp;
> +      uint64_t ta;
> +      uint64_t fsc;
> +      uint64_t msiptp;
> +      uint64_t msi_addr_mask;
> +      uint64_t msi_addr_pattern;
> +      uint64_t _reserved;
> +};
> +
> +/* Translation control fields */
> +#define RISCV_IOMMU_DC_TC_V             BIT_ULL(0)
> +#define RISCV_IOMMU_DC_TC_EN_ATS        BIT_ULL(1)
> +#define RISCV_IOMMU_DC_TC_EN_PRI        BIT_ULL(2)
> +#define RISCV_IOMMU_DC_TC_T2GPA         BIT_ULL(3)
> +#define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
> +#define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
> +#define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
> +#define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
> +#define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
> +#define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
> +
> +/* Second-stage (aka G-stage) context fields */
> +#define RISCV_IOMMU_DC_IOHGATP_PPN      RISCV_IOMMU_ATP_PPN_FIELD
> +#define RISCV_IOMMU_DC_IOHGATP_GSCID    GENMASK_ULL(59, 44)
> +#define RISCV_IOMMU_DC_IOHGATP_MODE     RISCV_IOMMU_ATP_MODE_FIELD
> +
> +/**
> + * enum riscv_iommu_dc_iohgatp_modes - Guest address
> + * translation/protection modes
> + *
> + * @RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> + *      No translation/protection
> + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> + *      Sv32x4 (2-bit extension of Sv32), when fctl.GXL == 1
> + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> + *      Sv39x4 (2-bit extension of Sv39), when fctl.GXL == 0
> + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> + *      Sv48x4 (2-bit extension of Sv48), when fctl.GXL == 0
> + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> + *      Sv57x4 (2-bit extension of Sv57), when fctl.GXL == 0
> + */
> +enum riscv_iommu_dc_iohgatp_modes {
> +      RISCV_IOMMU_DC_IOHGATP_MODE_BARE = 0,
> +      RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4 = 8,
> +      RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4 = 8,
> +      RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4 = 9,
> +      RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4 = 10
> +};
> +
> +/* Translation attributes fields */
> +#define RISCV_IOMMU_DC_TA_PSCID         GENMASK_ULL(31, 12)
> +
> +/* First-stage context fields */
> +#define RISCV_IOMMU_DC_FSC_PPN          RISCV_IOMMU_ATP_PPN_FIELD
> +#define RISCV_IOMMU_DC_FSC_MODE         RISCV_IOMMU_ATP_MODE_FIELD
> +
> +/**
> + * enum riscv_iommu_dc_fsc_atp_modes - First stage address
> + * translation/protection modes
> + *
> + * @RISCV_IOMMU_DC_FSC_MODE_BARE: No translation/protection
> + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32: Sv32, when dc.tc.SXL == 1
> + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: Sv39, when dc.tc.SXL == 0
> + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: Sv48, when dc.tc.SXL == 0
> + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: Sv57, when dc.tc.SXL == 0
> + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8: 1lvl PDT, 8bit process ids
> + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17: 2lvl PDT, 17bit process ids
> + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20: 3lvl PDT, 20bit process ids
> + *
> + * FSC holds IOSATP when RISCV_IOMMU_DC_TC_PDTV is 0 and PDTP otherwise.
> + * IOSATP controls the first stage address translation (same as the satp
> + * register on the RISC-V MMU), and PDTP holds the process directory table,
> + * used to select a first stage page table based on a process id (for devices
> + * that support multiple process ids).
> + */
> +enum riscv_iommu_dc_fsc_atp_modes {
> +      RISCV_IOMMU_DC_FSC_MODE_BARE = 0,
> +      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 = 8,
> +      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 = 8,
> +      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48 = 9,
> +      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57 = 10,
> +      RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8 = 1,
> +      RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17 = 2,
> +      RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20 = 3
> +};
> +
> +/* MSI page table pointer */
> +#define RISCV_IOMMU_DC_MSIPTP_PPN       RISCV_IOMMU_ATP_PPN_FIELD
> +#define RISCV_IOMMU_DC_MSIPTP_MODE      RISCV_IOMMU_ATP_MODE_FIELD
> +#define RISCV_IOMMU_DC_MSIPTP_MODE_OFF  0
> +#define RISCV_IOMMU_DC_MSIPTP_MODE_FLAT 1
> +
> +/* MSI address mask */
> +#define RISCV_IOMMU_DC_MSI_ADDR_MASK    GENMASK_ULL(51, 0)
> +
> +/* MSI address pattern */
> +#define RISCV_IOMMU_DC_MSI_PATTERN      GENMASK_ULL(51, 0)
> +
> +
> +/**
> + * struct riscv_iommu_pc - Process Context
> + * @ta: Translation Attributes
> + * @fsc: First stage context
> + *
> + * This structure is used for leaf nodes on the Process Directory Table
> + * See section 2.3 for more details
> + */
> +struct riscv_iommu_pc {
> +      uint64_t ta;
> +      uint64_t fsc;
> +};
> +
> +/* Translation attributes fields */
> +#define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
> +#define RISCV_IOMMU_PC_TA_ENS           BIT_ULL(1)
> +#define RISCV_IOMMU_PC_TA_SUM           BIT_ULL(2)
> +#define RISCV_IOMMU_PC_TA_PSCID         GENMASK_ULL(31, 12)
> +
> +/* First stage context fields */
> +#define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
> +#define RISCV_IOMMU_PC_FSC_MODE         GENMASK_ULL(63, 60)
> +
> +
> +/*
> + * Chapter 3: In-memory queue interface
> + */
> +
> +/**
> + * struct riscv_iommu_cmd - Generic I/O MMU command structure
> + * @dword0: Includes the opcode and the function identifier
> + * @dword1: Opcode specific data
> + *
> + * The commands are interpreted as two 64bit fields, where the first
> + * 7bits of the first field are the opcode which also defines the
> + * command's format, followed by a 3bit field that specifies the
> + * function invoked by that command, and the rest is opcode-specific.
> + * This is a generic struct which will be populated differently
> + * according to each command. For more infos on the commands and
> + * the command queue check section 3.1.
> + */
> +struct riscv_iommu_command {
> +      uint64_t dword0;
> +      uint64_t dword1;
> +};
> +
> +/* Fields on dword0, common for all commands */
> +#define RISCV_IOMMU_CMD_OPCODE          GENMASK_ULL(6, 0)
> +#define RISCV_IOMMU_CMD_FUNC            GENMASK_ULL(9, 7)
> +
> +/* 3.1.1 I/O MMU Page-table cache invalidation */
> +/* Fields on dword0 */
> +#define RISCV_IOMMU_CMD_IOTINVAL_OPCODE         1
> +#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA       0
> +#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA      1
> +#define RISCV_IOMMU_CMD_IOTINVAL_AV     BIT_ULL(10)
> +#define RISCV_IOMMU_CMD_IOTINVAL_PSCID  GENMASK_ULL(31, 12)
> +#define RISCV_IOMMU_CMD_IOTINVAL_PSCV   BIT_ULL(32)
> +#define RISCV_IOMMU_CMD_IOTINVAL_GV     BIT_ULL(33)
> +#define RISCV_IOMMU_CMD_IOTINVAL_GSCID  GENMASK_ULL(59, 44)
> +/* dword1 is the address, 4K-alligned and shifted to the right by two bits. */
> +
> +/* 3.1.2 I/O MMU Command Queue Fences */
> +/* Fields on dword0 */
> +#define RISCV_IOMMU_CMD_IOFENCE_OPCODE          2
> +#define RISCV_IOMMU_CMD_IOFENCE_FUNC_C          0
> +#define RISCV_IOMMU_CMD_IOFENCE_AV      BIT_ULL(10)
> +#define RISCV_IOMMU_CMD_IOFENCE_WSI     BIT_ULL(11)
> +#define RISCV_IOMMU_CMD_IOFENCE_PR      BIT_ULL(12)
> +#define RISCV_IOMMU_CMD_IOFENCE_PW      BIT_ULL(13)
> +#define RISCV_IOMMU_CMD_IOFENCE_DATA    GENMASK_ULL(63, 32)
> +/* dword1 is the address, word-size alligned and shifted to the right by two bits. */
> +
> +/* 3.1.3 I/O MMU Directory cache invalidation */
> +/* Fields on dword0 */
> +#define RISCV_IOMMU_CMD_IODIR_OPCODE            3
> +#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT    0
> +#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT    1
> +#define RISCV_IOMMU_CMD_IODIR_PID       GENMASK_ULL(31, 12)
> +#define RISCV_IOMMU_CMD_IODIR_DV        BIT_ULL(33)
> +#define RISCV_IOMMU_CMD_IODIR_DID       GENMASK_ULL(63, 40)
> +/* dword1 is reserved for standard use */
> +
> +/* 3.1.4 I/O MMU PCIe ATS */
> +/* Fields on dword0 */
> +#define RISCV_IOMMU_CMD_ATS_OPCODE              4
> +#define RISCV_IOMMU_CMD_ATS_FUNC_INVAL          0
> +#define RISCV_IOMMU_CMD_ATS_FUNC_PRGR           1
> +#define RISCV_IOMMU_CMD_ATS_PID         GENMASK_ULL(31, 12)
> +#define RISCV_IOMMU_CMD_ATS_PV          BIT_ULL(32)
> +#define RISCV_IOMMU_CMD_ATS_DSV         BIT_ULL(33)
> +#define RISCV_IOMMU_CMD_ATS_RID         GENMASK_ULL(55, 40)
> +#define RISCV_IOMMU_CMD_ATS_DSEG        GENMASK_ULL(63, 56)
> +/* dword1 is the ATS payload, two different payload types for INVAL and PRGR */
> +
> +/* ATS.INVAL payload*/
> +#define RISCV_IOMMU_CMD_ATS_INVAL_G     BIT_ULL(0)
> +/* Bits 1 - 10 are zeroed */
> +#define RISCV_IOMMU_CMD_ATS_INVAL_S     BIT_ULL(11)
> +#define RISCV_IOMMU_CMD_ATS_INVAL_UADDR GENMASK_ULL(63, 12)
> +
> +/* ATS.PRGR payload */
> +/* Bits 0 - 31 are zeroed */
> +#define RISCV_IOMMU_CMD_ATS_PRGR_PRG_INDEX      GENMASK_ULL(40, 32)
> +/* Bits 41 - 43 are zeroed */
> +#define RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE      GENMASK_ULL(47, 44)
> +#define RISCV_IOMMU_CMD_ATS_PRGR_DST_ID         GENMASK_ULL(63, 48)
> +
> +
> +/**
> + * struct riscv_iommu_fq_record - Fault/Event Queue Record
> + * @hdr: Header, includes fault/event cause, PID/DID, transaction type etc
> + * @_reserved: Low 32bits for custom use, high 32bits for standard use
> + * @iotval: Transaction-type/cause specific format
> + * @iotval2: Cause specific format
> + *
> + * The fault/event queue reports events and failures raised when
> + * processing transactions. Each record is a 32byte structure where
> + * the first dword has a fixed format for providing generic infos
> + * regarding the fault/event, and two more dwords are there for
> + * fault/event-specific information. For more details see section
> + * 3.2.
> + */
> +struct riscv_iommu_fq_record {
> +      uint64_t hdr;
> +      uint64_t _reserved;
> +      uint64_t iotval;
> +      uint64_t iotval2;
> +};
> +
> +/* Fields on header */
> +#define RISCV_IOMMU_FQ_HDR_CAUSE        GENMASK_ULL(11, 0)
> +#define RISCV_IOMMU_FQ_HDR_PID          GENMASK_ULL(31, 12)
> +#define RISCV_IOMMU_FQ_HDR_PV           BIT_ULL(32)
> +#define RISCV_IOMMU_FQ_HDR_PRIV         BIT_ULL(33)
> +#define RISCV_IOMMU_FQ_HDR_TTYPE        GENMASK_ULL(39, 34)
> +#define RISCV_IOMMU_FQ_HDR_DID          GENMASK_ULL(63, 40)
> +
> +/**
> + * enum riscv_iommu_fq_causes - Fault/event cause values
> + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT: Instruction access fault
> + * @RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED: Read address misaligned
> + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT: Read load fault
> + * @RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED: Write/AMO address misaligned
> + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT: Write/AMO access fault
> + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S: Instruction page fault
> + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S: Read page fault
> + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S: Write/AMO page fault
> + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS: Instruction guest page fault
> + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS: Read guest page fault
> + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS: Write/AMO guest page fault
> + * @RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED: All inbound transactions disallowed
> + * @RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT: DDT entry load access fault
> + * @RISCV_IOMMU_FQ_CAUSE_DDT_INVALID: DDT entry invalid
> + * @RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED: DDT entry misconfigured
> + * @RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED: Transaction type disallowed
> + * @RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT: MSI PTE load access fault
> + * @RISCV_IOMMU_FQ_CAUSE_MSI_INVALID: MSI PTE invalid
> + * @RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED: MSI PTE misconfigured
> + * @RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT: MRIF access fault
> + * @RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT: PDT entry load access fault
> + * @RISCV_IOMMU_FQ_CAUSE_PDT_INVALID: PDT entry invalid
> + * @RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED: PDT entry misconfigured
> + * @RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED: DDT data corruption
> + * @RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED: PDT data corruption
> + * @RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED: MSI page table data corruption
> + * @RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED: MRIF data corruption
> + * @RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR: Internal data path error
> + * @RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT: IOMMU MSI write access fault
> + * @RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED: First/second stage page table data corruption
> + *
> + * Values are on table 11 of the spec, encodings 275 - 2047 are reserved for
> + * standard use, and 2048 - 4095 for custom use.
> + */
> +enum riscv_iommu_fq_causes {
> +      RISCV_IOMMU_FQ_CAUSE_INST_FAULT           = 1,
> +      RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED   = 4,
> +      RISCV_IOMMU_FQ_CAUSE_RD_FAULT             = 5,
> +      RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED   = 6,
> +      RISCV_IOMMU_FQ_CAUSE_WR_FAULT             = 7,
> +      RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S         = 12,
> +      RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S           = 13,
> +      RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S           = 15,
> +      RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS        = 20,
> +      RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS          = 21,
> +      RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS          = 23,
> +      RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED         = 256,
> +      RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT       = 257,
> +      RISCV_IOMMU_FQ_CAUSE_DDT_INVALID          = 258,
> +      RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED    = 259,
> +      RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED        = 260,
> +      RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT       = 261,
> +      RISCV_IOMMU_FQ_CAUSE_MSI_INVALID          = 262,
> +      RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED    = 263,
> +      RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT           = 264,
> +      RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT       = 265,
> +      RISCV_IOMMU_FQ_CAUSE_PDT_INVALID          = 266,
> +      RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED    = 267,
> +      RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED        = 268,
> +      RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED        = 269,
> +      RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED     = 270,
> +      RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED      = 271,
> +      RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR    = 272,
> +      RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT         = 273,
> +      RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED         = 274
> +};
> +
> +/**
> + * enum riscv_iommu_fq_ttypes: Fault/event transaction types
> + * @RISCV_IOMMU_FQ_TTYPE_NONE: None. Fault not caused by an inbound transaction.
> + * @RISCV_IOMMU_FQ_TTYPE_UADDR_INST_FETCH: Instruction fetch from untranslated address
> + * @RISCV_IOMMU_FQ_TTYPE_UADDR_RD: Read from untranslated address
> + * @RISCV_IOMMU_FQ_TTYPE_UADDR_WR: Write/AMO to untranslated address
> + * @RISCV_IOMMU_FQ_TTYPE_TADDR_INST_FETCH: Instruction fetch from translated address
> + * @RISCV_IOMMU_FQ_TTYPE_TADDR_RD: Read from translated address
> + * @RISCV_IOMMU_FQ_TTYPE_TADDR_WR: Write/AMO to translated address
> + * @RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ: PCIe ATS translation request
> + * @RISCV_IOMMU_FW_TTYPE_PCIE_MSG_REQ: PCIe message request
> + *
> + * Values are on table 12 of the spec, type 4 and 10 - 31 are reserved for
> + * standard use and 31 - 63 for custom use.
> + */
> +enum riscv_iommu_fq_ttypes {
> +      RISCV_IOMMU_FQ_TTYPE_NONE = 0,
> +      RISCV_IOMMU_FQ_TTYPE_UADDR_INST_FETCH = 1,
> +      RISCV_IOMMU_FQ_TTYPE_UADDR_RD = 2,
> +      RISCV_IOMMU_FQ_TTYPE_UADDR_WR = 3,
> +      RISCV_IOMMU_FQ_TTYPE_TADDR_INST_FETCH = 5,
> +      RISCV_IOMMU_FQ_TTYPE_TADDR_RD = 6,
> +      RISCV_IOMMU_FQ_TTYPE_TADDR_WR = 7,
> +      RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ = 8,
> +      RISCV_IOMMU_FW_TTYPE_PCIE_MSG_REQ = 9,
> +};
> +
> +
> +/**
> + * struct riscv_iommu_pq_record - PCIe Page Request record
> + * @hdr: Header, includes PID, DID etc
> + * @payload: Holds the page address, request group and permission bits
> + *
> + * For more infos on the PCIe Page Request queue see chapter 3.3.
> + */
> +struct riscv_iommu_pq_record {
> +      uint64_t hdr;
> +      uint64_t payload;
> +};
> +
> +/* Header fields */
> +#define RISCV_IOMMU_PREQ_HDR_PID        GENMASK_ULL(31, 12)
> +#define RISCV_IOMMU_PREQ_HDR_PV         BIT_ULL(32)
> +#define RISCV_IOMMU_PREQ_HDR_PRIV       BIT_ULL(33)
> +#define RISCV_IOMMU_PREQ_HDR_EXEC       BIT_ULL(34)
> +#define RISCV_IOMMU_PREQ_HDR_DID        GENMASK_ULL(63, 40)
> +
> +/* Payload fields */
> +#define RISCV_IOMMU_PREQ_PAYLOAD_R      BIT_ULL(0)
> +#define RISCV_IOMMU_PREQ_PAYLOAD_W      BIT_ULL(1)
> +#define RISCV_IOMMU_PREQ_PAYLOAD_L      BIT_ULL(2)
> +#define RISCV_IOMMU_PREQ_PAYLOAD_M      GENMASK_ULL(2, 0)
> +#define RISCV_IOMMU_PREQ_PRG_INDEX      GENMASK_ULL(11, 3)
> +#define RISCV_IOMMU_PREQ_UADDR          GENMASK_ULL(63, 12)
> +
> +
> +/**
> + * struct riscv_iommu_msi_pte - MSI Page Table Entry
> + * @pte: MSI PTE
> + * @mrif_info: Memory-resident interrupt file info
> + *
> + * The MSI Page Table is used for virtualizing MSIs, so that when
> + * a device sends an MSI to a guest, the IOMMU can reroute it
> + * by translating the MSI address, either to a guest interrupt file
> + * or a memory resident interrupt file (MRIF). Note that this page table
> + * is an array of MSI PTEs, not a multi-level pt, each entry
> + * is a leaf entry. For more infos check out the the AIA spec, chapter 9.5.
> + *
> + * Also in basic mode the mrif_info field is ignored by the IOMMU and can
> + * be used by software, any other reserved fields on pte must be zeroed-out
> + * by software.
> + */
> +struct riscv_iommu_msi_pte {
> +      uint64_t pte;
> +      uint64_t mrif_info;
> +};
> +
> +/* Fields on pte */
> +#define RISCV_IOMMU_MSI_PTE_V           BIT_ULL(0)
> +#define RISCV_IOMMU_MSI_PTE_M           GENMASK_ULL(2, 1)
> +
> +#define RISCV_IOMMU_MSI_PTE_M_MRIF      1
> +#define RISCV_IOMMU_MSI_PTE_M_BASIC     3
> +
> +/* When M == 1 (MRIF mode) */
> +#define RISCV_IOMMU_MSI_PTE_MRIF_ADDR   GENMASK_ULL(53, 7)
> +/* When M == 3 (basic mode) */
> +#define RISCV_IOMMU_MSI_PTE_PPN         RISCV_IOMMU_PPN_FIELD
> +#define RISCV_IOMMU_MSI_PTE_C           BIT_ULL(63)
> +
> +/* Fields on mrif_info */
> +#define RISCV_IOMMU_MSI_MRIF_NID        GENMASK_ULL(9, 0)
> +#define RISCV_IOMMU_MSI_MRIF_NPPN       RISCV_IOMMU_PPN_FIELD
> +#define RISCV_IOMMU_MSI_MRIF_NID_MSB    BIT_ULL(60)
> +
> +
> +#endif /* _RISCV_IOMMU_BITS_H_ */
> diff --git a/hw/riscv/riscv-iommu-pci.c b/hw/riscv/riscv-iommu-pci.c
> new file mode 100644
> index 0000000000..e205f806d6
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu-pci.c
> @@ -0,0 +1,181 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/pci/msi.h"
> +#include "hw/pci/msix.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/riscv/riscv_hart.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qemu/error-report.h"
> +#include "qemu/host-utils.h"
> +#include "qom/object.h"
> +
> +#include "cpu_bits.h"
> +#include "riscv-iommu.h"
> +#include "riscv-iommu-bits.h"
> +
> +#ifndef PCI_VENDOR_ID_RIVOS
> +#define PCI_VENDOR_ID_RIVOS           0x1efd
> +#endif
> +
> +#ifndef PCI_DEVICE_ID_RIVOS_IOMMU
> +#define PCI_DEVICE_ID_RIVOS_IOMMU     0xedf1
> +#endif

The file is the RISC-V IOMMU, but don't these IDs say Rivos IOMMU?

> +
> +/* RISC-V IOMMU PCI Device Emulation */
> +
> +typedef struct RISCVIOMMUStatePci {
> +    PCIDevice        pci;     /* Parent PCIe device state */
> +    MemoryRegion     bar0;    /* PCI BAR (including MSI-x config) */
> +    RISCVIOMMUState  iommu;   /* common IOMMU state */
> +} RISCVIOMMUStatePci;
> +
> +/* interrupt delivery callback */
> +static void riscv_iommu_pci_notify(RISCVIOMMUState *iommu, unsigned vector)
> +{
> +    RISCVIOMMUStatePci *s = container_of(iommu, RISCVIOMMUStatePci, iommu);
> +
> +    if (msix_enabled(&(s->pci))) {
> +        msix_notify(&(s->pci), vector);
> +    }
> +}
> +
> +static void riscv_iommu_pci_realize(PCIDevice *dev, Error **errp)
> +{
> +    RISCVIOMMUStatePci *s = DO_UPCAST(RISCVIOMMUStatePci, pci, dev);
> +    RISCVIOMMUState *iommu = &s->iommu;
> +    uint64_t cap = iommu->cap;
> +    Error *err = NULL;
> +
> +    /* Set device id for trace / debug */
> +    DEVICE(iommu)->id = g_strdup_printf("%02x:%02x.%01x",
> +        pci_dev_bus_num(dev), PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
> +
> +    /* Support MSI only */
> +    cap = set_field(cap, RISCV_IOMMU_CAP_IGS, RISCV_IOMMU_CAP_IGS_MSI);
> +    qdev_prop_set_uint64(DEVICE(dev), "capabilities", cap);
> +
> +    if (!qdev_realize(DEVICE(iommu), NULL, errp)) {
> +        return;
> +    }
> +
> +    memory_region_init(&s->bar0, OBJECT(s), "riscv-iommu-bar0",
> +        QEMU_ALIGN_UP(memory_region_size(&iommu->regs_mr), TARGET_PAGE_SIZE));
> +    memory_region_add_subregion(&s->bar0, 0, &iommu->regs_mr);
> +
> +    pcie_endpoint_cap_init(dev, 0);
> +
> +    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> +                     PCI_BASE_ADDRESS_MEM_TYPE_64, &s->bar0);
> +
> +    int ret = msix_init(dev, RISCV_IOMMU_INTR_COUNT,
> +                        &s->bar0, 0, RISCV_IOMMU_REG_MSI_CONFIG,
> +                        &s->bar0, 0, RISCV_IOMMU_REG_MSI_CONFIG + 256, 0, &err);
> +
> +    if (ret == -ENOTSUP) {
> +        /*
> +         * MSI-x is not supported by the platform.
> +         * Driver should use timer/polling based notification handlers.
> +         */
> +        warn_report_err(err);
> +    } else if (ret < 0) {
> +        error_propagate(errp, err);
> +        return;
> +    } else {
> +        /* mark all allocated MSIx vectors as used. */
> +        msix_vector_use(dev, RISCV_IOMMU_INTR_CQ);
> +        msix_vector_use(dev, RISCV_IOMMU_INTR_FQ);
> +        msix_vector_use(dev, RISCV_IOMMU_INTR_PM);
> +        msix_vector_use(dev, RISCV_IOMMU_INTR_PQ);
> +        iommu->notify = riscv_iommu_pci_notify;
> +    }
> +
> +    PCIBus *bus = pci_device_root_bus(dev);
> +    if (!bus) {
> +        error_setg(errp, "can't find PCIe root port for %02x:%02x.%x",
> +            pci_bus_num(pci_get_bus(dev)), PCI_SLOT(dev->devfn),
> +            PCI_FUNC(dev->devfn));
> +        return;
> +    }
> +
> +    riscv_iommu_pci_setup_iommu(iommu, bus, errp);
> +}
> +
> +static void riscv_iommu_pci_exit(PCIDevice *pci_dev)
> +{
> +    pci_setup_iommu(pci_device_root_bus(pci_dev), NULL, NULL);
> +}
> +
> +static const VMStateDescription riscv_iommu_vmstate = {
> +    .name = "riscv-iommu",
> +    .unmigratable = 1
> +};
> +
> +static void riscv_iommu_pci_init(Object *obj)
> +{
> +    RISCVIOMMUStatePci *s = RISCV_IOMMU_PCI(obj);
> +    RISCVIOMMUState *iommu = &s->iommu;
> +
> +    object_initialize_child(obj, "iommu", iommu, TYPE_RISCV_IOMMU);
> +    qdev_alias_all_properties(DEVICE(iommu), obj);
> +}
> +
> +static Property riscv_iommu_pci_properties[] = {
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void riscv_iommu_pci_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
> +
> +    k->realize = riscv_iommu_pci_realize;
> +    k->exit = riscv_iommu_pci_exit;
> +    k->vendor_id = PCI_VENDOR_ID_RIVOS;
> +    k->device_id = PCI_DEVICE_ID_RIVOS_IOMMU;
> +    k->revision = 0;
> +    k->class_id = 0x0806;
> +    dc->desc = "RISCV-IOMMU DMA Remapping device";
> +    dc->vmsd = &riscv_iommu_vmstate;
> +    dc->hotpluggable = false;
> +    dc->user_creatable = true;
> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +    device_class_set_props(dc, riscv_iommu_pci_properties);
> +}
> +
> +static const TypeInfo riscv_iommu_pci = {
> +    .name = TYPE_RISCV_IOMMU_PCI,
> +    .parent = TYPE_PCI_DEVICE,
> +    .class_init = riscv_iommu_pci_class_init,
> +    .instance_init = riscv_iommu_pci_init,
> +    .instance_size = sizeof(RISCVIOMMUStatePci),
> +    .interfaces = (InterfaceInfo[]) {
> +        { INTERFACE_PCIE_DEVICE },
> +        { },
> +    },
> +};
> +
> +static void riscv_iommu_register_pci_types(void)
> +{
> +    type_register_static(&riscv_iommu_pci);
> +}

The PCIe device should be a seperate patch

> +
> +type_init(riscv_iommu_register_pci_types);
> diff --git a/hw/riscv/riscv-iommu-sys.c b/hw/riscv/riscv-iommu-sys.c
> new file mode 100644
> index 0000000000..7148588b59
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu-sys.c
> @@ -0,0 +1,123 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu) - Platform Device
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/irq.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/sysbus.h"
> +#include "qapi/error.h"
> +#include "qapi/error.h"
> +#include "qemu/error-report.h"
> +#include "qemu/host-utils.h"
> +#include "qemu/module.h"
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#include "cpu_bits.h"
> +#include "riscv-iommu.h"
> +#include "riscv-iommu-bits.h"
> +
> +/* RISC-V IOMMU System Platform Device Emulation */
> +
> +struct RISCVIOMMUStateSys {
> +    SysBusDevice     parent;
> +    uint64_t         addr;
> +    qemu_irq         irqs[4];
> +    RISCVIOMMUState  iommu;
> +};
> +
> +/* interrupt delivery callback */
> +static void riscv_iommu_sys_notify(RISCVIOMMUState *iommu, unsigned vector)
> +{
> +    RISCVIOMMUStateSys *s = container_of(iommu, RISCVIOMMUStateSys, iommu);
> +
> +    if (vector < RISCV_IOMMU_INTR_COUNT && s->irqs[vector]) {
> +        qemu_irq_pulse(s->irqs[vector]);
> +    }
> +}
> +
> +static void riscv_iommu_sys_realize(DeviceState *dev, Error **errp)
> +{
> +    RISCVIOMMUStateSys *s = RISCV_IOMMU_SYS(dev);
> +    RISCVIOMMUState *iommu = &s->iommu;
> +    PCIBus *pci_bus;
> +    uint64_t cap = iommu->cap;
> +    int i;
> +
> +    /* Support WSI only */
> +    cap = set_field(cap, RISCV_IOMMU_CAP_IGS, RISCV_IOMMU_CAP_IGS_WSI);
> +    qdev_prop_set_uint64(dev, "capabilities", cap);
> +
> +    if (!qdev_realize(DEVICE(iommu), NULL, errp)) {
> +        return;
> +    }
> +
> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &iommu->regs_mr);
> +    if (s->addr) {
> +        sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, s->addr);
> +    }
> +
> +    for (i = 0; i < RISCV_IOMMU_INTR_COUNT; i++) {
> +        sysbus_init_irq(&s->parent, &s->irqs[i]);
> +    }
> +
> +    iommu->notify = riscv_iommu_sys_notify;
> +
> +    pci_bus = (PCIBus *) object_resolve_path_type("", TYPE_PCI_BUS, NULL);
> +    if (pci_bus) {
> +        riscv_iommu_pci_setup_iommu(iommu, pci_bus, errp);
> +    }
> +}
> +
> +static void riscv_iommu_sys_init(Object *obj)
> +{
> +    RISCVIOMMUStateSys *s = RISCV_IOMMU_SYS(obj);
> +    RISCVIOMMUState *iommu = &s->iommu;
> +
> +    object_initialize_child(obj, "iommu", iommu, TYPE_RISCV_IOMMU);
> +    qdev_alias_all_properties(DEVICE(iommu), obj);
> +}
> +
> +static Property riscv_iommu_sys_properties[] = {
> +    DEFINE_PROP_UINT64("addr", RISCVIOMMUStateSys, addr, 0),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void riscv_iommu_sys_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    dc->realize = riscv_iommu_sys_realize;
> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +    device_class_set_props(dc, riscv_iommu_sys_properties);
> +}
> +
> +static const TypeInfo riscv_iommu_sys = {
> +    .name          = TYPE_RISCV_IOMMU_SYS,
> +    .parent        = TYPE_SYS_BUS_DEVICE,
> +    .class_init    = riscv_iommu_sys_class_init,
> +    .instance_init = riscv_iommu_sys_init,
> +    .instance_size = sizeof(RISCVIOMMUStateSys),
> +};
> +
> +static void riscv_iommu_register_sys(void)
> +{
> +    type_register_static(&riscv_iommu_sys);
> +}

Same here

> +
> +type_init(riscv_iommu_register_sys)
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> new file mode 100644
> index 0000000000..fd271b2988
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.c
> @@ -0,0 +1,2539 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2021-2023, Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci_device.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/riscv/riscv_hart.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qemu/timer.h"
> +
> +#include "cpu_bits.h"
> +#include "riscv-iommu.h"
> +#include "riscv-iommu-bits.h"
> +#include "trace.h"
> +
> +#define LIMIT_CACHE_CTX               (1U << 7)
> +#define LIMIT_CACHE_IOT               (1U << 20)
> +
> +/* Physical page number coversions */
> +#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
> +#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
> +
> +typedef struct RISCVIOMMUContext RISCVIOMMUContext;
> +typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
> +
> +/* Device assigned I/O address space */
> +struct RISCVIOMMUSpace {
> +    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
> +    AddressSpace iova_as;       /* IOVA address space for attached device */
> +    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
> +    uint32_t devid;             /* Requester identifier, AKA device_id */
> +    bool notifier;              /* IOMMU unmap notifier enabled */
> +    QLIST_ENTRY(RISCVIOMMUSpace) list;
> +};
> +
> +/* Device translation context state. */
> +struct RISCVIOMMUContext {
> +    uint64_t devid:24;          /* Requester Id, AKA device_id */
> +    uint64_t pasid:20;          /* Process Address Space ID */
> +    uint64_t __rfu:20;          /* reserved */
> +    uint64_t tc;                /* Translation Control */
> +    uint64_t ta;                /* Translation Attributes */
> +    uint64_t satp;              /* S-Stage address translation and protection */
> +    uint64_t gatp;              /* G-Stage address translation and protection */
> +    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
> +    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
> +    uint64_t msiptp;            /* MSI redirection page table pointer */
> +};
> +
> +/* Address translation cache entry */
> +struct RISCVIOMMUEntry {
> +    uint64_t iova:44;           /* IOVA Page Number */
> +    uint64_t pscid:20;          /* Process Soft-Context identifier */
> +    uint64_t phys:44;           /* Physical Page Number */
> +    uint64_t gscid:16;          /* Guest Soft-Context identifier */
> +    uint64_t perm:2;            /* IOMMU_RW flags */
> +    uint64_t __rfu:2;
> +};
> +
> +/* IOMMU index for transactions without PASID specified. */
> +#define RISCV_IOMMU_NOPASID 0
> +
> +static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
> +{
> +    const uint32_t ipsr =
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
> +    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
> +    if (s->notify && !(ipsr & (1 << vec))) {
> +        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
> +    }
> +}
> +
> +static void riscv_iommu_fault(RISCVIOMMUState *s, struct riscv_iommu_fq_record *ev)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
> +    uint32_t next = (tail + 1) & s->fq_mask;
> +    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
> +
> +    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), ev->hdr, ev->iotval);
> +
> +    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
> +        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, RISCV_IOMMU_FQCSR_FQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
> +        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, RISCV_IOMMU_FQCSR_FQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
> +    }
> +}
> +
> +static void riscv_iommu_pri(RISCVIOMMUState *s,
> +    struct riscv_iommu_pq_record *pr)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
> +    uint32_t next = (tail + 1) & s->pq_mask;
> +    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
> +
> +    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), pr->payload);
> +
> +    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
> +        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, RISCV_IOMMU_PQCSR_PQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
> +        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, RISCV_IOMMU_PQCSR_PQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
> +    }
> +}
> +
> +static void __hpm_incr_ctr(RISCVIOMMUState *s, uint32_t ctr_idx)
> +{
> +    const uint32_t off = ctr_idx << 3;
> +    uint64_t cntr_val;
> +
> +    qemu_spin_lock(&s->regs_lock);
> +    cntr_val = ldq_le_p(&s->regs_rw[RISCV_IOMMU_REG_IOHPMCTR_BASE + off]);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_IOHPMCTR_BASE + off], cntr_val + 1);
> +    qemu_spin_unlock(&s->regs_lock);
> +
> +    /* Handle the overflow scenario. */
> +    if (cntr_val == UINT64_MAX) {
> +        /*
> +         * Generate interrupt only if OF bit is clear. +1 to offset the cycle
> +         * register OF bit.
> +         */
> +        const uint32_t ovf =
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF, BIT(ctr_idx + 1), 0);
> +        if (!get_field(ovf, BIT(ctr_idx + 1))) {
> +            riscv_iommu_reg_mod64(s,
> +                                  RISCV_IOMMU_REG_IOHPMEVT_BASE + off,
> +                                  RISCV_IOMMU_IOHPMEVT_OF,
> +                                  0);
> +            riscv_iommu_notify(s, RISCV_IOMMU_INTR_PM);
> +        }
> +    }
> +}
> +
> +static void riscv_iommu_hpm_incr_ctr(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    unsigned event_id)
> +{
> +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> +    uint32_t did_gscid;
> +    uint32_t pid_pscid;
> +    uint32_t ctr_idx;
> +    gpointer value;
> +    uint32_t ctrs;
> +    uint64_t evt;
> +
> +    if (!(s->cap & RISCV_IOMMU_CAP_HPM)) {
> +        return;
> +    }
> +
> +    pthread_rwlock_rdlock(&s->ht_lock);
> +    value = g_hash_table_lookup(s->hpm_event_ctr_map,
> +                                GUINT_TO_POINTER(event_id));
> +    if (value == NULL) {
> +        pthread_rwlock_unlock(&s->ht_lock);
> +        return;
> +    }
> +
> +    for (ctrs = GPOINTER_TO_UINT(value); ctrs != 0; ctrs &= ctrs - 1) {
> +        ctr_idx = ctz32(ctrs);
> +        if (get_field(inhibit, BIT(ctr_idx + 1))) {
> +            continue;
> +        }
> +
> +        evt = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_IOHPMEVT_BASE + (ctr_idx << 3));
> +
> +        /*
> +         * It's quite possible that event ID has been changed in counter
> +         * but hashtable hasn't been updated yet. We don't want to increment
> +         * counter for the old event ID.
> +         */
> +        if (event_id != get_field(evt, RISCV_IOMMU_IOHPMEVT_EVENT_ID)) {
> +            continue;
> +        }
> +
> +        if (get_field(evt, RISCV_IOMMU_IOHPMEVT_IDT)) {
> +            did_gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
> +            pid_pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
> +        } else {
> +            did_gscid = ctx->devid;
> +            pid_pscid = ctx->pasid;
> +        }
> +
> +        if (get_field(evt, RISCV_IOMMU_IOHPMEVT_PV_PSCV)) {
> +            /*
> +             * If the transaction does not have a valid process_id, counter
> +             * increments if device_id matches DID_GSCID. If the transaction has
> +             * a valid process_id, counter increments if device_id matches
> +             * DID_GSCID and process_id matches PID_PSCID. See IOMMU
> +             * Specification, Chapter 5.23. Performance-monitoring event
> +             * selector.
> +             */
> +            if (ctx->pasid &&
> +                get_field(evt, RISCV_IOMMU_IOHPMEVT_PID_PSCID) != pid_pscid) {
> +                continue;
> +            }
> +        }
> +
> +        if (get_field(evt, RISCV_IOMMU_IOHPMEVT_DV_GSCV)) {
> +            uint32_t mask = ~0;
> +
> +            if (get_field(evt, RISCV_IOMMU_IOHPMEVT_DMASK)) {
> +                /*
> +                 * 1001 1011   mask = GSCID
> +                 * 0000 0111   mask = mask ^ (mask + 1)
> +                 * 1111 1000   mask = ~mask;
> +                 */
> +                mask = get_field(evt, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
> +                mask = mask ^ (mask + 1);
> +                mask = ~mask;
> +            }
> +
> +            if ((get_field(evt, RISCV_IOMMU_IOHPMEVT_DID_GSCID) & mask) !=
> +                (did_gscid & mask)) {
> +                continue;
> +            }
> +        }
> +
> +        __hpm_incr_ctr(s, ctr_idx);
> +    }
> +
> +    pthread_rwlock_unlock(&s->ht_lock);
> +}
> +
> +/* Portable implementation of pext_u64, bit-mask extraction. */
> +static uint64_t _pext_u64(uint64_t val, uint64_t ext)
> +{
> +    uint64_t ret = 0;
> +    uint64_t rot = 1;
> +
> +    while (ext) {
> +        if (ext & 1) {
> +            if (val & 1) {
> +                ret |= rot;
> +            }
> +            rot <<= 1;
> +        }
> +        val >>= 1;
> +        ext >>= 1;
> +    }
> +
> +    return ret;
> +}
> +
> +/* Check if GPA matches MSI/MRIF pattern. */
> +static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    dma_addr_t gpa)
> +{
> +    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
> +        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +        return false; /* Invalid MSI/MRIF mode */
> +    }
> +
> +    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
> +        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
> + *
> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> + * Both implementation can be merged into single helper function in future.
> + * Keeping them separate for now, as error reporting and flow specifics are
> + * sufficiently different for separate implementation.
> + *
> + * @s        : IOMMU Device State
> + * @ctx      : Translation context for device id and process address space id.
> + * @iotlb    : translation data: physical address and access mode.
> + * @gpa      : provided IOVA is a guest physical address, use G-Stage only.
> + * @return   : success or fault cause code.
> + */
> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb, bool gpa)
> +{
> +    dma_addr_t addr, base;
> +    uint64_t satp, gatp, pte;
> +    bool en_s, en_g;
> +    struct {
> +        unsigned char step;
> +        unsigned char levels;
> +        unsigned char ptidxbits;
> +        unsigned char ptesize;
> +    } sc[2];
> +    /* Translation stage phase */
> +    enum {
> +        S_STAGE = 0,
> +        G_STAGE = 1,
> +    } pass;
> +
> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> +
> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;
> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> +
> +    /* Early check for MSI address match when IOVA == GPA */
> +    if (!en_s && (iotlb->perm & IOMMU_WO) &&
> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
> +        iotlb->target_as = &s->trap_as;
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        return 0;
> +    }
> +
> +    /* Exit early for pass-through mode. */
> +    if (!(en_s || en_g)) {
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        /* Allow R/W in pass-through mode */
> +        iotlb->perm = IOMMU_RW;
> +        return 0;
> +    }
> +
> +    /* S/G translation parameters. */
> +    for (pass = 0; pass < 2; pass++) {
> +        sc[pass].step = 0;
> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
> +            /* 32bit mode for GXL/SXL == 1 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> +                if (!(s->cap &
> +                    (pass ? RISCV_IOMMU_CAP_G_SV32 : RISCV_IOMMU_CAP_S_SV32))) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 2;
> +                sc[pass].ptidxbits = 10;
> +                sc[pass].ptesize   = 4;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        } else {
> +            /* 64bit mode for GXL/SXL == 0 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> +                if (!(s->cap &
> +                    (pass ? RISCV_IOMMU_CAP_G_SV39 : RISCV_IOMMU_CAP_S_SV39))) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 3;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> +                if (!(s->cap &
> +                    (pass ? RISCV_IOMMU_CAP_G_SV48 : RISCV_IOMMU_CAP_S_SV48))) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 4;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> +                if (!(s->cap &
> +                    (pass ? RISCV_IOMMU_CAP_G_SV57 : RISCV_IOMMU_CAP_S_SV57))) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 5;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        }
> +    };
> +
> +    /* S/G stages translation tables root pointers */
> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    addr = (en_s && en_g) ? satp : iotlb->iova;
> +    base = en_g ? gatp : satp;
> +    pass = en_g ? G_STAGE : S_STAGE;
> +
> +    do {
> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
> +                                 (sc[pass].levels - 1 - sc[pass].step);
> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
> +        const bool ade =
> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
> +
> +        /* Address range check before first level lookup */
> +        if (!sc[pass].step) {
> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> +            if ((addr & va_mask) != addr) {
> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +            }
> +        }
> +
> +        /* Read page table entry */
> +        if (dma_memory_read(s->target_as, pte_addr, &pte,
> +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
> +        }
> +
> +        if (pass == S_STAGE) {
> +            riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_S_VS_WALKS);
> +        } else {
> +            riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_G_WALKS);
> +        }
> +
> +        if (sc[pass].ptesize == 4) {
> +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
> +        } else {
> +            pte = le64_to_cpu(pte);
> +        }
> +
> +        sc[pass].step++;
> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> +        if (!(pte & PTE_V)) {
> +            break;                /* Invalid PTE */
> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> +            break;                /* Reserved leaf PTE flags: PTE_W */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> +            break;                /* Misaligned PPN */
> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
> +            break;                /* Read access check failed */
> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
> +            break;                /* Write access check failed */
> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
> +            break;                /* Access bit not set */
> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
> +            break;                /* Dirty bit not set */
> +        } else {
> +            /* Leaf PTE, translation completed. */
> +            sc[pass].step = sc[pass].levels;
> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> +            /* Update address mask based on smallest translation granularity */
> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
> +            /* Continue with S-Stage translation? */
> +            if (pass && sc[0].step != sc[0].levels) {
> +                pass = S_STAGE;
> +                addr = iotlb->iova;
> +                continue;
> +            }
> +            /* Translation phase completed (GPA or SPA) */
> +            iotlb->translated_addr = base;
> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> +                                                         : IOMMU_RO;
> +
> +            /* Check MSI GPA address match */
> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> +                riscv_iommu_msi_check(s, ctx, base)) {
> +                /* Trap MSI writes and return GPA address. */
> +                iotlb->target_as = &s->trap_as;
> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +                return 0;
> +            }
> +
> +            /* Continue with G-Stage translation? */
> +            if (!pass && en_g) {
> +                pass = G_STAGE;
> +                addr = base;
> +                base = gatp;
> +                sc[pass].step = 0;
> +                continue;
> +            }
> +
> +            return 0;
> +        }
> +
> +        if (sc[pass].step == sc[pass].levels) {
> +            break; /* Can't find leaf PTE */
> +        }
> +
> +        /* Continue with G-Stage translation? */
> +        if (!pass && en_g) {
> +            pass = G_STAGE;
> +            addr = base;
> +            base = gatp;
> +            sc[pass].step = 0;
> +        }
> +    } while (1);
> +
> +    return (iotlb->perm & IOMMU_WO) ?
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
> +}
> +
> +/* Redirect MSI write for given GPA. */
> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
> +    unsigned size, MemTxAttrs attrs)
> +{
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint64_t intn;
> +    uint32_t n190;
> +    uint64_t pte[2];
> +
> +    if (!riscv_iommu_msi_check(s, ctx, gpa)) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Interrupt File Number */
> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
> +    if (intn >= 256) {
> +        /* Interrupt file number out of range */
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* fetch MSI PTE */
> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
> +    addr = addr | (intn * sizeof(pte));
> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
> +            MEMTXATTRS_UNSPECIFIED);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +
> +    le64_to_cpus(&pte[0]);
> +    le64_to_cpus(&pte[1]);
> +
> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
> +        /* MSI Pass-through mode */
> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
> +        addr = addr | (gpa & TARGET_PAGE_MASK);
> +
> +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                              gpa, addr);
> +
> +        return dma_memory_write(s->target_as, addr, &data, size, attrs);
> +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
> +        /* MRIF mode, continue. */
> +        break;
> +    default:
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /*
> +     * Report an error for interrupt identities exceeding the maximum allowed
> +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
> +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
> +     */
> +    if ((data > 2047) || (gpa & 3)) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* MSI MRIF mode, non atomic pending bit update */
> +
> +    /* MRIF pending bit address */
> +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
> +    addr = addr | ((data & 0x7c0) >> 3);
> +
> +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                          gpa, addr);
> +
> +    /* MRIF pending bit mask */
> +    data = 1ULL << (data & 0x03f);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +    intn = intn | data;
> +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +
> +    /* Get MRIF enable bits */
> +    addr = addr + sizeof(intn);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +    if (!(intn & data)) {
> +        /* notification disabled, MRIF update completed. */
> +        return MEMTX_OK;
> +    }
> +
> +    /* Send notification message */
> +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
> +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
> +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
> +
> +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
> +    if (res != MEMTX_OK) {
> +        return res;
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +/*
> + * Device Context format.
> + *
> + * @s         : IOMMU Device State
> + * @return    : 0: extended (64 bytes) | 1: base (32 bytes)
> + */
> +static int riscv_iommu_dc_is_base(RISCVIOMMUState *s)
> +{
> +    return !(s->cap & RISCV_IOMMU_CAP_MSI_FLAT);
> +}
> +
> +/*
> + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> + *
> + * @s         : IOMMU Device State
> + * @ctx       : Device Translation Context with devid and pasid set.
> + * @return    : success or fault code.
> + */
> +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> +{
> +    const uint64_t ddtp = s->ddtp;
> +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
> +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
> +    struct riscv_iommu_dc dc;
> +    const int dc_fmt = riscv_iommu_dc_is_base(s);
> +    const size_t dc_len = sizeof(dc) >> dc_fmt;
> +    unsigned depth;
> +    uint64_t de;
> +
> +    switch (mode) {
> +    case RISCV_IOMMU_DDTP_MODE_OFF:
> +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +
> +    case RISCV_IOMMU_DDTP_MODE_BARE:
> +        /* mock up pass-through translation context */
> +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
> +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_FSC_MODE_BARE);
> +        ctx->tc = RISCV_IOMMU_DC_TC_EN_ATS | RISCV_IOMMU_DC_TC_V;
> +        ctx->ta = 0;
> +        ctx->msiptp = 0;
> +        return 0;
> +
> +    case RISCV_IOMMU_DDTP_MODE_1LVL:
> +        depth = 0;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_2LVL:
> +        depth = 1;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_3LVL:
> +        depth = 2;
> +        break;
> +
> +    default:
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +    }
> +
> +    /*
> +     * Check supported device id width (in bits).
> +     * See IOMMU Specification, Chapter 6. Software guidelines.
> +     * - if extended device-context format is used:
> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
> +     * - if base device-context format is used:
> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
> +     */
> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +    }
> +
> +    /* Device directory tree walk */
> +    for (; depth-- > 0; ) {
> +        riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_DD_WALK);
> +
> +        /*
> +         * Select device id index bits based on device directory tree level
> +         * and device context format.
> +         * See IOMMU Specification, Chapter 2. Data Structures.
> +         * - if extended device-context format is used:
> +         *   device index: [23:15][14:6][5:0]
> +         * - if base device-context format is used:
> +         *   device index: [23:16][15:7][6:0]
> +         */
> +        const int split = depth * 9 + 6 + dc_fmt;
> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID; /* invalid directory entry */
> +        }
> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID; /* reserved bits set. */
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
> +    }
> +
> +    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_DD_WALK);
> +
> +    /* index into device context entry page */
> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
> +
> +    memset(&dc, 0, sizeof(dc));
> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +    }
> +
> +    /* Set translation context. */
> +    ctx->tc = le64_to_cpu(dc.tc);
> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
> +    ctx->satp = le64_to_cpu(dc.fsc);
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +    }
> +
> +    /* FSC field checks */
> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
> +
> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
> +        /* No S-Stage translation, done. */
> +        return 0;
> +    }
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
> +        if (ctx->pasid != RISCV_IOMMU_NOPASID) {
> +            /* PASID is disabled */
> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +        }
> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
> +            /* Invalid translation mode */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        return 0;
> +    }
> +
> +    if (ctx->pasid == RISCV_IOMMU_NOPASID) {
> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
> +            /* No default PASID enabled, set BARE mode */
> +            ctx->satp = 0ULL;
> +            return 0;
> +        } else {
> +            /* Use default PASID #0 */
> +            ctx->pasid = 0;
> +        }
> +    }
> +
> +    /* FSC.TC.PDTV enabled */
> +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
> +        /* Invalid PDTP.MODE */
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> +    }
> +
> +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
> +        riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_PD_WALK);
> +
> +        /*
> +         * Select process id index bits based on process directory tree
> +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
> +         */
> +        const int split = depth * 9 + 8;
> +        addr |= ((ctx->pasid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
> +    }
> +
> +    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_PD_WALK);
> +
> +    /* Leaf entry in PDT */
> +    addr |= (ctx->pasid << 4) & ~TARGET_PAGE_MASK;
> +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +    }
> +
> +    /* Use FSC and TA from process directory entry. */
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->satp = le64_to_cpu(dc.fsc);
> +
> +    return 0;
> +}
> +
> +/* Translation Context cache support */
> +static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
> +{
> +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
> +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
> +    return c1->devid == c2->devid && c1->pasid == c2->pasid;
> +}
> +
> +static guint __ctx_hash(gconstpointer v)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
> +    /* Generate simple hash of (pasid, devid), assuming 24-bit wide devid */
> +    return (guint)(ctx->devid) + ((guint)(ctx->pasid) << 24);
> +}
> +
> +static void __ctx_inval_devid_pasid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid &&
> +        ctx->pasid == arg->pasid) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
> +    uint32_t devid, uint32_t pasid)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .pasid = pasid,
> +    };
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    g_hash_table_foreach(ctx_cache, func, &key);
> +    g_hash_table_unref(ctx_cache);
> +}
> +
> +/* Find or allocate translation context for a given {device_id, process_id} */
> +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
> +    unsigned devid, unsigned pasid, void **ref)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext *ctx;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .pasid = pasid,
> +    };
> +
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    ctx = g_hash_table_lookup(ctx_cache, &key);
> +
> +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    if (g_hash_table_size(s->ctx_cache) >= LIMIT_CACHE_CTX) {
> +        ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                          g_free, NULL);
> +        g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
> +    }
> +
> +    ctx = g_new0(RISCVIOMMUContext, 1);
> +    ctx->devid = devid;
> +    ctx->pasid = pasid;
> +
> +    int fault = riscv_iommu_ctx_fetch(s, ctx);
> +    if (!fault) {
> +        g_hash_table_add(ctx_cache, ctx);
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    g_hash_table_unref(ctx_cache);
> +    *ref = NULL;
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_DTF)) {
> +        struct riscv_iommu_fq_record ev = { 0 };
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE,
> +            RISCV_IOMMU_FQ_TTYPE_UADDR_RD);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, devid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, pasid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, !!pasid);
> +        riscv_iommu_fault(s, &ev);
> +    }
> +
> +    g_free(ctx);
> +    return NULL;
> +}
> +
> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
> +{
> +    if (ref) {
> +        g_hash_table_unref((GHashTable *)ref);
> +    }
> +}
> +
> +/* Find or allocate address space for a given device */
> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
> +{
> +    RISCVIOMMUSpace *as;
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    QLIST_FOREACH(as, &s->spaces, list) {
> +        if (as->devid == devid) {
> +            break;
> +        }
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +
> +    if (as == NULL) {
> +        char name[64];
> +        as = g_new0(RISCVIOMMUSpace, 1);
> +
> +        as->iommu = s;
> +        as->devid = devid;
> +
> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +
> +        /* IOVA address space, untranslated addresses */
> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
> +            OBJECT(as), name, UINT64_MAX);
> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
> +            TYPE_RISCV_IOMMU_PCI);
> +
> +        qemu_mutex_lock(&s->core_lock);
> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
> +        qemu_mutex_unlock(&s->core_lock);
> +
> +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +    }
> +    return &as->iova_as;
> +}
> +
> +/* Translation Object cache support */
> +static gboolean __iot_equal(gconstpointer v1, gconstpointer v2)
> +{
> +    RISCVIOMMUEntry *t1 = (RISCVIOMMUEntry *) v1;
> +    RISCVIOMMUEntry *t2 = (RISCVIOMMUEntry *) v2;
> +    return t1->gscid == t2->gscid && t1->pscid == t2->pscid &&
> +           t1->iova == t2->iova;
> +}
> +
> +static guint __iot_hash(gconstpointer v)
> +{
> +    RISCVIOMMUEntry *t = (RISCVIOMMUEntry *) v;
> +    return (guint)t->iova;
> +}
> +
> +/* GV: 1 PSCV: 1 AV: 1 */
> +static void __iot_inval_pscid_iova(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> +    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
> +    if (iot->gscid == arg->gscid &&
> +        iot->pscid == arg->pscid &&
> +        iot->iova == arg->iova) {
> +        iot->perm = 0;
> +    }
> +}
> +
> +/* GV: 1 PSCV: 1 AV: 0 */
> +static void __iot_inval_pscid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> +    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
> +    if (iot->gscid == arg->gscid &&
> +        iot->pscid == arg->pscid) {
> +        iot->perm = 0;
> +    }
> +}
> +
> +/* GV: 1 GVMA: 1 */
> +static void __iot_inval_gscid_gpa(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> +    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
> +    if (iot->gscid == arg->gscid) {
> +        /* simplified cache, no GPA matching */
> +        iot->perm = 0;
> +    }
> +}
> +
> +/* GV: 1 GVMA: 0 */
> +static void __iot_inval_gscid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> +    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
> +    if (iot->gscid == arg->gscid) {
> +        iot->perm = 0;
> +    }
> +}
> +
> +/* GV: 0 */
> +static void __iot_inval_all(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> +    iot->perm = 0;
> +}
> +
> +/* caller should keep ref-count for iot_cache object */
> +static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
> +    GHashTable *iot_cache, hwaddr iova)
> +{
> +    RISCVIOMMUEntry key = {
> +        .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
> +        .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
> +        .iova  = PPN_DOWN(iova),
> +    };
> +    return g_hash_table_lookup(iot_cache, &key);
> +}
> +
> +/* caller should keep ref-count for iot_cache object */
> +static void riscv_iommu_iot_update(RISCVIOMMUState *s,
> +    GHashTable *iot_cache, RISCVIOMMUEntry *iot)
> +{
> +    if (!s->iot_limit) {
> +        return;
> +    }
> +
> +    if (g_hash_table_size(s->iot_cache) >= s->iot_limit) {
> +        iot_cache = g_hash_table_new_full(__iot_hash, __iot_equal,
> +                                          g_free, NULL);
> +        g_hash_table_unref(qatomic_xchg(&s->iot_cache, iot_cache));
> +    }
> +    g_hash_table_add(iot_cache, iot);
> +}
> +
> +static void riscv_iommu_iot_inval(RISCVIOMMUState *s, GHFunc func,
> +    uint32_t gscid, uint32_t pscid, hwaddr iova)
> +{
> +    GHashTable *iot_cache;
> +    RISCVIOMMUEntry key = {
> +        .gscid = gscid,
> +        .pscid = pscid,
> +        .iova  = PPN_DOWN(iova),
> +    };
> +
> +    iot_cache = g_hash_table_ref(s->iot_cache);
> +    g_hash_table_foreach(iot_cache, func, &key);
> +    g_hash_table_unref(iot_cache);
> +}
> +
> +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb, bool enable_cache)
> +{
> +    RISCVIOMMUEntry *iot;
> +    IOMMUAccessFlags perm;
> +    bool enable_faults;
> +    bool enable_pasid;
> +    bool enable_pri;
> +    GHashTable *iot_cache;
> +    int fault;
> +
> +    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_URQ);
> +
> +    iot_cache = g_hash_table_ref(s->iot_cache);
> +
> +    enable_faults = !(ctx->tc & RISCV_IOMMU_DC_TC_DTF);
> +    /*
> +     * TC[32] is reserved for custom extensions, used here to temporarily
> +     * enable automatic page-request generation for ATS queries.
> +     */
> +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
> +    enable_pasid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
> +
> +    /* Check for ATS request. */
> +    if (iotlb->perm == IOMMU_NONE) {
> +        riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_ATS_RQ);
> +        /* Check if ATS is disabled. */
> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS)) {
> +            enable_pri = false;
> +            fault = RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +            goto done;
> +        }
> +        trace_riscv_iommu_ats(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid), iotlb->iova);
> +    }
> +
> +    iot = riscv_iommu_iot_lookup(ctx, iot_cache, iotlb->iova);
> +    perm = iot ? iot->perm : IOMMU_NONE;
> +    if (perm != IOMMU_NONE) {
> +        iotlb->translated_addr = PPN_PHYS(iot->phys);
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        iotlb->perm = perm;
> +        fault = 0;
> +        goto done;
> +    }
> +
> +    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_TLB_MISS);
> +
> +    /* Translate using device directory / page table information. */
> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb, false);
> +
> +    if (!fault && iotlb->target_as == &s->trap_as) {
> +        /* Do not cache trapped MSI translations */
> +        goto done;
> +    }
> +
> +    if (!fault && iotlb->translated_addr != iotlb->iova && enable_cache) {
> +        iot = g_new0(RISCVIOMMUEntry, 1);
> +        iot->iova = PPN_DOWN(iotlb->iova);
> +        iot->phys = PPN_DOWN(iotlb->translated_addr);
> +        iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
> +        iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
> +        iot->perm = iotlb->perm;
> +        riscv_iommu_iot_update(s, iot_cache, iot);
> +    }
> +
> +done:
> +    g_hash_table_unref(iot_cache);
> +
> +    if (enable_pri && fault) {
> +        struct riscv_iommu_pq_record pr = {0};
> +        if (enable_pasid) {
> +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
> +                RISCV_IOMMU_PREQ_HDR_PID, ctx->pasid);
> +        }
> +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
> +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) | RISCV_IOMMU_PREQ_PAYLOAD_M;
> +        riscv_iommu_pri(s, &pr);
> +        return fault;
> +    }
> +
> +    if (enable_faults && fault) {
> +        struct riscv_iommu_fq_record ev;
> +        const unsigned ttype =
> +            (iotlb->perm & IOMMU_RW) ? RISCV_IOMMU_FQ_TTYPE_UADDR_WR :
> +            ((iotlb->perm & IOMMU_RO) ? RISCV_IOMMU_FQ_TTYPE_UADDR_RD :
> +            RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ);
> +        ev.hdr = set_field(0, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, ttype);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, enable_pasid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->pasid);
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
> +        ev.iotval    = iotlb->iova;
> +        ev.iotval2   = iotlb->translated_addr;
> +        ev._reserved = 0;
> +        riscv_iommu_fault(s, &ev);
> +        return fault;
> +    }
> +
> +    return 0;
> +}
> +
> +/* IOMMU Command Interface */
> +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
> +    uint64_t addr, uint32_t data)
> +{
> +    /*
> +     * ATS processing in this implementation of the IOMMU is synchronous,
> +     * no need to wait for completions here.
> +     */
> +    if (!notify) {
> +        return MEMTX_OK;
> +    }
> +
> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
> +        MEMTXATTRS_UNSPECIFIED);
> +}
> +
> +static void riscv_iommu_ats(RISCVIOMMUState *s,
> +    struct riscv_iommu_command *cmd, IOMMUNotifierFlag flag,
> +    IOMMUAccessFlags perm,
> +    void (*trace_fn)(const char *id))
> +{
> +    RISCVIOMMUSpace *as = NULL;
> +    IOMMUNotifier *n;
> +    IOMMUTLBEvent event;
> +    uint32_t pasid;
> +    uint32_t devid;
> +    const bool pv = cmd->dword0 & RISCV_IOMMU_CMD_ATS_PV;
> +
> +    if (cmd->dword0 & RISCV_IOMMU_CMD_ATS_DSV) {
> +        /* Use device segment and requester id */
> +        devid = get_field(cmd->dword0,
> +            RISCV_IOMMU_CMD_ATS_DSEG | RISCV_IOMMU_CMD_ATS_RID);
> +    } else {
> +        devid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_RID);
> +    }
> +
> +    pasid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_PID);
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    QLIST_FOREACH(as, &s->spaces, list) {
> +        if (as->devid == devid) {
> +            break;
> +        }
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +
> +    if (!as || !as->notifier) {
> +        return;
> +    }
> +
> +    event.type = flag;
> +    event.entry.perm = perm;
> +    event.entry.target_as = s->target_as;
> +
> +    IOMMU_NOTIFIER_FOREACH(n, &as->iova_mr) {
> +        if (!pv || n->iommu_idx == pasid) {
> +            event.entry.iova = n->start;
> +            event.entry.addr_mask = n->end - n->start;
> +            trace_fn(as->iova_mr.parent_obj.name);
> +            memory_region_notify_iommu_one(n, &event);
> +        }
> +    }
> +}
> +
> +static void riscv_iommu_ats_inval(RISCVIOMMUState *s,
> +    struct riscv_iommu_command *cmd)
> +{
> +    return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_DEVIOTLB_UNMAP, IOMMU_NONE,
> +                           trace_riscv_iommu_ats_inval);
> +}
> +
> +static void riscv_iommu_ats_prgr(RISCVIOMMUState *s,
> +    struct riscv_iommu_command *cmd)
> +{
> +    unsigned resp_code = get_field(cmd->dword1, RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE);
> +    /* Using the access flag to carry response code information */
> +    IOMMUAccessFlags perm = resp_code ? IOMMU_NONE : IOMMU_RW;
> +    return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_MAP, perm,
> +                           trace_riscv_iommu_ats_prgr);
> +}
> +
> +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
> +{
> +    uint64_t old_ddtp = s->ddtp;
> +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
> +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    bool ok = false;
> +
> +    /*
> +     * Check for allowed DDTP.MODE transitions:
> +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
> +     */
> +    if (new_mode == old_mode ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
> +        ok = true;
> +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
> +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
> +    }
> +
> +    if (ok) {
> +        /* clear reserved and busy bits, report back sanitized version */
> +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
> +                             RISCV_IOMMU_DDTP_MODE, new_mode);
> +    } else {
> +        new_ddtp = old_ddtp;
> +    }
> +    s->ddtp = new_ddtp;
> +
> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
> +}
> +
> +/* Command function and opcode field. */
> +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
> +
> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
> +{
> +    struct riscv_iommu_command cmd;
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint32_t tail, head, ctrl;
> +    GHFunc func;
> +
> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
> +
> +    /* Check for pending error or queue processing disabled */
> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
> +        return;
> +    }
> +
> +    while (tail != head) {
> +        addr = s->cq_addr  + head * sizeof(cmd);
> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
> +                              MEMTXATTRS_UNSPECIFIED);
> +
> +        if (res != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, RISCV_IOMMU_CQCSR_CQMF, 0);
> +            goto fault;
> +        }
> +
> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
> +
> +        switch (get_field(cmd.dword0, RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC)) {
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
> +            res = riscv_iommu_iofence(s, cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV,
> +                cmd.dword1, get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
> +
> +            if (res != MEMTX_OK) {
> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
> +                goto fault;
> +            }
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
> +                goto cmd_ill;
> +            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
> +                /* invalidate all cache mappings */
> +                func = __iot_inval_all;
> +            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
> +                /* invalidate cache matching GSCID */
> +                func = __iot_inval_gscid;
> +            } else {
> +                /* invalidate cache matching GSCID and ADDR (GPA) */
> +                func = __iot_inval_gscid_gpa;
> +            }
> +            riscv_iommu_iot_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID), 0,
> +                cmd.dword1 & TARGET_PAGE_MASK);
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
> +                /* invalidate all cache mappings, simplified model */
> +                func = __iot_inval_all;
> +            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV)) {
> +                /* invalidate cache matching GSCID, simplified model */
> +                func = __iot_inval_gscid;
> +            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
> +                /* invalidate cache matching GSCID and PSCID */
> +                func = __iot_inval_pscid;
> +            } else {
> +                /* invalidate cache matching GSCID and PSCID and ADDR (IOVA) */
> +                func = __iot_inval_pscid_iova;
> +            }
> +            riscv_iommu_iot_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID),
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_PSCID),
> +                cmd.dword1 & TARGET_PAGE_MASK);
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* invalidate all device context cache mappings */
> +                func = __ctx_inval_all;
> +            } else {
> +                /* invalidate all device context matching DID */
> +                func = __ctx_inval_devid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* illegal command arguments IODIR_PDT & DV == 0 */
> +                goto cmd_ill;
> +            } else {
> +                func = __ctx_inval_devid_pasid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
> +            break;
> +
> +        /* ATS commands */
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_INVAL,
> +                             RISCV_IOMMU_CMD_ATS_OPCODE):
> +            riscv_iommu_ats_inval(s, &cmd);
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_PRGR,
> +                             RISCV_IOMMU_CMD_ATS_OPCODE):
> +            riscv_iommu_ats_prgr(s, &cmd);
> +            break;
> +
> +        default:
> +        cmd_ill:
> +            /* Invalid instruction, do not advance instruction index. */
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
> +            goto fault;
> +        }
> +
> +        /* Advance and update head pointer after command completes. */
> +        head = (head + 1) & s->cq_mask;
> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
> +    }
> +    return;
> +
> +fault:
> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
> +    }
> +}
> +
> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
> +            RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
> +            RISCV_IOMMU_FQCSR_FQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
> +            RISCV_IOMMU_PQCSR_PQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_dbg(RISCVIOMMUState *s)
> +{
> +    uint64_t iova = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_IOVA);
> +    uint64_t ctrl = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_CTL);
> +    unsigned devid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_DID);
> +    unsigned pid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_PID);
> +    RISCVIOMMUContext *ctx;
> +    void *ref;
> +
> +    if (!(ctrl & RISCV_IOMMU_TR_REQ_CTL_GO_BUSY)) {
> +        return;
> +    }
> +
> +    ctx = riscv_iommu_ctx(s, devid, pid, &ref);
> +    if (ctx == NULL) {
> +        riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE,
> +            RISCV_IOMMU_TR_RESPONSE_FAULT | (RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED << 10));
> +    } else {
> +        IOMMUTLBEntry iotlb = {
> +            .iova = iova,
> +            .perm = IOMMU_NONE,
> +            .addr_mask = ~0,
> +            .target_as = NULL,
> +        };
> +        int fault = riscv_iommu_translate(s, ctx, &iotlb, false);
> +        if (fault) {
> +            iova = RISCV_IOMMU_TR_RESPONSE_FAULT | (((uint64_t) fault) << 10);
> +        } else {
> +            iova = ((iotlb.translated_addr & ~iotlb.addr_mask) >> 2) &
> +                RISCV_IOMMU_TR_RESPONSE_PPN;
> +        }
> +        riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE, iova);
> +    }
> +
> +    riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_TR_REQ_CTL, 0,
> +        RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
> +    riscv_iommu_ctx_put(s, ref);
> +}
> +
> +/* Core IOMMU execution activation */
> +enum {
> +    RISCV_IOMMU_EXEC_DDTP,
> +    RISCV_IOMMU_EXEC_CQCSR,
> +    RISCV_IOMMU_EXEC_CQT,
> +    RISCV_IOMMU_EXEC_FQCSR,
> +    RISCV_IOMMU_EXEC_FQH,
> +    RISCV_IOMMU_EXEC_PQCSR,
> +    RISCV_IOMMU_EXEC_PQH,
> +    RISCV_IOMMU_EXEC_TR_REQUEST,
> +    /* RISCV_IOMMU_EXEC_EXIT must be the last enum value */
> +    RISCV_IOMMU_EXEC_EXIT,
> +};
> +
> +static void *riscv_iommu_core_proc(void* arg)
> +{
> +    RISCVIOMMUState *s = arg;
> +    unsigned exec = 0;
> +    unsigned mask = 0;
> +
> +    while (!(exec & BIT(RISCV_IOMMU_EXEC_EXIT))) {
> +        mask = (mask ? mask : BIT(RISCV_IOMMU_EXEC_EXIT)) >> 1;
> +        switch (exec & mask) {
> +        case BIT(RISCV_IOMMU_EXEC_DDTP):
> +            riscv_iommu_process_ddtp(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_CQCSR):
> +            riscv_iommu_process_cq_control(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_CQT):
> +            riscv_iommu_process_cq_tail(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_FQCSR):
> +            riscv_iommu_process_fq_control(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_FQH):
> +            /* NOP */
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_PQCSR):
> +            riscv_iommu_process_pq_control(s);
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_PQH):
> +            /* NOP */
> +            break;
> +        case BIT(RISCV_IOMMU_EXEC_TR_REQUEST):
> +            riscv_iommu_process_dbg(s);
> +            break;
> +        }
> +        exec &= ~mask;
> +        if (!exec) {
> +            qemu_mutex_lock(&s->core_lock);
> +            exec = s->core_exec;
> +            while (!exec) {
> +                qemu_cond_wait(&s->core_cond, &s->core_lock);
> +                exec = s->core_exec;
> +            }
> +            s->core_exec = 0;
> +            qemu_mutex_unlock(&s->core_lock);
> +        }
> +    };
> +
> +    return NULL;
> +}
> +
> +/* For now we assume IOMMU HPM frequency to be 1GHz so 1-cycle is of 1-ns. */
> +static inline uint64_t __get_cycles(void)
> +{
> +    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
> +}
> +
> +static void __hpm_setup_timer(RISCVIOMMUState *s, uint64_t value)
> +{
> +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> +    uint64_t overflow_at, overflow_ns;
> +
> +    if (get_field(inhibit, RISCV_IOMMU_IOCOUNTINH_CY)) {
> +        return;
> +    }
> +
> +    /*
> +     * We are using INT64_MAX here instead to UINT64_MAX because cycle counter
> +     * has 63-bit precision and INT64_MAX is the maximum it can store.
> +     */
> +    if (value) {
> +        overflow_ns = INT64_MAX - value + 1;
> +    } else {
> +        overflow_ns = INT64_MAX;
> +    }
> +
> +    overflow_at = (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + overflow_ns;
> +
> +    if (overflow_at > INT64_MAX) {
> +        s->irq_overflow_left = overflow_at - INT64_MAX;
> +        overflow_at = INT64_MAX;
> +    }
> +
> +    timer_mod_anticipate_ns(s->hpm_timer, overflow_at);
> +}
> +
> +/* Updates the internal cycle counter state when iocntinh:CY is changed. */
> +static void riscv_iommu_process_iocntinh_cy(RISCVIOMMUState *s,
> +                                            bool prev_cy_inh)
> +{
> +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> +
> +    /* We only need to process CY bit toggle. */
> +    if (!(inhibit ^ prev_cy_inh)) {
> +        return;
> +    }
> +
> +    if (!(inhibit & RISCV_IOMMU_IOCOUNTINH_CY)) {
> +        /*
> +         * Cycle counter is enabled. Just start the timer again and update the
> +         * clock snapshot value to point to the current time to make sure
> +         * iohpmcycles read is correct.
> +         */
> +        s->hpmcycle_prev = __get_cycles();
> +        __hpm_setup_timer(s, s->hpmcycle_val);
> +    } else {
> +        /*
> +         * Cycle counter is disabled. Stop the timer and update the cycle
> +         * counter to record the current value which is last programmed
> +         * value + the cycles passed so far.
> +         */
> +        s->hpmcycle_val = s->hpmcycle_val + (__get_cycles() - s->hpmcycle_prev);
> +        timer_del(s->hpm_timer);
> +    }
> +}
> +
> +static void riscv_iommu_process_hpmcycle_write(RISCVIOMMUState *s)
> +{
> +    const uint64_t val = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_IOHPMCYCLES);
> +    const uint32_t ovf = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTOVF);
> +
> +    /*
> +     * Clear OF bit in IOCNTOVF if it's being cleared in IOHPMCYCLES register.
> +     */
> +    if (get_field(ovf, RISCV_IOMMU_IOCOUNTOVF_CY) &&
> +        !get_field(val, RISCV_IOMMU_IOHPMCYCLES_OVF)) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF, 0,
> +            RISCV_IOMMU_IOCOUNTOVF_CY);
> +    }
> +
> +    s->hpmcycle_val = val & ~RISCV_IOMMU_IOHPMCYCLES_OVF;
> +    s->hpmcycle_prev = __get_cycles();
> +    __hpm_setup_timer(s, s->hpmcycle_val);
> +}
> +
> +static inline bool __check_valid_event_id(unsigned event_id)
> +{
> +    return event_id > RISCV_IOMMU_HPMEVENT_INVALID &&
> +           event_id < RISCV_IOMMU_HPMEVENT_MAX;
> +}
> +
> +static gboolean __hpm_event_equal(gpointer key, gpointer value, gpointer udata)
> +{
> +    uint32_t *pair = udata;
> +
> +    if (GPOINTER_TO_UINT(value) & (1 << pair[0])) {
> +        pair[1] = GPOINTER_TO_UINT(key);
> +        return true;
> +    }
> +
> +    return false;
> +}
> +
> +/* Caller must check ctr_idx against hpm_ctrs to see if its supported or not. */
> +static void __update_event_map(RISCVIOMMUState *s, uint64_t value,
> +    uint32_t ctr_idx)
> +{
> +    unsigned event_id = get_field(value, RISCV_IOMMU_IOHPMEVT_EVENT_ID);
> +    uint32_t pair[2] = { ctr_idx, RISCV_IOMMU_HPMEVENT_INVALID };
> +    uint32_t new_value = 1 << ctr_idx;
> +    gpointer data;
> +
> +    /* If EventID field is RISCV_IOMMU_HPMEVENT_INVALID remove the current mapping. */
> +    if (event_id == RISCV_IOMMU_HPMEVENT_INVALID) {
> +        data = g_hash_table_find(s->hpm_event_ctr_map, __hpm_event_equal, pair);
> +
> +        new_value = GPOINTER_TO_UINT(data) & ~(new_value);
> +        pthread_rwlock_wrlock(&s->ht_lock);
> +        if (new_value != 0) {
> +            g_hash_table_replace(s->hpm_event_ctr_map,
> +                                 GUINT_TO_POINTER(pair[1]),
> +                                 GUINT_TO_POINTER(new_value));
> +        } else {
> +            g_hash_table_remove(s->hpm_event_ctr_map,
> +                                GUINT_TO_POINTER(pair[1]));
> +        }
> +        pthread_rwlock_unlock(&s->ht_lock);
> +
> +        return;
> +    }
> +
> +    /* Update the counter mask if the event is already enabled. */
> +    if (g_hash_table_lookup_extended(s->hpm_event_ctr_map,
> +                                     GUINT_TO_POINTER(event_id),
> +                                     NULL,
> +                                     &data)) {
> +        new_value |= GPOINTER_TO_UINT(data);
> +    }
> +
> +    pthread_rwlock_wrlock(&s->ht_lock);
> +    g_hash_table_insert(s->hpm_event_ctr_map,
> +                        GUINT_TO_POINTER(event_id),
> +                        GUINT_TO_POINTER(new_value));
> +    pthread_rwlock_unlock(&s->ht_lock);
> +}
> +
> +static void riscv_iommu_process_hpmevt_write(RISCVIOMMUState *s,
> +                                             uint32_t evt_reg)
> +{
> +    const uint32_t ctr_idx = (evt_reg - RISCV_IOMMU_REG_IOHPMEVT_BASE) >> 3;
> +    const uint32_t ovf = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTOVF);
> +    uint64_t val = riscv_iommu_reg_get64(s, evt_reg);
> +
> +    if (ctr_idx >= s->hpm_cntrs) {
> +        return;
> +    }
> +
> +    /* Clear OF bit in IOCNTOVF if it's being cleared in IOHPMEVT register. */
> +    if (get_field(ovf, BIT(ctr_idx + 1)) && !get_field(val, RISCV_IOMMU_IOHPMEVT_OF)) {
> +        /* +1 to offset CYCLE register OF bit. */
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF, 0, BIT(ctr_idx + 1));
> +    }
> +
> +    if (!__check_valid_event_id(get_field(val, RISCV_IOMMU_IOHPMEVT_EVENT_ID))) {
> +        /* Reset EventID (WARL) field to invalid. */
> +        val = set_field(val, RISCV_IOMMU_IOHPMEVT_EVENT_ID,
> +            RISCV_IOMMU_HPMEVENT_INVALID);
> +        riscv_iommu_reg_set64(s, evt_reg, val);
> +    }
> +
> +    __update_event_map(s, val, ctr_idx);
> +}
> +
> +static void riscv_iommu_process_hpm_writes(RISCVIOMMUState *s,
> +                                           uint32_t regb,
> +                                           bool prev_cy_inh)
> +{
> +    switch (regb) {
> +    case RISCV_IOMMU_REG_IOCOUNTINH:
> +        riscv_iommu_process_iocntinh_cy(s, prev_cy_inh);
> +        break;
> +
> +    case RISCV_IOMMU_REG_IOHPMCYCLES:
> +    case RISCV_IOMMU_REG_IOHPMCYCLES + 4:
> +        riscv_iommu_process_hpmcycle_write(s);
> +        break;
> +
> +    case RISCV_IOMMU_REG_IOHPMEVT_BASE ...
> +        RISCV_IOMMU_REG_IOHPMEVT(RISCV_IOMMU_IOCOUNT_NUM) + 4:
> +        riscv_iommu_process_hpmevt_write(s, regb & ~7);
> +        break;
> +    }
> +}
> +
> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState *s = opaque;
> +    uint32_t regb = addr & ~3;
> +    bool cy_inh = false;
> +    uint32_t busy = 0;
> +    uint32_t exec = 0;
> +
> +    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment or access size */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        /* Unsupported MMIO access location. */
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Track actionable MMIO write. */
> +    switch (regb) {
> +    case RISCV_IOMMU_REG_DDTP:
> +    case RISCV_IOMMU_REG_DDTP + 4:
> +        exec = BIT(RISCV_IOMMU_EXEC_DDTP);
> +        regb = RISCV_IOMMU_REG_DDTP;
> +        busy = RISCV_IOMMU_DDTP_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQT:
> +        exec = BIT(RISCV_IOMMU_EXEC_CQT);
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQCSR:
> +        exec = BIT(RISCV_IOMMU_EXEC_CQCSR);
> +        busy = RISCV_IOMMU_CQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_FQH:
> +        exec = BIT(RISCV_IOMMU_EXEC_FQH);
> +        break;
> +
> +    case RISCV_IOMMU_REG_FQCSR:
> +        exec = BIT(RISCV_IOMMU_EXEC_FQCSR);
> +        busy = RISCV_IOMMU_FQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_PQH:
> +        exec = BIT(RISCV_IOMMU_EXEC_PQH);
> +        break;
> +
> +    case RISCV_IOMMU_REG_PQCSR:
> +        exec = BIT(RISCV_IOMMU_EXEC_PQCSR);
> +        busy = RISCV_IOMMU_PQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_IOCOUNTINH:
> +        if (addr != RISCV_IOMMU_REG_IOCOUNTINH) {
> +            break;
> +        }
> +
> +        /* Store previous value of CY bit. */
> +        cy_inh = !!(riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH) &
> +            RISCV_IOMMU_IOCOUNTINH_CY);
> +        break;
> +
> +    case RISCV_IOMMU_REG_TR_REQ_CTL:
> +        exec = BIT(RISCV_IOMMU_EXEC_TR_REQUEST);
> +        regb = RISCV_IOMMU_REG_TR_REQ_CTL;
> +        busy = RISCV_IOMMU_TR_REQ_CTL_GO_BUSY;
> +        break;
> +    }
> +
> +    /*
> +     * Registers update might be not synchronized with core logic.
> +     * If system software updates register when relevant BUSY bit is set
> +     * IOMMU behavior of additional writes to the register is UNSPECIFIED
> +     */
> +
> +    qemu_spin_lock(&s->regs_lock);
> +    if (size == 1) {
> +        uint8_t ro = s->regs_ro[addr];
> +        uint8_t wc = s->regs_wc[addr];
> +        uint8_t rw = s->regs_rw[addr];
> +        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
> +    } else if (size == 2) {
> +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
> +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
> +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
> +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 4) {
> +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 8) {
> +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    }
> +
> +    /* Busy flag update, MSB 4-byte register. */
> +    if (busy) {
> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
> +        stl_le_p(&s->regs_rw[regb], rw | busy);
> +    }
> +    qemu_spin_unlock(&s->regs_lock);
> +
> +    /* Process HPM writes and update any internal state if needed. */
> +    if (regb >= RISCV_IOMMU_REG_IOCOUNTOVF &&
> +        regb <= (RISCV_IOMMU_REG_IOHPMEVT(RISCV_IOMMU_IOCOUNT_NUM) + 4)) {
> +        riscv_iommu_process_hpm_writes(s, regb, cy_inh);
> +    }
> +
> +    /* Wake up core processing thread. */
> +    if (exec) {
> +        qemu_mutex_lock(&s->core_lock);
> +        s->core_exec |= exec;
> +        qemu_cond_signal(&s->core_cond);
> +        qemu_mutex_unlock(&s->core_lock);
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +static uint64_t riscv_iommu_hpmcycle_read(RISCVIOMMUState *s)
> +{
> +    const uint64_t cycle = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_IOHPMCYCLES);
> +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> +    const uint64_t ctr_prev = s->hpmcycle_prev;
> +    const uint64_t ctr_val = s->hpmcycle_val;
> +
> +    if (get_field(inhibit, RISCV_IOMMU_IOCOUNTINH_CY)) {
> +        /*
> +         * Counter should not increment if inhibit bit is set. We can't really
> +         * stop the QEMU_CLOCK_VIRTUAL, so we just return the last updated
> +         * counter value to indicate that counter was not incremented.
> +         */
> +        return (ctr_val & RISCV_IOMMU_IOHPMCYCLES_COUNTER) |
> +               (cycle & RISCV_IOMMU_IOHPMCYCLES_OVF);
> +    }
> +
> +    return (ctr_val + __get_cycles() - ctr_prev) |
> +        (cycle & RISCV_IOMMU_IOHPMCYCLES_OVF);
> +}
> +
> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState *s = opaque;
> +    uint64_t val = -1;
> +    uint8_t *ptr;
> +
> +    if ((addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment. */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Compute cycle register value. */
> +    if ((addr & ~7) == RISCV_IOMMU_REG_IOHPMCYCLES) {
> +        val = riscv_iommu_hpmcycle_read(s);
> +        ptr = (uint8_t *)&val + (addr & 7);
> +    } else if ((addr & ~3) == RISCV_IOMMU_REG_IOCOUNTOVF) {
> +        /*
> +         * Software can read RISCV_IOMMU_REG_IOCOUNTOVF before timer callback completes.
> +         * In which case CY_OF bit in RISCV_IOMMU_IOHPMCYCLES_OVF would be 0. Here we
> +         * take the CY_OF bit state from RISCV_IOMMU_REG_IOHPMCYCLES register as it's
> +         * not dependent over the timer callback and is computed from cycle
> +         * overflow.
> +         */
> +        val = ldq_le_p(&s->regs_rw[addr]);
> +        val |= (riscv_iommu_hpmcycle_read(s) & RISCV_IOMMU_IOHPMCYCLES_OVF)
> +                   ? RISCV_IOMMU_IOCOUNTOVF_CY
> +                   : 0;
> +        ptr = (uint8_t *)&val + (addr & 3);
> +    } else {
> +        ptr = &s->regs_rw[addr];
> +    }
> +
> +    if (size == 1) {
> +        val = (uint64_t)*ptr;
> +    } else if (size == 2) {
> +        val = lduw_le_p(ptr);
> +    } else if (size == 4) {
> +        val = ldl_le_p(ptr);
> +    } else if (size == 8) {
> +        val = ldq_le_p(ptr);
> +    } else {
> +        return MEMTX_ERROR;
> +    }
> +
> +    *data = val;
> +
> +    return MEMTX_OK;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
> +    .read_with_attrs = riscv_iommu_mmio_read,
> +    .write_with_attrs = riscv_iommu_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +        .unaligned = false,
> +    },
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +/*
> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
> + * memory region as untranslated address, for additional MSI/MRIF interception
> + * by IOMMU interrupt remapping implementation.
> + * Note: Device emulation code generating an MSI is expected to provide a valid
> + * memory transaction attributes with requested_id set.
> + */
> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
> +    RISCVIOMMUContext *ctx;
> +    MemTxResult res;
> +    void *ref;
> +    uint32_t devid = attrs.requester_id;
> +
> +    if (attrs.unspecified) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
> +    if (ctx == NULL) {
> +        res = MEMTX_ACCESS_ERROR;
> +    } else {
> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
> +    }
> +    riscv_iommu_ctx_put(s, ref);
> +    return res;
> +}
> +
> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    return MEMTX_ACCESS_ERROR;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_trap_ops = {
> +    .read_with_attrs = riscv_iommu_trap_read,
> +    .write_with_attrs = riscv_iommu_trap_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +        .unaligned = true,
> +    },
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +/* Timer callback for cycle counter overflow. */
> +static void riscv_iommu_hpm_timer_cb(void *priv)
> +{
> +    RISCVIOMMUState *s = priv;
> +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> +    uint32_t ovf;
> +
> +    if (get_field(inhibit, RISCV_IOMMU_IOCOUNTINH_CY)) {
> +        return;
> +    }
> +
> +    if (s->irq_overflow_left > 0) {
> +        uint64_t irq_trigger_at =
> +            qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->irq_overflow_left;
> +        timer_mod_anticipate_ns(s->hpm_timer, irq_trigger_at);
> +        s->irq_overflow_left = 0;
> +        return;
> +    }
> +
> +    ovf = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTOVF);
> +    if (!get_field(ovf, RISCV_IOMMU_IOCOUNTOVF_CY)) {
> +        /*
> +         * We don't need to set hpmcycle_val to zero and update hpmcycle_prev to
> +         * current clock value. The way we calculate iohpmcycs will overflow
> +         * and return the correct value. This avoids the need to synchronize
> +         * timer callback and write callback.
> +         */
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF,
> +            RISCV_IOMMU_IOCOUNTOVF_CY, 0);
> +        riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_IOHPMCYCLES,
> +            RISCV_IOMMU_IOHPMCYCLES_OVF, 0);
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PM);
> +    }
> +}
> +
> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
> +{
> +    const uint64_t cap_implemented =
> +        RISCV_IOMMU_CAP_MSI_FLAT |
> +        RISCV_IOMMU_CAP_MSI_MRIF |
> +        RISCV_IOMMU_CAP_ATS |
> +        RISCV_IOMMU_CAP_S_SV32 |
> +        RISCV_IOMMU_CAP_S_SV39 |
> +        RISCV_IOMMU_CAP_S_SV48 |
> +        RISCV_IOMMU_CAP_S_SV57 |
> +        RISCV_IOMMU_CAP_G_SV32 |
> +        RISCV_IOMMU_CAP_G_SV39 |
> +        RISCV_IOMMU_CAP_G_SV48 |
> +        RISCV_IOMMU_CAP_G_SV57 |
> +        RISCV_IOMMU_CAP_MSI_FLAT |
> +        RISCV_IOMMU_CAP_MSI_MRIF |
> +        RISCV_IOMMU_CAP_ATS |
> +        RISCV_IOMMU_CAP_IGS |
> +        RISCV_IOMMU_CAP_HPM |
> +        RISCV_IOMMU_CAP_DBG |
> +        RISCV_IOMMU_CAP_PD8 |
> +        RISCV_IOMMU_CAP_PD17 |
> +        RISCV_IOMMU_CAP_PD20;
> +
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    s->cap &= cap_implemented;
> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_VERSION, s->version);
> +
> +    if (s->hpm_cntrs > RISCV_IOMMU_IOCOUNT_NUM) {
> +        /* Clip number of HPM counters to maximum supported (31). */
> +        s->hpm_cntrs = RISCV_IOMMU_IOCOUNT_NUM;
> +    } else if (s->hpm_cntrs == 0) {
> +        /* Disable hardware performance monitor interface */
> +        s->cap |= RISCV_IOMMU_CAP_HPM;
> +    }
> +
> +    /* Verify supported IGS */
> +    switch (get_field(s->cap, RISCV_IOMMU_CAP_IGS)) {
> +    case RISCV_IOMMU_CAP_IGS_MSI:
> +    case RISCV_IOMMU_CAP_IGS_WSI:
> +        break;
> +    default:
> +        error_setg(errp, "can't support requested IGS mode: cap: %" PRIx64,
> +            s->cap);
> +        return;
> +    }
> +
> +    /* Report QEMU target physical address space limits */
> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS, TARGET_PHYS_ADDR_SPACE_BITS);
> +
> +    /* Adjust reported PD capabilities */
> +    if (s->pasid_bits < 20) {
> +        s->cap &= ~RISCV_IOMMU_CAP_PD20;
> +    } else if (s->pasid_bits < 17) {
> +        s->cap &= ~RISCV_IOMMU_CAP_PD17;
> +    } else if (s->pasid_bits < 8) {
> +        s->cap &= ~RISCV_IOMMU_CAP_PD8;
> +    }
> +
> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
> +
> +    /* register storage */
> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +
> +     /* Mark all registers read-only */
> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
> +
> +    /*
> +     * Register complete MMIO space, including MSI/PBA registers.
> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
> +     * managed directly by the PCIDevice implementation.
> +     */
> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
> +
> +    /* Set power-on register state */
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], s->fctl);
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
> +        RISCV_IOMMU_CQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
> +        RISCV_IOMMU_FQCSR_FQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
> +        RISCV_IOMMU_FQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
> +        RISCV_IOMMU_PQCSR_PQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
> +        RISCV_IOMMU_PQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
> +    /* If HPM registers are enabled. */
> +    if (s->cap & RISCV_IOMMU_CAP_HPM) {
> +        /* +1 for cycle counter bit. */
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_IOCOUNTINH], ~((2 << s->hpm_cntrs) - 1));
> +        stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_IOHPMCYCLES], 0);
> +        memset(&s->regs_ro[RISCV_IOMMU_REG_IOHPMCTR_BASE], 0x00, s->hpm_cntrs * 8);
> +        memset(&s->regs_ro[RISCV_IOMMU_REG_IOHPMEVT_BASE], 0x00, s->hpm_cntrs * 8);
> +    }
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_IVEC], 0);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
> +    /* If debug registers enabled. */
> +    if (s->cap & RISCV_IOMMU_CAP_DBG) {
> +        stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_IOVA], 0);
> +        stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_CTL],
> +            RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
> +    }
> +
> +    /* Memory region for downstream access, if specified. */
> +    if (s->target_mr) {
> +        s->target_as = g_new0(AddressSpace, 1);
> +        address_space_init(s->target_as, s->target_mr,
> +            "riscv-iommu-downstream");
> +    } else {
> +        /* Fallback to global system memory. */
> +        s->target_as = &address_space_memory;
> +    }
> +
> +    /* Memory region for untranslated MRIF/MSI writes */
> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
> +            "riscv-iommu-trap", ~0ULL);
> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
> +
> +    /* Device translation context cache */
> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                         g_free, NULL);
> +    s->iot_cache = g_hash_table_new_full(__iot_hash, __iot_equal,
> +                                         g_free, NULL);
> +
> +    if (s->cap & RISCV_IOMMU_CAP_HPM) {
> +        s->hpm_event_ctr_map = g_hash_table_new(g_direct_hash, g_direct_equal);
> +        pthread_rwlock_init(&s->ht_lock, NULL);
> +        s->hpm_timer =
> +            timer_new_ns(QEMU_CLOCK_VIRTUAL, riscv_iommu_hpm_timer_cb, s);
> +    }
> +
> +    s->iommus.le_next = NULL;
> +    s->iommus.le_prev = NULL;
> +    QLIST_INIT(&s->spaces);
> +    qemu_cond_init(&s->core_cond);
> +    qemu_mutex_init(&s->core_lock);
> +    qemu_spin_init(&s->regs_lock);
> +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
> +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
> +}
> +
> +static void riscv_iommu_unrealize(DeviceState *dev)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    /* cancel pending operations and stop */
> +    s->core_exec = BIT(RISCV_IOMMU_EXEC_EXIT);
> +    qemu_cond_signal(&s->core_cond);
> +    qemu_mutex_unlock(&s->core_lock);
> +    qemu_thread_join(&s->core_proc);
> +    qemu_cond_destroy(&s->core_cond);
> +    qemu_mutex_destroy(&s->core_lock);
> +    if (s->cap & RISCV_IOMMU_CAP_HPM) {
> +        timer_free(s->hpm_timer);
> +        pthread_rwlock_destroy(&s->ht_lock);
> +        g_hash_table_unref(s->hpm_event_ctr_map);
> +    }
> +    g_hash_table_unref(s->iot_cache);
> +    g_hash_table_unref(s->ctx_cache);
> +}
> +
> +static Property riscv_iommu_properties[] = {
> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
> +        RISCV_IOMMU_SPEC_DOT_VER),
> +    DEFINE_PROP_UINT64("capabilities", RISCVIOMMUState, cap, ~0ULL),
> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
> +    DEFINE_PROP_UINT32("ioatc-limit", RISCVIOMMUState, iot_limit,
> +        LIMIT_CACHE_IOT),
> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
> +        TYPE_MEMORY_REGION, MemoryRegion *),
> +    DEFINE_PROP_UINT8("hpm-counters", RISCVIOMMUState, hpm_cntrs,
> +        RISCV_IOMMU_IOCOUNT_NUM),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
> +    dc->user_creatable = false;
> +    dc->realize = riscv_iommu_realize;
> +    dc->unrealize = riscv_iommu_unrealize;
> +    device_class_set_props(dc, riscv_iommu_properties);
> +}
> +
> +static const TypeInfo riscv_iommu_info = {
> +    .name = TYPE_RISCV_IOMMU,
> +    .parent = TYPE_DEVICE,
> +    .instance_size = sizeof(RISCVIOMMUState),
> +    .class_init = riscv_iommu_class_init,
> +};
> +
> +static const char *IOMMU_FLAG_STR[] = {
> +    "NA",
> +    "RO",
> +    "WR",
> +    "RW",
> +};
> +
> +/* RISC-V IOMMU Memory Region - Address Translation Space */
> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
> +    IOMMUAccessFlags flag, int iommu_idx)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    RISCVIOMMUContext *ctx;
> +    void *ref;
> +    IOMMUTLBEntry iotlb = {
> +        .iova = addr,
> +        .target_as = as->iommu->target_as,
> +        .addr_mask = ~0ULL,
> +        .perm = flag,
> +    };
> +
> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
> +    if (ctx == NULL) {
> +        /* Translation disabled or invalid. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb, true)) {
> +        /* Translation disabled or fault reported. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    }
> +
> +    /* Trace all dma translations with original access flags. */
> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
> +                          iotlb.translated_addr);
> +
> +    riscv_iommu_ctx_put(as->iommu, ref);
> +
> +    return iotlb;
> +}
> +
> +static int riscv_iommu_memory_region_notify(
> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
> +    IOMMUNotifierFlag new, Error **errp)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +
> +    if (old == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = true;
> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
> +    } else if (new == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = false;
> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
> +    }
> +
> +    return 0;
> +}
> +
> +static inline bool pci_is_iommu(PCIDevice *pdev)
> +{
> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
> +}
> +
> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
> +{
> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> +    AddressSpace *as = NULL;
> +
> +    if (pdev && pci_is_iommu(pdev)) {
> +        return s->target_as;
> +    }
> +
> +    /* Find first registered IOMMU device */
> +    while (s->iommus.le_prev) {
> +        s = *(s->iommus.le_prev);
> +    }
> +
> +    /* Find first matching IOMMU */
> +    while (s != NULL && as == NULL) {
> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> +        s = s->iommus.le_next;
> +    }
> +
> +    return as ? as : &address_space_memory;
> +}
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +    Error **errp)
> +{
> +    if (bus->iommu_fn == riscv_iommu_find_as) {
> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> +    } else if (bus->iommu_fn == NULL) {
> +        pci_setup_iommu(bus, riscv_iommu_find_as, iommu);
> +    } else {
> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
> +            pci_bus_num(bus));
> +    }
> +}
> +
> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
> +    MemTxAttrs attrs)
> +{
> +    return RISCV_IOMMU_NOPASID;
> +}
> +
> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> +{
> +    return 1;
> +}
> +
> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
> +{
> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> +
> +    imrc->translate = riscv_iommu_memory_region_translate;
> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> +}
> +
> +static const TypeInfo riscv_iommu_memory_region_info = {
> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> +    .class_init = riscv_iommu_memory_region_init,
> +};
> +
> +static void riscv_iommu_register_mr_types(void)
> +{
> +    type_register_static(&riscv_iommu_memory_region_info);
> +    type_register_static(&riscv_iommu_info);
> +}
> +
> +type_init(riscv_iommu_register_mr_types);
> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> new file mode 100644
> index 0000000000..c68e09db58
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.h
> @@ -0,0 +1,152 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_STATE_H
> +#define HW_RISCV_IOMMU_STATE_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#include "hw/riscv/iommu.h"
> +
> +struct RISCVIOMMUState {
> +    /*< private >*/
> +    DeviceState parent_obj;
> +
> +    /*< public >*/
> +    uint32_t version;     /* Reported interface version number */
> +    uint32_t pasid_bits;  /* process identifier width */
> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> +
> +    uint64_t cap;         /* IOMMU supported capabilities */
> +    uint64_t fctl;        /* IOMMU enabled features */
> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
> +
> +    /* IOMMU Internal State */
> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> +
> +    dma_addr_t cq_addr;   /* Command queue base physical address */
> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
> +
> +    uint32_t cq_mask;     /* Command queue index bit mask */
> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> +    uint32_t pq_mask;     /* Page request queue index bit mask */
> +
> +    /* interrupt notifier */
> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> +
> +    /* IOMMU State Machine */
> +    QemuThread core_proc; /* Background processing thread */
> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
> +    QemuCond core_cond;   /* Background processing wake up signal */
> +    unsigned core_exec;   /* Processing thread execution actions */
> +
> +    /* IOMMU target address space */
> +    AddressSpace *target_as;
> +    MemoryRegion *target_mr;
> +
> +    /* MSI / MRIF access trap */
> +    AddressSpace trap_as;
> +    MemoryRegion trap_mr;
> +
> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
> +    GHashTable *iot_cache;          /* IO Translated Address Cache */
> +    unsigned iot_limit;             /* IO Translation Cache size limit */
> +
> +    /* HPM cycle counter */
> +    QEMUTimer *hpm_timer;
> +    uint64_t hpmcycle_val;      /* Current value of cycle register */
> +    uint64_t hpmcycle_prev;     /* Saved value of QEMU_CLOCK_VIRTUAL clock */
> +    uint64_t irq_overflow_left; /* Value beyond INT64_MAX after overflow */
> +
> +    /* HPM event counters */
> +    uint8_t hpm_cntrs;
> +    GHashTable *hpm_event_ctr_map; /* Mapping of events to counters */
> +    pthread_rwlock_t ht_lock;      /* Lock used for hpm_event_ctr_map updates */
> +
> +    /* MMIO Hardware Interface */
> +    MemoryRegion regs_mr;
> +    QemuSpin regs_lock;
> +    uint8_t *regs_rw;  /* register state (user write) */
> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> +    uint8_t *regs_ro;  /* read-only mask */
> +
> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +         Error **errp);
> +
> +/* private helpers */
> +
> +/* Register helper functions */
> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set, uint32_t clr)
> +{
> +    uint32_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldl_le_p(s->regs_rw + idx);
> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stl_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldl_le_p(s->regs_rw + idx);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set, uint64_t clr)
> +{
> +    uint64_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldq_le_p(s->regs_rw + idx);
> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stq_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldq_le_p(s->regs_rw + idx);
> +}
> +
> +
> +
> +#endif
> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> new file mode 100644
> index 0000000000..fd5e21e3d4
> --- /dev/null
> +++ b/hw/riscv/trace-events
> @@ -0,0 +1,14 @@
> +# See documentation at docs/devel/tracing.rst
> +
> +# riscv-iommu.c
> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
> +riscv_iommu_ats(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: translate request %04x:%02x.%u iova: 0x%"PRIx64
> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
> +riscv_iommu_ats_inval(const char *id) "%s: dev-iotlb invalidate"
> +riscv_iommu_ats_prgr(const char *id) "%s: dev-iotlb page request group response"
> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> new file mode 100644
> index 0000000000..b88504b750
> --- /dev/null
> +++ b/hw/riscv/trace.h
> @@ -0,0 +1,2 @@
> +#include "trace/trace-hw_riscv.h"
> +
> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> new file mode 100644
> index 0000000000..2a63a5cbf2
> --- /dev/null
> +++ b/include/hw/riscv/iommu.h
> @@ -0,0 +1,40 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_H
> +#define HW_RISCV_IOMMU_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#define TYPE_RISCV_IOMMU "x-riscv-iommu"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> +
> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "x-riscv-iommu-mr"
> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> +
> +#define TYPE_RISCV_IOMMU_PCI "x-riscv-iommu-pci"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> +
> +#define TYPE_RISCV_IOMMU_SYS "x-riscv-iommu-device"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStateSys, RISCV_IOMMU_SYS)
> +typedef struct RISCVIOMMUStateSys RISCVIOMMUStateSys;
> +
> +#endif
> diff --git a/meson.build b/meson.build
> index 5fcdb37a71..693ea3447d 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -3268,6 +3268,7 @@ if have_system
>      'hw/rdma',
>      'hw/rdma/vmw',
>      'hw/rtc',
> +    'hw/riscv',
>      'hw/s390x',
>      'hw/scsi',
>      'hw/sd',
> --
> 2.34.1
>
>
Tomasz Jeznach Aug. 7, 2023, 4:16 p.m. UTC | #2
On Sun, Jul 23, 2023 at 7:33 PM Alistair Francis <alistair23@gmail.com> wrote:
>
> On Thu, Jul 20, 2023 at 12:34 PM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
> >
> > The RISC-V IOMMU specification is now ratified as-per the RISC-V international
> > process [1]. The latest frozen specifcation can be found at:
> > https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>
> Exciting!
>
> >
> > The patch add device emulation for RISC-V IOMMU which supports device and process
> > context lookups, command and fault queue interfaces, two stage address translation
> > logic with Sv32, Sv39, Sv48, Sv57 addressing modes, address translation cache,
> > MSI remapping with FLAT/MRIF modes, initial ATS and PRI interfaces, debug capabilities,
> > hardware performance counters. Platform and PCIe device instantiation is supported,
> > with wire-signaled and message-signaled interrupt capabilities.
> >
> > Hardware interface definition file is shared with Linux kernel driver implementation,
> > available in the maintainer's branch riscv_iommu_v1 at https://github.com/tjeznach/linux.
> >
> > Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> > Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> > Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> > ---
> >  hw/riscv/Kconfig            |    3 +
> >  hw/riscv/meson.build        |    1 +
> >  hw/riscv/riscv-iommu-bits.h |  749 +++++++++++
> >  hw/riscv/riscv-iommu-pci.c  |  181 +++
> >  hw/riscv/riscv-iommu-sys.c  |  123 ++
> >  hw/riscv/riscv-iommu.c      | 2539 +++++++++++++++++++++++++++++++++++
> >  hw/riscv/riscv-iommu.h      |  152 +++
> >  hw/riscv/trace-events       |   14 +
> >  hw/riscv/trace.h            |    2 +
> >  include/hw/riscv/iommu.h    |   40 +
> >  meson.build                 |    1 +
>
> This is a really long patch!
>
> I think this should at least be split up to rougly each file (as long
> as it compiles). For example the header files could be added in a
> patch each. Which  would reduce some of the review burden.
>

Thanks for the feedback. Definitely it will be easier for reviewers :)
I will split into more reasonable patches.

> >  11 files changed, 3805 insertions(+)
> >  create mode 100644 hw/riscv/riscv-iommu-bits.h
> >  create mode 100644 hw/riscv/riscv-iommu-pci.c
> >  create mode 100644 hw/riscv/riscv-iommu-sys.c
> >  create mode 100644 hw/riscv/riscv-iommu.c
> >  create mode 100644 hw/riscv/riscv-iommu.h
> >  create mode 100644 hw/riscv/trace-events
> >  create mode 100644 hw/riscv/trace.h
> >  create mode 100644 include/hw/riscv/iommu.h
> >
> > diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> > index b6a5eb4452..617a509f1b 100644
> > --- a/hw/riscv/Kconfig
> > +++ b/hw/riscv/Kconfig
> > @@ -1,3 +1,6 @@
> > +config RISCV_IOMMU
> > +    bool
> > +
> >  config RISCV_NUMA
> >      bool
> >
> > diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> > index 2f7ee81be3..e37c5d78e2 100644
> > --- a/hw/riscv/meson.build
> > +++ b/hw/riscv/meson.build
> > @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
> >  riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
> >  riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
> >  riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
> > +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c', 'riscv-iommu-pci.c', 'riscv-iommu-sys.c'))
> >
> >  hw_arch += {'riscv': riscv_ss}
> > diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
> > new file mode 100644
> > index 0000000000..9ce713361f
> > --- /dev/null
> > +++ b/hw/riscv/riscv-iommu-bits.h
> > @@ -0,0 +1,749 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright © 2022-2023 Rivos Inc.
> > + * Copyright © 2023 FORTH-ICS/CARV
> > + * Copyright © 2023 RISC-V IOMMU Task Group
> > + *
> > + * RISC-V Ziommu - Register Layout and Data Structures.
> > + *
> > + * Based on the 'RISC-V IOMMU Architecture Specification', Version 1.0
> > + * Published at  https://github.com/riscv-non-isa/riscv-iommu
> > + *
> > + */
> > +
> > +#ifndef HW_RISCV_IOMMU_BITS_H
> > +#define HW_RISCV_IOMMU_BITS_H
> > +
> > +/*
> > + * This file is based on Linux RISC-V IOMMU file
> > + * located at 'drivers/iommu/riscv/iommu-bits.h'
> > + */
> > +
> > +#include "qemu/osdep.h"
>
> This shouldn't be included in header files
>
> > +
> > +#define RISCV_IOMMU_SPEC_DOT_VER 0x010
> > +
> > +#ifndef GENMASK_ULL
> > +#define GENMASK_ULL(h, l) (((~0ULL) >> (63 - (h) + (l))) << (l))
> > +#endif
> > +
> > +/*
> > + * Chapter 5: Memory Mapped register interface
> > + */
> > +
> > +/* Common field positions */
> > +#define RISCV_IOMMU_PPN_FIELD           GENMASK_ULL(53, 10)
> > +#define RISCV_IOMMU_QUEUE_LOGSZ_FIELD   GENMASK_ULL(4, 0)
> > +#define RISCV_IOMMU_QUEUE_INDEX_FIELD   GENMASK_ULL(31, 0)
> > +#define RISCV_IOMMU_QUEUE_ENABLE        BIT(0)
> > +#define RISCV_IOMMU_QUEUE_INTR_ENABLE   BIT(1)
> > +#define RISCV_IOMMU_QUEUE_MEM_FAULT     BIT(8)
> > +#define RISCV_IOMMU_QUEUE_OVERFLOW      BIT(9)
> > +#define RISCV_IOMMU_QUEUE_ACTIVE        BIT(16)
> > +#define RISCV_IOMMU_QUEUE_BUSY          BIT(17)
> > +#define RISCV_IOMMU_ATP_PPN_FIELD       GENMASK_ULL(43, 0)
> > +#define RISCV_IOMMU_ATP_MODE_FIELD      GENMASK_ULL(63, 60)
> > +
> > +/* 5.3 IOMMU Capabilities (64bits) */
> > +#define RISCV_IOMMU_REG_CAP             0x0000
> > +#define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
> > +#define RISCV_IOMMU_CAP_S_SV32          BIT_ULL(8)
> > +#define RISCV_IOMMU_CAP_S_SV39          BIT_ULL(9)
> > +#define RISCV_IOMMU_CAP_S_SV48          BIT_ULL(10)
> > +#define RISCV_IOMMU_CAP_S_SV57          BIT_ULL(11)
> > +#define RISCV_IOMMU_CAP_SVPBMT          BIT_ULL(15)
> > +#define RISCV_IOMMU_CAP_G_SV32          BIT_ULL(16)
> > +#define RISCV_IOMMU_CAP_G_SV39          BIT_ULL(17)
> > +#define RISCV_IOMMU_CAP_G_SV48          BIT_ULL(18)
> > +#define RISCV_IOMMU_CAP_G_SV57          BIT_ULL(19)
> > +#define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
> > +#define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
> > +#define RISCV_IOMMU_CAP_AMO             BIT_ULL(24)
> > +#define RISCV_IOMMU_CAP_ATS             BIT_ULL(25)
> > +#define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
> > +#define RISCV_IOMMU_CAP_END             BIT_ULL(27)
> > +#define RISCV_IOMMU_CAP_IGS             GENMASK_ULL(29, 28)
> > +#define RISCV_IOMMU_CAP_HPM             BIT_ULL(30)
> > +#define RISCV_IOMMU_CAP_DBG             BIT_ULL(31)
> > +#define RISCV_IOMMU_CAP_PAS             GENMASK_ULL(37, 32)
> > +#define RISCV_IOMMU_CAP_PD8             BIT_ULL(38)
> > +#define RISCV_IOMMU_CAP_PD17            BIT_ULL(39)
> > +#define RISCV_IOMMU_CAP_PD20            BIT_ULL(40)
> > +
> > +#define RISCV_IOMMU_CAP_VERSION_VER_MASK      0xF0
> > +#define RISCV_IOMMU_CAP_VERSION_REV_MASK      0x0F
> > +
> > +/**
> > + * enum riscv_iommu_igs_settings - Interrupt Generation Support Settings
> > + * @RISCV_IOMMU_CAP_IGS_MSI: I/O MMU supports only MSI generation
> > + * @RISCV_IOMMU_CAP_IGS_WSI: I/O MMU supports only Wired-Signaled interrupt
> > + * @RISCV_IOMMU_CAP_IGS_BOTH: I/O MMU supports both MSI and WSI generation
> > + * @RISCV_IOMMU_CAP_IGS_RSRV: Reserved for standard use
> > + */
> > +enum riscv_iommu_igs_settings {
> > +      RISCV_IOMMU_CAP_IGS_MSI  = 0,
> > +      RISCV_IOMMU_CAP_IGS_WSI  = 1,
> > +      RISCV_IOMMU_CAP_IGS_BOTH = 2,
> > +      RISCV_IOMMU_CAP_IGS_RSRV = 3
> > +};
> > +
> > +
> > +/* 5.4 Features control register (32bits) */
> > +#define RISCV_IOMMU_REG_FCTL            0x0008
> > +#define RISCV_IOMMU_FCTL_BE             BIT(0)
> > +#define RISCV_IOMMU_FCTL_WSI            BIT(1)
> > +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
> > +
> > +
> > +/* 5.5 Device-directory-table pointer (64bits) */
> > +#define RISCV_IOMMU_REG_DDTP            0x0010
> > +#define RISCV_IOMMU_DDTP_MODE           GENMASK_ULL(3, 0)
> > +#define RISCV_IOMMU_DDTP_BUSY           BIT_ULL(4)
> > +#define RISCV_IOMMU_DDTP_PPN            RISCV_IOMMU_PPN_FIELD
> > +
> > +/**
> > + * enum riscv_iommu_ddtp_modes - I/O MMU translation modes
> > + * @RISCV_IOMMU_DDTP_MODE_OFF: No inbound transactions allowed
> > + * @RISCV_IOMMU_DDTP_MODE_BARE: Pass-through mode
> > + * @RISCV_IOMMU_DDTP_MODE_1LVL: One-level DDT
> > + * @RISCV_IOMMU_DDTP_MODE_2LVL: Two-level DDT
> > + * @RISCV_IOMMU_DDTP_MODE_3LVL: Three-level DDT
> > + */
> > +enum riscv_iommu_ddtp_modes {
> > +      RISCV_IOMMU_DDTP_MODE_OFF = 0,
> > +      RISCV_IOMMU_DDTP_MODE_BARE = 1,
> > +      RISCV_IOMMU_DDTP_MODE_1LVL = 2,
> > +      RISCV_IOMMU_DDTP_MODE_2LVL = 3,
> > +      RISCV_IOMMU_DDTP_MODE_3LVL = 4,
> > +      RISCV_IOMMU_DDTP_MODE_MAX = 4
> > +};
> > +
> > +
> > +/* 5.6 Command Queue Base (64bits) */
> > +#define RISCV_IOMMU_REG_CQB             0x0018
> > +#define RISCV_IOMMU_CQB_LOG2SZ          RISCV_IOMMU_QUEUE_LOGSZ_FIELD
> > +#define RISCV_IOMMU_CQB_PPN             RISCV_IOMMU_PPN_FIELD
> > +
> > +/* 5.7 Command Queue head (32bits) */
> > +#define RISCV_IOMMU_REG_CQH             0x0020
> > +#define RISCV_IOMMU_CQH_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> > +
> > +/* 5.8 Command Queue tail (32bits) */
> > +#define RISCV_IOMMU_REG_CQT             0x0024
> > +#define RISCV_IOMMU_CQT_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> > +
> > +
> > +/* 5.9 Fault Queue Base (64bits) */
> > +#define RISCV_IOMMU_REG_FQB             0x0028
> > +#define RISCV_IOMMU_FQB_LOG2SZ          RISCV_IOMMU_QUEUE_LOGSZ_FIELD
> > +#define RISCV_IOMMU_FQB_PPN             RISCV_IOMMU_PPN_FIELD
> > +
> > +/* 5.10 Fault Queue Head (32bits) */
> > +#define RISCV_IOMMU_REG_FQH             0x0030
> > +#define RISCV_IOMMU_FQH_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> > +
> > +/* 5.11 Fault Queue tail (32bits) */
> > +#define RISCV_IOMMU_REG_FQT             0x0034
> > +#define RISCV_IOMMU_FQT_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> > +
> > +
> > +/* 5.12 Page Request Queue base (64bits) */
> > +#define RISCV_IOMMU_REG_PQB             0x0038
> > +#define RISCV_IOMMU_PQB_LOG2SZ          RISCV_IOMMU_QUEUE_LOGSZ_FIELD
> > +#define RISCV_IOMMU_PQB_PPN             RISCV_IOMMU_PPN_FIELD
> > +
> > +/* 5.13 Page Request Queue head (32bits) */
> > +#define RISCV_IOMMU_REG_PQH             0x0040
> > +#define RISCV_IOMMU_PQH_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
> > +
> > +/* 5.14 Page Request Queue tail (32bits) */
> > +#define RISCV_IOMMU_REG_PQT             0x0044
> > +#define RISCV_IOMMU_PQT_INDEX_MASK      RISCV_IOMMU_QUEUE_INDEX_FIELD
> > +
> > +/* 5.15 Command Queue CSR (32bits) */
> > +#define RISCV_IOMMU_REG_CQCSR           0x0048
> > +#define RISCV_IOMMU_CQCSR_CQEN          RISCV_IOMMU_QUEUE_ENABLE
> > +#define RISCV_IOMMU_CQCSR_CIE           RISCV_IOMMU_QUEUE_INTR_ENABLE
> > +#define RISCV_IOMMU_CQCSR_CQMF          RISCV_IOMMU_QUEUE_MEM_FAULT
> > +#define RISCV_IOMMU_CQCSR_CMD_TO        BIT(9)
> > +#define RISCV_IOMMU_CQCSR_CMD_ILL       BIT(10)
> > +#define RISCV_IOMMU_CQCSR_FENCE_W_IP    BIT(11)
> > +#define RISCV_IOMMU_CQCSR_CQON          RISCV_IOMMU_QUEUE_ACTIVE
> > +#define RISCV_IOMMU_CQCSR_BUSY          RISCV_IOMMU_QUEUE_BUSY
> > +
> > +
> > +/* 5.16 Fault Queue CSR (32bits) */
> > +#define RISCV_IOMMU_REG_FQCSR           0x004C
> > +#define RISCV_IOMMU_FQCSR_FQEN          RISCV_IOMMU_QUEUE_ENABLE
> > +#define RISCV_IOMMU_FQCSR_FIE           RISCV_IOMMU_QUEUE_INTR_ENABLE
> > +#define RISCV_IOMMU_FQCSR_FQMF          RISCV_IOMMU_QUEUE_MEM_FAULT
> > +#define RISCV_IOMMU_FQCSR_FQOF          RISCV_IOMMU_QUEUE_OVERFLOW
> > +#define RISCV_IOMMU_FQCSR_FQON          RISCV_IOMMU_QUEUE_ACTIVE
> > +#define RISCV_IOMMU_FQCSR_BUSY          RISCV_IOMMU_QUEUE_BUSY
> > +
> > +
> > +/* 5.17 Page Request Queue CSR (32bits) */
> > +#define RISCV_IOMMU_REG_PQCSR           0x0050
> > +#define RISCV_IOMMU_PQCSR_PQEN          RISCV_IOMMU_QUEUE_ENABLE
> > +#define RISCV_IOMMU_PQCSR_PIE           RISCV_IOMMU_QUEUE_INTR_ENABLE
> > +#define RISCV_IOMMU_PQCSR_PQMF          RISCV_IOMMU_QUEUE_MEM_FAULT
> > +#define RISCV_IOMMU_PQCSR_PQOF          RISCV_IOMMU_QUEUE_OVERFLOW
> > +#define RISCV_IOMMU_PQCSR_PQON          RISCV_IOMMU_QUEUE_ACTIVE
> > +#define RISCV_IOMMU_PQCSR_BUSY          RISCV_IOMMU_QUEUE_BUSY
> > +
> > +
> > +/* 5.18 Interrupt Pending Status (32bits) */
> > +#define RISCV_IOMMU_REG_IPSR            0x0054
> > +
> > +#define RISCV_IOMMU_INTR_CQ             0
> > +#define RISCV_IOMMU_INTR_FQ             1
> > +#define RISCV_IOMMU_INTR_PM             2
> > +#define RISCV_IOMMU_INTR_PQ             3
> > +#define RISCV_IOMMU_INTR_COUNT          4
> > +
> > +#define RISCV_IOMMU_IPSR_CIP            BIT(RISCV_IOMMU_INTR_CQ)
> > +#define RISCV_IOMMU_IPSR_FIP            BIT(RISCV_IOMMU_INTR_FQ)
> > +#define RISCV_IOMMU_IPSR_PMIP           BIT(RISCV_IOMMU_INTR_PM)
> > +#define RISCV_IOMMU_IPSR_PIP            BIT(RISCV_IOMMU_INTR_PQ)
> > +
> > +#define RISCV_IOMMU_IOCOUNT_NUM         31
> > +
> > +/* 5.19 Performance monitoring counter overflow status (32bits) */
> > +#define RISCV_IOMMU_REG_IOCOUNTOVF      0x0058
> > +#define RISCV_IOMMU_IOCOUNTOVF_CY       BIT(0)
> > +#define RISCV_IOMMU_IOCOUNTOVF_HPM      GENMASK(31, 1)
> > +
> > +/* 5.20 Performance monitoring counter inhibits (32bits) */
> > +#define RISCV_IOMMU_REG_IOCOUNTINH      0x005C
> > +#define RISCV_IOMMU_IOCOUNTINH_CY       BIT(0)
> > +#define RISCV_IOMMU_IOCOUNTINH_HPM      GENMASK(31, 1)
> > +
> > +/* 5.21 Performance monitoring cycles counter (64bits) */
> > +#define RISCV_IOMMU_REG_IOHPMCYCLES     0x0060
> > +#define RISCV_IOMMU_IOHPMCYCLES_COUNTER GENMASK_ULL(62, 0)
> > +#define RISCV_IOMMU_IOHPMCYCLES_OVF     BIT_ULL(63)
> > +
> > +/* 5.22 Performance monitoring event counters (31 * 64bits) */
> > +#define RISCV_IOMMU_REG_IOHPMCTR_BASE   0x0068
> > +#define RISCV_IOMMU_REG_IOHPMCTR(_n)    \
> > +    (RISCV_IOMMU_REG_IOHPMCTR_BASE + (_n * 0x8))
> > +
> > +/* 5.23 Performance monitoring event selectors (31 * 64bits) */
> > +#define RISCV_IOMMU_REG_IOHPMEVT_BASE   0x0160
> > +#define RISCV_IOMMU_REG_IOHPMEVT(_n)    \
> > +    (RISCV_IOMMU_REG_IOHPMEVT_BASE + (_n * 0x8))
> > +#define RISCV_IOMMU_IOHPMEVT_EVENT_ID   GENMASK_ULL(14, 0)
> > +#define RISCV_IOMMU_IOHPMEVT_DMASK      BIT_ULL(15)
> > +#define RISCV_IOMMU_IOHPMEVT_PID_PSCID  GENMASK_ULL(35, 16)
> > +#define RISCV_IOMMU_IOHPMEVT_DID_GSCID  GENMASK_ULL(59, 36)
> > +#define RISCV_IOMMU_IOHPMEVT_PV_PSCV    BIT_ULL(60)
> > +#define RISCV_IOMMU_IOHPMEVT_DV_GSCV    BIT_ULL(61)
> > +#define RISCV_IOMMU_IOHPMEVT_IDT        BIT_ULL(62)
> > +#define RISCV_IOMMU_IOHPMEVT_OF         BIT_ULL(63)
> > +
> > +/**
> > + * enum RISCV_IOMMU_HPMEVENT_id - Performance-monitoring event identifier
> > + *
> > + * @RISCV_IOMMU_HPMEVENT_INVALID: Invalid event, do not count
> > + * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests
> > + * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests
> > + * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests
> > + * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses
> > + * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks
> > + * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks
> > + * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: S/VS-Stage page table walks
> > + * @RISCV_IOMMU_HPMEVENT_G_WALKS: G-Stage page table walks
> > + * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs
> > + */
> > +enum RISCV_IOMMU_HPMEVENT_id {
> > +    RISCV_IOMMU_HPMEVENT_INVALID    = 0,
> > +    RISCV_IOMMU_HPMEVENT_URQ        = 1,
> > +    RISCV_IOMMU_HPMEVENT_TRQ        = 2,
> > +    RISCV_IOMMU_HPMEVENT_ATS_RQ     = 3,
> > +    RISCV_IOMMU_HPMEVENT_TLB_MISS   = 4,
> > +    RISCV_IOMMU_HPMEVENT_DD_WALK    = 5,
> > +    RISCV_IOMMU_HPMEVENT_PD_WALK    = 6,
> > +    RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7,
> > +    RISCV_IOMMU_HPMEVENT_G_WALKS    = 8,
> > +    RISCV_IOMMU_HPMEVENT_MAX        = 9
> > +};
> > +
> > +/* 5.24 Translation request IOVA (64bits) */
> > +#define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
> > +#define RISCV_IOMMU_TR_REQ_IOVA_VPN     GENMASK_ULL(63, 12)
> > +
> > +/* 5.25 Translation request control (64bits) */
> > +#define RISCV_IOMMU_REG_TR_REQ_CTL      0x0260
> > +#define RISCV_IOMMU_TR_REQ_CTL_GO_BUSY  BIT_ULL(0)
> > +#define RISCV_IOMMU_TR_REQ_CTL_PRIV     BIT_ULL(1)
> > +#define RISCV_IOMMU_TR_REQ_CTL_EXE      BIT_ULL(2)
> > +#define RISCV_IOMMU_TR_REQ_CTL_NW       BIT_ULL(3)
> > +#define RISCV_IOMMU_TR_REQ_CTL_PID      GENMASK_ULL(31, 12)
> > +#define RISCV_IOMMU_TR_REQ_CTL_PV       BIT_ULL(32)
> > +#define RISCV_IOMMU_TR_REQ_CTL_DID      GENMASK_ULL(63, 40)
> > +
> > +/* 5.26 Translation request response (64bits) */
> > +#define RISCV_IOMMU_REG_TR_RESPONSE     0x0268
> > +#define RISCV_IOMMU_TR_RESPONSE_FAULT   BIT_ULL(0)
> > +#define RISCV_IOMMU_TR_RESPONSE_PBMT    GENMASK_ULL(8, 7)
> > +#define RISCV_IOMMU_TR_RESPONSE_SZ      BIT_ULL(9)
> > +#define RISCV_IOMMU_TR_RESPONSE_PPN     RISCV_IOMMU_PPN_FIELD
> > +
> > +
> > +/* 5.27 Interrupt cause to vector (64bits) */
> > +#define RISCV_IOMMU_REG_IVEC            0x02F8
> > +#define RISCV_IOMMU_IVEC_CIV            GENMASK_ULL(3, 0)
> > +#define RISCV_IOMMU_IVEC_FIV            GENMASK_ULL(7, 4)
> > +#define RISCV_IOMMU_IVEC_PMIV           GENMASK_ULL(11, 8)
> > +#define RISCV_IOMMU_IVEC_PIV            GENMASK_ULL(15, 12)
> > +
> > +
> > +/* 5.28 MSI Configuration table (32 * 64bits) */
> > +#define RISCV_IOMMU_REG_MSI_CONFIG      0x0300
> > +#define RISCV_IOMMU_REG_MSI_ADDR(_n)    \
> > +    (RISCV_IOMMU_REG_MSI_CONFIG + (_n * 0x10))
> > +#define RISCV_IOMMU_MSI_ADDR            GENMASK_ULL(55, 2)
> > +#define RISCV_IOMMU_REG_MSI_DATA(_n)    \
> > +    (RISCV_IOMMU_REG_MSI_CONFIG + (_n * 0x10) + 0x08)
> > +#define RISCV_IOMMU_MSI_DATA            GENMASK_ULL(31, 0)
> > +#define RISCV_IOMMU_REG_MSI_VEC_CTL(_n) \
> > +    (RISCV_IOMMU_REG_MSI_CONFIG + (_n * 0x10) + 0x0C)
> > +#define RISCV_IOMMU_MSI_VEC_CTL_M      BIT_ULL(0)
> > +
> > +
> > +#define RISCV_IOMMU_REG_SIZE           0x1000
> > +
> > +/*
> > + * Chapter 2: Data structures
> > + */
> > +
> > +/*
> > + * Device Directory Table macros for non-leaf nodes
> > + */
> > +#define RISCV_IOMMU_DDTE_VALID          BIT_ULL(0)
> > +#define RISCV_IOMMU_DDTE_PPN            RISCV_IOMMU_PPN_FIELD
> > +
> > +/**
> > + * struct riscv_iommu_dc - Device Context
> > + * @tc: Translation Control
> > + * @iohgatp: I/O Hypervisor guest address translation and protection
> > + *           (Second stage context)
> > + * @ta: Translation Attributes
> > + * @fsc: First stage context
> > + * @msiptpt: MSI page table pointer
> > + * @msi_addr_mask: MSI address mask
> > + * @msi_addr_pattern: MSI address pattern
> > + *
> > + * This structure is used for leaf nodes on the Device Directory Table,
> > + * in case RISCV_IOMMU_CAP_MSI_FLAT is not set, the bottom 4 fields are
> > + * not present and are skipped with pointer arithmetic to avoid
> > + * casting, check out riscv_iommu_get_dc().
> > + * See section 2.1 for more details
> > + */
> > +struct riscv_iommu_dc {
> > +      uint64_t tc;
> > +      uint64_t iohgatp;
> > +      uint64_t ta;
> > +      uint64_t fsc;
> > +      uint64_t msiptp;
> > +      uint64_t msi_addr_mask;
> > +      uint64_t msi_addr_pattern;
> > +      uint64_t _reserved;
> > +};
> > +
> > +/* Translation control fields */
> > +#define RISCV_IOMMU_DC_TC_V             BIT_ULL(0)
> > +#define RISCV_IOMMU_DC_TC_EN_ATS        BIT_ULL(1)
> > +#define RISCV_IOMMU_DC_TC_EN_PRI        BIT_ULL(2)
> > +#define RISCV_IOMMU_DC_TC_T2GPA         BIT_ULL(3)
> > +#define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
> > +#define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
> > +#define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
> > +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
> > +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
> > +#define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
> > +#define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
> > +#define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
> > +
> > +/* Second-stage (aka G-stage) context fields */
> > +#define RISCV_IOMMU_DC_IOHGATP_PPN      RISCV_IOMMU_ATP_PPN_FIELD
> > +#define RISCV_IOMMU_DC_IOHGATP_GSCID    GENMASK_ULL(59, 44)
> > +#define RISCV_IOMMU_DC_IOHGATP_MODE     RISCV_IOMMU_ATP_MODE_FIELD
> > +
> > +/**
> > + * enum riscv_iommu_dc_iohgatp_modes - Guest address
> > + * translation/protection modes
> > + *
> > + * @RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> > + *      No translation/protection
> > + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> > + *      Sv32x4 (2-bit extension of Sv32), when fctl.GXL == 1
> > + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> > + *      Sv39x4 (2-bit extension of Sv39), when fctl.GXL == 0
> > + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> > + *      Sv48x4 (2-bit extension of Sv48), when fctl.GXL == 0
> > + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> > + *      Sv57x4 (2-bit extension of Sv57), when fctl.GXL == 0
> > + */
> > +enum riscv_iommu_dc_iohgatp_modes {
> > +      RISCV_IOMMU_DC_IOHGATP_MODE_BARE = 0,
> > +      RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4 = 8,
> > +      RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4 = 8,
> > +      RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4 = 9,
> > +      RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4 = 10
> > +};
> > +
> > +/* Translation attributes fields */
> > +#define RISCV_IOMMU_DC_TA_PSCID         GENMASK_ULL(31, 12)
> > +
> > +/* First-stage context fields */
> > +#define RISCV_IOMMU_DC_FSC_PPN          RISCV_IOMMU_ATP_PPN_FIELD
> > +#define RISCV_IOMMU_DC_FSC_MODE         RISCV_IOMMU_ATP_MODE_FIELD
> > +
> > +/**
> > + * enum riscv_iommu_dc_fsc_atp_modes - First stage address
> > + * translation/protection modes
> > + *
> > + * @RISCV_IOMMU_DC_FSC_MODE_BARE: No translation/protection
> > + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32: Sv32, when dc.tc.SXL == 1
> > + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: Sv39, when dc.tc.SXL == 0
> > + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: Sv48, when dc.tc.SXL == 0
> > + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: Sv57, when dc.tc.SXL == 0
> > + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8: 1lvl PDT, 8bit process ids
> > + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17: 2lvl PDT, 17bit process ids
> > + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20: 3lvl PDT, 20bit process ids
> > + *
> > + * FSC holds IOSATP when RISCV_IOMMU_DC_TC_PDTV is 0 and PDTP otherwise.
> > + * IOSATP controls the first stage address translation (same as the satp
> > + * register on the RISC-V MMU), and PDTP holds the process directory table,
> > + * used to select a first stage page table based on a process id (for devices
> > + * that support multiple process ids).
> > + */
> > +enum riscv_iommu_dc_fsc_atp_modes {
> > +      RISCV_IOMMU_DC_FSC_MODE_BARE = 0,
> > +      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 = 8,
> > +      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 = 8,
> > +      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48 = 9,
> > +      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57 = 10,
> > +      RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8 = 1,
> > +      RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17 = 2,
> > +      RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20 = 3
> > +};
> > +
> > +/* MSI page table pointer */
> > +#define RISCV_IOMMU_DC_MSIPTP_PPN       RISCV_IOMMU_ATP_PPN_FIELD
> > +#define RISCV_IOMMU_DC_MSIPTP_MODE      RISCV_IOMMU_ATP_MODE_FIELD
> > +#define RISCV_IOMMU_DC_MSIPTP_MODE_OFF  0
> > +#define RISCV_IOMMU_DC_MSIPTP_MODE_FLAT 1
> > +
> > +/* MSI address mask */
> > +#define RISCV_IOMMU_DC_MSI_ADDR_MASK    GENMASK_ULL(51, 0)
> > +
> > +/* MSI address pattern */
> > +#define RISCV_IOMMU_DC_MSI_PATTERN      GENMASK_ULL(51, 0)
> > +
> > +
> > +/**
> > + * struct riscv_iommu_pc - Process Context
> > + * @ta: Translation Attributes
> > + * @fsc: First stage context
> > + *
> > + * This structure is used for leaf nodes on the Process Directory Table
> > + * See section 2.3 for more details
> > + */
> > +struct riscv_iommu_pc {
> > +      uint64_t ta;
> > +      uint64_t fsc;
> > +};
> > +
> > +/* Translation attributes fields */
> > +#define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
> > +#define RISCV_IOMMU_PC_TA_ENS           BIT_ULL(1)
> > +#define RISCV_IOMMU_PC_TA_SUM           BIT_ULL(2)
> > +#define RISCV_IOMMU_PC_TA_PSCID         GENMASK_ULL(31, 12)
> > +
> > +/* First stage context fields */
> > +#define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
> > +#define RISCV_IOMMU_PC_FSC_MODE         GENMASK_ULL(63, 60)
> > +
> > +
> > +/*
> > + * Chapter 3: In-memory queue interface
> > + */
> > +
> > +/**
> > + * struct riscv_iommu_cmd - Generic I/O MMU command structure
> > + * @dword0: Includes the opcode and the function identifier
> > + * @dword1: Opcode specific data
> > + *
> > + * The commands are interpreted as two 64bit fields, where the first
> > + * 7bits of the first field are the opcode which also defines the
> > + * command's format, followed by a 3bit field that specifies the
> > + * function invoked by that command, and the rest is opcode-specific.
> > + * This is a generic struct which will be populated differently
> > + * according to each command. For more infos on the commands and
> > + * the command queue check section 3.1.
> > + */
> > +struct riscv_iommu_command {
> > +      uint64_t dword0;
> > +      uint64_t dword1;
> > +};
> > +
> > +/* Fields on dword0, common for all commands */
> > +#define RISCV_IOMMU_CMD_OPCODE          GENMASK_ULL(6, 0)
> > +#define RISCV_IOMMU_CMD_FUNC            GENMASK_ULL(9, 7)
> > +
> > +/* 3.1.1 I/O MMU Page-table cache invalidation */
> > +/* Fields on dword0 */
> > +#define RISCV_IOMMU_CMD_IOTINVAL_OPCODE         1
> > +#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA       0
> > +#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA      1
> > +#define RISCV_IOMMU_CMD_IOTINVAL_AV     BIT_ULL(10)
> > +#define RISCV_IOMMU_CMD_IOTINVAL_PSCID  GENMASK_ULL(31, 12)
> > +#define RISCV_IOMMU_CMD_IOTINVAL_PSCV   BIT_ULL(32)
> > +#define RISCV_IOMMU_CMD_IOTINVAL_GV     BIT_ULL(33)
> > +#define RISCV_IOMMU_CMD_IOTINVAL_GSCID  GENMASK_ULL(59, 44)
> > +/* dword1 is the address, 4K-alligned and shifted to the right by two bits. */
> > +
> > +/* 3.1.2 I/O MMU Command Queue Fences */
> > +/* Fields on dword0 */
> > +#define RISCV_IOMMU_CMD_IOFENCE_OPCODE          2
> > +#define RISCV_IOMMU_CMD_IOFENCE_FUNC_C          0
> > +#define RISCV_IOMMU_CMD_IOFENCE_AV      BIT_ULL(10)
> > +#define RISCV_IOMMU_CMD_IOFENCE_WSI     BIT_ULL(11)
> > +#define RISCV_IOMMU_CMD_IOFENCE_PR      BIT_ULL(12)
> > +#define RISCV_IOMMU_CMD_IOFENCE_PW      BIT_ULL(13)
> > +#define RISCV_IOMMU_CMD_IOFENCE_DATA    GENMASK_ULL(63, 32)
> > +/* dword1 is the address, word-size alligned and shifted to the right by two bits. */
> > +
> > +/* 3.1.3 I/O MMU Directory cache invalidation */
> > +/* Fields on dword0 */
> > +#define RISCV_IOMMU_CMD_IODIR_OPCODE            3
> > +#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT    0
> > +#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT    1
> > +#define RISCV_IOMMU_CMD_IODIR_PID       GENMASK_ULL(31, 12)
> > +#define RISCV_IOMMU_CMD_IODIR_DV        BIT_ULL(33)
> > +#define RISCV_IOMMU_CMD_IODIR_DID       GENMASK_ULL(63, 40)
> > +/* dword1 is reserved for standard use */
> > +
> > +/* 3.1.4 I/O MMU PCIe ATS */
> > +/* Fields on dword0 */
> > +#define RISCV_IOMMU_CMD_ATS_OPCODE              4
> > +#define RISCV_IOMMU_CMD_ATS_FUNC_INVAL          0
> > +#define RISCV_IOMMU_CMD_ATS_FUNC_PRGR           1
> > +#define RISCV_IOMMU_CMD_ATS_PID         GENMASK_ULL(31, 12)
> > +#define RISCV_IOMMU_CMD_ATS_PV          BIT_ULL(32)
> > +#define RISCV_IOMMU_CMD_ATS_DSV         BIT_ULL(33)
> > +#define RISCV_IOMMU_CMD_ATS_RID         GENMASK_ULL(55, 40)
> > +#define RISCV_IOMMU_CMD_ATS_DSEG        GENMASK_ULL(63, 56)
> > +/* dword1 is the ATS payload, two different payload types for INVAL and PRGR */
> > +
> > +/* ATS.INVAL payload*/
> > +#define RISCV_IOMMU_CMD_ATS_INVAL_G     BIT_ULL(0)
> > +/* Bits 1 - 10 are zeroed */
> > +#define RISCV_IOMMU_CMD_ATS_INVAL_S     BIT_ULL(11)
> > +#define RISCV_IOMMU_CMD_ATS_INVAL_UADDR GENMASK_ULL(63, 12)
> > +
> > +/* ATS.PRGR payload */
> > +/* Bits 0 - 31 are zeroed */
> > +#define RISCV_IOMMU_CMD_ATS_PRGR_PRG_INDEX      GENMASK_ULL(40, 32)
> > +/* Bits 41 - 43 are zeroed */
> > +#define RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE      GENMASK_ULL(47, 44)
> > +#define RISCV_IOMMU_CMD_ATS_PRGR_DST_ID         GENMASK_ULL(63, 48)
> > +
> > +
> > +/**
> > + * struct riscv_iommu_fq_record - Fault/Event Queue Record
> > + * @hdr: Header, includes fault/event cause, PID/DID, transaction type etc
> > + * @_reserved: Low 32bits for custom use, high 32bits for standard use
> > + * @iotval: Transaction-type/cause specific format
> > + * @iotval2: Cause specific format
> > + *
> > + * The fault/event queue reports events and failures raised when
> > + * processing transactions. Each record is a 32byte structure where
> > + * the first dword has a fixed format for providing generic infos
> > + * regarding the fault/event, and two more dwords are there for
> > + * fault/event-specific information. For more details see section
> > + * 3.2.
> > + */
> > +struct riscv_iommu_fq_record {
> > +      uint64_t hdr;
> > +      uint64_t _reserved;
> > +      uint64_t iotval;
> > +      uint64_t iotval2;
> > +};
> > +
> > +/* Fields on header */
> > +#define RISCV_IOMMU_FQ_HDR_CAUSE        GENMASK_ULL(11, 0)
> > +#define RISCV_IOMMU_FQ_HDR_PID          GENMASK_ULL(31, 12)
> > +#define RISCV_IOMMU_FQ_HDR_PV           BIT_ULL(32)
> > +#define RISCV_IOMMU_FQ_HDR_PRIV         BIT_ULL(33)
> > +#define RISCV_IOMMU_FQ_HDR_TTYPE        GENMASK_ULL(39, 34)
> > +#define RISCV_IOMMU_FQ_HDR_DID          GENMASK_ULL(63, 40)
> > +
> > +/**
> > + * enum riscv_iommu_fq_causes - Fault/event cause values
> > + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT: Instruction access fault
> > + * @RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED: Read address misaligned
> > + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT: Read load fault
> > + * @RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED: Write/AMO address misaligned
> > + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT: Write/AMO access fault
> > + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S: Instruction page fault
> > + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S: Read page fault
> > + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S: Write/AMO page fault
> > + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS: Instruction guest page fault
> > + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS: Read guest page fault
> > + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS: Write/AMO guest page fault
> > + * @RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED: All inbound transactions disallowed
> > + * @RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT: DDT entry load access fault
> > + * @RISCV_IOMMU_FQ_CAUSE_DDT_INVALID: DDT entry invalid
> > + * @RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED: DDT entry misconfigured
> > + * @RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED: Transaction type disallowed
> > + * @RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT: MSI PTE load access fault
> > + * @RISCV_IOMMU_FQ_CAUSE_MSI_INVALID: MSI PTE invalid
> > + * @RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED: MSI PTE misconfigured
> > + * @RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT: MRIF access fault
> > + * @RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT: PDT entry load access fault
> > + * @RISCV_IOMMU_FQ_CAUSE_PDT_INVALID: PDT entry invalid
> > + * @RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED: PDT entry misconfigured
> > + * @RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED: DDT data corruption
> > + * @RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED: PDT data corruption
> > + * @RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED: MSI page table data corruption
> > + * @RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED: MRIF data corruption
> > + * @RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR: Internal data path error
> > + * @RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT: IOMMU MSI write access fault
> > + * @RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED: First/second stage page table data corruption
> > + *
> > + * Values are on table 11 of the spec, encodings 275 - 2047 are reserved for
> > + * standard use, and 2048 - 4095 for custom use.
> > + */
> > +enum riscv_iommu_fq_causes {
> > +      RISCV_IOMMU_FQ_CAUSE_INST_FAULT           = 1,
> > +      RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED   = 4,
> > +      RISCV_IOMMU_FQ_CAUSE_RD_FAULT             = 5,
> > +      RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED   = 6,
> > +      RISCV_IOMMU_FQ_CAUSE_WR_FAULT             = 7,
> > +      RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S         = 12,
> > +      RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S           = 13,
> > +      RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S           = 15,
> > +      RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS        = 20,
> > +      RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS          = 21,
> > +      RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS          = 23,
> > +      RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED         = 256,
> > +      RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT       = 257,
> > +      RISCV_IOMMU_FQ_CAUSE_DDT_INVALID          = 258,
> > +      RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED    = 259,
> > +      RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED        = 260,
> > +      RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT       = 261,
> > +      RISCV_IOMMU_FQ_CAUSE_MSI_INVALID          = 262,
> > +      RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED    = 263,
> > +      RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT           = 264,
> > +      RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT       = 265,
> > +      RISCV_IOMMU_FQ_CAUSE_PDT_INVALID          = 266,
> > +      RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED    = 267,
> > +      RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED        = 268,
> > +      RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED        = 269,
> > +      RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED     = 270,
> > +      RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED      = 271,
> > +      RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR    = 272,
> > +      RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT         = 273,
> > +      RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED         = 274
> > +};
> > +
> > +/**
> > + * enum riscv_iommu_fq_ttypes: Fault/event transaction types
> > + * @RISCV_IOMMU_FQ_TTYPE_NONE: None. Fault not caused by an inbound transaction.
> > + * @RISCV_IOMMU_FQ_TTYPE_UADDR_INST_FETCH: Instruction fetch from untranslated address
> > + * @RISCV_IOMMU_FQ_TTYPE_UADDR_RD: Read from untranslated address
> > + * @RISCV_IOMMU_FQ_TTYPE_UADDR_WR: Write/AMO to untranslated address
> > + * @RISCV_IOMMU_FQ_TTYPE_TADDR_INST_FETCH: Instruction fetch from translated address
> > + * @RISCV_IOMMU_FQ_TTYPE_TADDR_RD: Read from translated address
> > + * @RISCV_IOMMU_FQ_TTYPE_TADDR_WR: Write/AMO to translated address
> > + * @RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ: PCIe ATS translation request
> > + * @RISCV_IOMMU_FW_TTYPE_PCIE_MSG_REQ: PCIe message request
> > + *
> > + * Values are on table 12 of the spec, type 4 and 10 - 31 are reserved for
> > + * standard use and 31 - 63 for custom use.
> > + */
> > +enum riscv_iommu_fq_ttypes {
> > +      RISCV_IOMMU_FQ_TTYPE_NONE = 0,
> > +      RISCV_IOMMU_FQ_TTYPE_UADDR_INST_FETCH = 1,
> > +      RISCV_IOMMU_FQ_TTYPE_UADDR_RD = 2,
> > +      RISCV_IOMMU_FQ_TTYPE_UADDR_WR = 3,
> > +      RISCV_IOMMU_FQ_TTYPE_TADDR_INST_FETCH = 5,
> > +      RISCV_IOMMU_FQ_TTYPE_TADDR_RD = 6,
> > +      RISCV_IOMMU_FQ_TTYPE_TADDR_WR = 7,
> > +      RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ = 8,
> > +      RISCV_IOMMU_FW_TTYPE_PCIE_MSG_REQ = 9,
> > +};
> > +
> > +
> > +/**
> > + * struct riscv_iommu_pq_record - PCIe Page Request record
> > + * @hdr: Header, includes PID, DID etc
> > + * @payload: Holds the page address, request group and permission bits
> > + *
> > + * For more infos on the PCIe Page Request queue see chapter 3.3.
> > + */
> > +struct riscv_iommu_pq_record {
> > +      uint64_t hdr;
> > +      uint64_t payload;
> > +};
> > +
> > +/* Header fields */
> > +#define RISCV_IOMMU_PREQ_HDR_PID        GENMASK_ULL(31, 12)
> > +#define RISCV_IOMMU_PREQ_HDR_PV         BIT_ULL(32)
> > +#define RISCV_IOMMU_PREQ_HDR_PRIV       BIT_ULL(33)
> > +#define RISCV_IOMMU_PREQ_HDR_EXEC       BIT_ULL(34)
> > +#define RISCV_IOMMU_PREQ_HDR_DID        GENMASK_ULL(63, 40)
> > +
> > +/* Payload fields */
> > +#define RISCV_IOMMU_PREQ_PAYLOAD_R      BIT_ULL(0)
> > +#define RISCV_IOMMU_PREQ_PAYLOAD_W      BIT_ULL(1)
> > +#define RISCV_IOMMU_PREQ_PAYLOAD_L      BIT_ULL(2)
> > +#define RISCV_IOMMU_PREQ_PAYLOAD_M      GENMASK_ULL(2, 0)
> > +#define RISCV_IOMMU_PREQ_PRG_INDEX      GENMASK_ULL(11, 3)
> > +#define RISCV_IOMMU_PREQ_UADDR          GENMASK_ULL(63, 12)
> > +
> > +
> > +/**
> > + * struct riscv_iommu_msi_pte - MSI Page Table Entry
> > + * @pte: MSI PTE
> > + * @mrif_info: Memory-resident interrupt file info
> > + *
> > + * The MSI Page Table is used for virtualizing MSIs, so that when
> > + * a device sends an MSI to a guest, the IOMMU can reroute it
> > + * by translating the MSI address, either to a guest interrupt file
> > + * or a memory resident interrupt file (MRIF). Note that this page table
> > + * is an array of MSI PTEs, not a multi-level pt, each entry
> > + * is a leaf entry. For more infos check out the the AIA spec, chapter 9.5.
> > + *
> > + * Also in basic mode the mrif_info field is ignored by the IOMMU and can
> > + * be used by software, any other reserved fields on pte must be zeroed-out
> > + * by software.
> > + */
> > +struct riscv_iommu_msi_pte {
> > +      uint64_t pte;
> > +      uint64_t mrif_info;
> > +};
> > +
> > +/* Fields on pte */
> > +#define RISCV_IOMMU_MSI_PTE_V           BIT_ULL(0)
> > +#define RISCV_IOMMU_MSI_PTE_M           GENMASK_ULL(2, 1)
> > +
> > +#define RISCV_IOMMU_MSI_PTE_M_MRIF      1
> > +#define RISCV_IOMMU_MSI_PTE_M_BASIC     3
> > +
> > +/* When M == 1 (MRIF mode) */
> > +#define RISCV_IOMMU_MSI_PTE_MRIF_ADDR   GENMASK_ULL(53, 7)
> > +/* When M == 3 (basic mode) */
> > +#define RISCV_IOMMU_MSI_PTE_PPN         RISCV_IOMMU_PPN_FIELD
> > +#define RISCV_IOMMU_MSI_PTE_C           BIT_ULL(63)
> > +
> > +/* Fields on mrif_info */
> > +#define RISCV_IOMMU_MSI_MRIF_NID        GENMASK_ULL(9, 0)
> > +#define RISCV_IOMMU_MSI_MRIF_NPPN       RISCV_IOMMU_PPN_FIELD
> > +#define RISCV_IOMMU_MSI_MRIF_NID_MSB    BIT_ULL(60)
> > +
> > +
> > +#endif /* _RISCV_IOMMU_BITS_H_ */
> > diff --git a/hw/riscv/riscv-iommu-pci.c b/hw/riscv/riscv-iommu-pci.c
> > new file mode 100644
> > index 0000000000..e205f806d6
> > --- /dev/null
> > +++ b/hw/riscv/riscv-iommu-pci.c
> > @@ -0,0 +1,181 @@
> > +/*
> > + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> > + *
> > + * Copyright (C) 2022-2023 Rivos Inc.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "hw/pci/msi.h"
> > +#include "hw/pci/msix.h"
> > +#include "hw/pci/pci_bus.h"
> > +#include "hw/qdev-properties.h"
> > +#include "hw/riscv/riscv_hart.h"
> > +#include "migration/vmstate.h"
> > +#include "qapi/error.h"
> > +#include "qemu/error-report.h"
> > +#include "qemu/host-utils.h"
> > +#include "qom/object.h"
> > +
> > +#include "cpu_bits.h"
> > +#include "riscv-iommu.h"
> > +#include "riscv-iommu-bits.h"
> > +
> > +#ifndef PCI_VENDOR_ID_RIVOS
> > +#define PCI_VENDOR_ID_RIVOS           0x1efd
> > +#endif
> > +
> > +#ifndef PCI_DEVICE_ID_RIVOS_IOMMU
> > +#define PCI_DEVICE_ID_RIVOS_IOMMU     0xedf1
> > +#endif
>
> The file is the RISC-V IOMMU, but don't these IDs say Rivos IOMMU?
>

This can be changed if there is a more appropriate poll of VIDs for QEMU models.
As we had to pick an VID/DID, the simples was to donate one from Rivos VID pool.

> > +
> > +/* RISC-V IOMMU PCI Device Emulation */
> > +
> > +typedef struct RISCVIOMMUStatePci {
> > +    PCIDevice        pci;     /* Parent PCIe device state */
> > +    MemoryRegion     bar0;    /* PCI BAR (including MSI-x config) */
> > +    RISCVIOMMUState  iommu;   /* common IOMMU state */
> > +} RISCVIOMMUStatePci;
> > +
> > +/* interrupt delivery callback */
> > +static void riscv_iommu_pci_notify(RISCVIOMMUState *iommu, unsigned vector)
> > +{
> > +    RISCVIOMMUStatePci *s = container_of(iommu, RISCVIOMMUStatePci, iommu);
> > +
> > +    if (msix_enabled(&(s->pci))) {
> > +        msix_notify(&(s->pci), vector);
> > +    }
> > +}
> > +
> > +static void riscv_iommu_pci_realize(PCIDevice *dev, Error **errp)
> > +{
> > +    RISCVIOMMUStatePci *s = DO_UPCAST(RISCVIOMMUStatePci, pci, dev);
> > +    RISCVIOMMUState *iommu = &s->iommu;
> > +    uint64_t cap = iommu->cap;
> > +    Error *err = NULL;
> > +
> > +    /* Set device id for trace / debug */
> > +    DEVICE(iommu)->id = g_strdup_printf("%02x:%02x.%01x",
> > +        pci_dev_bus_num(dev), PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
> > +
> > +    /* Support MSI only */
> > +    cap = set_field(cap, RISCV_IOMMU_CAP_IGS, RISCV_IOMMU_CAP_IGS_MSI);
> > +    qdev_prop_set_uint64(DEVICE(dev), "capabilities", cap);
> > +
> > +    if (!qdev_realize(DEVICE(iommu), NULL, errp)) {
> > +        return;
> > +    }
> > +
> > +    memory_region_init(&s->bar0, OBJECT(s), "riscv-iommu-bar0",
> > +        QEMU_ALIGN_UP(memory_region_size(&iommu->regs_mr), TARGET_PAGE_SIZE));
> > +    memory_region_add_subregion(&s->bar0, 0, &iommu->regs_mr);
> > +
> > +    pcie_endpoint_cap_init(dev, 0);
> > +
> > +    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > +                     PCI_BASE_ADDRESS_MEM_TYPE_64, &s->bar0);
> > +
> > +    int ret = msix_init(dev, RISCV_IOMMU_INTR_COUNT,
> > +                        &s->bar0, 0, RISCV_IOMMU_REG_MSI_CONFIG,
> > +                        &s->bar0, 0, RISCV_IOMMU_REG_MSI_CONFIG + 256, 0, &err);
> > +
> > +    if (ret == -ENOTSUP) {
> > +        /*
> > +         * MSI-x is not supported by the platform.
> > +         * Driver should use timer/polling based notification handlers.
> > +         */
> > +        warn_report_err(err);
> > +    } else if (ret < 0) {
> > +        error_propagate(errp, err);
> > +        return;
> > +    } else {
> > +        /* mark all allocated MSIx vectors as used. */
> > +        msix_vector_use(dev, RISCV_IOMMU_INTR_CQ);
> > +        msix_vector_use(dev, RISCV_IOMMU_INTR_FQ);
> > +        msix_vector_use(dev, RISCV_IOMMU_INTR_PM);
> > +        msix_vector_use(dev, RISCV_IOMMU_INTR_PQ);
> > +        iommu->notify = riscv_iommu_pci_notify;
> > +    }
> > +
> > +    PCIBus *bus = pci_device_root_bus(dev);
> > +    if (!bus) {
> > +        error_setg(errp, "can't find PCIe root port for %02x:%02x.%x",
> > +            pci_bus_num(pci_get_bus(dev)), PCI_SLOT(dev->devfn),
> > +            PCI_FUNC(dev->devfn));
> > +        return;
> > +    }
> > +
> > +    riscv_iommu_pci_setup_iommu(iommu, bus, errp);
> > +}
> > +
> > +static void riscv_iommu_pci_exit(PCIDevice *pci_dev)
> > +{
> > +    pci_setup_iommu(pci_device_root_bus(pci_dev), NULL, NULL);
> > +}
> > +
> > +static const VMStateDescription riscv_iommu_vmstate = {
> > +    .name = "riscv-iommu",
> > +    .unmigratable = 1
> > +};
> > +
> > +static void riscv_iommu_pci_init(Object *obj)
> > +{
> > +    RISCVIOMMUStatePci *s = RISCV_IOMMU_PCI(obj);
> > +    RISCVIOMMUState *iommu = &s->iommu;
> > +
> > +    object_initialize_child(obj, "iommu", iommu, TYPE_RISCV_IOMMU);
> > +    qdev_alias_all_properties(DEVICE(iommu), obj);
> > +}
> > +
> > +static Property riscv_iommu_pci_properties[] = {
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +static void riscv_iommu_pci_class_init(ObjectClass *klass, void *data)
> > +{
> > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > +    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
> > +
> > +    k->realize = riscv_iommu_pci_realize;
> > +    k->exit = riscv_iommu_pci_exit;
> > +    k->vendor_id = PCI_VENDOR_ID_RIVOS;
> > +    k->device_id = PCI_DEVICE_ID_RIVOS_IOMMU;
> > +    k->revision = 0;
> > +    k->class_id = 0x0806;
> > +    dc->desc = "RISCV-IOMMU DMA Remapping device";
> > +    dc->vmsd = &riscv_iommu_vmstate;
> > +    dc->hotpluggable = false;
> > +    dc->user_creatable = true;
> > +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> > +    device_class_set_props(dc, riscv_iommu_pci_properties);
> > +}
> > +
> > +static const TypeInfo riscv_iommu_pci = {
> > +    .name = TYPE_RISCV_IOMMU_PCI,
> > +    .parent = TYPE_PCI_DEVICE,
> > +    .class_init = riscv_iommu_pci_class_init,
> > +    .instance_init = riscv_iommu_pci_init,
> > +    .instance_size = sizeof(RISCVIOMMUStatePci),
> > +    .interfaces = (InterfaceInfo[]) {
> > +        { INTERFACE_PCIE_DEVICE },
> > +        { },
> > +    },
> > +};
> > +
> > +static void riscv_iommu_register_pci_types(void)
> > +{
> > +    type_register_static(&riscv_iommu_pci);
> > +}
>
> The PCIe device should be a seperate patch
>

ack.


> > +
> > +type_init(riscv_iommu_register_pci_types);
> > diff --git a/hw/riscv/riscv-iommu-sys.c b/hw/riscv/riscv-iommu-sys.c
> > new file mode 100644
> > index 0000000000..7148588b59
> > --- /dev/null
> > +++ b/hw/riscv/riscv-iommu-sys.c
> > @@ -0,0 +1,123 @@
> > +/*
> > + * QEMU emulation of an RISC-V IOMMU (Ziommu) - Platform Device
> > + *
> > + * Copyright (C) 2022-2023 Rivos Inc.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "hw/pci/pci_bus.h"
> > +#include "hw/irq.h"
> > +#include "hw/qdev-properties.h"
> > +#include "hw/sysbus.h"
> > +#include "qapi/error.h"
> > +#include "qapi/error.h"
> > +#include "qemu/error-report.h"
> > +#include "qemu/host-utils.h"
> > +#include "qemu/module.h"
> > +#include "qemu/osdep.h"
> > +#include "qom/object.h"
> > +
> > +#include "cpu_bits.h"
> > +#include "riscv-iommu.h"
> > +#include "riscv-iommu-bits.h"
> > +
> > +/* RISC-V IOMMU System Platform Device Emulation */
> > +
> > +struct RISCVIOMMUStateSys {
> > +    SysBusDevice     parent;
> > +    uint64_t         addr;
> > +    qemu_irq         irqs[4];
> > +    RISCVIOMMUState  iommu;
> > +};
> > +
> > +/* interrupt delivery callback */
> > +static void riscv_iommu_sys_notify(RISCVIOMMUState *iommu, unsigned vector)
> > +{
> > +    RISCVIOMMUStateSys *s = container_of(iommu, RISCVIOMMUStateSys, iommu);
> > +
> > +    if (vector < RISCV_IOMMU_INTR_COUNT && s->irqs[vector]) {
> > +        qemu_irq_pulse(s->irqs[vector]);
> > +    }
> > +}
> > +
> > +static void riscv_iommu_sys_realize(DeviceState *dev, Error **errp)
> > +{
> > +    RISCVIOMMUStateSys *s = RISCV_IOMMU_SYS(dev);
> > +    RISCVIOMMUState *iommu = &s->iommu;
> > +    PCIBus *pci_bus;
> > +    uint64_t cap = iommu->cap;
> > +    int i;
> > +
> > +    /* Support WSI only */
> > +    cap = set_field(cap, RISCV_IOMMU_CAP_IGS, RISCV_IOMMU_CAP_IGS_WSI);
> > +    qdev_prop_set_uint64(dev, "capabilities", cap);
> > +
> > +    if (!qdev_realize(DEVICE(iommu), NULL, errp)) {
> > +        return;
> > +    }
> > +
> > +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &iommu->regs_mr);
> > +    if (s->addr) {
> > +        sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, s->addr);
> > +    }
> > +
> > +    for (i = 0; i < RISCV_IOMMU_INTR_COUNT; i++) {
> > +        sysbus_init_irq(&s->parent, &s->irqs[i]);
> > +    }
> > +
> > +    iommu->notify = riscv_iommu_sys_notify;
> > +
> > +    pci_bus = (PCIBus *) object_resolve_path_type("", TYPE_PCI_BUS, NULL);
> > +    if (pci_bus) {
> > +        riscv_iommu_pci_setup_iommu(iommu, pci_bus, errp);
> > +    }
> > +}
> > +
> > +static void riscv_iommu_sys_init(Object *obj)
> > +{
> > +    RISCVIOMMUStateSys *s = RISCV_IOMMU_SYS(obj);
> > +    RISCVIOMMUState *iommu = &s->iommu;
> > +
> > +    object_initialize_child(obj, "iommu", iommu, TYPE_RISCV_IOMMU);
> > +    qdev_alias_all_properties(DEVICE(iommu), obj);
> > +}
> > +
> > +static Property riscv_iommu_sys_properties[] = {
> > +    DEFINE_PROP_UINT64("addr", RISCVIOMMUStateSys, addr, 0),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +static void riscv_iommu_sys_class_init(ObjectClass *klass, void *data)
> > +{
> > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > +    dc->realize = riscv_iommu_sys_realize;
> > +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> > +    device_class_set_props(dc, riscv_iommu_sys_properties);
> > +}
> > +
> > +static const TypeInfo riscv_iommu_sys = {
> > +    .name          = TYPE_RISCV_IOMMU_SYS,
> > +    .parent        = TYPE_SYS_BUS_DEVICE,
> > +    .class_init    = riscv_iommu_sys_class_init,
> > +    .instance_init = riscv_iommu_sys_init,
> > +    .instance_size = sizeof(RISCVIOMMUStateSys),
> > +};
> > +
> > +static void riscv_iommu_register_sys(void)
> > +{
> > +    type_register_static(&riscv_iommu_sys);
> > +}
>
> Same here
>

ack

> > +
> > +type_init(riscv_iommu_register_sys)
> > diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> > new file mode 100644
> > index 0000000000..fd271b2988
> > --- /dev/null
> > +++ b/hw/riscv/riscv-iommu.c
> > @@ -0,0 +1,2539 @@
> > +/*
> > + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> > + *
> > + * Copyright (C) 2021-2023, Rivos Inc.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qom/object.h"
> > +#include "hw/pci/pci_bus.h"
> > +#include "hw/pci/pci_device.h"
> > +#include "hw/qdev-properties.h"
> > +#include "hw/riscv/riscv_hart.h"
> > +#include "migration/vmstate.h"
> > +#include "qapi/error.h"
> > +#include "qemu/timer.h"
> > +
> > +#include "cpu_bits.h"
> > +#include "riscv-iommu.h"
> > +#include "riscv-iommu-bits.h"
> > +#include "trace.h"
> > +
> > +#define LIMIT_CACHE_CTX               (1U << 7)
> > +#define LIMIT_CACHE_IOT               (1U << 20)
> > +
> > +/* Physical page number coversions */
> > +#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
> > +#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
> > +
> > +typedef struct RISCVIOMMUContext RISCVIOMMUContext;
> > +typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
> > +
> > +/* Device assigned I/O address space */
> > +struct RISCVIOMMUSpace {
> > +    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
> > +    AddressSpace iova_as;       /* IOVA address space for attached device */
> > +    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
> > +    uint32_t devid;             /* Requester identifier, AKA device_id */
> > +    bool notifier;              /* IOMMU unmap notifier enabled */
> > +    QLIST_ENTRY(RISCVIOMMUSpace) list;
> > +};
> > +
> > +/* Device translation context state. */
> > +struct RISCVIOMMUContext {
> > +    uint64_t devid:24;          /* Requester Id, AKA device_id */
> > +    uint64_t pasid:20;          /* Process Address Space ID */
> > +    uint64_t __rfu:20;          /* reserved */
> > +    uint64_t tc;                /* Translation Control */
> > +    uint64_t ta;                /* Translation Attributes */
> > +    uint64_t satp;              /* S-Stage address translation and protection */
> > +    uint64_t gatp;              /* G-Stage address translation and protection */
> > +    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
> > +    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
> > +    uint64_t msiptp;            /* MSI redirection page table pointer */
> > +};
> > +
> > +/* Address translation cache entry */
> > +struct RISCVIOMMUEntry {
> > +    uint64_t iova:44;           /* IOVA Page Number */
> > +    uint64_t pscid:20;          /* Process Soft-Context identifier */
> > +    uint64_t phys:44;           /* Physical Page Number */
> > +    uint64_t gscid:16;          /* Guest Soft-Context identifier */
> > +    uint64_t perm:2;            /* IOMMU_RW flags */
> > +    uint64_t __rfu:2;
> > +};
> > +
> > +/* IOMMU index for transactions without PASID specified. */
> > +#define RISCV_IOMMU_NOPASID 0
> > +
> > +static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
> > +{
> > +    const uint32_t ipsr =
> > +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
> > +    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
> > +    if (s->notify && !(ipsr & (1 << vec))) {
> > +        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
> > +    }
> > +}
> > +
> > +static void riscv_iommu_fault(RISCVIOMMUState *s, struct riscv_iommu_fq_record *ev)
> > +{
> > +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> > +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
> > +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
> > +    uint32_t next = (tail + 1) & s->fq_mask;
> > +    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
> > +
> > +    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> > +                          PCI_FUNC(devid), ev->hdr, ev->iotval);
> > +
> > +    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
> > +        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
> > +        return;
> > +    }
> > +
> > +    if (head == next) {
> > +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, RISCV_IOMMU_FQCSR_FQOF, 0);
> > +    } else {
> > +        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
> > +        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
> > +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> > +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, RISCV_IOMMU_FQCSR_FQMF, 0);
> > +        } else {
> > +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
> > +        }
> > +    }
> > +
> > +    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
> > +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
> > +    }
> > +}
> > +
> > +static void riscv_iommu_pri(RISCVIOMMUState *s,
> > +    struct riscv_iommu_pq_record *pr)
> > +{
> > +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> > +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
> > +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
> > +    uint32_t next = (tail + 1) & s->pq_mask;
> > +    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
> > +
> > +    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> > +                          PCI_FUNC(devid), pr->payload);
> > +
> > +    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
> > +        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
> > +        return;
> > +    }
> > +
> > +    if (head == next) {
> > +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, RISCV_IOMMU_PQCSR_PQOF, 0);
> > +    } else {
> > +        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
> > +        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
> > +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> > +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, RISCV_IOMMU_PQCSR_PQMF, 0);
> > +        } else {
> > +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
> > +        }
> > +    }
> > +
> > +    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
> > +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
> > +    }
> > +}
> > +
> > +static void __hpm_incr_ctr(RISCVIOMMUState *s, uint32_t ctr_idx)
> > +{
> > +    const uint32_t off = ctr_idx << 3;
> > +    uint64_t cntr_val;
> > +
> > +    qemu_spin_lock(&s->regs_lock);
> > +    cntr_val = ldq_le_p(&s->regs_rw[RISCV_IOMMU_REG_IOHPMCTR_BASE + off]);
> > +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_IOHPMCTR_BASE + off], cntr_val + 1);
> > +    qemu_spin_unlock(&s->regs_lock);
> > +
> > +    /* Handle the overflow scenario. */
> > +    if (cntr_val == UINT64_MAX) {
> > +        /*
> > +         * Generate interrupt only if OF bit is clear. +1 to offset the cycle
> > +         * register OF bit.
> > +         */
> > +        const uint32_t ovf =
> > +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF, BIT(ctr_idx + 1), 0);
> > +        if (!get_field(ovf, BIT(ctr_idx + 1))) {
> > +            riscv_iommu_reg_mod64(s,
> > +                                  RISCV_IOMMU_REG_IOHPMEVT_BASE + off,
> > +                                  RISCV_IOMMU_IOHPMEVT_OF,
> > +                                  0);
> > +            riscv_iommu_notify(s, RISCV_IOMMU_INTR_PM);
> > +        }
> > +    }
> > +}
> > +
> > +static void riscv_iommu_hpm_incr_ctr(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> > +    unsigned event_id)
> > +{
> > +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> > +    uint32_t did_gscid;
> > +    uint32_t pid_pscid;
> > +    uint32_t ctr_idx;
> > +    gpointer value;
> > +    uint32_t ctrs;
> > +    uint64_t evt;
> > +
> > +    if (!(s->cap & RISCV_IOMMU_CAP_HPM)) {
> > +        return;
> > +    }
> > +
> > +    pthread_rwlock_rdlock(&s->ht_lock);
> > +    value = g_hash_table_lookup(s->hpm_event_ctr_map,
> > +                                GUINT_TO_POINTER(event_id));
> > +    if (value == NULL) {
> > +        pthread_rwlock_unlock(&s->ht_lock);
> > +        return;
> > +    }
> > +
> > +    for (ctrs = GPOINTER_TO_UINT(value); ctrs != 0; ctrs &= ctrs - 1) {
> > +        ctr_idx = ctz32(ctrs);
> > +        if (get_field(inhibit, BIT(ctr_idx + 1))) {
> > +            continue;
> > +        }
> > +
> > +        evt = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_IOHPMEVT_BASE + (ctr_idx << 3));
> > +
> > +        /*
> > +         * It's quite possible that event ID has been changed in counter
> > +         * but hashtable hasn't been updated yet. We don't want to increment
> > +         * counter for the old event ID.
> > +         */
> > +        if (event_id != get_field(evt, RISCV_IOMMU_IOHPMEVT_EVENT_ID)) {
> > +            continue;
> > +        }
> > +
> > +        if (get_field(evt, RISCV_IOMMU_IOHPMEVT_IDT)) {
> > +            did_gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
> > +            pid_pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
> > +        } else {
> > +            did_gscid = ctx->devid;
> > +            pid_pscid = ctx->pasid;
> > +        }
> > +
> > +        if (get_field(evt, RISCV_IOMMU_IOHPMEVT_PV_PSCV)) {
> > +            /*
> > +             * If the transaction does not have a valid process_id, counter
> > +             * increments if device_id matches DID_GSCID. If the transaction has
> > +             * a valid process_id, counter increments if device_id matches
> > +             * DID_GSCID and process_id matches PID_PSCID. See IOMMU
> > +             * Specification, Chapter 5.23. Performance-monitoring event
> > +             * selector.
> > +             */
> > +            if (ctx->pasid &&
> > +                get_field(evt, RISCV_IOMMU_IOHPMEVT_PID_PSCID) != pid_pscid) {
> > +                continue;
> > +            }
> > +        }
> > +
> > +        if (get_field(evt, RISCV_IOMMU_IOHPMEVT_DV_GSCV)) {
> > +            uint32_t mask = ~0;
> > +
> > +            if (get_field(evt, RISCV_IOMMU_IOHPMEVT_DMASK)) {
> > +                /*
> > +                 * 1001 1011   mask = GSCID
> > +                 * 0000 0111   mask = mask ^ (mask + 1)
> > +                 * 1111 1000   mask = ~mask;
> > +                 */
> > +                mask = get_field(evt, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
> > +                mask = mask ^ (mask + 1);
> > +                mask = ~mask;
> > +            }
> > +
> > +            if ((get_field(evt, RISCV_IOMMU_IOHPMEVT_DID_GSCID) & mask) !=
> > +                (did_gscid & mask)) {
> > +                continue;
> > +            }
> > +        }
> > +
> > +        __hpm_incr_ctr(s, ctr_idx);
> > +    }
> > +
> > +    pthread_rwlock_unlock(&s->ht_lock);
> > +}
> > +
> > +/* Portable implementation of pext_u64, bit-mask extraction. */
> > +static uint64_t _pext_u64(uint64_t val, uint64_t ext)
> > +{
> > +    uint64_t ret = 0;
> > +    uint64_t rot = 1;
> > +
> > +    while (ext) {
> > +        if (ext & 1) {
> > +            if (val & 1) {
> > +                ret |= rot;
> > +            }
> > +            rot <<= 1;
> > +        }
> > +        val >>= 1;
> > +        ext >>= 1;
> > +    }
> > +
> > +    return ret;
> > +}
> > +
> > +/* Check if GPA matches MSI/MRIF pattern. */
> > +static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> > +    dma_addr_t gpa)
> > +{
> > +    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
> > +        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> > +        return false; /* Invalid MSI/MRIF mode */
> > +    }
> > +
> > +    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
> > +        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
> > +    }
> > +
> > +    return true;
> > +}
> > +
> > +/*
> > + * RISCV IOMMU Address Translation Lookup - Page Table Walk
> > + *
> > + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> > + * Both implementation can be merged into single helper function in future.
> > + * Keeping them separate for now, as error reporting and flow specifics are
> > + * sufficiently different for separate implementation.
> > + *
> > + * @s        : IOMMU Device State
> > + * @ctx      : Translation context for device id and process address space id.
> > + * @iotlb    : translation data: physical address and access mode.
> > + * @gpa      : provided IOVA is a guest physical address, use G-Stage only.
> > + * @return   : success or fault cause code.
> > + */
> > +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> > +    IOMMUTLBEntry *iotlb, bool gpa)
> > +{
> > +    dma_addr_t addr, base;
> > +    uint64_t satp, gatp, pte;
> > +    bool en_s, en_g;
> > +    struct {
> > +        unsigned char step;
> > +        unsigned char levels;
> > +        unsigned char ptidxbits;
> > +        unsigned char ptesize;
> > +    } sc[2];
> > +    /* Translation stage phase */
> > +    enum {
> > +        S_STAGE = 0,
> > +        G_STAGE = 1,
> > +    } pass;
> > +
> > +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> > +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> > +
> > +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;
> > +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> > +
> > +    /* Early check for MSI address match when IOVA == GPA */
> > +    if (!en_s && (iotlb->perm & IOMMU_WO) &&
> > +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
> > +        iotlb->target_as = &s->trap_as;
> > +        iotlb->translated_addr = iotlb->iova;
> > +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> > +        return 0;
> > +    }
> > +
> > +    /* Exit early for pass-through mode. */
> > +    if (!(en_s || en_g)) {
> > +        iotlb->translated_addr = iotlb->iova;
> > +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> > +        /* Allow R/W in pass-through mode */
> > +        iotlb->perm = IOMMU_RW;
> > +        return 0;
> > +    }
> > +
> > +    /* S/G translation parameters. */
> > +    for (pass = 0; pass < 2; pass++) {
> > +        sc[pass].step = 0;
> > +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
> > +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
> > +            /* 32bit mode for GXL/SXL == 1 */
> > +            switch (pass ? gatp : satp) {
> > +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> > +                sc[pass].levels    = 0;
> > +                sc[pass].ptidxbits = 0;
> > +                sc[pass].ptesize   = 0;
> > +                break;
> > +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> > +                if (!(s->cap &
> > +                    (pass ? RISCV_IOMMU_CAP_G_SV32 : RISCV_IOMMU_CAP_S_SV32))) {
> > +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> > +                }
> > +                sc[pass].levels    = 2;
> > +                sc[pass].ptidxbits = 10;
> > +                sc[pass].ptesize   = 4;
> > +                break;
> > +            default:
> > +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> > +            }
> > +        } else {
> > +            /* 64bit mode for GXL/SXL == 0 */
> > +            switch (pass ? gatp : satp) {
> > +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> > +                sc[pass].levels    = 0;
> > +                sc[pass].ptidxbits = 0;
> > +                sc[pass].ptesize   = 0;
> > +                break;
> > +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> > +                if (!(s->cap &
> > +                    (pass ? RISCV_IOMMU_CAP_G_SV39 : RISCV_IOMMU_CAP_S_SV39))) {
> > +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> > +                }
> > +                sc[pass].levels    = 3;
> > +                sc[pass].ptidxbits = 9;
> > +                sc[pass].ptesize   = 8;
> > +                break;
> > +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> > +                if (!(s->cap &
> > +                    (pass ? RISCV_IOMMU_CAP_G_SV48 : RISCV_IOMMU_CAP_S_SV48))) {
> > +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> > +                }
> > +                sc[pass].levels    = 4;
> > +                sc[pass].ptidxbits = 9;
> > +                sc[pass].ptesize   = 8;
> > +                break;
> > +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> > +                if (!(s->cap &
> > +                    (pass ? RISCV_IOMMU_CAP_G_SV57 : RISCV_IOMMU_CAP_S_SV57))) {
> > +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> > +                }
> > +                sc[pass].levels    = 5;
> > +                sc[pass].ptidxbits = 9;
> > +                sc[pass].ptesize   = 8;
> > +                break;
> > +            default:
> > +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> > +            }
> > +        }
> > +    };
> > +
> > +    /* S/G stages translation tables root pointers */
> > +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> > +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
> > +    addr = (en_s && en_g) ? satp : iotlb->iova;
> > +    base = en_g ? gatp : satp;
> > +    pass = en_g ? G_STAGE : S_STAGE;
> > +
> > +    do {
> > +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
> > +        const unsigned va_bits = widened + sc[pass].ptidxbits;
> > +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
> > +                                 (sc[pass].levels - 1 - sc[pass].step);
> > +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> > +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
> > +        const bool ade =
> > +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
> > +
> > +        /* Address range check before first level lookup */
> > +        if (!sc[pass].step) {
> > +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> > +            if ((addr & va_mask) != addr) {
> > +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> > +            }
> > +        }
> > +
> > +        /* Read page table entry */
> > +        if (dma_memory_read(s->target_as, pte_addr, &pte,
> > +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> > +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
> > +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
> > +        }
> > +
> > +        if (pass == S_STAGE) {
> > +            riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_S_VS_WALKS);
> > +        } else {
> > +            riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_G_WALKS);
> > +        }
> > +
> > +        if (sc[pass].ptesize == 4) {
> > +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
> > +        } else {
> > +            pte = le64_to_cpu(pte);
> > +        }
> > +
> > +        sc[pass].step++;
> > +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> > +
> > +        if (!(pte & PTE_V)) {
> > +            break;                /* Invalid PTE */
> > +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> > +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> > +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> > +            break;                /* Reserved leaf PTE flags: PTE_W */
> > +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> > +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
> > +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> > +            break;                /* Misaligned PPN */
> > +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
> > +            break;                /* Read access check failed */
> > +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
> > +            break;                /* Write access check failed */
> > +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
> > +            break;                /* Access bit not set */
> > +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
> > +            break;                /* Dirty bit not set */
> > +        } else {
> > +            /* Leaf PTE, translation completed. */
> > +            sc[pass].step = sc[pass].levels;
> > +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> > +            /* Update address mask based on smallest translation granularity */
> > +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
> > +            /* Continue with S-Stage translation? */
> > +            if (pass && sc[0].step != sc[0].levels) {
> > +                pass = S_STAGE;
> > +                addr = iotlb->iova;
> > +                continue;
> > +            }
> > +            /* Translation phase completed (GPA or SPA) */
> > +            iotlb->translated_addr = base;
> > +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> > +                                                         : IOMMU_RO;
> > +
> > +            /* Check MSI GPA address match */
> > +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> > +                riscv_iommu_msi_check(s, ctx, base)) {
> > +                /* Trap MSI writes and return GPA address. */
> > +                iotlb->target_as = &s->trap_as;
> > +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> > +                return 0;
> > +            }
> > +
> > +            /* Continue with G-Stage translation? */
> > +            if (!pass && en_g) {
> > +                pass = G_STAGE;
> > +                addr = base;
> > +                base = gatp;
> > +                sc[pass].step = 0;
> > +                continue;
> > +            }
> > +
> > +            return 0;
> > +        }
> > +
> > +        if (sc[pass].step == sc[pass].levels) {
> > +            break; /* Can't find leaf PTE */
> > +        }
> > +
> > +        /* Continue with G-Stage translation? */
> > +        if (!pass && en_g) {
> > +            pass = G_STAGE;
> > +            addr = base;
> > +            base = gatp;
> > +            sc[pass].step = 0;
> > +        }
> > +    } while (1);
> > +
> > +    return (iotlb->perm & IOMMU_WO) ?
> > +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
> > +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
> > +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
> > +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
> > +}
> > +
> > +/* Redirect MSI write for given GPA. */
> > +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
> > +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
> > +    unsigned size, MemTxAttrs attrs)
> > +{
> > +    MemTxResult res;
> > +    dma_addr_t addr;
> > +    uint64_t intn;
> > +    uint32_t n190;
> > +    uint64_t pte[2];
> > +
> > +    if (!riscv_iommu_msi_check(s, ctx, gpa)) {
> > +        return MEMTX_ACCESS_ERROR;
> > +    }
> > +
> > +    /* Interrupt File Number */
> > +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
> > +    if (intn >= 256) {
> > +        /* Interrupt file number out of range */
> > +        return MEMTX_ACCESS_ERROR;
> > +    }
> > +
> > +    /* fetch MSI PTE */
> > +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
> > +    addr = addr | (intn * sizeof(pte));
> > +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
> > +            MEMTXATTRS_UNSPECIFIED);
> > +    if (res != MEMTX_OK) {
> > +        return res;
> > +    }
> > +
> > +    le64_to_cpus(&pte[0]);
> > +    le64_to_cpus(&pte[1]);
> > +
> > +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
> > +        return MEMTX_ACCESS_ERROR;
> > +    }
> > +
> > +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
> > +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
> > +        /* MSI Pass-through mode */
> > +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
> > +        addr = addr | (gpa & TARGET_PAGE_MASK);
> > +
> > +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> > +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> > +                              gpa, addr);
> > +
> > +        return dma_memory_write(s->target_as, addr, &data, size, attrs);
> > +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
> > +        /* MRIF mode, continue. */
> > +        break;
> > +    default:
> > +        return MEMTX_ACCESS_ERROR;
> > +    }
> > +
> > +    /*
> > +     * Report an error for interrupt identities exceeding the maximum allowed
> > +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
> > +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
> > +     */
> > +    if ((data > 2047) || (gpa & 3)) {
> > +        return MEMTX_ACCESS_ERROR;
> > +    }
> > +
> > +    /* MSI MRIF mode, non atomic pending bit update */
> > +
> > +    /* MRIF pending bit address */
> > +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
> > +    addr = addr | ((data & 0x7c0) >> 3);
> > +
> > +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> > +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> > +                          gpa, addr);
> > +
> > +    /* MRIF pending bit mask */
> > +    data = 1ULL << (data & 0x03f);
> > +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> > +    if (res != MEMTX_OK) {
> > +        return res;
> > +    }
> > +    intn = intn | data;
> > +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
> > +    if (res != MEMTX_OK) {
> > +        return res;
> > +    }
> > +
> > +    /* Get MRIF enable bits */
> > +    addr = addr + sizeof(intn);
> > +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> > +    if (res != MEMTX_OK) {
> > +        return res;
> > +    }
> > +    if (!(intn & data)) {
> > +        /* notification disabled, MRIF update completed. */
> > +        return MEMTX_OK;
> > +    }
> > +
> > +    /* Send notification message */
> > +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
> > +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
> > +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
> > +
> > +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
> > +    if (res != MEMTX_OK) {
> > +        return res;
> > +    }
> > +
> > +    return MEMTX_OK;
> > +}
> > +
> > +/*
> > + * Device Context format.
> > + *
> > + * @s         : IOMMU Device State
> > + * @return    : 0: extended (64 bytes) | 1: base (32 bytes)
> > + */
> > +static int riscv_iommu_dc_is_base(RISCVIOMMUState *s)
> > +{
> > +    return !(s->cap & RISCV_IOMMU_CAP_MSI_FLAT);
> > +}
> > +
> > +/*
> > + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> > + *
> > + * @s         : IOMMU Device State
> > + * @ctx       : Device Translation Context with devid and pasid set.
> > + * @return    : success or fault code.
> > + */
> > +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> > +{
> > +    const uint64_t ddtp = s->ddtp;
> > +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
> > +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
> > +    struct riscv_iommu_dc dc;
> > +    const int dc_fmt = riscv_iommu_dc_is_base(s);
> > +    const size_t dc_len = sizeof(dc) >> dc_fmt;
> > +    unsigned depth;
> > +    uint64_t de;
> > +
> > +    switch (mode) {
> > +    case RISCV_IOMMU_DDTP_MODE_OFF:
> > +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> > +
> > +    case RISCV_IOMMU_DDTP_MODE_BARE:
> > +        /* mock up pass-through translation context */
> > +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> > +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
> > +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> > +            RISCV_IOMMU_DC_FSC_MODE_BARE);
> > +        ctx->tc = RISCV_IOMMU_DC_TC_EN_ATS | RISCV_IOMMU_DC_TC_V;
> > +        ctx->ta = 0;
> > +        ctx->msiptp = 0;
> > +        return 0;
> > +
> > +    case RISCV_IOMMU_DDTP_MODE_1LVL:
> > +        depth = 0;
> > +        break;
> > +
> > +    case RISCV_IOMMU_DDTP_MODE_2LVL:
> > +        depth = 1;
> > +        break;
> > +
> > +    case RISCV_IOMMU_DDTP_MODE_3LVL:
> > +        depth = 2;
> > +        break;
> > +
> > +    default:
> > +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> > +    }
> > +
> > +    /*
> > +     * Check supported device id width (in bits).
> > +     * See IOMMU Specification, Chapter 6. Software guidelines.
> > +     * - if extended device-context format is used:
> > +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
> > +     * - if base device-context format is used:
> > +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
> > +     */
> > +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
> > +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> > +    }
> > +
> > +    /* Device directory tree walk */
> > +    for (; depth-- > 0; ) {
> > +        riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_DD_WALK);
> > +
> > +        /*
> > +         * Select device id index bits based on device directory tree level
> > +         * and device context format.
> > +         * See IOMMU Specification, Chapter 2. Data Structures.
> > +         * - if extended device-context format is used:
> > +         *   device index: [23:15][14:6][5:0]
> > +         * - if base device-context format is used:
> > +         *   device index: [23:16][15:7][6:0]
> > +         */
> > +        const int split = depth * 9 + 6 + dc_fmt;
> > +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
> > +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> > +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> > +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> > +        }
> > +        le64_to_cpus(&de);
> > +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
> > +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID; /* invalid directory entry */
> > +        }
> > +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
> > +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID; /* reserved bits set. */
> > +        }
> > +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
> > +    }
> > +
> > +    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_DD_WALK);
> > +
> > +    /* index into device context entry page */
> > +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
> > +
> > +    memset(&dc, 0, sizeof(dc));
> > +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
> > +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> > +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> > +    }
> > +
> > +    /* Set translation context. */
> > +    ctx->tc = le64_to_cpu(dc.tc);
> > +    ctx->gatp = le64_to_cpu(dc.iohgatp);
> > +    ctx->satp = le64_to_cpu(dc.fsc);
> > +    ctx->ta = le64_to_cpu(dc.ta);
> > +    ctx->msiptp = le64_to_cpu(dc.msiptp);
> > +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> > +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
> > +
> > +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> > +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> > +    }
> > +
> > +    /* FSC field checks */
> > +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> > +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
> > +
> > +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
> > +        /* No S-Stage translation, done. */
> > +        return 0;
> > +    }
> > +
> > +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
> > +        if (ctx->pasid != RISCV_IOMMU_NOPASID) {
> > +            /* PASID is disabled */
> > +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> > +        }
> > +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
> > +            /* Invalid translation mode */
> > +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> > +        }
> > +        return 0;
> > +    }
> > +
> > +    if (ctx->pasid == RISCV_IOMMU_NOPASID) {
> > +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
> > +            /* No default PASID enabled, set BARE mode */
> > +            ctx->satp = 0ULL;
> > +            return 0;
> > +        } else {
> > +            /* Use default PASID #0 */
> > +            ctx->pasid = 0;
> > +        }
> > +    }
> > +
> > +    /* FSC.TC.PDTV enabled */
> > +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
> > +        /* Invalid PDTP.MODE */
> > +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> > +    }
> > +
> > +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
> > +        riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_PD_WALK);
> > +
> > +        /*
> > +         * Select process id index bits based on process directory tree
> > +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
> > +         */
> > +        const int split = depth * 9 + 8;
> > +        addr |= ((ctx->pasid >> split) << 3) & ~TARGET_PAGE_MASK;
> > +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> > +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> > +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> > +        }
> > +        le64_to_cpus(&de);
> > +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
> > +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> > +        }
> > +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
> > +    }
> > +
> > +    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_PD_WALK);
> > +
> > +    /* Leaf entry in PDT */
> > +    addr |= (ctx->pasid << 4) & ~TARGET_PAGE_MASK;
> > +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> > +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> > +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> > +    }
> > +
> > +    /* Use FSC and TA from process directory entry. */
> > +    ctx->ta = le64_to_cpu(dc.ta);
> > +    ctx->satp = le64_to_cpu(dc.fsc);
> > +
> > +    return 0;
> > +}
> > +
> > +/* Translation Context cache support */
> > +static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
> > +{
> > +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
> > +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
> > +    return c1->devid == c2->devid && c1->pasid == c2->pasid;
> > +}
> > +
> > +static guint __ctx_hash(gconstpointer v)
> > +{
> > +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
> > +    /* Generate simple hash of (pasid, devid), assuming 24-bit wide devid */
> > +    return (guint)(ctx->devid) + ((guint)(ctx->pasid) << 24);
> > +}
> > +
> > +static void __ctx_inval_devid_pasid(gpointer key, gpointer value, gpointer data)
> > +{
> > +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> > +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> > +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> > +        ctx->devid == arg->devid &&
> > +        ctx->pasid == arg->pasid) {
> > +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> > +    }
> > +}
> > +
> > +static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
> > +{
> > +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> > +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> > +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> > +        ctx->devid == arg->devid) {
> > +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> > +    }
> > +}
> > +
> > +static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
> > +{
> > +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> > +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
> > +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> > +    }
> > +}
> > +
> > +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
> > +    uint32_t devid, uint32_t pasid)
> > +{
> > +    GHashTable *ctx_cache;
> > +    RISCVIOMMUContext key = {
> > +        .devid = devid,
> > +        .pasid = pasid,
> > +    };
> > +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> > +    g_hash_table_foreach(ctx_cache, func, &key);
> > +    g_hash_table_unref(ctx_cache);
> > +}
> > +
> > +/* Find or allocate translation context for a given {device_id, process_id} */
> > +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
> > +    unsigned devid, unsigned pasid, void **ref)
> > +{
> > +    GHashTable *ctx_cache;
> > +    RISCVIOMMUContext *ctx;
> > +    RISCVIOMMUContext key = {
> > +        .devid = devid,
> > +        .pasid = pasid,
> > +    };
> > +
> > +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> > +    ctx = g_hash_table_lookup(ctx_cache, &key);
> > +
> > +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> > +        *ref = ctx_cache;
> > +        return ctx;
> > +    }
> > +
> > +    if (g_hash_table_size(s->ctx_cache) >= LIMIT_CACHE_CTX) {
> > +        ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> > +                                          g_free, NULL);
> > +        g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
> > +    }
> > +
> > +    ctx = g_new0(RISCVIOMMUContext, 1);
> > +    ctx->devid = devid;
> > +    ctx->pasid = pasid;
> > +
> > +    int fault = riscv_iommu_ctx_fetch(s, ctx);
> > +    if (!fault) {
> > +        g_hash_table_add(ctx_cache, ctx);
> > +        *ref = ctx_cache;
> > +        return ctx;
> > +    }
> > +
> > +    g_hash_table_unref(ctx_cache);
> > +    *ref = NULL;
> > +
> > +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_DTF)) {
> > +        struct riscv_iommu_fq_record ev = { 0 };
> > +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
> > +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE,
> > +            RISCV_IOMMU_FQ_TTYPE_UADDR_RD);
> > +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, devid);
> > +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, pasid);
> > +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, !!pasid);
> > +        riscv_iommu_fault(s, &ev);
> > +    }
> > +
> > +    g_free(ctx);
> > +    return NULL;
> > +}
> > +
> > +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
> > +{
> > +    if (ref) {
> > +        g_hash_table_unref((GHashTable *)ref);
> > +    }
> > +}
> > +
> > +/* Find or allocate address space for a given device */
> > +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
> > +{
> > +    RISCVIOMMUSpace *as;
> > +
> > +    /* FIXME: PCIe bus remapping for attached endpoints. */
> > +    devid |= s->bus << 8;
> > +
> > +    qemu_mutex_lock(&s->core_lock);
> > +    QLIST_FOREACH(as, &s->spaces, list) {
> > +        if (as->devid == devid) {
> > +            break;
> > +        }
> > +    }
> > +    qemu_mutex_unlock(&s->core_lock);
> > +
> > +    if (as == NULL) {
> > +        char name[64];
> > +        as = g_new0(RISCVIOMMUSpace, 1);
> > +
> > +        as->iommu = s;
> > +        as->devid = devid;
> > +
> > +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
> > +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> > +
> > +        /* IOVA address space, untranslated addresses */
> > +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
> > +            TYPE_RISCV_IOMMU_MEMORY_REGION,
> > +            OBJECT(as), name, UINT64_MAX);
> > +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
> > +            TYPE_RISCV_IOMMU_PCI);
> > +
> > +        qemu_mutex_lock(&s->core_lock);
> > +        QLIST_INSERT_HEAD(&s->spaces, as, list);
> > +        qemu_mutex_unlock(&s->core_lock);
> > +
> > +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
> > +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> > +    }
> > +    return &as->iova_as;
> > +}
> > +
> > +/* Translation Object cache support */
> > +static gboolean __iot_equal(gconstpointer v1, gconstpointer v2)
> > +{
> > +    RISCVIOMMUEntry *t1 = (RISCVIOMMUEntry *) v1;
> > +    RISCVIOMMUEntry *t2 = (RISCVIOMMUEntry *) v2;
> > +    return t1->gscid == t2->gscid && t1->pscid == t2->pscid &&
> > +           t1->iova == t2->iova;
> > +}
> > +
> > +static guint __iot_hash(gconstpointer v)
> > +{
> > +    RISCVIOMMUEntry *t = (RISCVIOMMUEntry *) v;
> > +    return (guint)t->iova;
> > +}
> > +
> > +/* GV: 1 PSCV: 1 AV: 1 */
> > +static void __iot_inval_pscid_iova(gpointer key, gpointer value, gpointer data)
> > +{
> > +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> > +    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
> > +    if (iot->gscid == arg->gscid &&
> > +        iot->pscid == arg->pscid &&
> > +        iot->iova == arg->iova) {
> > +        iot->perm = 0;
> > +    }
> > +}
> > +
> > +/* GV: 1 PSCV: 1 AV: 0 */
> > +static void __iot_inval_pscid(gpointer key, gpointer value, gpointer data)
> > +{
> > +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> > +    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
> > +    if (iot->gscid == arg->gscid &&
> > +        iot->pscid == arg->pscid) {
> > +        iot->perm = 0;
> > +    }
> > +}
> > +
> > +/* GV: 1 GVMA: 1 */
> > +static void __iot_inval_gscid_gpa(gpointer key, gpointer value, gpointer data)
> > +{
> > +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> > +    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
> > +    if (iot->gscid == arg->gscid) {
> > +        /* simplified cache, no GPA matching */
> > +        iot->perm = 0;
> > +    }
> > +}
> > +
> > +/* GV: 1 GVMA: 0 */
> > +static void __iot_inval_gscid(gpointer key, gpointer value, gpointer data)
> > +{
> > +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> > +    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
> > +    if (iot->gscid == arg->gscid) {
> > +        iot->perm = 0;
> > +    }
> > +}
> > +
> > +/* GV: 0 */
> > +static void __iot_inval_all(gpointer key, gpointer value, gpointer data)
> > +{
> > +    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
> > +    iot->perm = 0;
> > +}
> > +
> > +/* caller should keep ref-count for iot_cache object */
> > +static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
> > +    GHashTable *iot_cache, hwaddr iova)
> > +{
> > +    RISCVIOMMUEntry key = {
> > +        .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
> > +        .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
> > +        .iova  = PPN_DOWN(iova),
> > +    };
> > +    return g_hash_table_lookup(iot_cache, &key);
> > +}
> > +
> > +/* caller should keep ref-count for iot_cache object */
> > +static void riscv_iommu_iot_update(RISCVIOMMUState *s,
> > +    GHashTable *iot_cache, RISCVIOMMUEntry *iot)
> > +{
> > +    if (!s->iot_limit) {
> > +        return;
> > +    }
> > +
> > +    if (g_hash_table_size(s->iot_cache) >= s->iot_limit) {
> > +        iot_cache = g_hash_table_new_full(__iot_hash, __iot_equal,
> > +                                          g_free, NULL);
> > +        g_hash_table_unref(qatomic_xchg(&s->iot_cache, iot_cache));
> > +    }
> > +    g_hash_table_add(iot_cache, iot);
> > +}
> > +
> > +static void riscv_iommu_iot_inval(RISCVIOMMUState *s, GHFunc func,
> > +    uint32_t gscid, uint32_t pscid, hwaddr iova)
> > +{
> > +    GHashTable *iot_cache;
> > +    RISCVIOMMUEntry key = {
> > +        .gscid = gscid,
> > +        .pscid = pscid,
> > +        .iova  = PPN_DOWN(iova),
> > +    };
> > +
> > +    iot_cache = g_hash_table_ref(s->iot_cache);
> > +    g_hash_table_foreach(iot_cache, func, &key);
> > +    g_hash_table_unref(iot_cache);
> > +}
> > +
> > +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> > +    IOMMUTLBEntry *iotlb, bool enable_cache)
> > +{
> > +    RISCVIOMMUEntry *iot;
> > +    IOMMUAccessFlags perm;
> > +    bool enable_faults;
> > +    bool enable_pasid;
> > +    bool enable_pri;
> > +    GHashTable *iot_cache;
> > +    int fault;
> > +
> > +    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_URQ);
> > +
> > +    iot_cache = g_hash_table_ref(s->iot_cache);
> > +
> > +    enable_faults = !(ctx->tc & RISCV_IOMMU_DC_TC_DTF);
> > +    /*
> > +     * TC[32] is reserved for custom extensions, used here to temporarily
> > +     * enable automatic page-request generation for ATS queries.
> > +     */
> > +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
> > +    enable_pasid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
> > +
> > +    /* Check for ATS request. */
> > +    if (iotlb->perm == IOMMU_NONE) {
> > +        riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_ATS_RQ);
> > +        /* Check if ATS is disabled. */
> > +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS)) {
> > +            enable_pri = false;
> > +            fault = RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> > +            goto done;
> > +        }
> > +        trace_riscv_iommu_ats(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> > +                PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid), iotlb->iova);
> > +    }
> > +
> > +    iot = riscv_iommu_iot_lookup(ctx, iot_cache, iotlb->iova);
> > +    perm = iot ? iot->perm : IOMMU_NONE;
> > +    if (perm != IOMMU_NONE) {
> > +        iotlb->translated_addr = PPN_PHYS(iot->phys);
> > +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> > +        iotlb->perm = perm;
> > +        fault = 0;
> > +        goto done;
> > +    }
> > +
> > +    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_TLB_MISS);
> > +
> > +    /* Translate using device directory / page table information. */
> > +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb, false);
> > +
> > +    if (!fault && iotlb->target_as == &s->trap_as) {
> > +        /* Do not cache trapped MSI translations */
> > +        goto done;
> > +    }
> > +
> > +    if (!fault && iotlb->translated_addr != iotlb->iova && enable_cache) {
> > +        iot = g_new0(RISCVIOMMUEntry, 1);
> > +        iot->iova = PPN_DOWN(iotlb->iova);
> > +        iot->phys = PPN_DOWN(iotlb->translated_addr);
> > +        iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
> > +        iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
> > +        iot->perm = iotlb->perm;
> > +        riscv_iommu_iot_update(s, iot_cache, iot);
> > +    }
> > +
> > +done:
> > +    g_hash_table_unref(iot_cache);
> > +
> > +    if (enable_pri && fault) {
> > +        struct riscv_iommu_pq_record pr = {0};
> > +        if (enable_pasid) {
> > +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
> > +                RISCV_IOMMU_PREQ_HDR_PID, ctx->pasid);
> > +        }
> > +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
> > +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) | RISCV_IOMMU_PREQ_PAYLOAD_M;
> > +        riscv_iommu_pri(s, &pr);
> > +        return fault;
> > +    }
> > +
> > +    if (enable_faults && fault) {
> > +        struct riscv_iommu_fq_record ev;
> > +        const unsigned ttype =
> > +            (iotlb->perm & IOMMU_RW) ? RISCV_IOMMU_FQ_TTYPE_UADDR_WR :
> > +            ((iotlb->perm & IOMMU_RO) ? RISCV_IOMMU_FQ_TTYPE_UADDR_RD :
> > +            RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ);
> > +        ev.hdr = set_field(0, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
> > +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, ttype);
> > +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, enable_pasid);
> > +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->pasid);
> > +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
> > +        ev.iotval    = iotlb->iova;
> > +        ev.iotval2   = iotlb->translated_addr;
> > +        ev._reserved = 0;
> > +        riscv_iommu_fault(s, &ev);
> > +        return fault;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +/* IOMMU Command Interface */
> > +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
> > +    uint64_t addr, uint32_t data)
> > +{
> > +    /*
> > +     * ATS processing in this implementation of the IOMMU is synchronous,
> > +     * no need to wait for completions here.
> > +     */
> > +    if (!notify) {
> > +        return MEMTX_OK;
> > +    }
> > +
> > +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
> > +        MEMTXATTRS_UNSPECIFIED);
> > +}
> > +
> > +static void riscv_iommu_ats(RISCVIOMMUState *s,
> > +    struct riscv_iommu_command *cmd, IOMMUNotifierFlag flag,
> > +    IOMMUAccessFlags perm,
> > +    void (*trace_fn)(const char *id))
> > +{
> > +    RISCVIOMMUSpace *as = NULL;
> > +    IOMMUNotifier *n;
> > +    IOMMUTLBEvent event;
> > +    uint32_t pasid;
> > +    uint32_t devid;
> > +    const bool pv = cmd->dword0 & RISCV_IOMMU_CMD_ATS_PV;
> > +
> > +    if (cmd->dword0 & RISCV_IOMMU_CMD_ATS_DSV) {
> > +        /* Use device segment and requester id */
> > +        devid = get_field(cmd->dword0,
> > +            RISCV_IOMMU_CMD_ATS_DSEG | RISCV_IOMMU_CMD_ATS_RID);
> > +    } else {
> > +        devid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_RID);
> > +    }
> > +
> > +    pasid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_PID);
> > +
> > +    qemu_mutex_lock(&s->core_lock);
> > +    QLIST_FOREACH(as, &s->spaces, list) {
> > +        if (as->devid == devid) {
> > +            break;
> > +        }
> > +    }
> > +    qemu_mutex_unlock(&s->core_lock);
> > +
> > +    if (!as || !as->notifier) {
> > +        return;
> > +    }
> > +
> > +    event.type = flag;
> > +    event.entry.perm = perm;
> > +    event.entry.target_as = s->target_as;
> > +
> > +    IOMMU_NOTIFIER_FOREACH(n, &as->iova_mr) {
> > +        if (!pv || n->iommu_idx == pasid) {
> > +            event.entry.iova = n->start;
> > +            event.entry.addr_mask = n->end - n->start;
> > +            trace_fn(as->iova_mr.parent_obj.name);
> > +            memory_region_notify_iommu_one(n, &event);
> > +        }
> > +    }
> > +}
> > +
> > +static void riscv_iommu_ats_inval(RISCVIOMMUState *s,
> > +    struct riscv_iommu_command *cmd)
> > +{
> > +    return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_DEVIOTLB_UNMAP, IOMMU_NONE,
> > +                           trace_riscv_iommu_ats_inval);
> > +}
> > +
> > +static void riscv_iommu_ats_prgr(RISCVIOMMUState *s,
> > +    struct riscv_iommu_command *cmd)
> > +{
> > +    unsigned resp_code = get_field(cmd->dword1, RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE);
> > +    /* Using the access flag to carry response code information */
> > +    IOMMUAccessFlags perm = resp_code ? IOMMU_NONE : IOMMU_RW;
> > +    return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_MAP, perm,
> > +                           trace_riscv_iommu_ats_prgr);
> > +}
> > +
> > +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
> > +{
> > +    uint64_t old_ddtp = s->ddtp;
> > +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
> > +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
> > +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
> > +    bool ok = false;
> > +
> > +    /*
> > +     * Check for allowed DDTP.MODE transitions:
> > +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
> > +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
> > +     */
> > +    if (new_mode == old_mode ||
> > +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> > +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
> > +        ok = true;
> > +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
> > +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
> > +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
> > +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> > +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
> > +    }
> > +
> > +    if (ok) {
> > +        /* clear reserved and busy bits, report back sanitized version */
> > +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
> > +                             RISCV_IOMMU_DDTP_MODE, new_mode);
> > +    } else {
> > +        new_ddtp = old_ddtp;
> > +    }
> > +    s->ddtp = new_ddtp;
> > +
> > +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
> > +}
> > +
> > +/* Command function and opcode field. */
> > +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
> > +
> > +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
> > +{
> > +    struct riscv_iommu_command cmd;
> > +    MemTxResult res;
> > +    dma_addr_t addr;
> > +    uint32_t tail, head, ctrl;
> > +    GHFunc func;
> > +
> > +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> > +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
> > +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
> > +
> > +    /* Check for pending error or queue processing disabled */
> > +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
> > +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
> > +        return;
> > +    }
> > +
> > +    while (tail != head) {
> > +        addr = s->cq_addr  + head * sizeof(cmd);
> > +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
> > +                              MEMTXATTRS_UNSPECIFIED);
> > +
> > +        if (res != MEMTX_OK) {
> > +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, RISCV_IOMMU_CQCSR_CQMF, 0);
> > +            goto fault;
> > +        }
> > +
> > +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
> > +
> > +        switch (get_field(cmd.dword0, RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC)) {
> > +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
> > +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
> > +            res = riscv_iommu_iofence(s, cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV,
> > +                cmd.dword1, get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
> > +
> > +            if (res != MEMTX_OK) {
> > +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> > +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
> > +                goto fault;
> > +            }
> > +            break;
> > +
> > +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
> > +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> > +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
> > +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
> > +                goto cmd_ill;
> > +            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
> > +                /* invalidate all cache mappings */
> > +                func = __iot_inval_all;
> > +            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
> > +                /* invalidate cache matching GSCID */
> > +                func = __iot_inval_gscid;
> > +            } else {
> > +                /* invalidate cache matching GSCID and ADDR (GPA) */
> > +                func = __iot_inval_gscid_gpa;
> > +            }
> > +            riscv_iommu_iot_inval(s, func,
> > +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID), 0,
> > +                cmd.dword1 & TARGET_PAGE_MASK);
> > +            break;
> > +
> > +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
> > +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> > +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
> > +                /* invalidate all cache mappings, simplified model */
> > +                func = __iot_inval_all;
> > +            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV)) {
> > +                /* invalidate cache matching GSCID, simplified model */
> > +                func = __iot_inval_gscid;
> > +            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
> > +                /* invalidate cache matching GSCID and PSCID */
> > +                func = __iot_inval_pscid;
> > +            } else {
> > +                /* invalidate cache matching GSCID and PSCID and ADDR (IOVA) */
> > +                func = __iot_inval_pscid_iova;
> > +            }
> > +            riscv_iommu_iot_inval(s, func,
> > +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID),
> > +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_PSCID),
> > +                cmd.dword1 & TARGET_PAGE_MASK);
> > +            break;
> > +
> > +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
> > +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> > +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> > +                /* invalidate all device context cache mappings */
> > +                func = __ctx_inval_all;
> > +            } else {
> > +                /* invalidate all device context matching DID */
> > +                func = __ctx_inval_devid;
> > +            }
> > +            riscv_iommu_ctx_inval(s, func,
> > +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
> > +            break;
> > +
> > +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
> > +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> > +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> > +                /* illegal command arguments IODIR_PDT & DV == 0 */
> > +                goto cmd_ill;
> > +            } else {
> > +                func = __ctx_inval_devid_pasid;
> > +            }
> > +            riscv_iommu_ctx_inval(s, func,
> > +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
> > +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
> > +            break;
> > +
> > +        /* ATS commands */
> > +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_INVAL,
> > +                             RISCV_IOMMU_CMD_ATS_OPCODE):
> > +            riscv_iommu_ats_inval(s, &cmd);
> > +            break;
> > +
> > +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_PRGR,
> > +                             RISCV_IOMMU_CMD_ATS_OPCODE):
> > +            riscv_iommu_ats_prgr(s, &cmd);
> > +            break;
> > +
> > +        default:
> > +        cmd_ill:
> > +            /* Invalid instruction, do not advance instruction index. */
> > +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> > +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
> > +            goto fault;
> > +        }
> > +
> > +        /* Advance and update head pointer after command completes. */
> > +        head = (head + 1) & s->cq_mask;
> > +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
> > +    }
> > +    return;
> > +
> > +fault:
> > +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
> > +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
> > +    }
> > +}
> > +
> > +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
> > +{
> > +    uint64_t base;
> > +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> > +    uint32_t ctrl_clr;
> > +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
> > +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
> > +
> > +    if (enable && !active) {
> > +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
> > +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
> > +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
> > +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
> > +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
> > +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
> > +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
> > +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
> > +            RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO;
> > +    } else if (!enable && active) {
> > +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
> > +        ctrl_set = 0;
> > +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
> > +    } else {
> > +        ctrl_set = 0;
> > +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
> > +    }
> > +
> > +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
> > +}
> > +
> > +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
> > +{
> > +    uint64_t base;
> > +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> > +    uint32_t ctrl_clr;
> > +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
> > +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
> > +
> > +    if (enable && !active) {
> > +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
> > +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
> > +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
> > +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
> > +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
> > +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
> > +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
> > +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
> > +            RISCV_IOMMU_FQCSR_FQOF;
> > +    } else if (!enable && active) {
> > +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
> > +        ctrl_set = 0;
> > +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
> > +    } else {
> > +        ctrl_set = 0;
> > +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
> > +    }
> > +
> > +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
> > +}
> > +
> > +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
> > +{
> > +    uint64_t base;
> > +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> > +    uint32_t ctrl_clr;
> > +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
> > +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
> > +
> > +    if (enable && !active) {
> > +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
> > +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
> > +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
> > +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
> > +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
> > +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
> > +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
> > +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
> > +            RISCV_IOMMU_PQCSR_PQOF;
> > +    } else if (!enable && active) {
> > +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
> > +        ctrl_set = 0;
> > +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
> > +    } else {
> > +        ctrl_set = 0;
> > +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
> > +    }
> > +
> > +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
> > +}
> > +
> > +static void riscv_iommu_process_dbg(RISCVIOMMUState *s)
> > +{
> > +    uint64_t iova = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_IOVA);
> > +    uint64_t ctrl = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_CTL);
> > +    unsigned devid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_DID);
> > +    unsigned pid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_PID);
> > +    RISCVIOMMUContext *ctx;
> > +    void *ref;
> > +
> > +    if (!(ctrl & RISCV_IOMMU_TR_REQ_CTL_GO_BUSY)) {
> > +        return;
> > +    }
> > +
> > +    ctx = riscv_iommu_ctx(s, devid, pid, &ref);
> > +    if (ctx == NULL) {
> > +        riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE,
> > +            RISCV_IOMMU_TR_RESPONSE_FAULT | (RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED << 10));
> > +    } else {
> > +        IOMMUTLBEntry iotlb = {
> > +            .iova = iova,
> > +            .perm = IOMMU_NONE,
> > +            .addr_mask = ~0,
> > +            .target_as = NULL,
> > +        };
> > +        int fault = riscv_iommu_translate(s, ctx, &iotlb, false);
> > +        if (fault) {
> > +            iova = RISCV_IOMMU_TR_RESPONSE_FAULT | (((uint64_t) fault) << 10);
> > +        } else {
> > +            iova = ((iotlb.translated_addr & ~iotlb.addr_mask) >> 2) &
> > +                RISCV_IOMMU_TR_RESPONSE_PPN;
> > +        }
> > +        riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE, iova);
> > +    }
> > +
> > +    riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_TR_REQ_CTL, 0,
> > +        RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
> > +    riscv_iommu_ctx_put(s, ref);
> > +}
> > +
> > +/* Core IOMMU execution activation */
> > +enum {
> > +    RISCV_IOMMU_EXEC_DDTP,
> > +    RISCV_IOMMU_EXEC_CQCSR,
> > +    RISCV_IOMMU_EXEC_CQT,
> > +    RISCV_IOMMU_EXEC_FQCSR,
> > +    RISCV_IOMMU_EXEC_FQH,
> > +    RISCV_IOMMU_EXEC_PQCSR,
> > +    RISCV_IOMMU_EXEC_PQH,
> > +    RISCV_IOMMU_EXEC_TR_REQUEST,
> > +    /* RISCV_IOMMU_EXEC_EXIT must be the last enum value */
> > +    RISCV_IOMMU_EXEC_EXIT,
> > +};
> > +
> > +static void *riscv_iommu_core_proc(void* arg)
> > +{
> > +    RISCVIOMMUState *s = arg;
> > +    unsigned exec = 0;
> > +    unsigned mask = 0;
> > +
> > +    while (!(exec & BIT(RISCV_IOMMU_EXEC_EXIT))) {
> > +        mask = (mask ? mask : BIT(RISCV_IOMMU_EXEC_EXIT)) >> 1;
> > +        switch (exec & mask) {
> > +        case BIT(RISCV_IOMMU_EXEC_DDTP):
> > +            riscv_iommu_process_ddtp(s);
> > +            break;
> > +        case BIT(RISCV_IOMMU_EXEC_CQCSR):
> > +            riscv_iommu_process_cq_control(s);
> > +            break;
> > +        case BIT(RISCV_IOMMU_EXEC_CQT):
> > +            riscv_iommu_process_cq_tail(s);
> > +            break;
> > +        case BIT(RISCV_IOMMU_EXEC_FQCSR):
> > +            riscv_iommu_process_fq_control(s);
> > +            break;
> > +        case BIT(RISCV_IOMMU_EXEC_FQH):
> > +            /* NOP */
> > +            break;
> > +        case BIT(RISCV_IOMMU_EXEC_PQCSR):
> > +            riscv_iommu_process_pq_control(s);
> > +            break;
> > +        case BIT(RISCV_IOMMU_EXEC_PQH):
> > +            /* NOP */
> > +            break;
> > +        case BIT(RISCV_IOMMU_EXEC_TR_REQUEST):
> > +            riscv_iommu_process_dbg(s);
> > +            break;
> > +        }
> > +        exec &= ~mask;
> > +        if (!exec) {
> > +            qemu_mutex_lock(&s->core_lock);
> > +            exec = s->core_exec;
> > +            while (!exec) {
> > +                qemu_cond_wait(&s->core_cond, &s->core_lock);
> > +                exec = s->core_exec;
> > +            }
> > +            s->core_exec = 0;
> > +            qemu_mutex_unlock(&s->core_lock);
> > +        }
> > +    };
> > +
> > +    return NULL;
> > +}
> > +
> > +/* For now we assume IOMMU HPM frequency to be 1GHz so 1-cycle is of 1-ns. */
> > +static inline uint64_t __get_cycles(void)
> > +{
> > +    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
> > +}
> > +
> > +static void __hpm_setup_timer(RISCVIOMMUState *s, uint64_t value)
> > +{
> > +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> > +    uint64_t overflow_at, overflow_ns;
> > +
> > +    if (get_field(inhibit, RISCV_IOMMU_IOCOUNTINH_CY)) {
> > +        return;
> > +    }
> > +
> > +    /*
> > +     * We are using INT64_MAX here instead to UINT64_MAX because cycle counter
> > +     * has 63-bit precision and INT64_MAX is the maximum it can store.
> > +     */
> > +    if (value) {
> > +        overflow_ns = INT64_MAX - value + 1;
> > +    } else {
> > +        overflow_ns = INT64_MAX;
> > +    }
> > +
> > +    overflow_at = (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + overflow_ns;
> > +
> > +    if (overflow_at > INT64_MAX) {
> > +        s->irq_overflow_left = overflow_at - INT64_MAX;
> > +        overflow_at = INT64_MAX;
> > +    }
> > +
> > +    timer_mod_anticipate_ns(s->hpm_timer, overflow_at);
> > +}
> > +
> > +/* Updates the internal cycle counter state when iocntinh:CY is changed. */
> > +static void riscv_iommu_process_iocntinh_cy(RISCVIOMMUState *s,
> > +                                            bool prev_cy_inh)
> > +{
> > +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> > +
> > +    /* We only need to process CY bit toggle. */
> > +    if (!(inhibit ^ prev_cy_inh)) {
> > +        return;
> > +    }
> > +
> > +    if (!(inhibit & RISCV_IOMMU_IOCOUNTINH_CY)) {
> > +        /*
> > +         * Cycle counter is enabled. Just start the timer again and update the
> > +         * clock snapshot value to point to the current time to make sure
> > +         * iohpmcycles read is correct.
> > +         */
> > +        s->hpmcycle_prev = __get_cycles();
> > +        __hpm_setup_timer(s, s->hpmcycle_val);
> > +    } else {
> > +        /*
> > +         * Cycle counter is disabled. Stop the timer and update the cycle
> > +         * counter to record the current value which is last programmed
> > +         * value + the cycles passed so far.
> > +         */
> > +        s->hpmcycle_val = s->hpmcycle_val + (__get_cycles() - s->hpmcycle_prev);
> > +        timer_del(s->hpm_timer);
> > +    }
> > +}
> > +
> > +static void riscv_iommu_process_hpmcycle_write(RISCVIOMMUState *s)
> > +{
> > +    const uint64_t val = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_IOHPMCYCLES);
> > +    const uint32_t ovf = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTOVF);
> > +
> > +    /*
> > +     * Clear OF bit in IOCNTOVF if it's being cleared in IOHPMCYCLES register.
> > +     */
> > +    if (get_field(ovf, RISCV_IOMMU_IOCOUNTOVF_CY) &&
> > +        !get_field(val, RISCV_IOMMU_IOHPMCYCLES_OVF)) {
> > +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF, 0,
> > +            RISCV_IOMMU_IOCOUNTOVF_CY);
> > +    }
> > +
> > +    s->hpmcycle_val = val & ~RISCV_IOMMU_IOHPMCYCLES_OVF;
> > +    s->hpmcycle_prev = __get_cycles();
> > +    __hpm_setup_timer(s, s->hpmcycle_val);
> > +}
> > +
> > +static inline bool __check_valid_event_id(unsigned event_id)
> > +{
> > +    return event_id > RISCV_IOMMU_HPMEVENT_INVALID &&
> > +           event_id < RISCV_IOMMU_HPMEVENT_MAX;
> > +}
> > +
> > +static gboolean __hpm_event_equal(gpointer key, gpointer value, gpointer udata)
> > +{
> > +    uint32_t *pair = udata;
> > +
> > +    if (GPOINTER_TO_UINT(value) & (1 << pair[0])) {
> > +        pair[1] = GPOINTER_TO_UINT(key);
> > +        return true;
> > +    }
> > +
> > +    return false;
> > +}
> > +
> > +/* Caller must check ctr_idx against hpm_ctrs to see if its supported or not. */
> > +static void __update_event_map(RISCVIOMMUState *s, uint64_t value,
> > +    uint32_t ctr_idx)
> > +{
> > +    unsigned event_id = get_field(value, RISCV_IOMMU_IOHPMEVT_EVENT_ID);
> > +    uint32_t pair[2] = { ctr_idx, RISCV_IOMMU_HPMEVENT_INVALID };
> > +    uint32_t new_value = 1 << ctr_idx;
> > +    gpointer data;
> > +
> > +    /* If EventID field is RISCV_IOMMU_HPMEVENT_INVALID remove the current mapping. */
> > +    if (event_id == RISCV_IOMMU_HPMEVENT_INVALID) {
> > +        data = g_hash_table_find(s->hpm_event_ctr_map, __hpm_event_equal, pair);
> > +
> > +        new_value = GPOINTER_TO_UINT(data) & ~(new_value);
> > +        pthread_rwlock_wrlock(&s->ht_lock);
> > +        if (new_value != 0) {
> > +            g_hash_table_replace(s->hpm_event_ctr_map,
> > +                                 GUINT_TO_POINTER(pair[1]),
> > +                                 GUINT_TO_POINTER(new_value));
> > +        } else {
> > +            g_hash_table_remove(s->hpm_event_ctr_map,
> > +                                GUINT_TO_POINTER(pair[1]));
> > +        }
> > +        pthread_rwlock_unlock(&s->ht_lock);
> > +
> > +        return;
> > +    }
> > +
> > +    /* Update the counter mask if the event is already enabled. */
> > +    if (g_hash_table_lookup_extended(s->hpm_event_ctr_map,
> > +                                     GUINT_TO_POINTER(event_id),
> > +                                     NULL,
> > +                                     &data)) {
> > +        new_value |= GPOINTER_TO_UINT(data);
> > +    }
> > +
> > +    pthread_rwlock_wrlock(&s->ht_lock);
> > +    g_hash_table_insert(s->hpm_event_ctr_map,
> > +                        GUINT_TO_POINTER(event_id),
> > +                        GUINT_TO_POINTER(new_value));
> > +    pthread_rwlock_unlock(&s->ht_lock);
> > +}
> > +
> > +static void riscv_iommu_process_hpmevt_write(RISCVIOMMUState *s,
> > +                                             uint32_t evt_reg)
> > +{
> > +    const uint32_t ctr_idx = (evt_reg - RISCV_IOMMU_REG_IOHPMEVT_BASE) >> 3;
> > +    const uint32_t ovf = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTOVF);
> > +    uint64_t val = riscv_iommu_reg_get64(s, evt_reg);
> > +
> > +    if (ctr_idx >= s->hpm_cntrs) {
> > +        return;
> > +    }
> > +
> > +    /* Clear OF bit in IOCNTOVF if it's being cleared in IOHPMEVT register. */
> > +    if (get_field(ovf, BIT(ctr_idx + 1)) && !get_field(val, RISCV_IOMMU_IOHPMEVT_OF)) {
> > +        /* +1 to offset CYCLE register OF bit. */
> > +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF, 0, BIT(ctr_idx + 1));
> > +    }
> > +
> > +    if (!__check_valid_event_id(get_field(val, RISCV_IOMMU_IOHPMEVT_EVENT_ID))) {
> > +        /* Reset EventID (WARL) field to invalid. */
> > +        val = set_field(val, RISCV_IOMMU_IOHPMEVT_EVENT_ID,
> > +            RISCV_IOMMU_HPMEVENT_INVALID);
> > +        riscv_iommu_reg_set64(s, evt_reg, val);
> > +    }
> > +
> > +    __update_event_map(s, val, ctr_idx);
> > +}
> > +
> > +static void riscv_iommu_process_hpm_writes(RISCVIOMMUState *s,
> > +                                           uint32_t regb,
> > +                                           bool prev_cy_inh)
> > +{
> > +    switch (regb) {
> > +    case RISCV_IOMMU_REG_IOCOUNTINH:
> > +        riscv_iommu_process_iocntinh_cy(s, prev_cy_inh);
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_IOHPMCYCLES:
> > +    case RISCV_IOMMU_REG_IOHPMCYCLES + 4:
> > +        riscv_iommu_process_hpmcycle_write(s);
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_IOHPMEVT_BASE ...
> > +        RISCV_IOMMU_REG_IOHPMEVT(RISCV_IOMMU_IOCOUNT_NUM) + 4:
> > +        riscv_iommu_process_hpmevt_write(s, regb & ~7);
> > +        break;
> > +    }
> > +}
> > +
> > +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
> > +    uint64_t data, unsigned size, MemTxAttrs attrs)
> > +{
> > +    RISCVIOMMUState *s = opaque;
> > +    uint32_t regb = addr & ~3;
> > +    bool cy_inh = false;
> > +    uint32_t busy = 0;
> > +    uint32_t exec = 0;
> > +
> > +    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
> > +        /* Unsupported MMIO alignment or access size */
> > +        return MEMTX_ERROR;
> > +    }
> > +
> > +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> > +        /* Unsupported MMIO access location. */
> > +        return MEMTX_ACCESS_ERROR;
> > +    }
> > +
> > +    /* Track actionable MMIO write. */
> > +    switch (regb) {
> > +    case RISCV_IOMMU_REG_DDTP:
> > +    case RISCV_IOMMU_REG_DDTP + 4:
> > +        exec = BIT(RISCV_IOMMU_EXEC_DDTP);
> > +        regb = RISCV_IOMMU_REG_DDTP;
> > +        busy = RISCV_IOMMU_DDTP_BUSY;
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_CQT:
> > +        exec = BIT(RISCV_IOMMU_EXEC_CQT);
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_CQCSR:
> > +        exec = BIT(RISCV_IOMMU_EXEC_CQCSR);
> > +        busy = RISCV_IOMMU_CQCSR_BUSY;
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_FQH:
> > +        exec = BIT(RISCV_IOMMU_EXEC_FQH);
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_FQCSR:
> > +        exec = BIT(RISCV_IOMMU_EXEC_FQCSR);
> > +        busy = RISCV_IOMMU_FQCSR_BUSY;
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_PQH:
> > +        exec = BIT(RISCV_IOMMU_EXEC_PQH);
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_PQCSR:
> > +        exec = BIT(RISCV_IOMMU_EXEC_PQCSR);
> > +        busy = RISCV_IOMMU_PQCSR_BUSY;
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_IOCOUNTINH:
> > +        if (addr != RISCV_IOMMU_REG_IOCOUNTINH) {
> > +            break;
> > +        }
> > +
> > +        /* Store previous value of CY bit. */
> > +        cy_inh = !!(riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH) &
> > +            RISCV_IOMMU_IOCOUNTINH_CY);
> > +        break;
> > +
> > +    case RISCV_IOMMU_REG_TR_REQ_CTL:
> > +        exec = BIT(RISCV_IOMMU_EXEC_TR_REQUEST);
> > +        regb = RISCV_IOMMU_REG_TR_REQ_CTL;
> > +        busy = RISCV_IOMMU_TR_REQ_CTL_GO_BUSY;
> > +        break;
> > +    }
> > +
> > +    /*
> > +     * Registers update might be not synchronized with core logic.
> > +     * If system software updates register when relevant BUSY bit is set
> > +     * IOMMU behavior of additional writes to the register is UNSPECIFIED
> > +     */
> > +
> > +    qemu_spin_lock(&s->regs_lock);
> > +    if (size == 1) {
> > +        uint8_t ro = s->regs_ro[addr];
> > +        uint8_t wc = s->regs_wc[addr];
> > +        uint8_t rw = s->regs_rw[addr];
> > +        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
> > +    } else if (size == 2) {
> > +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
> > +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
> > +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
> > +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> > +    } else if (size == 4) {
> > +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> > +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> > +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> > +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> > +    } else if (size == 8) {
> > +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> > +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> > +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> > +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> > +    }
> > +
> > +    /* Busy flag update, MSB 4-byte register. */
> > +    if (busy) {
> > +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
> > +        stl_le_p(&s->regs_rw[regb], rw | busy);
> > +    }
> > +    qemu_spin_unlock(&s->regs_lock);
> > +
> > +    /* Process HPM writes and update any internal state if needed. */
> > +    if (regb >= RISCV_IOMMU_REG_IOCOUNTOVF &&
> > +        regb <= (RISCV_IOMMU_REG_IOHPMEVT(RISCV_IOMMU_IOCOUNT_NUM) + 4)) {
> > +        riscv_iommu_process_hpm_writes(s, regb, cy_inh);
> > +    }
> > +
> > +    /* Wake up core processing thread. */
> > +    if (exec) {
> > +        qemu_mutex_lock(&s->core_lock);
> > +        s->core_exec |= exec;
> > +        qemu_cond_signal(&s->core_cond);
> > +        qemu_mutex_unlock(&s->core_lock);
> > +    }
> > +
> > +    return MEMTX_OK;
> > +}
> > +
> > +static uint64_t riscv_iommu_hpmcycle_read(RISCVIOMMUState *s)
> > +{
> > +    const uint64_t cycle = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_IOHPMCYCLES);
> > +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> > +    const uint64_t ctr_prev = s->hpmcycle_prev;
> > +    const uint64_t ctr_val = s->hpmcycle_val;
> > +
> > +    if (get_field(inhibit, RISCV_IOMMU_IOCOUNTINH_CY)) {
> > +        /*
> > +         * Counter should not increment if inhibit bit is set. We can't really
> > +         * stop the QEMU_CLOCK_VIRTUAL, so we just return the last updated
> > +         * counter value to indicate that counter was not incremented.
> > +         */
> > +        return (ctr_val & RISCV_IOMMU_IOHPMCYCLES_COUNTER) |
> > +               (cycle & RISCV_IOMMU_IOHPMCYCLES_OVF);
> > +    }
> > +
> > +    return (ctr_val + __get_cycles() - ctr_prev) |
> > +        (cycle & RISCV_IOMMU_IOHPMCYCLES_OVF);
> > +}
> > +
> > +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
> > +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> > +{
> > +    RISCVIOMMUState *s = opaque;
> > +    uint64_t val = -1;
> > +    uint8_t *ptr;
> > +
> > +    if ((addr & (size - 1)) != 0) {
> > +        /* Unsupported MMIO alignment. */
> > +        return MEMTX_ERROR;
> > +    }
> > +
> > +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> > +        return MEMTX_ACCESS_ERROR;
> > +    }
> > +
> > +    /* Compute cycle register value. */
> > +    if ((addr & ~7) == RISCV_IOMMU_REG_IOHPMCYCLES) {
> > +        val = riscv_iommu_hpmcycle_read(s);
> > +        ptr = (uint8_t *)&val + (addr & 7);
> > +    } else if ((addr & ~3) == RISCV_IOMMU_REG_IOCOUNTOVF) {
> > +        /*
> > +         * Software can read RISCV_IOMMU_REG_IOCOUNTOVF before timer callback completes.
> > +         * In which case CY_OF bit in RISCV_IOMMU_IOHPMCYCLES_OVF would be 0. Here we
> > +         * take the CY_OF bit state from RISCV_IOMMU_REG_IOHPMCYCLES register as it's
> > +         * not dependent over the timer callback and is computed from cycle
> > +         * overflow.
> > +         */
> > +        val = ldq_le_p(&s->regs_rw[addr]);
> > +        val |= (riscv_iommu_hpmcycle_read(s) & RISCV_IOMMU_IOHPMCYCLES_OVF)
> > +                   ? RISCV_IOMMU_IOCOUNTOVF_CY
> > +                   : 0;
> > +        ptr = (uint8_t *)&val + (addr & 3);
> > +    } else {
> > +        ptr = &s->regs_rw[addr];
> > +    }
> > +
> > +    if (size == 1) {
> > +        val = (uint64_t)*ptr;
> > +    } else if (size == 2) {
> > +        val = lduw_le_p(ptr);
> > +    } else if (size == 4) {
> > +        val = ldl_le_p(ptr);
> > +    } else if (size == 8) {
> > +        val = ldq_le_p(ptr);
> > +    } else {
> > +        return MEMTX_ERROR;
> > +    }
> > +
> > +    *data = val;
> > +
> > +    return MEMTX_OK;
> > +}
> > +
> > +static const MemoryRegionOps riscv_iommu_mmio_ops = {
> > +    .read_with_attrs = riscv_iommu_mmio_read,
> > +    .write_with_attrs = riscv_iommu_mmio_write,
> > +    .endianness = DEVICE_NATIVE_ENDIAN,
> > +    .impl = {
> > +        .min_access_size = 1,
> > +        .max_access_size = 8,
> > +        .unaligned = false,
> > +    },
> > +    .valid = {
> > +        .min_access_size = 1,
> > +        .max_access_size = 8,
> > +    }
> > +};
> > +
> > +/*
> > + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
> > + * memory region as untranslated address, for additional MSI/MRIF interception
> > + * by IOMMU interrupt remapping implementation.
> > + * Note: Device emulation code generating an MSI is expected to provide a valid
> > + * memory transaction attributes with requested_id set.
> > + */
> > +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
> > +    uint64_t data, unsigned size, MemTxAttrs attrs)
> > +{
> > +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
> > +    RISCVIOMMUContext *ctx;
> > +    MemTxResult res;
> > +    void *ref;
> > +    uint32_t devid = attrs.requester_id;
> > +
> > +    if (attrs.unspecified) {
> > +        return MEMTX_ACCESS_ERROR;
> > +    }
> > +
> > +    /* FIXME: PCIe bus remapping for attached endpoints. */
> > +    devid |= s->bus << 8;
> > +
> > +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
> > +    if (ctx == NULL) {
> > +        res = MEMTX_ACCESS_ERROR;
> > +    } else {
> > +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
> > +    }
> > +    riscv_iommu_ctx_put(s, ref);
> > +    return res;
> > +}
> > +
> > +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
> > +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> > +{
> > +    return MEMTX_ACCESS_ERROR;
> > +}
> > +
> > +static const MemoryRegionOps riscv_iommu_trap_ops = {
> > +    .read_with_attrs = riscv_iommu_trap_read,
> > +    .write_with_attrs = riscv_iommu_trap_write,
> > +    .endianness = DEVICE_LITTLE_ENDIAN,
> > +    .impl = {
> > +        .min_access_size = 1,
> > +        .max_access_size = 8,
> > +        .unaligned = true,
> > +    },
> > +    .valid = {
> > +        .min_access_size = 1,
> > +        .max_access_size = 8,
> > +    }
> > +};
> > +
> > +/* Timer callback for cycle counter overflow. */
> > +static void riscv_iommu_hpm_timer_cb(void *priv)
> > +{
> > +    RISCVIOMMUState *s = priv;
> > +    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
> > +    uint32_t ovf;
> > +
> > +    if (get_field(inhibit, RISCV_IOMMU_IOCOUNTINH_CY)) {
> > +        return;
> > +    }
> > +
> > +    if (s->irq_overflow_left > 0) {
> > +        uint64_t irq_trigger_at =
> > +            qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->irq_overflow_left;
> > +        timer_mod_anticipate_ns(s->hpm_timer, irq_trigger_at);
> > +        s->irq_overflow_left = 0;
> > +        return;
> > +    }
> > +
> > +    ovf = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTOVF);
> > +    if (!get_field(ovf, RISCV_IOMMU_IOCOUNTOVF_CY)) {
> > +        /*
> > +         * We don't need to set hpmcycle_val to zero and update hpmcycle_prev to
> > +         * current clock value. The way we calculate iohpmcycs will overflow
> > +         * and return the correct value. This avoids the need to synchronize
> > +         * timer callback and write callback.
> > +         */
> > +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF,
> > +            RISCV_IOMMU_IOCOUNTOVF_CY, 0);
> > +        riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_IOHPMCYCLES,
> > +            RISCV_IOMMU_IOHPMCYCLES_OVF, 0);
> > +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PM);
> > +    }
> > +}
> > +
> > +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
> > +{
> > +    const uint64_t cap_implemented =
> > +        RISCV_IOMMU_CAP_MSI_FLAT |
> > +        RISCV_IOMMU_CAP_MSI_MRIF |
> > +        RISCV_IOMMU_CAP_ATS |
> > +        RISCV_IOMMU_CAP_S_SV32 |
> > +        RISCV_IOMMU_CAP_S_SV39 |
> > +        RISCV_IOMMU_CAP_S_SV48 |
> > +        RISCV_IOMMU_CAP_S_SV57 |
> > +        RISCV_IOMMU_CAP_G_SV32 |
> > +        RISCV_IOMMU_CAP_G_SV39 |
> > +        RISCV_IOMMU_CAP_G_SV48 |
> > +        RISCV_IOMMU_CAP_G_SV57 |
> > +        RISCV_IOMMU_CAP_MSI_FLAT |
> > +        RISCV_IOMMU_CAP_MSI_MRIF |
> > +        RISCV_IOMMU_CAP_ATS |
> > +        RISCV_IOMMU_CAP_IGS |
> > +        RISCV_IOMMU_CAP_HPM |
> > +        RISCV_IOMMU_CAP_DBG |
> > +        RISCV_IOMMU_CAP_PD8 |
> > +        RISCV_IOMMU_CAP_PD17 |
> > +        RISCV_IOMMU_CAP_PD20;
> > +
> > +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> > +
> > +    s->cap &= cap_implemented;
> > +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_VERSION, s->version);
> > +
> > +    if (s->hpm_cntrs > RISCV_IOMMU_IOCOUNT_NUM) {
> > +        /* Clip number of HPM counters to maximum supported (31). */
> > +        s->hpm_cntrs = RISCV_IOMMU_IOCOUNT_NUM;
> > +    } else if (s->hpm_cntrs == 0) {
> > +        /* Disable hardware performance monitor interface */
> > +        s->cap |= RISCV_IOMMU_CAP_HPM;
> > +    }
> > +
> > +    /* Verify supported IGS */
> > +    switch (get_field(s->cap, RISCV_IOMMU_CAP_IGS)) {
> > +    case RISCV_IOMMU_CAP_IGS_MSI:
> > +    case RISCV_IOMMU_CAP_IGS_WSI:
> > +        break;
> > +    default:
> > +        error_setg(errp, "can't support requested IGS mode: cap: %" PRIx64,
> > +            s->cap);
> > +        return;
> > +    }
> > +
> > +    /* Report QEMU target physical address space limits */
> > +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS, TARGET_PHYS_ADDR_SPACE_BITS);
> > +
> > +    /* Adjust reported PD capabilities */
> > +    if (s->pasid_bits < 20) {
> > +        s->cap &= ~RISCV_IOMMU_CAP_PD20;
> > +    } else if (s->pasid_bits < 17) {
> > +        s->cap &= ~RISCV_IOMMU_CAP_PD17;
> > +    } else if (s->pasid_bits < 8) {
> > +        s->cap &= ~RISCV_IOMMU_CAP_PD8;
> > +    }
> > +
> > +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
> > +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
> > +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
> > +
> > +    /* register storage */
> > +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> > +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> > +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> > +
> > +     /* Mark all registers read-only */
> > +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
> > +
> > +    /*
> > +     * Register complete MMIO space, including MSI/PBA registers.
> > +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
> > +     * managed directly by the PCIDevice implementation.
> > +     */
> > +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
> > +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
> > +
> > +    /* Set power-on register state */
> > +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
> > +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], s->fctl);
> > +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
> > +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
> > +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
> > +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
> > +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
> > +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
> > +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
> > +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
> > +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
> > +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
> > +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
> > +        RISCV_IOMMU_CQCSR_BUSY);
> > +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
> > +        RISCV_IOMMU_FQCSR_FQOF);
> > +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
> > +        RISCV_IOMMU_FQCSR_BUSY);
> > +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
> > +        RISCV_IOMMU_PQCSR_PQOF);
> > +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
> > +        RISCV_IOMMU_PQCSR_BUSY);
> > +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
> > +    /* If HPM registers are enabled. */
> > +    if (s->cap & RISCV_IOMMU_CAP_HPM) {
> > +        /* +1 for cycle counter bit. */
> > +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_IOCOUNTINH], ~((2 << s->hpm_cntrs) - 1));
> > +        stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_IOHPMCYCLES], 0);
> > +        memset(&s->regs_ro[RISCV_IOMMU_REG_IOHPMCTR_BASE], 0x00, s->hpm_cntrs * 8);
> > +        memset(&s->regs_ro[RISCV_IOMMU_REG_IOHPMEVT_BASE], 0x00, s->hpm_cntrs * 8);
> > +    }
> > +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_IVEC], 0);
> > +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
> > +    /* If debug registers enabled. */
> > +    if (s->cap & RISCV_IOMMU_CAP_DBG) {
> > +        stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_IOVA], 0);
> > +        stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_CTL],
> > +            RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
> > +    }
> > +
> > +    /* Memory region for downstream access, if specified. */
> > +    if (s->target_mr) {
> > +        s->target_as = g_new0(AddressSpace, 1);
> > +        address_space_init(s->target_as, s->target_mr,
> > +            "riscv-iommu-downstream");
> > +    } else {
> > +        /* Fallback to global system memory. */
> > +        s->target_as = &address_space_memory;
> > +    }
> > +
> > +    /* Memory region for untranslated MRIF/MSI writes */
> > +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
> > +            "riscv-iommu-trap", ~0ULL);
> > +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
> > +
> > +    /* Device translation context cache */
> > +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> > +                                         g_free, NULL);
> > +    s->iot_cache = g_hash_table_new_full(__iot_hash, __iot_equal,
> > +                                         g_free, NULL);
> > +
> > +    if (s->cap & RISCV_IOMMU_CAP_HPM) {
> > +        s->hpm_event_ctr_map = g_hash_table_new(g_direct_hash, g_direct_equal);
> > +        pthread_rwlock_init(&s->ht_lock, NULL);
> > +        s->hpm_timer =
> > +            timer_new_ns(QEMU_CLOCK_VIRTUAL, riscv_iommu_hpm_timer_cb, s);
> > +    }
> > +
> > +    s->iommus.le_next = NULL;
> > +    s->iommus.le_prev = NULL;
> > +    QLIST_INIT(&s->spaces);
> > +    qemu_cond_init(&s->core_cond);
> > +    qemu_mutex_init(&s->core_lock);
> > +    qemu_spin_init(&s->regs_lock);
> > +    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
> > +        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
> > +}
> > +
> > +static void riscv_iommu_unrealize(DeviceState *dev)
> > +{
> > +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> > +
> > +    qemu_mutex_lock(&s->core_lock);
> > +    /* cancel pending operations and stop */
> > +    s->core_exec = BIT(RISCV_IOMMU_EXEC_EXIT);
> > +    qemu_cond_signal(&s->core_cond);
> > +    qemu_mutex_unlock(&s->core_lock);
> > +    qemu_thread_join(&s->core_proc);
> > +    qemu_cond_destroy(&s->core_cond);
> > +    qemu_mutex_destroy(&s->core_lock);
> > +    if (s->cap & RISCV_IOMMU_CAP_HPM) {
> > +        timer_free(s->hpm_timer);
> > +        pthread_rwlock_destroy(&s->ht_lock);
> > +        g_hash_table_unref(s->hpm_event_ctr_map);
> > +    }
> > +    g_hash_table_unref(s->iot_cache);
> > +    g_hash_table_unref(s->ctx_cache);
> > +}
> > +
> > +static Property riscv_iommu_properties[] = {
> > +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
> > +        RISCV_IOMMU_SPEC_DOT_VER),
> > +    DEFINE_PROP_UINT64("capabilities", RISCVIOMMUState, cap, ~0ULL),
> > +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> > +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
> > +    DEFINE_PROP_UINT32("ioatc-limit", RISCVIOMMUState, iot_limit,
> > +        LIMIT_CACHE_IOT),
> > +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
> > +        TYPE_MEMORY_REGION, MemoryRegion *),
> > +    DEFINE_PROP_UINT8("hpm-counters", RISCVIOMMUState, hpm_cntrs,
> > +        RISCV_IOMMU_IOCOUNT_NUM),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
> > +{
> > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > +
> > +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
> > +    dc->user_creatable = false;
> > +    dc->realize = riscv_iommu_realize;
> > +    dc->unrealize = riscv_iommu_unrealize;
> > +    device_class_set_props(dc, riscv_iommu_properties);
> > +}
> > +
> > +static const TypeInfo riscv_iommu_info = {
> > +    .name = TYPE_RISCV_IOMMU,
> > +    .parent = TYPE_DEVICE,
> > +    .instance_size = sizeof(RISCVIOMMUState),
> > +    .class_init = riscv_iommu_class_init,
> > +};
> > +
> > +static const char *IOMMU_FLAG_STR[] = {
> > +    "NA",
> > +    "RO",
> > +    "WR",
> > +    "RW",
> > +};
> > +
> > +/* RISC-V IOMMU Memory Region - Address Translation Space */
> > +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
> > +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
> > +    IOMMUAccessFlags flag, int iommu_idx)
> > +{
> > +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> > +    RISCVIOMMUContext *ctx;
> > +    void *ref;
> > +    IOMMUTLBEntry iotlb = {
> > +        .iova = addr,
> > +        .target_as = as->iommu->target_as,
> > +        .addr_mask = ~0ULL,
> > +        .perm = flag,
> > +    };
> > +
> > +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
> > +    if (ctx == NULL) {
> > +        /* Translation disabled or invalid. */
> > +        iotlb.addr_mask = 0;
> > +        iotlb.perm = IOMMU_NONE;
> > +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb, true)) {
> > +        /* Translation disabled or fault reported. */
> > +        iotlb.addr_mask = 0;
> > +        iotlb.perm = IOMMU_NONE;
> > +    }
> > +
> > +    /* Trace all dma translations with original access flags. */
> > +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
> > +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
> > +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
> > +                          iotlb.translated_addr);
> > +
> > +    riscv_iommu_ctx_put(as->iommu, ref);
> > +
> > +    return iotlb;
> > +}
> > +
> > +static int riscv_iommu_memory_region_notify(
> > +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
> > +    IOMMUNotifierFlag new, Error **errp)
> > +{
> > +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> > +
> > +    if (old == IOMMU_NOTIFIER_NONE) {
> > +        as->notifier = true;
> > +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
> > +    } else if (new == IOMMU_NOTIFIER_NONE) {
> > +        as->notifier = false;
> > +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static inline bool pci_is_iommu(PCIDevice *pdev)
> > +{
> > +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
> > +}
> > +
> > +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
> > +{
> > +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> > +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> > +    AddressSpace *as = NULL;
> > +
> > +    if (pdev && pci_is_iommu(pdev)) {
> > +        return s->target_as;
> > +    }
> > +
> > +    /* Find first registered IOMMU device */
> > +    while (s->iommus.le_prev) {
> > +        s = *(s->iommus.le_prev);
> > +    }
> > +
> > +    /* Find first matching IOMMU */
> > +    while (s != NULL && as == NULL) {
> > +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> > +        s = s->iommus.le_next;
> > +    }
> > +
> > +    return as ? as : &address_space_memory;
> > +}
> > +
> > +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> > +    Error **errp)
> > +{
> > +    if (bus->iommu_fn == riscv_iommu_find_as) {
> > +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
> > +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> > +        QLIST_INSERT_AFTER(last, iommu, iommus);
> > +    } else if (bus->iommu_fn == NULL) {
> > +        pci_setup_iommu(bus, riscv_iommu_find_as, iommu);
> > +    } else {
> > +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
> > +            pci_bus_num(bus));
> > +    }
> > +}
> > +
> > +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
> > +    MemTxAttrs attrs)
> > +{
> > +    return RISCV_IOMMU_NOPASID;
> > +}
> > +
> > +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> > +{
> > +    return 1;
> > +}
> > +
> > +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
> > +{
> > +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> > +
> > +    imrc->translate = riscv_iommu_memory_region_translate;
> > +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> > +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> > +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> > +}
> > +
> > +static const TypeInfo riscv_iommu_memory_region_info = {
> > +    .parent = TYPE_IOMMU_MEMORY_REGION,
> > +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> > +    .class_init = riscv_iommu_memory_region_init,
> > +};
> > +
> > +static void riscv_iommu_register_mr_types(void)
> > +{
> > +    type_register_static(&riscv_iommu_memory_region_info);
> > +    type_register_static(&riscv_iommu_info);
> > +}
> > +
> > +type_init(riscv_iommu_register_mr_types);
> > diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> > new file mode 100644
> > index 0000000000..c68e09db58
> > --- /dev/null
> > +++ b/hw/riscv/riscv-iommu.h
> > @@ -0,0 +1,152 @@
> > +/*
> > + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> > + *
> > + * Copyright (C) 2022-2023 Rivos Inc.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#ifndef HW_RISCV_IOMMU_STATE_H
> > +#define HW_RISCV_IOMMU_STATE_H
> > +
> > +#include "qemu/osdep.h"
> > +#include "qom/object.h"
> > +
> > +#include "hw/riscv/iommu.h"
> > +
> > +struct RISCVIOMMUState {
> > +    /*< private >*/
> > +    DeviceState parent_obj;
> > +
> > +    /*< public >*/
> > +    uint32_t version;     /* Reported interface version number */
> > +    uint32_t pasid_bits;  /* process identifier width */
> > +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> > +
> > +    uint64_t cap;         /* IOMMU supported capabilities */
> > +    uint64_t fctl;        /* IOMMU enabled features */
> > +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
> > +
> > +    /* IOMMU Internal State */
> > +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> > +
> > +    dma_addr_t cq_addr;   /* Command queue base physical address */
> > +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
> > +    dma_addr_t pq_addr;   /* Page request queue base physical address */
> > +
> > +    uint32_t cq_mask;     /* Command queue index bit mask */
> > +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> > +    uint32_t pq_mask;     /* Page request queue index bit mask */
> > +
> > +    /* interrupt notifier */
> > +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> > +
> > +    /* IOMMU State Machine */
> > +    QemuThread core_proc; /* Background processing thread */
> > +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
> > +    QemuCond core_cond;   /* Background processing wake up signal */
> > +    unsigned core_exec;   /* Processing thread execution actions */
> > +
> > +    /* IOMMU target address space */
> > +    AddressSpace *target_as;
> > +    MemoryRegion *target_mr;
> > +
> > +    /* MSI / MRIF access trap */
> > +    AddressSpace trap_as;
> > +    MemoryRegion trap_mr;
> > +
> > +    GHashTable *ctx_cache;          /* Device translation Context Cache */
> > +    GHashTable *iot_cache;          /* IO Translated Address Cache */
> > +    unsigned iot_limit;             /* IO Translation Cache size limit */
> > +
> > +    /* HPM cycle counter */
> > +    QEMUTimer *hpm_timer;
> > +    uint64_t hpmcycle_val;      /* Current value of cycle register */
> > +    uint64_t hpmcycle_prev;     /* Saved value of QEMU_CLOCK_VIRTUAL clock */
> > +    uint64_t irq_overflow_left; /* Value beyond INT64_MAX after overflow */
> > +
> > +    /* HPM event counters */
> > +    uint8_t hpm_cntrs;
> > +    GHashTable *hpm_event_ctr_map; /* Mapping of events to counters */
> > +    pthread_rwlock_t ht_lock;      /* Lock used for hpm_event_ctr_map updates */
> > +
> > +    /* MMIO Hardware Interface */
> > +    MemoryRegion regs_mr;
> > +    QemuSpin regs_lock;
> > +    uint8_t *regs_rw;  /* register state (user write) */
> > +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> > +    uint8_t *regs_ro;  /* read-only mask */
> > +
> > +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> > +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> > +};
> > +
> > +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> > +         Error **errp);
> > +
> > +/* private helpers */
> > +
> > +/* Register helper functions */
> > +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> > +    unsigned idx, uint32_t set, uint32_t clr)
> > +{
> > +    uint32_t val;
> > +    qemu_spin_lock(&s->regs_lock);
> > +    val = ldl_le_p(s->regs_rw + idx);
> > +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> > +    qemu_spin_unlock(&s->regs_lock);
> > +    return val;
> > +}
> > +
> > +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> > +    unsigned idx, uint32_t set)
> > +{
> > +    qemu_spin_lock(&s->regs_lock);
> > +    stl_le_p(s->regs_rw + idx, set);
> > +    qemu_spin_unlock(&s->regs_lock);
> > +}
> > +
> > +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
> > +    unsigned idx)
> > +{
> > +    return ldl_le_p(s->regs_rw + idx);
> > +}
> > +
> > +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
> > +    unsigned idx, uint64_t set, uint64_t clr)
> > +{
> > +    uint64_t val;
> > +    qemu_spin_lock(&s->regs_lock);
> > +    val = ldq_le_p(s->regs_rw + idx);
> > +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> > +    qemu_spin_unlock(&s->regs_lock);
> > +    return val;
> > +}
> > +
> > +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> > +    unsigned idx, uint64_t set)
> > +{
> > +    qemu_spin_lock(&s->regs_lock);
> > +    stq_le_p(s->regs_rw + idx, set);
> > +    qemu_spin_unlock(&s->regs_lock);
> > +}
> > +
> > +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> > +    unsigned idx)
> > +{
> > +    return ldq_le_p(s->regs_rw + idx);
> > +}
> > +
> > +
> > +
> > +#endif
> > diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> > new file mode 100644
> > index 0000000000..fd5e21e3d4
> > --- /dev/null
> > +++ b/hw/riscv/trace-events
> > @@ -0,0 +1,14 @@
> > +# See documentation at docs/devel/tracing.rst
> > +
> > +# riscv-iommu.c
> > +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
> > +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
> > +riscv_iommu_ats(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: translate request %04x:%02x.%u iova: 0x%"PRIx64
> > +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> > +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> > +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
> > +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
> > +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
> > +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
> > +riscv_iommu_ats_inval(const char *id) "%s: dev-iotlb invalidate"
> > +riscv_iommu_ats_prgr(const char *id) "%s: dev-iotlb page request group response"
> > diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> > new file mode 100644
> > index 0000000000..b88504b750
> > --- /dev/null
> > +++ b/hw/riscv/trace.h
> > @@ -0,0 +1,2 @@
> > +#include "trace/trace-hw_riscv.h"
> > +
> > diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> > new file mode 100644
> > index 0000000000..2a63a5cbf2
> > --- /dev/null
> > +++ b/include/hw/riscv/iommu.h
> > @@ -0,0 +1,40 @@
> > +/*
> > + * QEMU emulation of an RISC-V IOMMU (Ziommu)
> > + *
> > + * Copyright (C) 2022-2023 Rivos Inc.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#ifndef HW_RISCV_IOMMU_H
> > +#define HW_RISCV_IOMMU_H
> > +
> > +#include "qemu/osdep.h"
> > +#include "qom/object.h"
> > +
> > +#define TYPE_RISCV_IOMMU "x-riscv-iommu"
> > +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> > +typedef struct RISCVIOMMUState RISCVIOMMUState;
> > +
> > +#define TYPE_RISCV_IOMMU_MEMORY_REGION "x-riscv-iommu-mr"
> > +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> > +
> > +#define TYPE_RISCV_IOMMU_PCI "x-riscv-iommu-pci"
> > +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> > +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> > +
> > +#define TYPE_RISCV_IOMMU_SYS "x-riscv-iommu-device"
> > +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStateSys, RISCV_IOMMU_SYS)
> > +typedef struct RISCVIOMMUStateSys RISCVIOMMUStateSys;
> > +
> > +#endif
> > diff --git a/meson.build b/meson.build
> > index 5fcdb37a71..693ea3447d 100644
> > --- a/meson.build
> > +++ b/meson.build
> > @@ -3268,6 +3268,7 @@ if have_system
> >      'hw/rdma',
> >      'hw/rdma/vmw',
> >      'hw/rtc',
> > +    'hw/riscv',
> >      'hw/s390x',
> >      'hw/scsi',
> >      'hw/sd',
> > --
> > 2.34.1
> >
> >


best,
- Tomasz
diff mbox series

Patch

diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index b6a5eb4452..617a509f1b 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -1,3 +1,6 @@ 
+config RISCV_IOMMU
+    bool
+
 config RISCV_NUMA
     bool
 
diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
index 2f7ee81be3..e37c5d78e2 100644
--- a/hw/riscv/meson.build
+++ b/hw/riscv/meson.build
@@ -10,5 +10,6 @@  riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
 riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
 riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
 riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
+riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c', 'riscv-iommu-pci.c', 'riscv-iommu-sys.c'))
 
 hw_arch += {'riscv': riscv_ss}
diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
new file mode 100644
index 0000000000..9ce713361f
--- /dev/null
+++ b/hw/riscv/riscv-iommu-bits.h
@@ -0,0 +1,749 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2022-2023 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ * Copyright © 2023 RISC-V IOMMU Task Group
+ *
+ * RISC-V Ziommu - Register Layout and Data Structures.
+ *
+ * Based on the 'RISC-V IOMMU Architecture Specification', Version 1.0
+ * Published at  https://github.com/riscv-non-isa/riscv-iommu
+ *
+ */
+
+#ifndef HW_RISCV_IOMMU_BITS_H
+#define HW_RISCV_IOMMU_BITS_H
+
+/*
+ * This file is based on Linux RISC-V IOMMU file
+ * located at 'drivers/iommu/riscv/iommu-bits.h'
+ */
+
+#include "qemu/osdep.h"
+
+#define RISCV_IOMMU_SPEC_DOT_VER 0x010
+
+#ifndef GENMASK_ULL
+#define GENMASK_ULL(h, l) (((~0ULL) >> (63 - (h) + (l))) << (l))
+#endif
+
+/*
+ * Chapter 5: Memory Mapped register interface
+ */
+
+/* Common field positions */
+#define RISCV_IOMMU_PPN_FIELD           GENMASK_ULL(53, 10)
+#define RISCV_IOMMU_QUEUE_LOGSZ_FIELD   GENMASK_ULL(4, 0)
+#define RISCV_IOMMU_QUEUE_INDEX_FIELD   GENMASK_ULL(31, 0)
+#define RISCV_IOMMU_QUEUE_ENABLE        BIT(0)
+#define RISCV_IOMMU_QUEUE_INTR_ENABLE   BIT(1)
+#define RISCV_IOMMU_QUEUE_MEM_FAULT     BIT(8)
+#define RISCV_IOMMU_QUEUE_OVERFLOW      BIT(9)
+#define RISCV_IOMMU_QUEUE_ACTIVE        BIT(16)
+#define RISCV_IOMMU_QUEUE_BUSY          BIT(17)
+#define RISCV_IOMMU_ATP_PPN_FIELD       GENMASK_ULL(43, 0)
+#define RISCV_IOMMU_ATP_MODE_FIELD      GENMASK_ULL(63, 60)
+
+/* 5.3 IOMMU Capabilities (64bits) */
+#define RISCV_IOMMU_REG_CAP             0x0000
+#define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
+#define RISCV_IOMMU_CAP_S_SV32          BIT_ULL(8)
+#define RISCV_IOMMU_CAP_S_SV39          BIT_ULL(9)
+#define RISCV_IOMMU_CAP_S_SV48          BIT_ULL(10)
+#define RISCV_IOMMU_CAP_S_SV57          BIT_ULL(11)
+#define RISCV_IOMMU_CAP_SVPBMT          BIT_ULL(15)
+#define RISCV_IOMMU_CAP_G_SV32          BIT_ULL(16)
+#define RISCV_IOMMU_CAP_G_SV39          BIT_ULL(17)
+#define RISCV_IOMMU_CAP_G_SV48          BIT_ULL(18)
+#define RISCV_IOMMU_CAP_G_SV57          BIT_ULL(19)
+#define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
+#define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
+#define RISCV_IOMMU_CAP_AMO             BIT_ULL(24)
+#define RISCV_IOMMU_CAP_ATS             BIT_ULL(25)
+#define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
+#define RISCV_IOMMU_CAP_END             BIT_ULL(27)
+#define RISCV_IOMMU_CAP_IGS             GENMASK_ULL(29, 28)
+#define RISCV_IOMMU_CAP_HPM             BIT_ULL(30)
+#define RISCV_IOMMU_CAP_DBG             BIT_ULL(31)
+#define RISCV_IOMMU_CAP_PAS             GENMASK_ULL(37, 32)
+#define RISCV_IOMMU_CAP_PD8             BIT_ULL(38)
+#define RISCV_IOMMU_CAP_PD17            BIT_ULL(39)
+#define RISCV_IOMMU_CAP_PD20            BIT_ULL(40)
+
+#define RISCV_IOMMU_CAP_VERSION_VER_MASK      0xF0
+#define RISCV_IOMMU_CAP_VERSION_REV_MASK      0x0F
+
+/**
+ * enum riscv_iommu_igs_settings - Interrupt Generation Support Settings
+ * @RISCV_IOMMU_CAP_IGS_MSI: I/O MMU supports only MSI generation
+ * @RISCV_IOMMU_CAP_IGS_WSI: I/O MMU supports only Wired-Signaled interrupt
+ * @RISCV_IOMMU_CAP_IGS_BOTH: I/O MMU supports both MSI and WSI generation
+ * @RISCV_IOMMU_CAP_IGS_RSRV: Reserved for standard use
+ */
+enum riscv_iommu_igs_settings {
+      RISCV_IOMMU_CAP_IGS_MSI  = 0,
+      RISCV_IOMMU_CAP_IGS_WSI  = 1,
+      RISCV_IOMMU_CAP_IGS_BOTH = 2,
+      RISCV_IOMMU_CAP_IGS_RSRV = 3
+};
+
+
+/* 5.4 Features control register (32bits) */
+#define RISCV_IOMMU_REG_FCTL            0x0008
+#define RISCV_IOMMU_FCTL_BE             BIT(0)
+#define RISCV_IOMMU_FCTL_WSI            BIT(1)
+#define RISCV_IOMMU_FCTL_GXL            BIT(2)
+
+
+/* 5.5 Device-directory-table pointer (64bits) */
+#define RISCV_IOMMU_REG_DDTP            0x0010
+#define RISCV_IOMMU_DDTP_MODE           GENMASK_ULL(3, 0)
+#define RISCV_IOMMU_DDTP_BUSY           BIT_ULL(4)
+#define RISCV_IOMMU_DDTP_PPN            RISCV_IOMMU_PPN_FIELD
+
+/**
+ * enum riscv_iommu_ddtp_modes - I/O MMU translation modes
+ * @RISCV_IOMMU_DDTP_MODE_OFF: No inbound transactions allowed
+ * @RISCV_IOMMU_DDTP_MODE_BARE: Pass-through mode
+ * @RISCV_IOMMU_DDTP_MODE_1LVL: One-level DDT
+ * @RISCV_IOMMU_DDTP_MODE_2LVL: Two-level DDT
+ * @RISCV_IOMMU_DDTP_MODE_3LVL: Three-level DDT
+ */
+enum riscv_iommu_ddtp_modes {
+      RISCV_IOMMU_DDTP_MODE_OFF = 0,
+      RISCV_IOMMU_DDTP_MODE_BARE = 1,
+      RISCV_IOMMU_DDTP_MODE_1LVL = 2,
+      RISCV_IOMMU_DDTP_MODE_2LVL = 3,
+      RISCV_IOMMU_DDTP_MODE_3LVL = 4,
+      RISCV_IOMMU_DDTP_MODE_MAX = 4
+};
+
+
+/* 5.6 Command Queue Base (64bits) */
+#define RISCV_IOMMU_REG_CQB             0x0018
+#define RISCV_IOMMU_CQB_LOG2SZ          RISCV_IOMMU_QUEUE_LOGSZ_FIELD
+#define RISCV_IOMMU_CQB_PPN             RISCV_IOMMU_PPN_FIELD
+
+/* 5.7 Command Queue head (32bits) */
+#define RISCV_IOMMU_REG_CQH             0x0020
+#define RISCV_IOMMU_CQH_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.8 Command Queue tail (32bits) */
+#define RISCV_IOMMU_REG_CQT             0x0024
+#define RISCV_IOMMU_CQT_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+
+/* 5.9 Fault Queue Base (64bits) */
+#define RISCV_IOMMU_REG_FQB             0x0028
+#define RISCV_IOMMU_FQB_LOG2SZ          RISCV_IOMMU_QUEUE_LOGSZ_FIELD
+#define RISCV_IOMMU_FQB_PPN             RISCV_IOMMU_PPN_FIELD
+
+/* 5.10 Fault Queue Head (32bits) */
+#define RISCV_IOMMU_REG_FQH             0x0030
+#define RISCV_IOMMU_FQH_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.11 Fault Queue tail (32bits) */
+#define RISCV_IOMMU_REG_FQT             0x0034
+#define RISCV_IOMMU_FQT_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+
+/* 5.12 Page Request Queue base (64bits) */
+#define RISCV_IOMMU_REG_PQB             0x0038
+#define RISCV_IOMMU_PQB_LOG2SZ          RISCV_IOMMU_QUEUE_LOGSZ_FIELD
+#define RISCV_IOMMU_PQB_PPN             RISCV_IOMMU_PPN_FIELD
+
+/* 5.13 Page Request Queue head (32bits) */
+#define RISCV_IOMMU_REG_PQH             0x0040
+#define RISCV_IOMMU_PQH_INDEX           RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.14 Page Request Queue tail (32bits) */
+#define RISCV_IOMMU_REG_PQT             0x0044
+#define RISCV_IOMMU_PQT_INDEX_MASK      RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.15 Command Queue CSR (32bits) */
+#define RISCV_IOMMU_REG_CQCSR           0x0048
+#define RISCV_IOMMU_CQCSR_CQEN          RISCV_IOMMU_QUEUE_ENABLE
+#define RISCV_IOMMU_CQCSR_CIE           RISCV_IOMMU_QUEUE_INTR_ENABLE
+#define RISCV_IOMMU_CQCSR_CQMF          RISCV_IOMMU_QUEUE_MEM_FAULT
+#define RISCV_IOMMU_CQCSR_CMD_TO        BIT(9)
+#define RISCV_IOMMU_CQCSR_CMD_ILL       BIT(10)
+#define RISCV_IOMMU_CQCSR_FENCE_W_IP    BIT(11)
+#define RISCV_IOMMU_CQCSR_CQON          RISCV_IOMMU_QUEUE_ACTIVE
+#define RISCV_IOMMU_CQCSR_BUSY          RISCV_IOMMU_QUEUE_BUSY
+
+
+/* 5.16 Fault Queue CSR (32bits) */
+#define RISCV_IOMMU_REG_FQCSR           0x004C
+#define RISCV_IOMMU_FQCSR_FQEN          RISCV_IOMMU_QUEUE_ENABLE
+#define RISCV_IOMMU_FQCSR_FIE           RISCV_IOMMU_QUEUE_INTR_ENABLE
+#define RISCV_IOMMU_FQCSR_FQMF          RISCV_IOMMU_QUEUE_MEM_FAULT
+#define RISCV_IOMMU_FQCSR_FQOF          RISCV_IOMMU_QUEUE_OVERFLOW
+#define RISCV_IOMMU_FQCSR_FQON          RISCV_IOMMU_QUEUE_ACTIVE
+#define RISCV_IOMMU_FQCSR_BUSY          RISCV_IOMMU_QUEUE_BUSY
+
+
+/* 5.17 Page Request Queue CSR (32bits) */
+#define RISCV_IOMMU_REG_PQCSR           0x0050
+#define RISCV_IOMMU_PQCSR_PQEN          RISCV_IOMMU_QUEUE_ENABLE
+#define RISCV_IOMMU_PQCSR_PIE           RISCV_IOMMU_QUEUE_INTR_ENABLE
+#define RISCV_IOMMU_PQCSR_PQMF          RISCV_IOMMU_QUEUE_MEM_FAULT
+#define RISCV_IOMMU_PQCSR_PQOF          RISCV_IOMMU_QUEUE_OVERFLOW
+#define RISCV_IOMMU_PQCSR_PQON          RISCV_IOMMU_QUEUE_ACTIVE
+#define RISCV_IOMMU_PQCSR_BUSY          RISCV_IOMMU_QUEUE_BUSY
+
+
+/* 5.18 Interrupt Pending Status (32bits) */
+#define RISCV_IOMMU_REG_IPSR            0x0054
+
+#define RISCV_IOMMU_INTR_CQ             0
+#define RISCV_IOMMU_INTR_FQ             1
+#define RISCV_IOMMU_INTR_PM             2
+#define RISCV_IOMMU_INTR_PQ             3
+#define RISCV_IOMMU_INTR_COUNT          4
+
+#define RISCV_IOMMU_IPSR_CIP            BIT(RISCV_IOMMU_INTR_CQ)
+#define RISCV_IOMMU_IPSR_FIP            BIT(RISCV_IOMMU_INTR_FQ)
+#define RISCV_IOMMU_IPSR_PMIP           BIT(RISCV_IOMMU_INTR_PM)
+#define RISCV_IOMMU_IPSR_PIP            BIT(RISCV_IOMMU_INTR_PQ)
+
+#define RISCV_IOMMU_IOCOUNT_NUM         31
+
+/* 5.19 Performance monitoring counter overflow status (32bits) */
+#define RISCV_IOMMU_REG_IOCOUNTOVF      0x0058
+#define RISCV_IOMMU_IOCOUNTOVF_CY       BIT(0)
+#define RISCV_IOMMU_IOCOUNTOVF_HPM      GENMASK(31, 1)
+
+/* 5.20 Performance monitoring counter inhibits (32bits) */
+#define RISCV_IOMMU_REG_IOCOUNTINH      0x005C
+#define RISCV_IOMMU_IOCOUNTINH_CY       BIT(0)
+#define RISCV_IOMMU_IOCOUNTINH_HPM      GENMASK(31, 1)
+
+/* 5.21 Performance monitoring cycles counter (64bits) */
+#define RISCV_IOMMU_REG_IOHPMCYCLES     0x0060
+#define RISCV_IOMMU_IOHPMCYCLES_COUNTER GENMASK_ULL(62, 0)
+#define RISCV_IOMMU_IOHPMCYCLES_OVF     BIT_ULL(63)
+
+/* 5.22 Performance monitoring event counters (31 * 64bits) */
+#define RISCV_IOMMU_REG_IOHPMCTR_BASE   0x0068
+#define RISCV_IOMMU_REG_IOHPMCTR(_n)    \
+    (RISCV_IOMMU_REG_IOHPMCTR_BASE + (_n * 0x8))
+
+/* 5.23 Performance monitoring event selectors (31 * 64bits) */
+#define RISCV_IOMMU_REG_IOHPMEVT_BASE   0x0160
+#define RISCV_IOMMU_REG_IOHPMEVT(_n)    \
+    (RISCV_IOMMU_REG_IOHPMEVT_BASE + (_n * 0x8))
+#define RISCV_IOMMU_IOHPMEVT_EVENT_ID   GENMASK_ULL(14, 0)
+#define RISCV_IOMMU_IOHPMEVT_DMASK      BIT_ULL(15)
+#define RISCV_IOMMU_IOHPMEVT_PID_PSCID  GENMASK_ULL(35, 16)
+#define RISCV_IOMMU_IOHPMEVT_DID_GSCID  GENMASK_ULL(59, 36)
+#define RISCV_IOMMU_IOHPMEVT_PV_PSCV    BIT_ULL(60)
+#define RISCV_IOMMU_IOHPMEVT_DV_GSCV    BIT_ULL(61)
+#define RISCV_IOMMU_IOHPMEVT_IDT        BIT_ULL(62)
+#define RISCV_IOMMU_IOHPMEVT_OF         BIT_ULL(63)
+
+/**
+ * enum RISCV_IOMMU_HPMEVENT_id - Performance-monitoring event identifier
+ *
+ * @RISCV_IOMMU_HPMEVENT_INVALID: Invalid event, do not count
+ * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests
+ * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests
+ * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests
+ * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses
+ * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks
+ * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks
+ * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: S/VS-Stage page table walks
+ * @RISCV_IOMMU_HPMEVENT_G_WALKS: G-Stage page table walks
+ * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs
+ */
+enum RISCV_IOMMU_HPMEVENT_id {
+    RISCV_IOMMU_HPMEVENT_INVALID    = 0,
+    RISCV_IOMMU_HPMEVENT_URQ        = 1,
+    RISCV_IOMMU_HPMEVENT_TRQ        = 2,
+    RISCV_IOMMU_HPMEVENT_ATS_RQ     = 3,
+    RISCV_IOMMU_HPMEVENT_TLB_MISS   = 4,
+    RISCV_IOMMU_HPMEVENT_DD_WALK    = 5,
+    RISCV_IOMMU_HPMEVENT_PD_WALK    = 6,
+    RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7,
+    RISCV_IOMMU_HPMEVENT_G_WALKS    = 8,
+    RISCV_IOMMU_HPMEVENT_MAX        = 9
+};
+
+/* 5.24 Translation request IOVA (64bits) */
+#define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
+#define RISCV_IOMMU_TR_REQ_IOVA_VPN     GENMASK_ULL(63, 12)
+
+/* 5.25 Translation request control (64bits) */
+#define RISCV_IOMMU_REG_TR_REQ_CTL      0x0260
+#define RISCV_IOMMU_TR_REQ_CTL_GO_BUSY  BIT_ULL(0)
+#define RISCV_IOMMU_TR_REQ_CTL_PRIV     BIT_ULL(1)
+#define RISCV_IOMMU_TR_REQ_CTL_EXE      BIT_ULL(2)
+#define RISCV_IOMMU_TR_REQ_CTL_NW       BIT_ULL(3)
+#define RISCV_IOMMU_TR_REQ_CTL_PID      GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_TR_REQ_CTL_PV       BIT_ULL(32)
+#define RISCV_IOMMU_TR_REQ_CTL_DID      GENMASK_ULL(63, 40)
+
+/* 5.26 Translation request response (64bits) */
+#define RISCV_IOMMU_REG_TR_RESPONSE     0x0268
+#define RISCV_IOMMU_TR_RESPONSE_FAULT   BIT_ULL(0)
+#define RISCV_IOMMU_TR_RESPONSE_PBMT    GENMASK_ULL(8, 7)
+#define RISCV_IOMMU_TR_RESPONSE_SZ      BIT_ULL(9)
+#define RISCV_IOMMU_TR_RESPONSE_PPN     RISCV_IOMMU_PPN_FIELD
+
+
+/* 5.27 Interrupt cause to vector (64bits) */
+#define RISCV_IOMMU_REG_IVEC            0x02F8
+#define RISCV_IOMMU_IVEC_CIV            GENMASK_ULL(3, 0)
+#define RISCV_IOMMU_IVEC_FIV            GENMASK_ULL(7, 4)
+#define RISCV_IOMMU_IVEC_PMIV           GENMASK_ULL(11, 8)
+#define RISCV_IOMMU_IVEC_PIV            GENMASK_ULL(15, 12)
+
+
+/* 5.28 MSI Configuration table (32 * 64bits) */
+#define RISCV_IOMMU_REG_MSI_CONFIG      0x0300
+#define RISCV_IOMMU_REG_MSI_ADDR(_n)    \
+    (RISCV_IOMMU_REG_MSI_CONFIG + (_n * 0x10))
+#define RISCV_IOMMU_MSI_ADDR            GENMASK_ULL(55, 2)
+#define RISCV_IOMMU_REG_MSI_DATA(_n)    \
+    (RISCV_IOMMU_REG_MSI_CONFIG + (_n * 0x10) + 0x08)
+#define RISCV_IOMMU_MSI_DATA            GENMASK_ULL(31, 0)
+#define RISCV_IOMMU_REG_MSI_VEC_CTL(_n) \
+    (RISCV_IOMMU_REG_MSI_CONFIG + (_n * 0x10) + 0x0C)
+#define RISCV_IOMMU_MSI_VEC_CTL_M      BIT_ULL(0)
+
+
+#define RISCV_IOMMU_REG_SIZE           0x1000
+
+/*
+ * Chapter 2: Data structures
+ */
+
+/*
+ * Device Directory Table macros for non-leaf nodes
+ */
+#define RISCV_IOMMU_DDTE_VALID          BIT_ULL(0)
+#define RISCV_IOMMU_DDTE_PPN            RISCV_IOMMU_PPN_FIELD
+
+/**
+ * struct riscv_iommu_dc - Device Context
+ * @tc: Translation Control
+ * @iohgatp: I/O Hypervisor guest address translation and protection
+ *           (Second stage context)
+ * @ta: Translation Attributes
+ * @fsc: First stage context
+ * @msiptpt: MSI page table pointer
+ * @msi_addr_mask: MSI address mask
+ * @msi_addr_pattern: MSI address pattern
+ *
+ * This structure is used for leaf nodes on the Device Directory Table,
+ * in case RISCV_IOMMU_CAP_MSI_FLAT is not set, the bottom 4 fields are
+ * not present and are skipped with pointer arithmetic to avoid
+ * casting, check out riscv_iommu_get_dc().
+ * See section 2.1 for more details
+ */
+struct riscv_iommu_dc {
+      uint64_t tc;
+      uint64_t iohgatp;
+      uint64_t ta;
+      uint64_t fsc;
+      uint64_t msiptp;
+      uint64_t msi_addr_mask;
+      uint64_t msi_addr_pattern;
+      uint64_t _reserved;
+};
+
+/* Translation control fields */
+#define RISCV_IOMMU_DC_TC_V             BIT_ULL(0)
+#define RISCV_IOMMU_DC_TC_EN_ATS        BIT_ULL(1)
+#define RISCV_IOMMU_DC_TC_EN_PRI        BIT_ULL(2)
+#define RISCV_IOMMU_DC_TC_T2GPA         BIT_ULL(3)
+#define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
+#define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
+#define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
+#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
+#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
+#define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
+#define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
+#define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
+
+/* Second-stage (aka G-stage) context fields */
+#define RISCV_IOMMU_DC_IOHGATP_PPN      RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_DC_IOHGATP_GSCID    GENMASK_ULL(59, 44)
+#define RISCV_IOMMU_DC_IOHGATP_MODE     RISCV_IOMMU_ATP_MODE_FIELD
+
+/**
+ * enum riscv_iommu_dc_iohgatp_modes - Guest address
+ * translation/protection modes
+ *
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+ *      No translation/protection
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
+ *      Sv32x4 (2-bit extension of Sv32), when fctl.GXL == 1
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
+ *      Sv39x4 (2-bit extension of Sv39), when fctl.GXL == 0
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
+ *      Sv48x4 (2-bit extension of Sv48), when fctl.GXL == 0
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
+ *      Sv57x4 (2-bit extension of Sv57), when fctl.GXL == 0
+ */
+enum riscv_iommu_dc_iohgatp_modes {
+      RISCV_IOMMU_DC_IOHGATP_MODE_BARE = 0,
+      RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4 = 8,
+      RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4 = 8,
+      RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4 = 9,
+      RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4 = 10
+};
+
+/* Translation attributes fields */
+#define RISCV_IOMMU_DC_TA_PSCID         GENMASK_ULL(31, 12)
+
+/* First-stage context fields */
+#define RISCV_IOMMU_DC_FSC_PPN          RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_DC_FSC_MODE         RISCV_IOMMU_ATP_MODE_FIELD
+
+/**
+ * enum riscv_iommu_dc_fsc_atp_modes - First stage address
+ * translation/protection modes
+ *
+ * @RISCV_IOMMU_DC_FSC_MODE_BARE: No translation/protection
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32: Sv32, when dc.tc.SXL == 1
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: Sv39, when dc.tc.SXL == 0
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: Sv48, when dc.tc.SXL == 0
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: Sv57, when dc.tc.SXL == 0
+ * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8: 1lvl PDT, 8bit process ids
+ * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17: 2lvl PDT, 17bit process ids
+ * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20: 3lvl PDT, 20bit process ids
+ *
+ * FSC holds IOSATP when RISCV_IOMMU_DC_TC_PDTV is 0 and PDTP otherwise.
+ * IOSATP controls the first stage address translation (same as the satp
+ * register on the RISC-V MMU), and PDTP holds the process directory table,
+ * used to select a first stage page table based on a process id (for devices
+ * that support multiple process ids).
+ */
+enum riscv_iommu_dc_fsc_atp_modes {
+      RISCV_IOMMU_DC_FSC_MODE_BARE = 0,
+      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 = 8,
+      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 = 8,
+      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48 = 9,
+      RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57 = 10,
+      RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8 = 1,
+      RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17 = 2,
+      RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20 = 3
+};
+
+/* MSI page table pointer */
+#define RISCV_IOMMU_DC_MSIPTP_PPN       RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_DC_MSIPTP_MODE      RISCV_IOMMU_ATP_MODE_FIELD
+#define RISCV_IOMMU_DC_MSIPTP_MODE_OFF  0
+#define RISCV_IOMMU_DC_MSIPTP_MODE_FLAT 1
+
+/* MSI address mask */
+#define RISCV_IOMMU_DC_MSI_ADDR_MASK    GENMASK_ULL(51, 0)
+
+/* MSI address pattern */
+#define RISCV_IOMMU_DC_MSI_PATTERN      GENMASK_ULL(51, 0)
+
+
+/**
+ * struct riscv_iommu_pc - Process Context
+ * @ta: Translation Attributes
+ * @fsc: First stage context
+ *
+ * This structure is used for leaf nodes on the Process Directory Table
+ * See section 2.3 for more details
+ */
+struct riscv_iommu_pc {
+      uint64_t ta;
+      uint64_t fsc;
+};
+
+/* Translation attributes fields */
+#define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
+#define RISCV_IOMMU_PC_TA_ENS           BIT_ULL(1)
+#define RISCV_IOMMU_PC_TA_SUM           BIT_ULL(2)
+#define RISCV_IOMMU_PC_TA_PSCID         GENMASK_ULL(31, 12)
+
+/* First stage context fields */
+#define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
+#define RISCV_IOMMU_PC_FSC_MODE         GENMASK_ULL(63, 60)
+
+
+/*
+ * Chapter 3: In-memory queue interface
+ */
+
+/**
+ * struct riscv_iommu_cmd - Generic I/O MMU command structure
+ * @dword0: Includes the opcode and the function identifier
+ * @dword1: Opcode specific data
+ *
+ * The commands are interpreted as two 64bit fields, where the first
+ * 7bits of the first field are the opcode which also defines the
+ * command's format, followed by a 3bit field that specifies the
+ * function invoked by that command, and the rest is opcode-specific.
+ * This is a generic struct which will be populated differently
+ * according to each command. For more infos on the commands and
+ * the command queue check section 3.1.
+ */
+struct riscv_iommu_command {
+      uint64_t dword0;
+      uint64_t dword1;
+};
+
+/* Fields on dword0, common for all commands */
+#define RISCV_IOMMU_CMD_OPCODE          GENMASK_ULL(6, 0)
+#define RISCV_IOMMU_CMD_FUNC            GENMASK_ULL(9, 7)
+
+/* 3.1.1 I/O MMU Page-table cache invalidation */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_IOTINVAL_OPCODE         1
+#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA       0
+#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA      1
+#define RISCV_IOMMU_CMD_IOTINVAL_AV     BIT_ULL(10)
+#define RISCV_IOMMU_CMD_IOTINVAL_PSCID  GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_CMD_IOTINVAL_PSCV   BIT_ULL(32)
+#define RISCV_IOMMU_CMD_IOTINVAL_GV     BIT_ULL(33)
+#define RISCV_IOMMU_CMD_IOTINVAL_GSCID  GENMASK_ULL(59, 44)
+/* dword1 is the address, 4K-alligned and shifted to the right by two bits. */
+
+/* 3.1.2 I/O MMU Command Queue Fences */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_IOFENCE_OPCODE          2
+#define RISCV_IOMMU_CMD_IOFENCE_FUNC_C          0
+#define RISCV_IOMMU_CMD_IOFENCE_AV      BIT_ULL(10)
+#define RISCV_IOMMU_CMD_IOFENCE_WSI     BIT_ULL(11)
+#define RISCV_IOMMU_CMD_IOFENCE_PR      BIT_ULL(12)
+#define RISCV_IOMMU_CMD_IOFENCE_PW      BIT_ULL(13)
+#define RISCV_IOMMU_CMD_IOFENCE_DATA    GENMASK_ULL(63, 32)
+/* dword1 is the address, word-size alligned and shifted to the right by two bits. */
+
+/* 3.1.3 I/O MMU Directory cache invalidation */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_IODIR_OPCODE            3
+#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT    0
+#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT    1
+#define RISCV_IOMMU_CMD_IODIR_PID       GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_CMD_IODIR_DV        BIT_ULL(33)
+#define RISCV_IOMMU_CMD_IODIR_DID       GENMASK_ULL(63, 40)
+/* dword1 is reserved for standard use */
+
+/* 3.1.4 I/O MMU PCIe ATS */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_ATS_OPCODE              4
+#define RISCV_IOMMU_CMD_ATS_FUNC_INVAL          0
+#define RISCV_IOMMU_CMD_ATS_FUNC_PRGR           1
+#define RISCV_IOMMU_CMD_ATS_PID         GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_CMD_ATS_PV          BIT_ULL(32)
+#define RISCV_IOMMU_CMD_ATS_DSV         BIT_ULL(33)
+#define RISCV_IOMMU_CMD_ATS_RID         GENMASK_ULL(55, 40)
+#define RISCV_IOMMU_CMD_ATS_DSEG        GENMASK_ULL(63, 56)
+/* dword1 is the ATS payload, two different payload types for INVAL and PRGR */
+
+/* ATS.INVAL payload*/
+#define RISCV_IOMMU_CMD_ATS_INVAL_G     BIT_ULL(0)
+/* Bits 1 - 10 are zeroed */
+#define RISCV_IOMMU_CMD_ATS_INVAL_S     BIT_ULL(11)
+#define RISCV_IOMMU_CMD_ATS_INVAL_UADDR GENMASK_ULL(63, 12)
+
+/* ATS.PRGR payload */
+/* Bits 0 - 31 are zeroed */
+#define RISCV_IOMMU_CMD_ATS_PRGR_PRG_INDEX      GENMASK_ULL(40, 32)
+/* Bits 41 - 43 are zeroed */
+#define RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE      GENMASK_ULL(47, 44)
+#define RISCV_IOMMU_CMD_ATS_PRGR_DST_ID         GENMASK_ULL(63, 48)
+
+
+/**
+ * struct riscv_iommu_fq_record - Fault/Event Queue Record
+ * @hdr: Header, includes fault/event cause, PID/DID, transaction type etc
+ * @_reserved: Low 32bits for custom use, high 32bits for standard use
+ * @iotval: Transaction-type/cause specific format
+ * @iotval2: Cause specific format
+ *
+ * The fault/event queue reports events and failures raised when
+ * processing transactions. Each record is a 32byte structure where
+ * the first dword has a fixed format for providing generic infos
+ * regarding the fault/event, and two more dwords are there for
+ * fault/event-specific information. For more details see section
+ * 3.2.
+ */
+struct riscv_iommu_fq_record {
+      uint64_t hdr;
+      uint64_t _reserved;
+      uint64_t iotval;
+      uint64_t iotval2;
+};
+
+/* Fields on header */
+#define RISCV_IOMMU_FQ_HDR_CAUSE        GENMASK_ULL(11, 0)
+#define RISCV_IOMMU_FQ_HDR_PID          GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_FQ_HDR_PV           BIT_ULL(32)
+#define RISCV_IOMMU_FQ_HDR_PRIV         BIT_ULL(33)
+#define RISCV_IOMMU_FQ_HDR_TTYPE        GENMASK_ULL(39, 34)
+#define RISCV_IOMMU_FQ_HDR_DID          GENMASK_ULL(63, 40)
+
+/**
+ * enum riscv_iommu_fq_causes - Fault/event cause values
+ * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT: Instruction access fault
+ * @RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED: Read address misaligned
+ * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT: Read load fault
+ * @RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED: Write/AMO address misaligned
+ * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT: Write/AMO access fault
+ * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S: Instruction page fault
+ * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S: Read page fault
+ * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S: Write/AMO page fault
+ * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS: Instruction guest page fault
+ * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS: Read guest page fault
+ * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS: Write/AMO guest page fault
+ * @RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED: All inbound transactions disallowed
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT: DDT entry load access fault
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_INVALID: DDT entry invalid
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED: DDT entry misconfigured
+ * @RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED: Transaction type disallowed
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT: MSI PTE load access fault
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_INVALID: MSI PTE invalid
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED: MSI PTE misconfigured
+ * @RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT: MRIF access fault
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT: PDT entry load access fault
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_INVALID: PDT entry invalid
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED: PDT entry misconfigured
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED: DDT data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED: PDT data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED: MSI page table data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED: MRIF data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR: Internal data path error
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT: IOMMU MSI write access fault
+ * @RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED: First/second stage page table data corruption
+ *
+ * Values are on table 11 of the spec, encodings 275 - 2047 are reserved for
+ * standard use, and 2048 - 4095 for custom use.
+ */
+enum riscv_iommu_fq_causes {
+      RISCV_IOMMU_FQ_CAUSE_INST_FAULT           = 1,
+      RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED   = 4,
+      RISCV_IOMMU_FQ_CAUSE_RD_FAULT             = 5,
+      RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED   = 6,
+      RISCV_IOMMU_FQ_CAUSE_WR_FAULT             = 7,
+      RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S         = 12,
+      RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S           = 13,
+      RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S           = 15,
+      RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS        = 20,
+      RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS          = 21,
+      RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS          = 23,
+      RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED         = 256,
+      RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT       = 257,
+      RISCV_IOMMU_FQ_CAUSE_DDT_INVALID          = 258,
+      RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED    = 259,
+      RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED        = 260,
+      RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT       = 261,
+      RISCV_IOMMU_FQ_CAUSE_MSI_INVALID          = 262,
+      RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED    = 263,
+      RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT           = 264,
+      RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT       = 265,
+      RISCV_IOMMU_FQ_CAUSE_PDT_INVALID          = 266,
+      RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED    = 267,
+      RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED        = 268,
+      RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED        = 269,
+      RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED     = 270,
+      RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED      = 271,
+      RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR    = 272,
+      RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT         = 273,
+      RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED         = 274
+};
+
+/**
+ * enum riscv_iommu_fq_ttypes: Fault/event transaction types
+ * @RISCV_IOMMU_FQ_TTYPE_NONE: None. Fault not caused by an inbound transaction.
+ * @RISCV_IOMMU_FQ_TTYPE_UADDR_INST_FETCH: Instruction fetch from untranslated address
+ * @RISCV_IOMMU_FQ_TTYPE_UADDR_RD: Read from untranslated address
+ * @RISCV_IOMMU_FQ_TTYPE_UADDR_WR: Write/AMO to untranslated address
+ * @RISCV_IOMMU_FQ_TTYPE_TADDR_INST_FETCH: Instruction fetch from translated address
+ * @RISCV_IOMMU_FQ_TTYPE_TADDR_RD: Read from translated address
+ * @RISCV_IOMMU_FQ_TTYPE_TADDR_WR: Write/AMO to translated address
+ * @RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ: PCIe ATS translation request
+ * @RISCV_IOMMU_FW_TTYPE_PCIE_MSG_REQ: PCIe message request
+ *
+ * Values are on table 12 of the spec, type 4 and 10 - 31 are reserved for
+ * standard use and 31 - 63 for custom use.
+ */
+enum riscv_iommu_fq_ttypes {
+      RISCV_IOMMU_FQ_TTYPE_NONE = 0,
+      RISCV_IOMMU_FQ_TTYPE_UADDR_INST_FETCH = 1,
+      RISCV_IOMMU_FQ_TTYPE_UADDR_RD = 2,
+      RISCV_IOMMU_FQ_TTYPE_UADDR_WR = 3,
+      RISCV_IOMMU_FQ_TTYPE_TADDR_INST_FETCH = 5,
+      RISCV_IOMMU_FQ_TTYPE_TADDR_RD = 6,
+      RISCV_IOMMU_FQ_TTYPE_TADDR_WR = 7,
+      RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ = 8,
+      RISCV_IOMMU_FW_TTYPE_PCIE_MSG_REQ = 9,
+};
+
+
+/**
+ * struct riscv_iommu_pq_record - PCIe Page Request record
+ * @hdr: Header, includes PID, DID etc
+ * @payload: Holds the page address, request group and permission bits
+ *
+ * For more infos on the PCIe Page Request queue see chapter 3.3.
+ */
+struct riscv_iommu_pq_record {
+      uint64_t hdr;
+      uint64_t payload;
+};
+
+/* Header fields */
+#define RISCV_IOMMU_PREQ_HDR_PID        GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_PREQ_HDR_PV         BIT_ULL(32)
+#define RISCV_IOMMU_PREQ_HDR_PRIV       BIT_ULL(33)
+#define RISCV_IOMMU_PREQ_HDR_EXEC       BIT_ULL(34)
+#define RISCV_IOMMU_PREQ_HDR_DID        GENMASK_ULL(63, 40)
+
+/* Payload fields */
+#define RISCV_IOMMU_PREQ_PAYLOAD_R      BIT_ULL(0)
+#define RISCV_IOMMU_PREQ_PAYLOAD_W      BIT_ULL(1)
+#define RISCV_IOMMU_PREQ_PAYLOAD_L      BIT_ULL(2)
+#define RISCV_IOMMU_PREQ_PAYLOAD_M      GENMASK_ULL(2, 0)
+#define RISCV_IOMMU_PREQ_PRG_INDEX      GENMASK_ULL(11, 3)
+#define RISCV_IOMMU_PREQ_UADDR          GENMASK_ULL(63, 12)
+
+
+/**
+ * struct riscv_iommu_msi_pte - MSI Page Table Entry
+ * @pte: MSI PTE
+ * @mrif_info: Memory-resident interrupt file info
+ *
+ * The MSI Page Table is used for virtualizing MSIs, so that when
+ * a device sends an MSI to a guest, the IOMMU can reroute it
+ * by translating the MSI address, either to a guest interrupt file
+ * or a memory resident interrupt file (MRIF). Note that this page table
+ * is an array of MSI PTEs, not a multi-level pt, each entry
+ * is a leaf entry. For more infos check out the the AIA spec, chapter 9.5.
+ *
+ * Also in basic mode the mrif_info field is ignored by the IOMMU and can
+ * be used by software, any other reserved fields on pte must be zeroed-out
+ * by software.
+ */
+struct riscv_iommu_msi_pte {
+      uint64_t pte;
+      uint64_t mrif_info;
+};
+
+/* Fields on pte */
+#define RISCV_IOMMU_MSI_PTE_V           BIT_ULL(0)
+#define RISCV_IOMMU_MSI_PTE_M           GENMASK_ULL(2, 1)
+
+#define RISCV_IOMMU_MSI_PTE_M_MRIF      1
+#define RISCV_IOMMU_MSI_PTE_M_BASIC     3
+
+/* When M == 1 (MRIF mode) */
+#define RISCV_IOMMU_MSI_PTE_MRIF_ADDR   GENMASK_ULL(53, 7)
+/* When M == 3 (basic mode) */
+#define RISCV_IOMMU_MSI_PTE_PPN         RISCV_IOMMU_PPN_FIELD
+#define RISCV_IOMMU_MSI_PTE_C           BIT_ULL(63)
+
+/* Fields on mrif_info */
+#define RISCV_IOMMU_MSI_MRIF_NID        GENMASK_ULL(9, 0)
+#define RISCV_IOMMU_MSI_MRIF_NPPN       RISCV_IOMMU_PPN_FIELD
+#define RISCV_IOMMU_MSI_MRIF_NID_MSB    BIT_ULL(60)
+
+
+#endif /* _RISCV_IOMMU_BITS_H_ */
diff --git a/hw/riscv/riscv-iommu-pci.c b/hw/riscv/riscv-iommu-pci.c
new file mode 100644
index 0000000000..e205f806d6
--- /dev/null
+++ b/hw/riscv/riscv-iommu-pci.c
@@ -0,0 +1,181 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU (Ziommu)
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/qdev-properties.h"
+#include "hw/riscv/riscv_hart.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/host-utils.h"
+#include "qom/object.h"
+
+#include "cpu_bits.h"
+#include "riscv-iommu.h"
+#include "riscv-iommu-bits.h"
+
+#ifndef PCI_VENDOR_ID_RIVOS
+#define PCI_VENDOR_ID_RIVOS           0x1efd
+#endif
+
+#ifndef PCI_DEVICE_ID_RIVOS_IOMMU
+#define PCI_DEVICE_ID_RIVOS_IOMMU     0xedf1
+#endif
+
+/* RISC-V IOMMU PCI Device Emulation */
+
+typedef struct RISCVIOMMUStatePci {
+    PCIDevice        pci;     /* Parent PCIe device state */
+    MemoryRegion     bar0;    /* PCI BAR (including MSI-x config) */
+    RISCVIOMMUState  iommu;   /* common IOMMU state */
+} RISCVIOMMUStatePci;
+
+/* interrupt delivery callback */
+static void riscv_iommu_pci_notify(RISCVIOMMUState *iommu, unsigned vector)
+{
+    RISCVIOMMUStatePci *s = container_of(iommu, RISCVIOMMUStatePci, iommu);
+
+    if (msix_enabled(&(s->pci))) {
+        msix_notify(&(s->pci), vector);
+    }
+}
+
+static void riscv_iommu_pci_realize(PCIDevice *dev, Error **errp)
+{
+    RISCVIOMMUStatePci *s = DO_UPCAST(RISCVIOMMUStatePci, pci, dev);
+    RISCVIOMMUState *iommu = &s->iommu;
+    uint64_t cap = iommu->cap;
+    Error *err = NULL;
+
+    /* Set device id for trace / debug */
+    DEVICE(iommu)->id = g_strdup_printf("%02x:%02x.%01x",
+        pci_dev_bus_num(dev), PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+
+    /* Support MSI only */
+    cap = set_field(cap, RISCV_IOMMU_CAP_IGS, RISCV_IOMMU_CAP_IGS_MSI);
+    qdev_prop_set_uint64(DEVICE(dev), "capabilities", cap);
+
+    if (!qdev_realize(DEVICE(iommu), NULL, errp)) {
+        return;
+    }
+
+    memory_region_init(&s->bar0, OBJECT(s), "riscv-iommu-bar0",
+        QEMU_ALIGN_UP(memory_region_size(&iommu->regs_mr), TARGET_PAGE_SIZE));
+    memory_region_add_subregion(&s->bar0, 0, &iommu->regs_mr);
+
+    pcie_endpoint_cap_init(dev, 0);
+
+    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
+                     PCI_BASE_ADDRESS_MEM_TYPE_64, &s->bar0);
+
+    int ret = msix_init(dev, RISCV_IOMMU_INTR_COUNT,
+                        &s->bar0, 0, RISCV_IOMMU_REG_MSI_CONFIG,
+                        &s->bar0, 0, RISCV_IOMMU_REG_MSI_CONFIG + 256, 0, &err);
+
+    if (ret == -ENOTSUP) {
+        /*
+         * MSI-x is not supported by the platform.
+         * Driver should use timer/polling based notification handlers.
+         */
+        warn_report_err(err);
+    } else if (ret < 0) {
+        error_propagate(errp, err);
+        return;
+    } else {
+        /* mark all allocated MSIx vectors as used. */
+        msix_vector_use(dev, RISCV_IOMMU_INTR_CQ);
+        msix_vector_use(dev, RISCV_IOMMU_INTR_FQ);
+        msix_vector_use(dev, RISCV_IOMMU_INTR_PM);
+        msix_vector_use(dev, RISCV_IOMMU_INTR_PQ);
+        iommu->notify = riscv_iommu_pci_notify;
+    }
+
+    PCIBus *bus = pci_device_root_bus(dev);
+    if (!bus) {
+        error_setg(errp, "can't find PCIe root port for %02x:%02x.%x",
+            pci_bus_num(pci_get_bus(dev)), PCI_SLOT(dev->devfn),
+            PCI_FUNC(dev->devfn));
+        return;
+    }
+
+    riscv_iommu_pci_setup_iommu(iommu, bus, errp);
+}
+
+static void riscv_iommu_pci_exit(PCIDevice *pci_dev)
+{
+    pci_setup_iommu(pci_device_root_bus(pci_dev), NULL, NULL);
+}
+
+static const VMStateDescription riscv_iommu_vmstate = {
+    .name = "riscv-iommu",
+    .unmigratable = 1
+};
+
+static void riscv_iommu_pci_init(Object *obj)
+{
+    RISCVIOMMUStatePci *s = RISCV_IOMMU_PCI(obj);
+    RISCVIOMMUState *iommu = &s->iommu;
+
+    object_initialize_child(obj, "iommu", iommu, TYPE_RISCV_IOMMU);
+    qdev_alias_all_properties(DEVICE(iommu), obj);
+}
+
+static Property riscv_iommu_pci_properties[] = {
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void riscv_iommu_pci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    k->realize = riscv_iommu_pci_realize;
+    k->exit = riscv_iommu_pci_exit;
+    k->vendor_id = PCI_VENDOR_ID_RIVOS;
+    k->device_id = PCI_DEVICE_ID_RIVOS_IOMMU;
+    k->revision = 0;
+    k->class_id = 0x0806;
+    dc->desc = "RISCV-IOMMU DMA Remapping device";
+    dc->vmsd = &riscv_iommu_vmstate;
+    dc->hotpluggable = false;
+    dc->user_creatable = true;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    device_class_set_props(dc, riscv_iommu_pci_properties);
+}
+
+static const TypeInfo riscv_iommu_pci = {
+    .name = TYPE_RISCV_IOMMU_PCI,
+    .parent = TYPE_PCI_DEVICE,
+    .class_init = riscv_iommu_pci_class_init,
+    .instance_init = riscv_iommu_pci_init,
+    .instance_size = sizeof(RISCVIOMMUStatePci),
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_PCIE_DEVICE },
+        { },
+    },
+};
+
+static void riscv_iommu_register_pci_types(void)
+{
+    type_register_static(&riscv_iommu_pci);
+}
+
+type_init(riscv_iommu_register_pci_types);
diff --git a/hw/riscv/riscv-iommu-sys.c b/hw/riscv/riscv-iommu-sys.c
new file mode 100644
index 0000000000..7148588b59
--- /dev/null
+++ b/hw/riscv/riscv-iommu-sys.c
@@ -0,0 +1,123 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU (Ziommu) - Platform Device
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/irq.h"
+#include "hw/qdev-properties.h"
+#include "hw/sysbus.h"
+#include "qapi/error.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#include "cpu_bits.h"
+#include "riscv-iommu.h"
+#include "riscv-iommu-bits.h"
+
+/* RISC-V IOMMU System Platform Device Emulation */
+
+struct RISCVIOMMUStateSys {
+    SysBusDevice     parent;
+    uint64_t         addr;
+    qemu_irq         irqs[4];
+    RISCVIOMMUState  iommu;
+};
+
+/* interrupt delivery callback */
+static void riscv_iommu_sys_notify(RISCVIOMMUState *iommu, unsigned vector)
+{
+    RISCVIOMMUStateSys *s = container_of(iommu, RISCVIOMMUStateSys, iommu);
+
+    if (vector < RISCV_IOMMU_INTR_COUNT && s->irqs[vector]) {
+        qemu_irq_pulse(s->irqs[vector]);
+    }
+}
+
+static void riscv_iommu_sys_realize(DeviceState *dev, Error **errp)
+{
+    RISCVIOMMUStateSys *s = RISCV_IOMMU_SYS(dev);
+    RISCVIOMMUState *iommu = &s->iommu;
+    PCIBus *pci_bus;
+    uint64_t cap = iommu->cap;
+    int i;
+
+    /* Support WSI only */
+    cap = set_field(cap, RISCV_IOMMU_CAP_IGS, RISCV_IOMMU_CAP_IGS_WSI);
+    qdev_prop_set_uint64(dev, "capabilities", cap);
+
+    if (!qdev_realize(DEVICE(iommu), NULL, errp)) {
+        return;
+    }
+
+    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &iommu->regs_mr);
+    if (s->addr) {
+        sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, s->addr);
+    }
+
+    for (i = 0; i < RISCV_IOMMU_INTR_COUNT; i++) {
+        sysbus_init_irq(&s->parent, &s->irqs[i]);
+    }
+
+    iommu->notify = riscv_iommu_sys_notify;
+
+    pci_bus = (PCIBus *) object_resolve_path_type("", TYPE_PCI_BUS, NULL);
+    if (pci_bus) {
+        riscv_iommu_pci_setup_iommu(iommu, pci_bus, errp);
+    }
+}
+
+static void riscv_iommu_sys_init(Object *obj)
+{
+    RISCVIOMMUStateSys *s = RISCV_IOMMU_SYS(obj);
+    RISCVIOMMUState *iommu = &s->iommu;
+
+    object_initialize_child(obj, "iommu", iommu, TYPE_RISCV_IOMMU);
+    qdev_alias_all_properties(DEVICE(iommu), obj);
+}
+
+static Property riscv_iommu_sys_properties[] = {
+    DEFINE_PROP_UINT64("addr", RISCVIOMMUStateSys, addr, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void riscv_iommu_sys_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    dc->realize = riscv_iommu_sys_realize;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    device_class_set_props(dc, riscv_iommu_sys_properties);
+}
+
+static const TypeInfo riscv_iommu_sys = {
+    .name          = TYPE_RISCV_IOMMU_SYS,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .class_init    = riscv_iommu_sys_class_init,
+    .instance_init = riscv_iommu_sys_init,
+    .instance_size = sizeof(RISCVIOMMUStateSys),
+};
+
+static void riscv_iommu_register_sys(void)
+{
+    type_register_static(&riscv_iommu_sys);
+}
+
+type_init(riscv_iommu_register_sys)
diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
new file mode 100644
index 0000000000..fd271b2988
--- /dev/null
+++ b/hw/riscv/riscv-iommu.c
@@ -0,0 +1,2539 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU (Ziommu)
+ *
+ * Copyright (C) 2021-2023, Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci_device.h"
+#include "hw/qdev-properties.h"
+#include "hw/riscv/riscv_hart.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qemu/timer.h"
+
+#include "cpu_bits.h"
+#include "riscv-iommu.h"
+#include "riscv-iommu-bits.h"
+#include "trace.h"
+
+#define LIMIT_CACHE_CTX               (1U << 7)
+#define LIMIT_CACHE_IOT               (1U << 20)
+
+/* Physical page number coversions */
+#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
+#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
+
+typedef struct RISCVIOMMUContext RISCVIOMMUContext;
+typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
+
+/* Device assigned I/O address space */
+struct RISCVIOMMUSpace {
+    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
+    AddressSpace iova_as;       /* IOVA address space for attached device */
+    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
+    uint32_t devid;             /* Requester identifier, AKA device_id */
+    bool notifier;              /* IOMMU unmap notifier enabled */
+    QLIST_ENTRY(RISCVIOMMUSpace) list;
+};
+
+/* Device translation context state. */
+struct RISCVIOMMUContext {
+    uint64_t devid:24;          /* Requester Id, AKA device_id */
+    uint64_t pasid:20;          /* Process Address Space ID */
+    uint64_t __rfu:20;          /* reserved */
+    uint64_t tc;                /* Translation Control */
+    uint64_t ta;                /* Translation Attributes */
+    uint64_t satp;              /* S-Stage address translation and protection */
+    uint64_t gatp;              /* G-Stage address translation and protection */
+    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
+    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
+    uint64_t msiptp;            /* MSI redirection page table pointer */
+};
+
+/* Address translation cache entry */
+struct RISCVIOMMUEntry {
+    uint64_t iova:44;           /* IOVA Page Number */
+    uint64_t pscid:20;          /* Process Soft-Context identifier */
+    uint64_t phys:44;           /* Physical Page Number */
+    uint64_t gscid:16;          /* Guest Soft-Context identifier */
+    uint64_t perm:2;            /* IOMMU_RW flags */
+    uint64_t __rfu:2;
+};
+
+/* IOMMU index for transactions without PASID specified. */
+#define RISCV_IOMMU_NOPASID 0
+
+static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
+{
+    const uint32_t ipsr =
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
+    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
+    if (s->notify && !(ipsr & (1 << vec))) {
+        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
+    }
+}
+
+static void riscv_iommu_fault(RISCVIOMMUState *s, struct riscv_iommu_fq_record *ev)
+{
+    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
+    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
+    uint32_t next = (tail + 1) & s->fq_mask;
+    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
+
+    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                          PCI_FUNC(devid), ev->hdr, ev->iotval);
+
+    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
+        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
+        return;
+    }
+
+    if (head == next) {
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, RISCV_IOMMU_FQCSR_FQOF, 0);
+    } else {
+        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
+        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
+                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, RISCV_IOMMU_FQCSR_FQMF, 0);
+        } else {
+            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
+        }
+    }
+
+    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
+    }
+}
+
+static void riscv_iommu_pri(RISCVIOMMUState *s,
+    struct riscv_iommu_pq_record *pr)
+{
+    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
+    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
+    uint32_t next = (tail + 1) & s->pq_mask;
+    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
+
+    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                          PCI_FUNC(devid), pr->payload);
+
+    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
+        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
+        return;
+    }
+
+    if (head == next) {
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, RISCV_IOMMU_PQCSR_PQOF, 0);
+    } else {
+        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
+        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
+                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, RISCV_IOMMU_PQCSR_PQMF, 0);
+        } else {
+            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
+        }
+    }
+
+    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
+    }
+}
+
+static void __hpm_incr_ctr(RISCVIOMMUState *s, uint32_t ctr_idx)
+{
+    const uint32_t off = ctr_idx << 3;
+    uint64_t cntr_val;
+
+    qemu_spin_lock(&s->regs_lock);
+    cntr_val = ldq_le_p(&s->regs_rw[RISCV_IOMMU_REG_IOHPMCTR_BASE + off]);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_IOHPMCTR_BASE + off], cntr_val + 1);
+    qemu_spin_unlock(&s->regs_lock);
+
+    /* Handle the overflow scenario. */
+    if (cntr_val == UINT64_MAX) {
+        /*
+         * Generate interrupt only if OF bit is clear. +1 to offset the cycle
+         * register OF bit.
+         */
+        const uint32_t ovf =
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF, BIT(ctr_idx + 1), 0);
+        if (!get_field(ovf, BIT(ctr_idx + 1))) {
+            riscv_iommu_reg_mod64(s,
+                                  RISCV_IOMMU_REG_IOHPMEVT_BASE + off,
+                                  RISCV_IOMMU_IOHPMEVT_OF,
+                                  0);
+            riscv_iommu_notify(s, RISCV_IOMMU_INTR_PM);
+        }
+    }
+}
+
+static void riscv_iommu_hpm_incr_ctr(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    unsigned event_id)
+{
+    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
+    uint32_t did_gscid;
+    uint32_t pid_pscid;
+    uint32_t ctr_idx;
+    gpointer value;
+    uint32_t ctrs;
+    uint64_t evt;
+
+    if (!(s->cap & RISCV_IOMMU_CAP_HPM)) {
+        return;
+    }
+
+    pthread_rwlock_rdlock(&s->ht_lock);
+    value = g_hash_table_lookup(s->hpm_event_ctr_map,
+                                GUINT_TO_POINTER(event_id));
+    if (value == NULL) {
+        pthread_rwlock_unlock(&s->ht_lock);
+        return;
+    }
+
+    for (ctrs = GPOINTER_TO_UINT(value); ctrs != 0; ctrs &= ctrs - 1) {
+        ctr_idx = ctz32(ctrs);
+        if (get_field(inhibit, BIT(ctr_idx + 1))) {
+            continue;
+        }
+
+        evt = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_IOHPMEVT_BASE + (ctr_idx << 3));
+
+        /*
+         * It's quite possible that event ID has been changed in counter
+         * but hashtable hasn't been updated yet. We don't want to increment
+         * counter for the old event ID.
+         */
+        if (event_id != get_field(evt, RISCV_IOMMU_IOHPMEVT_EVENT_ID)) {
+            continue;
+        }
+
+        if (get_field(evt, RISCV_IOMMU_IOHPMEVT_IDT)) {
+            did_gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
+            pid_pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
+        } else {
+            did_gscid = ctx->devid;
+            pid_pscid = ctx->pasid;
+        }
+
+        if (get_field(evt, RISCV_IOMMU_IOHPMEVT_PV_PSCV)) {
+            /*
+             * If the transaction does not have a valid process_id, counter
+             * increments if device_id matches DID_GSCID. If the transaction has
+             * a valid process_id, counter increments if device_id matches
+             * DID_GSCID and process_id matches PID_PSCID. See IOMMU
+             * Specification, Chapter 5.23. Performance-monitoring event
+             * selector.
+             */
+            if (ctx->pasid &&
+                get_field(evt, RISCV_IOMMU_IOHPMEVT_PID_PSCID) != pid_pscid) {
+                continue;
+            }
+        }
+
+        if (get_field(evt, RISCV_IOMMU_IOHPMEVT_DV_GSCV)) {
+            uint32_t mask = ~0;
+
+            if (get_field(evt, RISCV_IOMMU_IOHPMEVT_DMASK)) {
+                /*
+                 * 1001 1011   mask = GSCID
+                 * 0000 0111   mask = mask ^ (mask + 1)
+                 * 1111 1000   mask = ~mask;
+                 */
+                mask = get_field(evt, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
+                mask = mask ^ (mask + 1);
+                mask = ~mask;
+            }
+
+            if ((get_field(evt, RISCV_IOMMU_IOHPMEVT_DID_GSCID) & mask) !=
+                (did_gscid & mask)) {
+                continue;
+            }
+        }
+
+        __hpm_incr_ctr(s, ctr_idx);
+    }
+
+    pthread_rwlock_unlock(&s->ht_lock);
+}
+
+/* Portable implementation of pext_u64, bit-mask extraction. */
+static uint64_t _pext_u64(uint64_t val, uint64_t ext)
+{
+    uint64_t ret = 0;
+    uint64_t rot = 1;
+
+    while (ext) {
+        if (ext & 1) {
+            if (val & 1) {
+                ret |= rot;
+            }
+            rot <<= 1;
+        }
+        val >>= 1;
+        ext >>= 1;
+    }
+
+    return ret;
+}
+
+/* Check if GPA matches MSI/MRIF pattern. */
+static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    dma_addr_t gpa)
+{
+    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
+        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
+        return false; /* Invalid MSI/MRIF mode */
+    }
+
+    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
+        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
+    }
+
+    return true;
+}
+
+/*
+ * RISCV IOMMU Address Translation Lookup - Page Table Walk
+ *
+ * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
+ * Both implementation can be merged into single helper function in future.
+ * Keeping them separate for now, as error reporting and flow specifics are
+ * sufficiently different for separate implementation.
+ *
+ * @s        : IOMMU Device State
+ * @ctx      : Translation context for device id and process address space id.
+ * @iotlb    : translation data: physical address and access mode.
+ * @gpa      : provided IOVA is a guest physical address, use G-Stage only.
+ * @return   : success or fault cause code.
+ */
+static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    IOMMUTLBEntry *iotlb, bool gpa)
+{
+    dma_addr_t addr, base;
+    uint64_t satp, gatp, pte;
+    bool en_s, en_g;
+    struct {
+        unsigned char step;
+        unsigned char levels;
+        unsigned char ptidxbits;
+        unsigned char ptesize;
+    } sc[2];
+    /* Translation stage phase */
+    enum {
+        S_STAGE = 0,
+        G_STAGE = 1,
+    } pass;
+
+    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
+    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
+
+    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;
+    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
+
+    /* Early check for MSI address match when IOVA == GPA */
+    if (!en_s && (iotlb->perm & IOMMU_WO) &&
+        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
+        iotlb->target_as = &s->trap_as;
+        iotlb->translated_addr = iotlb->iova;
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        return 0;
+    }
+
+    /* Exit early for pass-through mode. */
+    if (!(en_s || en_g)) {
+        iotlb->translated_addr = iotlb->iova;
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        /* Allow R/W in pass-through mode */
+        iotlb->perm = IOMMU_RW;
+        return 0;
+    }
+
+    /* S/G translation parameters. */
+    for (pass = 0; pass < 2; pass++) {
+        sc[pass].step = 0;
+        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
+            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
+            /* 32bit mode for GXL/SXL == 1 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
+                if (!(s->cap &
+                    (pass ? RISCV_IOMMU_CAP_G_SV32 : RISCV_IOMMU_CAP_S_SV32))) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 2;
+                sc[pass].ptidxbits = 10;
+                sc[pass].ptesize   = 4;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        } else {
+            /* 64bit mode for GXL/SXL == 0 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
+                if (!(s->cap &
+                    (pass ? RISCV_IOMMU_CAP_G_SV39 : RISCV_IOMMU_CAP_S_SV39))) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 3;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
+                if (!(s->cap &
+                    (pass ? RISCV_IOMMU_CAP_G_SV48 : RISCV_IOMMU_CAP_S_SV48))) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 4;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
+                if (!(s->cap &
+                    (pass ? RISCV_IOMMU_CAP_G_SV57 : RISCV_IOMMU_CAP_S_SV57))) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 5;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        }
+    };
+
+    /* S/G stages translation tables root pointers */
+    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
+    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
+    addr = (en_s && en_g) ? satp : iotlb->iova;
+    base = en_g ? gatp : satp;
+    pass = en_g ? G_STAGE : S_STAGE;
+
+    do {
+        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
+        const unsigned va_bits = widened + sc[pass].ptidxbits;
+        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
+                                 (sc[pass].levels - 1 - sc[pass].step);
+        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
+        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
+        const bool ade =
+            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
+
+        /* Address range check before first level lookup */
+        if (!sc[pass].step) {
+            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
+            if ((addr & va_mask) != addr) {
+                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
+            }
+        }
+
+        /* Read page table entry */
+        if (dma_memory_read(s->target_as, pte_addr, &pte,
+                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
+                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
+        }
+
+        if (pass == S_STAGE) {
+            riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_S_VS_WALKS);
+        } else {
+            riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_G_WALKS);
+        }
+
+        if (sc[pass].ptesize == 4) {
+            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
+        } else {
+            pte = le64_to_cpu(pte);
+        }
+
+        sc[pass].step++;
+        hwaddr ppn = pte >> PTE_PPN_SHIFT;
+
+        if (!(pte & PTE_V)) {
+            break;                /* Invalid PTE */
+        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
+            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
+            break;                /* Reserved leaf PTE flags: PTE_W */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
+            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
+        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
+            break;                /* Misaligned PPN */
+        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
+            break;                /* Read access check failed */
+        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
+            break;                /* Write access check failed */
+        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
+            break;                /* Access bit not set */
+        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
+            break;                /* Dirty bit not set */
+        } else {
+            /* Leaf PTE, translation completed. */
+            sc[pass].step = sc[pass].levels;
+            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
+            /* Update address mask based on smallest translation granularity */
+            iotlb->addr_mask &= (1ULL << va_skip) - 1;
+            /* Continue with S-Stage translation? */
+            if (pass && sc[0].step != sc[0].levels) {
+                pass = S_STAGE;
+                addr = iotlb->iova;
+                continue;
+            }
+            /* Translation phase completed (GPA or SPA) */
+            iotlb->translated_addr = base;
+            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
+                                                         : IOMMU_RO;
+
+            /* Check MSI GPA address match */
+            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
+                riscv_iommu_msi_check(s, ctx, base)) {
+                /* Trap MSI writes and return GPA address. */
+                iotlb->target_as = &s->trap_as;
+                iotlb->addr_mask = ~TARGET_PAGE_MASK;
+                return 0;
+            }
+
+            /* Continue with G-Stage translation? */
+            if (!pass && en_g) {
+                pass = G_STAGE;
+                addr = base;
+                base = gatp;
+                sc[pass].step = 0;
+                continue;
+            }
+
+            return 0;
+        }
+
+        if (sc[pass].step == sc[pass].levels) {
+            break; /* Can't find leaf PTE */
+        }
+
+        /* Continue with G-Stage translation? */
+        if (!pass && en_g) {
+            pass = G_STAGE;
+            addr = base;
+            base = gatp;
+            sc[pass].step = 0;
+        }
+    } while (1);
+
+    return (iotlb->perm & IOMMU_WO) ?
+                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
+                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
+}
+
+/* Redirect MSI write for given GPA. */
+static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
+    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
+    unsigned size, MemTxAttrs attrs)
+{
+    MemTxResult res;
+    dma_addr_t addr;
+    uint64_t intn;
+    uint32_t n190;
+    uint64_t pte[2];
+
+    if (!riscv_iommu_msi_check(s, ctx, gpa)) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* Interrupt File Number */
+    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
+    if (intn >= 256) {
+        /* Interrupt file number out of range */
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* fetch MSI PTE */
+    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
+    addr = addr | (intn * sizeof(pte));
+    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
+            MEMTXATTRS_UNSPECIFIED);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+
+    le64_to_cpus(&pte[0]);
+    le64_to_cpus(&pte[1]);
+
+    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
+    case RISCV_IOMMU_MSI_PTE_M_BASIC:
+        /* MSI Pass-through mode */
+        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
+        addr = addr | (gpa & TARGET_PAGE_MASK);
+
+        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
+                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
+                              gpa, addr);
+
+        return dma_memory_write(s->target_as, addr, &data, size, attrs);
+    case RISCV_IOMMU_MSI_PTE_M_MRIF:
+        /* MRIF mode, continue. */
+        break;
+    default:
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /*
+     * Report an error for interrupt identities exceeding the maximum allowed
+     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
+     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
+     */
+    if ((data > 2047) || (gpa & 3)) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* MSI MRIF mode, non atomic pending bit update */
+
+    /* MRIF pending bit address */
+    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
+    addr = addr | ((data & 0x7c0) >> 3);
+
+    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
+                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
+                          gpa, addr);
+
+    /* MRIF pending bit mask */
+    data = 1ULL << (data & 0x03f);
+    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+    intn = intn | data;
+    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+
+    /* Get MRIF enable bits */
+    addr = addr + sizeof(intn);
+    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+    if (!(intn & data)) {
+        /* notification disabled, MRIF update completed. */
+        return MEMTX_OK;
+    }
+
+    /* Send notification message */
+    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
+    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
+          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
+
+    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
+    if (res != MEMTX_OK) {
+        return res;
+    }
+
+    return MEMTX_OK;
+}
+
+/*
+ * Device Context format.
+ *
+ * @s         : IOMMU Device State
+ * @return    : 0: extended (64 bytes) | 1: base (32 bytes)
+ */
+static int riscv_iommu_dc_is_base(RISCVIOMMUState *s)
+{
+    return !(s->cap & RISCV_IOMMU_CAP_MSI_FLAT);
+}
+
+/*
+ * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
+ *
+ * @s         : IOMMU Device State
+ * @ctx       : Device Translation Context with devid and pasid set.
+ * @return    : success or fault code.
+ */
+static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
+{
+    const uint64_t ddtp = s->ddtp;
+    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
+    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
+    struct riscv_iommu_dc dc;
+    const int dc_fmt = riscv_iommu_dc_is_base(s);
+    const size_t dc_len = sizeof(dc) >> dc_fmt;
+    unsigned depth;
+    uint64_t de;
+
+    switch (mode) {
+    case RISCV_IOMMU_DDTP_MODE_OFF:
+        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
+
+    case RISCV_IOMMU_DDTP_MODE_BARE:
+        /* mock up pass-through translation context */
+        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
+        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_FSC_MODE_BARE);
+        ctx->tc = RISCV_IOMMU_DC_TC_EN_ATS | RISCV_IOMMU_DC_TC_V;
+        ctx->ta = 0;
+        ctx->msiptp = 0;
+        return 0;
+
+    case RISCV_IOMMU_DDTP_MODE_1LVL:
+        depth = 0;
+        break;
+
+    case RISCV_IOMMU_DDTP_MODE_2LVL:
+        depth = 1;
+        break;
+
+    case RISCV_IOMMU_DDTP_MODE_3LVL:
+        depth = 2;
+        break;
+
+    default:
+        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+    }
+
+    /*
+     * Check supported device id width (in bits).
+     * See IOMMU Specification, Chapter 6. Software guidelines.
+     * - if extended device-context format is used:
+     *   1LVL: 6, 2LVL: 15, 3LVL: 24
+     * - if base device-context format is used:
+     *   1LVL: 7, 2LVL: 16, 3LVL: 24
+     */
+    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+    }
+
+    /* Device directory tree walk */
+    for (; depth-- > 0; ) {
+        riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_DD_WALK);
+
+        /*
+         * Select device id index bits based on device directory tree level
+         * and device context format.
+         * See IOMMU Specification, Chapter 2. Data Structures.
+         * - if extended device-context format is used:
+         *   device index: [23:15][14:6][5:0]
+         * - if base device-context format is used:
+         *   device index: [23:16][15:7][6:0]
+         */
+        const int split = depth * 9 + 6 + dc_fmt;
+        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
+        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+        }
+        le64_to_cpus(&de);
+        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID; /* invalid directory entry */
+        }
+        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID; /* reserved bits set. */
+        }
+        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
+    }
+
+    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_DD_WALK);
+
+    /* index into device context entry page */
+    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
+
+    memset(&dc, 0, sizeof(dc));
+    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
+                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+    }
+
+    /* Set translation context. */
+    ctx->tc = le64_to_cpu(dc.tc);
+    ctx->gatp = le64_to_cpu(dc.iohgatp);
+    ctx->satp = le64_to_cpu(dc.fsc);
+    ctx->ta = le64_to_cpu(dc.ta);
+    ctx->msiptp = le64_to_cpu(dc.msiptp);
+    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
+    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+    }
+
+    /* FSC field checks */
+    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
+
+    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
+        /* No S-Stage translation, done. */
+        return 0;
+    }
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
+        if (ctx->pasid != RISCV_IOMMU_NOPASID) {
+            /* PASID is disabled */
+            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
+        }
+        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
+            /* Invalid translation mode */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
+        return 0;
+    }
+
+    if (ctx->pasid == RISCV_IOMMU_NOPASID) {
+        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
+            /* No default PASID enabled, set BARE mode */
+            ctx->satp = 0ULL;
+            return 0;
+        } else {
+            /* Use default PASID #0 */
+            ctx->pasid = 0;
+        }
+    }
+
+    /* FSC.TC.PDTV enabled */
+    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
+        /* Invalid PDTP.MODE */
+        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
+    }
+
+    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
+        riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_PD_WALK);
+
+        /*
+         * Select process id index bits based on process directory tree
+         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
+         */
+        const int split = depth * 9 + 8;
+        addr |= ((ctx->pasid >> split) << 3) & ~TARGET_PAGE_MASK;
+        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
+        }
+        le64_to_cpus(&de);
+        if (!(de & RISCV_IOMMU_PC_TA_V)) {
+            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
+        }
+        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
+    }
+
+    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_PD_WALK);
+
+    /* Leaf entry in PDT */
+    addr |= (ctx->pasid << 4) & ~TARGET_PAGE_MASK;
+    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
+                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
+    }
+
+    /* Use FSC and TA from process directory entry. */
+    ctx->ta = le64_to_cpu(dc.ta);
+    ctx->satp = le64_to_cpu(dc.fsc);
+
+    return 0;
+}
+
+/* Translation Context cache support */
+static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
+{
+    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
+    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
+    return c1->devid == c2->devid && c1->pasid == c2->pasid;
+}
+
+static guint __ctx_hash(gconstpointer v)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
+    /* Generate simple hash of (pasid, devid), assuming 24-bit wide devid */
+    return (guint)(ctx->devid) + ((guint)(ctx->pasid) << 24);
+}
+
+static void __ctx_inval_devid_pasid(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
+        ctx->devid == arg->devid &&
+        ctx->pasid == arg->pasid) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
+        ctx->devid == arg->devid) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
+    uint32_t devid, uint32_t pasid)
+{
+    GHashTable *ctx_cache;
+    RISCVIOMMUContext key = {
+        .devid = devid,
+        .pasid = pasid,
+    };
+    ctx_cache = g_hash_table_ref(s->ctx_cache);
+    g_hash_table_foreach(ctx_cache, func, &key);
+    g_hash_table_unref(ctx_cache);
+}
+
+/* Find or allocate translation context for a given {device_id, process_id} */
+static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
+    unsigned devid, unsigned pasid, void **ref)
+{
+    GHashTable *ctx_cache;
+    RISCVIOMMUContext *ctx;
+    RISCVIOMMUContext key = {
+        .devid = devid,
+        .pasid = pasid,
+    };
+
+    ctx_cache = g_hash_table_ref(s->ctx_cache);
+    ctx = g_hash_table_lookup(ctx_cache, &key);
+
+    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+        *ref = ctx_cache;
+        return ctx;
+    }
+
+    if (g_hash_table_size(s->ctx_cache) >= LIMIT_CACHE_CTX) {
+        ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
+                                          g_free, NULL);
+        g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
+    }
+
+    ctx = g_new0(RISCVIOMMUContext, 1);
+    ctx->devid = devid;
+    ctx->pasid = pasid;
+
+    int fault = riscv_iommu_ctx_fetch(s, ctx);
+    if (!fault) {
+        g_hash_table_add(ctx_cache, ctx);
+        *ref = ctx_cache;
+        return ctx;
+    }
+
+    g_hash_table_unref(ctx_cache);
+    *ref = NULL;
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_DTF)) {
+        struct riscv_iommu_fq_record ev = { 0 };
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE,
+            RISCV_IOMMU_FQ_TTYPE_UADDR_RD);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, devid);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, pasid);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, !!pasid);
+        riscv_iommu_fault(s, &ev);
+    }
+
+    g_free(ctx);
+    return NULL;
+}
+
+static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
+{
+    if (ref) {
+        g_hash_table_unref((GHashTable *)ref);
+    }
+}
+
+/* Find or allocate address space for a given device */
+static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
+{
+    RISCVIOMMUSpace *as;
+
+    /* FIXME: PCIe bus remapping for attached endpoints. */
+    devid |= s->bus << 8;
+
+    qemu_mutex_lock(&s->core_lock);
+    QLIST_FOREACH(as, &s->spaces, list) {
+        if (as->devid == devid) {
+            break;
+        }
+    }
+    qemu_mutex_unlock(&s->core_lock);
+
+    if (as == NULL) {
+        char name[64];
+        as = g_new0(RISCVIOMMUSpace, 1);
+
+        as->iommu = s;
+        as->devid = devid;
+
+        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
+            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
+
+        /* IOVA address space, untranslated addresses */
+        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
+            TYPE_RISCV_IOMMU_MEMORY_REGION,
+            OBJECT(as), name, UINT64_MAX);
+        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr),
+            TYPE_RISCV_IOMMU_PCI);
+
+        qemu_mutex_lock(&s->core_lock);
+        QLIST_INSERT_HEAD(&s->spaces, as, list);
+        qemu_mutex_unlock(&s->core_lock);
+
+        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
+                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
+    }
+    return &as->iova_as;
+}
+
+/* Translation Object cache support */
+static gboolean __iot_equal(gconstpointer v1, gconstpointer v2)
+{
+    RISCVIOMMUEntry *t1 = (RISCVIOMMUEntry *) v1;
+    RISCVIOMMUEntry *t2 = (RISCVIOMMUEntry *) v2;
+    return t1->gscid == t2->gscid && t1->pscid == t2->pscid &&
+           t1->iova == t2->iova;
+}
+
+static guint __iot_hash(gconstpointer v)
+{
+    RISCVIOMMUEntry *t = (RISCVIOMMUEntry *) v;
+    return (guint)t->iova;
+}
+
+/* GV: 1 PSCV: 1 AV: 1 */
+static void __iot_inval_pscid_iova(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
+    if (iot->gscid == arg->gscid &&
+        iot->pscid == arg->pscid &&
+        iot->iova == arg->iova) {
+        iot->perm = 0;
+    }
+}
+
+/* GV: 1 PSCV: 1 AV: 0 */
+static void __iot_inval_pscid(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
+    if (iot->gscid == arg->gscid &&
+        iot->pscid == arg->pscid) {
+        iot->perm = 0;
+    }
+}
+
+/* GV: 1 GVMA: 1 */
+static void __iot_inval_gscid_gpa(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
+    if (iot->gscid == arg->gscid) {
+        /* simplified cache, no GPA matching */
+        iot->perm = 0;
+    }
+}
+
+/* GV: 1 GVMA: 0 */
+static void __iot_inval_gscid(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+    RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
+    if (iot->gscid == arg->gscid) {
+        iot->perm = 0;
+    }
+}
+
+/* GV: 0 */
+static void __iot_inval_all(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+    iot->perm = 0;
+}
+
+/* caller should keep ref-count for iot_cache object */
+static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
+    GHashTable *iot_cache, hwaddr iova)
+{
+    RISCVIOMMUEntry key = {
+        .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
+        .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
+        .iova  = PPN_DOWN(iova),
+    };
+    return g_hash_table_lookup(iot_cache, &key);
+}
+
+/* caller should keep ref-count for iot_cache object */
+static void riscv_iommu_iot_update(RISCVIOMMUState *s,
+    GHashTable *iot_cache, RISCVIOMMUEntry *iot)
+{
+    if (!s->iot_limit) {
+        return;
+    }
+
+    if (g_hash_table_size(s->iot_cache) >= s->iot_limit) {
+        iot_cache = g_hash_table_new_full(__iot_hash, __iot_equal,
+                                          g_free, NULL);
+        g_hash_table_unref(qatomic_xchg(&s->iot_cache, iot_cache));
+    }
+    g_hash_table_add(iot_cache, iot);
+}
+
+static void riscv_iommu_iot_inval(RISCVIOMMUState *s, GHFunc func,
+    uint32_t gscid, uint32_t pscid, hwaddr iova)
+{
+    GHashTable *iot_cache;
+    RISCVIOMMUEntry key = {
+        .gscid = gscid,
+        .pscid = pscid,
+        .iova  = PPN_DOWN(iova),
+    };
+
+    iot_cache = g_hash_table_ref(s->iot_cache);
+    g_hash_table_foreach(iot_cache, func, &key);
+    g_hash_table_unref(iot_cache);
+}
+
+static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    IOMMUTLBEntry *iotlb, bool enable_cache)
+{
+    RISCVIOMMUEntry *iot;
+    IOMMUAccessFlags perm;
+    bool enable_faults;
+    bool enable_pasid;
+    bool enable_pri;
+    GHashTable *iot_cache;
+    int fault;
+
+    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_URQ);
+
+    iot_cache = g_hash_table_ref(s->iot_cache);
+
+    enable_faults = !(ctx->tc & RISCV_IOMMU_DC_TC_DTF);
+    /*
+     * TC[32] is reserved for custom extensions, used here to temporarily
+     * enable automatic page-request generation for ATS queries.
+     */
+    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
+    enable_pasid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
+
+    /* Check for ATS request. */
+    if (iotlb->perm == IOMMU_NONE) {
+        riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_ATS_RQ);
+        /* Check if ATS is disabled. */
+        if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS)) {
+            enable_pri = false;
+            fault = RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
+            goto done;
+        }
+        trace_riscv_iommu_ats(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
+                PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid), iotlb->iova);
+    }
+
+    iot = riscv_iommu_iot_lookup(ctx, iot_cache, iotlb->iova);
+    perm = iot ? iot->perm : IOMMU_NONE;
+    if (perm != IOMMU_NONE) {
+        iotlb->translated_addr = PPN_PHYS(iot->phys);
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        iotlb->perm = perm;
+        fault = 0;
+        goto done;
+    }
+
+    riscv_iommu_hpm_incr_ctr(s, ctx, RISCV_IOMMU_HPMEVENT_TLB_MISS);
+
+    /* Translate using device directory / page table information. */
+    fault = riscv_iommu_spa_fetch(s, ctx, iotlb, false);
+
+    if (!fault && iotlb->target_as == &s->trap_as) {
+        /* Do not cache trapped MSI translations */
+        goto done;
+    }
+
+    if (!fault && iotlb->translated_addr != iotlb->iova && enable_cache) {
+        iot = g_new0(RISCVIOMMUEntry, 1);
+        iot->iova = PPN_DOWN(iotlb->iova);
+        iot->phys = PPN_DOWN(iotlb->translated_addr);
+        iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
+        iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
+        iot->perm = iotlb->perm;
+        riscv_iommu_iot_update(s, iot_cache, iot);
+    }
+
+done:
+    g_hash_table_unref(iot_cache);
+
+    if (enable_pri && fault) {
+        struct riscv_iommu_pq_record pr = {0};
+        if (enable_pasid) {
+            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
+                RISCV_IOMMU_PREQ_HDR_PID, ctx->pasid);
+        }
+        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
+        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) | RISCV_IOMMU_PREQ_PAYLOAD_M;
+        riscv_iommu_pri(s, &pr);
+        return fault;
+    }
+
+    if (enable_faults && fault) {
+        struct riscv_iommu_fq_record ev;
+        const unsigned ttype =
+            (iotlb->perm & IOMMU_RW) ? RISCV_IOMMU_FQ_TTYPE_UADDR_WR :
+            ((iotlb->perm & IOMMU_RO) ? RISCV_IOMMU_FQ_TTYPE_UADDR_RD :
+            RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ);
+        ev.hdr = set_field(0, RISCV_IOMMU_FQ_HDR_CAUSE, fault);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, ttype);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, enable_pasid);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->pasid);
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
+        ev.iotval    = iotlb->iova;
+        ev.iotval2   = iotlb->translated_addr;
+        ev._reserved = 0;
+        riscv_iommu_fault(s, &ev);
+        return fault;
+    }
+
+    return 0;
+}
+
+/* IOMMU Command Interface */
+static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
+    uint64_t addr, uint32_t data)
+{
+    /*
+     * ATS processing in this implementation of the IOMMU is synchronous,
+     * no need to wait for completions here.
+     */
+    if (!notify) {
+        return MEMTX_OK;
+    }
+
+    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
+        MEMTXATTRS_UNSPECIFIED);
+}
+
+static void riscv_iommu_ats(RISCVIOMMUState *s,
+    struct riscv_iommu_command *cmd, IOMMUNotifierFlag flag,
+    IOMMUAccessFlags perm,
+    void (*trace_fn)(const char *id))
+{
+    RISCVIOMMUSpace *as = NULL;
+    IOMMUNotifier *n;
+    IOMMUTLBEvent event;
+    uint32_t pasid;
+    uint32_t devid;
+    const bool pv = cmd->dword0 & RISCV_IOMMU_CMD_ATS_PV;
+
+    if (cmd->dword0 & RISCV_IOMMU_CMD_ATS_DSV) {
+        /* Use device segment and requester id */
+        devid = get_field(cmd->dword0,
+            RISCV_IOMMU_CMD_ATS_DSEG | RISCV_IOMMU_CMD_ATS_RID);
+    } else {
+        devid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_RID);
+    }
+
+    pasid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_PID);
+
+    qemu_mutex_lock(&s->core_lock);
+    QLIST_FOREACH(as, &s->spaces, list) {
+        if (as->devid == devid) {
+            break;
+        }
+    }
+    qemu_mutex_unlock(&s->core_lock);
+
+    if (!as || !as->notifier) {
+        return;
+    }
+
+    event.type = flag;
+    event.entry.perm = perm;
+    event.entry.target_as = s->target_as;
+
+    IOMMU_NOTIFIER_FOREACH(n, &as->iova_mr) {
+        if (!pv || n->iommu_idx == pasid) {
+            event.entry.iova = n->start;
+            event.entry.addr_mask = n->end - n->start;
+            trace_fn(as->iova_mr.parent_obj.name);
+            memory_region_notify_iommu_one(n, &event);
+        }
+    }
+}
+
+static void riscv_iommu_ats_inval(RISCVIOMMUState *s,
+    struct riscv_iommu_command *cmd)
+{
+    return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_DEVIOTLB_UNMAP, IOMMU_NONE,
+                           trace_riscv_iommu_ats_inval);
+}
+
+static void riscv_iommu_ats_prgr(RISCVIOMMUState *s,
+    struct riscv_iommu_command *cmd)
+{
+    unsigned resp_code = get_field(cmd->dword1, RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE);
+    /* Using the access flag to carry response code information */
+    IOMMUAccessFlags perm = resp_code ? IOMMU_NONE : IOMMU_RW;
+    return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_MAP, perm,
+                           trace_riscv_iommu_ats_prgr);
+}
+
+static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
+{
+    uint64_t old_ddtp = s->ddtp;
+    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
+    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
+    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
+    bool ok = false;
+
+    /*
+     * Check for allowed DDTP.MODE transitions:
+     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
+     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
+     */
+    if (new_mode == old_mode ||
+        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
+        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
+        ok = true;
+    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
+               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
+               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
+        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
+             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
+    }
+
+    if (ok) {
+        /* clear reserved and busy bits, report back sanitized version */
+        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
+                             RISCV_IOMMU_DDTP_MODE, new_mode);
+    } else {
+        new_ddtp = old_ddtp;
+    }
+    s->ddtp = new_ddtp;
+
+    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
+}
+
+/* Command function and opcode field. */
+#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
+
+static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
+{
+    struct riscv_iommu_command cmd;
+    MemTxResult res;
+    dma_addr_t addr;
+    uint32_t tail, head, ctrl;
+    GHFunc func;
+
+    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
+    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
+
+    /* Check for pending error or queue processing disabled */
+    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
+        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
+        return;
+    }
+
+    while (tail != head) {
+        addr = s->cq_addr  + head * sizeof(cmd);
+        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
+                              MEMTXATTRS_UNSPECIFIED);
+
+        if (res != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, RISCV_IOMMU_CQCSR_CQMF, 0);
+            goto fault;
+        }
+
+        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
+
+        switch (get_field(cmd.dword0, RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC)) {
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
+                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
+            res = riscv_iommu_iofence(s, cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV,
+                cmd.dword1, get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
+
+            if (res != MEMTX_OK) {
+                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                                      RISCV_IOMMU_CQCSR_CQMF, 0);
+                goto fault;
+            }
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
+                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
+            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
+                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
+                goto cmd_ill;
+            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
+                /* invalidate all cache mappings */
+                func = __iot_inval_all;
+            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
+                /* invalidate cache matching GSCID */
+                func = __iot_inval_gscid;
+            } else {
+                /* invalidate cache matching GSCID and ADDR (GPA) */
+                func = __iot_inval_gscid_gpa;
+            }
+            riscv_iommu_iot_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID), 0,
+                cmd.dword1 & TARGET_PAGE_MASK);
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
+                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
+            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
+                /* invalidate all cache mappings, simplified model */
+                func = __iot_inval_all;
+            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV)) {
+                /* invalidate cache matching GSCID, simplified model */
+                func = __iot_inval_gscid;
+            } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
+                /* invalidate cache matching GSCID and PSCID */
+                func = __iot_inval_pscid;
+            } else {
+                /* invalidate cache matching GSCID and PSCID and ADDR (IOVA) */
+                func = __iot_inval_pscid_iova;
+            }
+            riscv_iommu_iot_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID),
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_PSCID),
+                cmd.dword1 & TARGET_PAGE_MASK);
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
+                             RISCV_IOMMU_CMD_IODIR_OPCODE):
+            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
+                /* invalidate all device context cache mappings */
+                func = __ctx_inval_all;
+            } else {
+                /* invalidate all device context matching DID */
+                func = __ctx_inval_devid;
+            }
+            riscv_iommu_ctx_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
+                             RISCV_IOMMU_CMD_IODIR_OPCODE):
+            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
+                /* illegal command arguments IODIR_PDT & DV == 0 */
+                goto cmd_ill;
+            } else {
+                func = __ctx_inval_devid_pasid;
+            }
+            riscv_iommu_ctx_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
+            break;
+
+        /* ATS commands */
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_INVAL,
+                             RISCV_IOMMU_CMD_ATS_OPCODE):
+            riscv_iommu_ats_inval(s, &cmd);
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_PRGR,
+                             RISCV_IOMMU_CMD_ATS_OPCODE):
+            riscv_iommu_ats_prgr(s, &cmd);
+            break;
+
+        default:
+        cmd_ill:
+            /* Invalid instruction, do not advance instruction index. */
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
+            goto fault;
+        }
+
+        /* Advance and update head pointer after command completes. */
+        head = (head + 1) & s->cq_mask;
+        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
+    }
+    return;
+
+fault:
+    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
+    }
+}
+
+static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
+        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
+        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
+        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
+            RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
+        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
+        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
+        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
+            RISCV_IOMMU_FQCSR_FQOF;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
+        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
+        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
+        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
+            RISCV_IOMMU_PQCSR_PQOF;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_dbg(RISCVIOMMUState *s)
+{
+    uint64_t iova = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_IOVA);
+    uint64_t ctrl = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_CTL);
+    unsigned devid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_DID);
+    unsigned pid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_PID);
+    RISCVIOMMUContext *ctx;
+    void *ref;
+
+    if (!(ctrl & RISCV_IOMMU_TR_REQ_CTL_GO_BUSY)) {
+        return;
+    }
+
+    ctx = riscv_iommu_ctx(s, devid, pid, &ref);
+    if (ctx == NULL) {
+        riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE,
+            RISCV_IOMMU_TR_RESPONSE_FAULT | (RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED << 10));
+    } else {
+        IOMMUTLBEntry iotlb = {
+            .iova = iova,
+            .perm = IOMMU_NONE,
+            .addr_mask = ~0,
+            .target_as = NULL,
+        };
+        int fault = riscv_iommu_translate(s, ctx, &iotlb, false);
+        if (fault) {
+            iova = RISCV_IOMMU_TR_RESPONSE_FAULT | (((uint64_t) fault) << 10);
+        } else {
+            iova = ((iotlb.translated_addr & ~iotlb.addr_mask) >> 2) &
+                RISCV_IOMMU_TR_RESPONSE_PPN;
+        }
+        riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE, iova);
+    }
+
+    riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_TR_REQ_CTL, 0,
+        RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
+    riscv_iommu_ctx_put(s, ref);
+}
+
+/* Core IOMMU execution activation */
+enum {
+    RISCV_IOMMU_EXEC_DDTP,
+    RISCV_IOMMU_EXEC_CQCSR,
+    RISCV_IOMMU_EXEC_CQT,
+    RISCV_IOMMU_EXEC_FQCSR,
+    RISCV_IOMMU_EXEC_FQH,
+    RISCV_IOMMU_EXEC_PQCSR,
+    RISCV_IOMMU_EXEC_PQH,
+    RISCV_IOMMU_EXEC_TR_REQUEST,
+    /* RISCV_IOMMU_EXEC_EXIT must be the last enum value */
+    RISCV_IOMMU_EXEC_EXIT,
+};
+
+static void *riscv_iommu_core_proc(void* arg)
+{
+    RISCVIOMMUState *s = arg;
+    unsigned exec = 0;
+    unsigned mask = 0;
+
+    while (!(exec & BIT(RISCV_IOMMU_EXEC_EXIT))) {
+        mask = (mask ? mask : BIT(RISCV_IOMMU_EXEC_EXIT)) >> 1;
+        switch (exec & mask) {
+        case BIT(RISCV_IOMMU_EXEC_DDTP):
+            riscv_iommu_process_ddtp(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_CQCSR):
+            riscv_iommu_process_cq_control(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_CQT):
+            riscv_iommu_process_cq_tail(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_FQCSR):
+            riscv_iommu_process_fq_control(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_FQH):
+            /* NOP */
+            break;
+        case BIT(RISCV_IOMMU_EXEC_PQCSR):
+            riscv_iommu_process_pq_control(s);
+            break;
+        case BIT(RISCV_IOMMU_EXEC_PQH):
+            /* NOP */
+            break;
+        case BIT(RISCV_IOMMU_EXEC_TR_REQUEST):
+            riscv_iommu_process_dbg(s);
+            break;
+        }
+        exec &= ~mask;
+        if (!exec) {
+            qemu_mutex_lock(&s->core_lock);
+            exec = s->core_exec;
+            while (!exec) {
+                qemu_cond_wait(&s->core_cond, &s->core_lock);
+                exec = s->core_exec;
+            }
+            s->core_exec = 0;
+            qemu_mutex_unlock(&s->core_lock);
+        }
+    };
+
+    return NULL;
+}
+
+/* For now we assume IOMMU HPM frequency to be 1GHz so 1-cycle is of 1-ns. */
+static inline uint64_t __get_cycles(void)
+{
+    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+}
+
+static void __hpm_setup_timer(RISCVIOMMUState *s, uint64_t value)
+{
+    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
+    uint64_t overflow_at, overflow_ns;
+
+    if (get_field(inhibit, RISCV_IOMMU_IOCOUNTINH_CY)) {
+        return;
+    }
+
+    /*
+     * We are using INT64_MAX here instead to UINT64_MAX because cycle counter
+     * has 63-bit precision and INT64_MAX is the maximum it can store.
+     */
+    if (value) {
+        overflow_ns = INT64_MAX - value + 1;
+    } else {
+        overflow_ns = INT64_MAX;
+    }
+
+    overflow_at = (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + overflow_ns;
+
+    if (overflow_at > INT64_MAX) {
+        s->irq_overflow_left = overflow_at - INT64_MAX;
+        overflow_at = INT64_MAX;
+    }
+
+    timer_mod_anticipate_ns(s->hpm_timer, overflow_at);
+}
+
+/* Updates the internal cycle counter state when iocntinh:CY is changed. */
+static void riscv_iommu_process_iocntinh_cy(RISCVIOMMUState *s,
+                                            bool prev_cy_inh)
+{
+    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
+
+    /* We only need to process CY bit toggle. */
+    if (!(inhibit ^ prev_cy_inh)) {
+        return;
+    }
+
+    if (!(inhibit & RISCV_IOMMU_IOCOUNTINH_CY)) {
+        /*
+         * Cycle counter is enabled. Just start the timer again and update the
+         * clock snapshot value to point to the current time to make sure
+         * iohpmcycles read is correct.
+         */
+        s->hpmcycle_prev = __get_cycles();
+        __hpm_setup_timer(s, s->hpmcycle_val);
+    } else {
+        /*
+         * Cycle counter is disabled. Stop the timer and update the cycle
+         * counter to record the current value which is last programmed
+         * value + the cycles passed so far.
+         */
+        s->hpmcycle_val = s->hpmcycle_val + (__get_cycles() - s->hpmcycle_prev);
+        timer_del(s->hpm_timer);
+    }
+}
+
+static void riscv_iommu_process_hpmcycle_write(RISCVIOMMUState *s)
+{
+    const uint64_t val = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_IOHPMCYCLES);
+    const uint32_t ovf = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTOVF);
+
+    /*
+     * Clear OF bit in IOCNTOVF if it's being cleared in IOHPMCYCLES register.
+     */
+    if (get_field(ovf, RISCV_IOMMU_IOCOUNTOVF_CY) &&
+        !get_field(val, RISCV_IOMMU_IOHPMCYCLES_OVF)) {
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF, 0,
+            RISCV_IOMMU_IOCOUNTOVF_CY);
+    }
+
+    s->hpmcycle_val = val & ~RISCV_IOMMU_IOHPMCYCLES_OVF;
+    s->hpmcycle_prev = __get_cycles();
+    __hpm_setup_timer(s, s->hpmcycle_val);
+}
+
+static inline bool __check_valid_event_id(unsigned event_id)
+{
+    return event_id > RISCV_IOMMU_HPMEVENT_INVALID &&
+           event_id < RISCV_IOMMU_HPMEVENT_MAX;
+}
+
+static gboolean __hpm_event_equal(gpointer key, gpointer value, gpointer udata)
+{
+    uint32_t *pair = udata;
+
+    if (GPOINTER_TO_UINT(value) & (1 << pair[0])) {
+        pair[1] = GPOINTER_TO_UINT(key);
+        return true;
+    }
+
+    return false;
+}
+
+/* Caller must check ctr_idx against hpm_ctrs to see if its supported or not. */
+static void __update_event_map(RISCVIOMMUState *s, uint64_t value,
+    uint32_t ctr_idx)
+{
+    unsigned event_id = get_field(value, RISCV_IOMMU_IOHPMEVT_EVENT_ID);
+    uint32_t pair[2] = { ctr_idx, RISCV_IOMMU_HPMEVENT_INVALID };
+    uint32_t new_value = 1 << ctr_idx;
+    gpointer data;
+
+    /* If EventID field is RISCV_IOMMU_HPMEVENT_INVALID remove the current mapping. */
+    if (event_id == RISCV_IOMMU_HPMEVENT_INVALID) {
+        data = g_hash_table_find(s->hpm_event_ctr_map, __hpm_event_equal, pair);
+
+        new_value = GPOINTER_TO_UINT(data) & ~(new_value);
+        pthread_rwlock_wrlock(&s->ht_lock);
+        if (new_value != 0) {
+            g_hash_table_replace(s->hpm_event_ctr_map,
+                                 GUINT_TO_POINTER(pair[1]),
+                                 GUINT_TO_POINTER(new_value));
+        } else {
+            g_hash_table_remove(s->hpm_event_ctr_map,
+                                GUINT_TO_POINTER(pair[1]));
+        }
+        pthread_rwlock_unlock(&s->ht_lock);
+
+        return;
+    }
+
+    /* Update the counter mask if the event is already enabled. */
+    if (g_hash_table_lookup_extended(s->hpm_event_ctr_map,
+                                     GUINT_TO_POINTER(event_id),
+                                     NULL,
+                                     &data)) {
+        new_value |= GPOINTER_TO_UINT(data);
+    }
+
+    pthread_rwlock_wrlock(&s->ht_lock);
+    g_hash_table_insert(s->hpm_event_ctr_map,
+                        GUINT_TO_POINTER(event_id),
+                        GUINT_TO_POINTER(new_value));
+    pthread_rwlock_unlock(&s->ht_lock);
+}
+
+static void riscv_iommu_process_hpmevt_write(RISCVIOMMUState *s,
+                                             uint32_t evt_reg)
+{
+    const uint32_t ctr_idx = (evt_reg - RISCV_IOMMU_REG_IOHPMEVT_BASE) >> 3;
+    const uint32_t ovf = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTOVF);
+    uint64_t val = riscv_iommu_reg_get64(s, evt_reg);
+
+    if (ctr_idx >= s->hpm_cntrs) {
+        return;
+    }
+
+    /* Clear OF bit in IOCNTOVF if it's being cleared in IOHPMEVT register. */
+    if (get_field(ovf, BIT(ctr_idx + 1)) && !get_field(val, RISCV_IOMMU_IOHPMEVT_OF)) {
+        /* +1 to offset CYCLE register OF bit. */
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF, 0, BIT(ctr_idx + 1));
+    }
+
+    if (!__check_valid_event_id(get_field(val, RISCV_IOMMU_IOHPMEVT_EVENT_ID))) {
+        /* Reset EventID (WARL) field to invalid. */
+        val = set_field(val, RISCV_IOMMU_IOHPMEVT_EVENT_ID,
+            RISCV_IOMMU_HPMEVENT_INVALID);
+        riscv_iommu_reg_set64(s, evt_reg, val);
+    }
+
+    __update_event_map(s, val, ctr_idx);
+}
+
+static void riscv_iommu_process_hpm_writes(RISCVIOMMUState *s,
+                                           uint32_t regb,
+                                           bool prev_cy_inh)
+{
+    switch (regb) {
+    case RISCV_IOMMU_REG_IOCOUNTINH:
+        riscv_iommu_process_iocntinh_cy(s, prev_cy_inh);
+        break;
+
+    case RISCV_IOMMU_REG_IOHPMCYCLES:
+    case RISCV_IOMMU_REG_IOHPMCYCLES + 4:
+        riscv_iommu_process_hpmcycle_write(s);
+        break;
+
+    case RISCV_IOMMU_REG_IOHPMEVT_BASE ...
+        RISCV_IOMMU_REG_IOHPMEVT(RISCV_IOMMU_IOCOUNT_NUM) + 4:
+        riscv_iommu_process_hpmevt_write(s, regb & ~7);
+        break;
+    }
+}
+
+static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
+    uint64_t data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState *s = opaque;
+    uint32_t regb = addr & ~3;
+    bool cy_inh = false;
+    uint32_t busy = 0;
+    uint32_t exec = 0;
+
+    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
+        /* Unsupported MMIO alignment or access size */
+        return MEMTX_ERROR;
+    }
+
+    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+        /* Unsupported MMIO access location. */
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* Track actionable MMIO write. */
+    switch (regb) {
+    case RISCV_IOMMU_REG_DDTP:
+    case RISCV_IOMMU_REG_DDTP + 4:
+        exec = BIT(RISCV_IOMMU_EXEC_DDTP);
+        regb = RISCV_IOMMU_REG_DDTP;
+        busy = RISCV_IOMMU_DDTP_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_CQT:
+        exec = BIT(RISCV_IOMMU_EXEC_CQT);
+        break;
+
+    case RISCV_IOMMU_REG_CQCSR:
+        exec = BIT(RISCV_IOMMU_EXEC_CQCSR);
+        busy = RISCV_IOMMU_CQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_FQH:
+        exec = BIT(RISCV_IOMMU_EXEC_FQH);
+        break;
+
+    case RISCV_IOMMU_REG_FQCSR:
+        exec = BIT(RISCV_IOMMU_EXEC_FQCSR);
+        busy = RISCV_IOMMU_FQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_PQH:
+        exec = BIT(RISCV_IOMMU_EXEC_PQH);
+        break;
+
+    case RISCV_IOMMU_REG_PQCSR:
+        exec = BIT(RISCV_IOMMU_EXEC_PQCSR);
+        busy = RISCV_IOMMU_PQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_IOCOUNTINH:
+        if (addr != RISCV_IOMMU_REG_IOCOUNTINH) {
+            break;
+        }
+
+        /* Store previous value of CY bit. */
+        cy_inh = !!(riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH) &
+            RISCV_IOMMU_IOCOUNTINH_CY);
+        break;
+
+    case RISCV_IOMMU_REG_TR_REQ_CTL:
+        exec = BIT(RISCV_IOMMU_EXEC_TR_REQUEST);
+        regb = RISCV_IOMMU_REG_TR_REQ_CTL;
+        busy = RISCV_IOMMU_TR_REQ_CTL_GO_BUSY;
+        break;
+    }
+
+    /*
+     * Registers update might be not synchronized with core logic.
+     * If system software updates register when relevant BUSY bit is set
+     * IOMMU behavior of additional writes to the register is UNSPECIFIED
+     */
+
+    qemu_spin_lock(&s->regs_lock);
+    if (size == 1) {
+        uint8_t ro = s->regs_ro[addr];
+        uint8_t wc = s->regs_wc[addr];
+        uint8_t rw = s->regs_rw[addr];
+        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
+    } else if (size == 2) {
+        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
+        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
+        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
+        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
+    } else if (size == 4) {
+        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
+        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
+        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
+        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
+    } else if (size == 8) {
+        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
+        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
+        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
+        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
+    }
+
+    /* Busy flag update, MSB 4-byte register. */
+    if (busy) {
+        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
+        stl_le_p(&s->regs_rw[regb], rw | busy);
+    }
+    qemu_spin_unlock(&s->regs_lock);
+
+    /* Process HPM writes and update any internal state if needed. */
+    if (regb >= RISCV_IOMMU_REG_IOCOUNTOVF &&
+        regb <= (RISCV_IOMMU_REG_IOHPMEVT(RISCV_IOMMU_IOCOUNT_NUM) + 4)) {
+        riscv_iommu_process_hpm_writes(s, regb, cy_inh);
+    }
+
+    /* Wake up core processing thread. */
+    if (exec) {
+        qemu_mutex_lock(&s->core_lock);
+        s->core_exec |= exec;
+        qemu_cond_signal(&s->core_cond);
+        qemu_mutex_unlock(&s->core_lock);
+    }
+
+    return MEMTX_OK;
+}
+
+static uint64_t riscv_iommu_hpmcycle_read(RISCVIOMMUState *s)
+{
+    const uint64_t cycle = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_IOHPMCYCLES);
+    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
+    const uint64_t ctr_prev = s->hpmcycle_prev;
+    const uint64_t ctr_val = s->hpmcycle_val;
+
+    if (get_field(inhibit, RISCV_IOMMU_IOCOUNTINH_CY)) {
+        /*
+         * Counter should not increment if inhibit bit is set. We can't really
+         * stop the QEMU_CLOCK_VIRTUAL, so we just return the last updated
+         * counter value to indicate that counter was not incremented.
+         */
+        return (ctr_val & RISCV_IOMMU_IOHPMCYCLES_COUNTER) |
+               (cycle & RISCV_IOMMU_IOHPMCYCLES_OVF);
+    }
+
+    return (ctr_val + __get_cycles() - ctr_prev) |
+        (cycle & RISCV_IOMMU_IOHPMCYCLES_OVF);
+}
+
+static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
+    uint64_t *data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState *s = opaque;
+    uint64_t val = -1;
+    uint8_t *ptr;
+
+    if ((addr & (size - 1)) != 0) {
+        /* Unsupported MMIO alignment. */
+        return MEMTX_ERROR;
+    }
+
+    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* Compute cycle register value. */
+    if ((addr & ~7) == RISCV_IOMMU_REG_IOHPMCYCLES) {
+        val = riscv_iommu_hpmcycle_read(s);
+        ptr = (uint8_t *)&val + (addr & 7);
+    } else if ((addr & ~3) == RISCV_IOMMU_REG_IOCOUNTOVF) {
+        /*
+         * Software can read RISCV_IOMMU_REG_IOCOUNTOVF before timer callback completes.
+         * In which case CY_OF bit in RISCV_IOMMU_IOHPMCYCLES_OVF would be 0. Here we
+         * take the CY_OF bit state from RISCV_IOMMU_REG_IOHPMCYCLES register as it's
+         * not dependent over the timer callback and is computed from cycle
+         * overflow.
+         */
+        val = ldq_le_p(&s->regs_rw[addr]);
+        val |= (riscv_iommu_hpmcycle_read(s) & RISCV_IOMMU_IOHPMCYCLES_OVF)
+                   ? RISCV_IOMMU_IOCOUNTOVF_CY
+                   : 0;
+        ptr = (uint8_t *)&val + (addr & 3);
+    } else {
+        ptr = &s->regs_rw[addr];
+    }
+
+    if (size == 1) {
+        val = (uint64_t)*ptr;
+    } else if (size == 2) {
+        val = lduw_le_p(ptr);
+    } else if (size == 4) {
+        val = ldl_le_p(ptr);
+    } else if (size == 8) {
+        val = ldq_le_p(ptr);
+    } else {
+        return MEMTX_ERROR;
+    }
+
+    *data = val;
+
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps riscv_iommu_mmio_ops = {
+    .read_with_attrs = riscv_iommu_mmio_read,
+    .write_with_attrs = riscv_iommu_mmio_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    }
+};
+
+/*
+ * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
+ * memory region as untranslated address, for additional MSI/MRIF interception
+ * by IOMMU interrupt remapping implementation.
+ * Note: Device emulation code generating an MSI is expected to provide a valid
+ * memory transaction attributes with requested_id set.
+ */
+static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
+    uint64_t data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
+    RISCVIOMMUContext *ctx;
+    MemTxResult res;
+    void *ref;
+    uint32_t devid = attrs.requester_id;
+
+    if (attrs.unspecified) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* FIXME: PCIe bus remapping for attached endpoints. */
+    devid |= s->bus << 8;
+
+    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
+    if (ctx == NULL) {
+        res = MEMTX_ACCESS_ERROR;
+    } else {
+        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
+    }
+    riscv_iommu_ctx_put(s, ref);
+    return res;
+}
+
+static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
+    uint64_t *data, unsigned size, MemTxAttrs attrs)
+{
+    return MEMTX_ACCESS_ERROR;
+}
+
+static const MemoryRegionOps riscv_iommu_trap_ops = {
+    .read_with_attrs = riscv_iommu_trap_read,
+    .write_with_attrs = riscv_iommu_trap_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = true,
+    },
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    }
+};
+
+/* Timer callback for cycle counter overflow. */
+static void riscv_iommu_hpm_timer_cb(void *priv)
+{
+    RISCVIOMMUState *s = priv;
+    const uint32_t inhibit = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTINH);
+    uint32_t ovf;
+
+    if (get_field(inhibit, RISCV_IOMMU_IOCOUNTINH_CY)) {
+        return;
+    }
+
+    if (s->irq_overflow_left > 0) {
+        uint64_t irq_trigger_at =
+            qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->irq_overflow_left;
+        timer_mod_anticipate_ns(s->hpm_timer, irq_trigger_at);
+        s->irq_overflow_left = 0;
+        return;
+    }
+
+    ovf = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IOCOUNTOVF);
+    if (!get_field(ovf, RISCV_IOMMU_IOCOUNTOVF_CY)) {
+        /*
+         * We don't need to set hpmcycle_val to zero and update hpmcycle_prev to
+         * current clock value. The way we calculate iohpmcycs will overflow
+         * and return the correct value. This avoids the need to synchronize
+         * timer callback and write callback.
+         */
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IOCOUNTOVF,
+            RISCV_IOMMU_IOCOUNTOVF_CY, 0);
+        riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_IOHPMCYCLES,
+            RISCV_IOMMU_IOHPMCYCLES_OVF, 0);
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PM);
+    }
+}
+
+static void riscv_iommu_realize(DeviceState *dev, Error **errp)
+{
+    const uint64_t cap_implemented =
+        RISCV_IOMMU_CAP_MSI_FLAT |
+        RISCV_IOMMU_CAP_MSI_MRIF |
+        RISCV_IOMMU_CAP_ATS |
+        RISCV_IOMMU_CAP_S_SV32 |
+        RISCV_IOMMU_CAP_S_SV39 |
+        RISCV_IOMMU_CAP_S_SV48 |
+        RISCV_IOMMU_CAP_S_SV57 |
+        RISCV_IOMMU_CAP_G_SV32 |
+        RISCV_IOMMU_CAP_G_SV39 |
+        RISCV_IOMMU_CAP_G_SV48 |
+        RISCV_IOMMU_CAP_G_SV57 |
+        RISCV_IOMMU_CAP_MSI_FLAT |
+        RISCV_IOMMU_CAP_MSI_MRIF |
+        RISCV_IOMMU_CAP_ATS |
+        RISCV_IOMMU_CAP_IGS |
+        RISCV_IOMMU_CAP_HPM |
+        RISCV_IOMMU_CAP_DBG |
+        RISCV_IOMMU_CAP_PD8 |
+        RISCV_IOMMU_CAP_PD17 |
+        RISCV_IOMMU_CAP_PD20;
+
+    RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+    s->cap &= cap_implemented;
+    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_VERSION, s->version);
+
+    if (s->hpm_cntrs > RISCV_IOMMU_IOCOUNT_NUM) {
+        /* Clip number of HPM counters to maximum supported (31). */
+        s->hpm_cntrs = RISCV_IOMMU_IOCOUNT_NUM;
+    } else if (s->hpm_cntrs == 0) {
+        /* Disable hardware performance monitor interface */
+        s->cap |= RISCV_IOMMU_CAP_HPM;
+    }
+
+    /* Verify supported IGS */
+    switch (get_field(s->cap, RISCV_IOMMU_CAP_IGS)) {
+    case RISCV_IOMMU_CAP_IGS_MSI:
+    case RISCV_IOMMU_CAP_IGS_WSI:
+        break;
+    default:
+        error_setg(errp, "can't support requested IGS mode: cap: %" PRIx64,
+            s->cap);
+        return;
+    }
+
+    /* Report QEMU target physical address space limits */
+    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS, TARGET_PHYS_ADDR_SPACE_BITS);
+
+    /* Adjust reported PD capabilities */
+    if (s->pasid_bits < 20) {
+        s->cap &= ~RISCV_IOMMU_CAP_PD20;
+    } else if (s->pasid_bits < 17) {
+        s->cap &= ~RISCV_IOMMU_CAP_PD17;
+    } else if (s->pasid_bits < 8) {
+        s->cap &= ~RISCV_IOMMU_CAP_PD8;
+    }
+
+    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
+    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
+                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
+
+    /* register storage */
+    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+
+     /* Mark all registers read-only */
+    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
+
+    /*
+     * Register complete MMIO space, including MSI/PBA registers.
+     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
+     * managed directly by the PCIDevice implementation.
+     */
+    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
+        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
+
+    /* Set power-on register state */
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], s->fctl);
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
+        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
+        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
+        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
+        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
+        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
+        RISCV_IOMMU_CQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
+        RISCV_IOMMU_FQCSR_FQOF);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
+        RISCV_IOMMU_FQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
+        RISCV_IOMMU_PQCSR_PQOF);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
+        RISCV_IOMMU_PQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
+    /* If HPM registers are enabled. */
+    if (s->cap & RISCV_IOMMU_CAP_HPM) {
+        /* +1 for cycle counter bit. */
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_IOCOUNTINH], ~((2 << s->hpm_cntrs) - 1));
+        stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_IOHPMCYCLES], 0);
+        memset(&s->regs_ro[RISCV_IOMMU_REG_IOHPMCTR_BASE], 0x00, s->hpm_cntrs * 8);
+        memset(&s->regs_ro[RISCV_IOMMU_REG_IOHPMEVT_BASE], 0x00, s->hpm_cntrs * 8);
+    }
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_IVEC], 0);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
+    /* If debug registers enabled. */
+    if (s->cap & RISCV_IOMMU_CAP_DBG) {
+        stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_IOVA], 0);
+        stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_CTL],
+            RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
+    }
+
+    /* Memory region for downstream access, if specified. */
+    if (s->target_mr) {
+        s->target_as = g_new0(AddressSpace, 1);
+        address_space_init(s->target_as, s->target_mr,
+            "riscv-iommu-downstream");
+    } else {
+        /* Fallback to global system memory. */
+        s->target_as = &address_space_memory;
+    }
+
+    /* Memory region for untranslated MRIF/MSI writes */
+    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
+            "riscv-iommu-trap", ~0ULL);
+    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
+
+    /* Device translation context cache */
+    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
+                                         g_free, NULL);
+    s->iot_cache = g_hash_table_new_full(__iot_hash, __iot_equal,
+                                         g_free, NULL);
+
+    if (s->cap & RISCV_IOMMU_CAP_HPM) {
+        s->hpm_event_ctr_map = g_hash_table_new(g_direct_hash, g_direct_equal);
+        pthread_rwlock_init(&s->ht_lock, NULL);
+        s->hpm_timer =
+            timer_new_ns(QEMU_CLOCK_VIRTUAL, riscv_iommu_hpm_timer_cb, s);
+    }
+
+    s->iommus.le_next = NULL;
+    s->iommus.le_prev = NULL;
+    QLIST_INIT(&s->spaces);
+    qemu_cond_init(&s->core_cond);
+    qemu_mutex_init(&s->core_lock);
+    qemu_spin_init(&s->regs_lock);
+    qemu_thread_create(&s->core_proc, "riscv-iommu-core",
+        riscv_iommu_core_proc, s, QEMU_THREAD_JOINABLE);
+}
+
+static void riscv_iommu_unrealize(DeviceState *dev)
+{
+    RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+    qemu_mutex_lock(&s->core_lock);
+    /* cancel pending operations and stop */
+    s->core_exec = BIT(RISCV_IOMMU_EXEC_EXIT);
+    qemu_cond_signal(&s->core_cond);
+    qemu_mutex_unlock(&s->core_lock);
+    qemu_thread_join(&s->core_proc);
+    qemu_cond_destroy(&s->core_cond);
+    qemu_mutex_destroy(&s->core_lock);
+    if (s->cap & RISCV_IOMMU_CAP_HPM) {
+        timer_free(s->hpm_timer);
+        pthread_rwlock_destroy(&s->ht_lock);
+        g_hash_table_unref(s->hpm_event_ctr_map);
+    }
+    g_hash_table_unref(s->iot_cache);
+    g_hash_table_unref(s->ctx_cache);
+}
+
+static Property riscv_iommu_properties[] = {
+    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
+        RISCV_IOMMU_SPEC_DOT_VER),
+    DEFINE_PROP_UINT64("capabilities", RISCVIOMMUState, cap, ~0ULL),
+    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
+    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
+    DEFINE_PROP_UINT32("ioatc-limit", RISCVIOMMUState, iot_limit,
+        LIMIT_CACHE_IOT),
+    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
+        TYPE_MEMORY_REGION, MemoryRegion *),
+    DEFINE_PROP_UINT8("hpm-counters", RISCVIOMMUState, hpm_cntrs,
+        RISCV_IOMMU_IOCOUNT_NUM),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void riscv_iommu_class_init(ObjectClass *klass, void* data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
+    dc->user_creatable = false;
+    dc->realize = riscv_iommu_realize;
+    dc->unrealize = riscv_iommu_unrealize;
+    device_class_set_props(dc, riscv_iommu_properties);
+}
+
+static const TypeInfo riscv_iommu_info = {
+    .name = TYPE_RISCV_IOMMU,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(RISCVIOMMUState),
+    .class_init = riscv_iommu_class_init,
+};
+
+static const char *IOMMU_FLAG_STR[] = {
+    "NA",
+    "RO",
+    "WR",
+    "RW",
+};
+
+/* RISC-V IOMMU Memory Region - Address Translation Space */
+static IOMMUTLBEntry riscv_iommu_memory_region_translate(
+    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
+    IOMMUAccessFlags flag, int iommu_idx)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+    RISCVIOMMUContext *ctx;
+    void *ref;
+    IOMMUTLBEntry iotlb = {
+        .iova = addr,
+        .target_as = as->iommu->target_as,
+        .addr_mask = ~0ULL,
+        .perm = flag,
+    };
+
+    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
+    if (ctx == NULL) {
+        /* Translation disabled or invalid. */
+        iotlb.addr_mask = 0;
+        iotlb.perm = IOMMU_NONE;
+    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb, true)) {
+        /* Translation disabled or fault reported. */
+        iotlb.addr_mask = 0;
+        iotlb.perm = IOMMU_NONE;
+    }
+
+    /* Trace all dma translations with original access flags. */
+    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
+                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
+                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
+                          iotlb.translated_addr);
+
+    riscv_iommu_ctx_put(as->iommu, ref);
+
+    return iotlb;
+}
+
+static int riscv_iommu_memory_region_notify(
+    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
+    IOMMUNotifierFlag new, Error **errp)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+
+    if (old == IOMMU_NOTIFIER_NONE) {
+        as->notifier = true;
+        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
+    } else if (new == IOMMU_NOTIFIER_NONE) {
+        as->notifier = false;
+        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
+    }
+
+    return 0;
+}
+
+static inline bool pci_is_iommu(PCIDevice *pdev)
+{
+    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
+}
+
+static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
+{
+    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
+    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
+    AddressSpace *as = NULL;
+
+    if (pdev && pci_is_iommu(pdev)) {
+        return s->target_as;
+    }
+
+    /* Find first registered IOMMU device */
+    while (s->iommus.le_prev) {
+        s = *(s->iommus.le_prev);
+    }
+
+    /* Find first matching IOMMU */
+    while (s != NULL && as == NULL) {
+        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
+        s = s->iommus.le_next;
+    }
+
+    return as ? as : &address_space_memory;
+}
+
+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+    Error **errp)
+{
+    if (bus->iommu_fn == riscv_iommu_find_as) {
+        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
+        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
+        QLIST_INSERT_AFTER(last, iommu, iommus);
+    } else if (bus->iommu_fn == NULL) {
+        pci_setup_iommu(bus, riscv_iommu_find_as, iommu);
+    } else {
+        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
+            pci_bus_num(bus));
+    }
+}
+
+static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
+    MemTxAttrs attrs)
+{
+    return RISCV_IOMMU_NOPASID;
+}
+
+static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
+{
+    return 1;
+}
+
+static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
+{
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+    imrc->translate = riscv_iommu_memory_region_translate;
+    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
+    imrc->attrs_to_index = riscv_iommu_memory_region_index;
+    imrc->num_indexes = riscv_iommu_memory_region_index_len;
+}
+
+static const TypeInfo riscv_iommu_memory_region_info = {
+    .parent = TYPE_IOMMU_MEMORY_REGION,
+    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
+    .class_init = riscv_iommu_memory_region_init,
+};
+
+static void riscv_iommu_register_mr_types(void)
+{
+    type_register_static(&riscv_iommu_memory_region_info);
+    type_register_static(&riscv_iommu_info);
+}
+
+type_init(riscv_iommu_register_mr_types);
diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
new file mode 100644
index 0000000000..c68e09db58
--- /dev/null
+++ b/hw/riscv/riscv-iommu.h
@@ -0,0 +1,152 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU (Ziommu)
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_RISCV_IOMMU_STATE_H
+#define HW_RISCV_IOMMU_STATE_H
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#include "hw/riscv/iommu.h"
+
+struct RISCVIOMMUState {
+    /*< private >*/
+    DeviceState parent_obj;
+
+    /*< public >*/
+    uint32_t version;     /* Reported interface version number */
+    uint32_t pasid_bits;  /* process identifier width */
+    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
+
+    uint64_t cap;         /* IOMMU supported capabilities */
+    uint64_t fctl;        /* IOMMU enabled features */
+    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
+
+    /* IOMMU Internal State */
+    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
+
+    dma_addr_t cq_addr;   /* Command queue base physical address */
+    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
+    dma_addr_t pq_addr;   /* Page request queue base physical address */
+
+    uint32_t cq_mask;     /* Command queue index bit mask */
+    uint32_t fq_mask;     /* Fault/event queue index bit mask */
+    uint32_t pq_mask;     /* Page request queue index bit mask */
+
+    /* interrupt notifier */
+    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
+
+    /* IOMMU State Machine */
+    QemuThread core_proc; /* Background processing thread */
+    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
+    QemuCond core_cond;   /* Background processing wake up signal */
+    unsigned core_exec;   /* Processing thread execution actions */
+
+    /* IOMMU target address space */
+    AddressSpace *target_as;
+    MemoryRegion *target_mr;
+
+    /* MSI / MRIF access trap */
+    AddressSpace trap_as;
+    MemoryRegion trap_mr;
+
+    GHashTable *ctx_cache;          /* Device translation Context Cache */
+    GHashTable *iot_cache;          /* IO Translated Address Cache */
+    unsigned iot_limit;             /* IO Translation Cache size limit */
+
+    /* HPM cycle counter */
+    QEMUTimer *hpm_timer;
+    uint64_t hpmcycle_val;      /* Current value of cycle register */
+    uint64_t hpmcycle_prev;     /* Saved value of QEMU_CLOCK_VIRTUAL clock */
+    uint64_t irq_overflow_left; /* Value beyond INT64_MAX after overflow */
+
+    /* HPM event counters */
+    uint8_t hpm_cntrs;
+    GHashTable *hpm_event_ctr_map; /* Mapping of events to counters */
+    pthread_rwlock_t ht_lock;      /* Lock used for hpm_event_ctr_map updates */
+
+    /* MMIO Hardware Interface */
+    MemoryRegion regs_mr;
+    QemuSpin regs_lock;
+    uint8_t *regs_rw;  /* register state (user write) */
+    uint8_t *regs_wc;  /* write-1-to-clear mask */
+    uint8_t *regs_ro;  /* read-only mask */
+
+    QLIST_ENTRY(RISCVIOMMUState) iommus;
+    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
+};
+
+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+         Error **errp);
+
+/* private helpers */
+
+/* Register helper functions */
+static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
+    unsigned idx, uint32_t set, uint32_t clr)
+{
+    uint32_t val;
+    qemu_spin_lock(&s->regs_lock);
+    val = ldl_le_p(s->regs_rw + idx);
+    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
+    qemu_spin_unlock(&s->regs_lock);
+    return val;
+}
+
+static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
+    unsigned idx, uint32_t set)
+{
+    qemu_spin_lock(&s->regs_lock);
+    stl_le_p(s->regs_rw + idx, set);
+    qemu_spin_unlock(&s->regs_lock);
+}
+
+static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
+    unsigned idx)
+{
+    return ldl_le_p(s->regs_rw + idx);
+}
+
+static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
+    unsigned idx, uint64_t set, uint64_t clr)
+{
+    uint64_t val;
+    qemu_spin_lock(&s->regs_lock);
+    val = ldq_le_p(s->regs_rw + idx);
+    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
+    qemu_spin_unlock(&s->regs_lock);
+    return val;
+}
+
+static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
+    unsigned idx, uint64_t set)
+{
+    qemu_spin_lock(&s->regs_lock);
+    stq_le_p(s->regs_rw + idx, set);
+    qemu_spin_unlock(&s->regs_lock);
+}
+
+static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
+    unsigned idx)
+{
+    return ldq_le_p(s->regs_rw + idx);
+}
+
+
+
+#endif
diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
new file mode 100644
index 0000000000..fd5e21e3d4
--- /dev/null
+++ b/hw/riscv/trace-events
@@ -0,0 +1,14 @@ 
+# See documentation at docs/devel/tracing.rst
+
+# riscv-iommu.c
+riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
+riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
+riscv_iommu_ats(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: translate request %04x:%02x.%u iova: 0x%"PRIx64
+riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
+riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
+riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
+riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
+riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
+riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
+riscv_iommu_ats_inval(const char *id) "%s: dev-iotlb invalidate"
+riscv_iommu_ats_prgr(const char *id) "%s: dev-iotlb page request group response"
diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
new file mode 100644
index 0000000000..b88504b750
--- /dev/null
+++ b/hw/riscv/trace.h
@@ -0,0 +1,2 @@ 
+#include "trace/trace-hw_riscv.h"
+
diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
new file mode 100644
index 0000000000..2a63a5cbf2
--- /dev/null
+++ b/include/hw/riscv/iommu.h
@@ -0,0 +1,40 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU (Ziommu)
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_RISCV_IOMMU_H
+#define HW_RISCV_IOMMU_H
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#define TYPE_RISCV_IOMMU "x-riscv-iommu"
+OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
+typedef struct RISCVIOMMUState RISCVIOMMUState;
+
+#define TYPE_RISCV_IOMMU_MEMORY_REGION "x-riscv-iommu-mr"
+typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
+
+#define TYPE_RISCV_IOMMU_PCI "x-riscv-iommu-pci"
+OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
+typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
+
+#define TYPE_RISCV_IOMMU_SYS "x-riscv-iommu-device"
+OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStateSys, RISCV_IOMMU_SYS)
+typedef struct RISCVIOMMUStateSys RISCVIOMMUStateSys;
+
+#endif
diff --git a/meson.build b/meson.build
index 5fcdb37a71..693ea3447d 100644
--- a/meson.build
+++ b/meson.build
@@ -3268,6 +3268,7 @@  if have_system
     'hw/rdma',
     'hw/rdma/vmw',
     'hw/rtc',
+    'hw/riscv',
     'hw/s390x',
     'hw/scsi',
     'hw/sd',