diff mbox series

[v2,09/15] hw/riscv/riscv-iommu: add s-stage and g-stage support

Message ID 20240307160319.675044-10-dbarboza@ventanamicro.com (mailing list archive)
State New, archived
Headers show
Series riscv: QEMU RISC-V IOMMU Support | expand

Commit Message

Daniel Henrique Barboza March 7, 2024, 4:03 p.m. UTC
From: Tomasz Jeznach <tjeznach@rivosinc.com>

Add support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage
(sv32x4, sv39x4, sv48x4, sv57x4 caps). Most of the work is done in the
riscv_iommu_spa_fetch() function that now has to consider how many
translation stages we need to walk the page table.

Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
---
 hw/riscv/riscv-iommu-bits.h |  11 ++
 hw/riscv/riscv-iommu.c      | 282 ++++++++++++++++++++++++++++++++++--
 hw/riscv/riscv-iommu.h      |   2 +
 3 files changed, 286 insertions(+), 9 deletions(-)

Comments

Frank Chang May 10, 2024, 10:36 a.m. UTC | #1
Hi Daniel,

Daniel Henrique Barboza <dbarboza@ventanamicro.com> 於 2024年3月8日 週五 上午12:09寫道:
>
> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>
> Add support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage
> (sv32x4, sv39x4, sv48x4, sv57x4 caps). Most of the work is done in the
> riscv_iommu_spa_fetch() function that now has to consider how many
> translation stages we need to walk the page table.
>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> ---
>  hw/riscv/riscv-iommu-bits.h |  11 ++
>  hw/riscv/riscv-iommu.c      | 282 ++++++++++++++++++++++++++++++++++--
>  hw/riscv/riscv-iommu.h      |   2 +
>  3 files changed, 286 insertions(+), 9 deletions(-)
>
> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
> index 8e80b1e52a..9d645d69ea 100644
> --- a/hw/riscv/riscv-iommu-bits.h
> +++ b/hw/riscv/riscv-iommu-bits.h
> @@ -71,6 +71,14 @@ struct riscv_iommu_pq_record {
>  /* 5.3 IOMMU Capabilities (64bits) */
>  #define RISCV_IOMMU_REG_CAP             0x0000
>  #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
> +#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
> +#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
> +#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
> +#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
> +#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
> +#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
> +#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
> +#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
>  #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
>  #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
>  #define RISCV_IOMMU_CAP_IGS             GENMASK_ULL(29, 28)
> @@ -79,6 +87,7 @@ struct riscv_iommu_pq_record {
>
>  /* 5.4 Features control register (32bits) */
>  #define RISCV_IOMMU_REG_FCTL            0x0008
> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
>
>  /* 5.5 Device-directory-table pointer (64bits) */
>  #define RISCV_IOMMU_REG_DDTP            0x0010
> @@ -195,6 +204,8 @@ struct riscv_iommu_dc {
>  #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
>  #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
>  #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
>  #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
>  #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
>
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> index 0b93146327..03a610fa75 100644
> --- a/hw/riscv/riscv-iommu.c
> +++ b/hw/riscv/riscv-iommu.c
> @@ -58,6 +58,8 @@ struct RISCVIOMMUContext {
>      uint64_t __rfu:20;          /* reserved */
>      uint64_t tc;                /* Translation Control */
>      uint64_t ta;                /* Translation Attributes */
> +    uint64_t satp;              /* S-Stage address translation and protection */
> +    uint64_t gatp;              /* G-Stage address translation and protection */
>      uint64_t msi_addr_mask;     /* MSI filtering - address mask */
>      uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
>      uint64_t msiptp;            /* MSI redirection page table pointer */
> @@ -194,12 +196,46 @@ static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>      return true;
>  }
>
> -/* RISCV IOMMU Address Translation Lookup - Page Table Walk */
> +/*
> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
> + *
> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> + * Both implementation can be merged into single helper function in future.
> + * Keeping them separate for now, as error reporting and flow specifics are
> + * sufficiently different for separate implementation.
> + *
> + * @s        : IOMMU Device State
> + * @ctx      : Translation context for device id and process address space id.
> + * @iotlb    : translation data: physical address and access mode.
> + * @gpa      : provided IOVA is a guest physical address, use G-Stage only.
> + * @return   : success or fault cause code.
> + */
>  static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> -    IOMMUTLBEntry *iotlb)
> +    IOMMUTLBEntry *iotlb, bool gpa)
>  {
> +    dma_addr_t addr, base;
> +    uint64_t satp, gatp, pte;
> +    bool en_s, en_g;
> +    struct {
> +        unsigned char step;
> +        unsigned char levels;
> +        unsigned char ptidxbits;
> +        unsigned char ptesize;
> +    } sc[2];
> +    /* Translation stage phase */
> +    enum {
> +        S_STAGE = 0,
> +        G_STAGE = 1,
> +    } pass;
> +
> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> +
> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;
> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> +
>      /* Early check for MSI address match when IOVA == GPA */
> -    if (iotlb->perm & IOMMU_WO &&
> +    if (!en_s && (iotlb->perm & IOMMU_WO) &&

I'm wondering do we need to check "en_s" for MSI writes?

IOMMU spec Section 2.3.3. Process to translate addresses of MSIs says:
"Determine if the address A is an access to a virtual interrupt file
as specified in Section 2.1.3.6."

and Section 2.1.3.6 says:

"An incoming memory access made by a device is recognized as
an access to a virtual interrupt file if the destination guest physical page
matches the supplied address pattern in all bit positions that are zeros
in the supplied address mask. In detail, a memory access to
guest physical address A is recognized as an access to a virtual
interrupt file’s
memory-mapped page if:
(A >> 12) & ~msi_addr_mask = (msi_addr_pattern & ~msi_addr_mask)"

Is checking the address pattern sufficient enough to determine
the address is an MSI to a virtual interrupt file?

>          riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
>          iotlb->target_as = &s->trap_as;
>          iotlb->translated_addr = iotlb->iova;
> @@ -208,11 +244,196 @@ static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>      }
>
>      /* Exit early for pass-through mode. */
> -    iotlb->translated_addr = iotlb->iova;
> -    iotlb->addr_mask = ~TARGET_PAGE_MASK;
> -    /* Allow R/W in pass-through mode */
> -    iotlb->perm = IOMMU_RW;
> -    return 0;
> +    if (!(en_s || en_g)) {
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        /* Allow R/W in pass-through mode */
> +        iotlb->perm = IOMMU_RW;
> +        return 0;
> +    }
> +
> +    /* S/G translation parameters. */
> +    for (pass = 0; pass < 2; pass++) {
> +        uint32_t sv_mode;
> +
> +        sc[pass].step = 0;
> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
> +            /* 32bit mode for GXL/SXL == 1 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 2;
> +                sc[pass].ptidxbits = 10;
> +                sc[pass].ptesize   = 4;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        } else {
> +            /* 64bit mode for GXL/SXL == 0 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 3;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 4;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 5;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        }
> +    };
> +
> +    /* S/G stages translation tables root pointers */
> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    addr = (en_s && en_g) ? satp : iotlb->iova;
> +    base = en_g ? gatp : satp;
> +    pass = en_g ? G_STAGE : S_STAGE;
> +
> +    do {
> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
> +                                 (sc[pass].levels - 1 - sc[pass].step);
> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
> +        const bool ade =
> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
> +
> +        /* Address range check before first level lookup */
> +        if (!sc[pass].step) {
> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> +            if ((addr & va_mask) != addr) {
> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +            }
> +        }
> +
> +        /* Read page table entry */
> +        if (dma_memory_read(s->target_as, pte_addr, &pte,
> +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
> +        }
> +
> +        if (sc[pass].ptesize == 4) {
> +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
> +        } else {
> +            pte = le64_to_cpu(pte);
> +        }
> +
> +        sc[pass].step++;
> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> +        if (!(pte & PTE_V)) {
> +            break;                /* Invalid PTE */
> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> +            break;                /* Reserved leaf PTE flags: PTE_W */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> +            break;                /* Misaligned PPN */
> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
> +            break;                /* Read access check failed */
> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
> +            break;                /* Write access check failed */
> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
> +            break;                /* Access bit not set */
> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
> +            break;                /* Dirty bit not set */
> +        } else {
> +            /* Leaf PTE, translation completed. */
> +            sc[pass].step = sc[pass].levels;
> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> +            /* Update address mask based on smallest translation granularity */
> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
> +            /* Continue with S-Stage translation? */
> +            if (pass && sc[0].step != sc[0].levels) {

Replace 0 with S_STAGE?

> +                pass = S_STAGE;
> +                addr = iotlb->iova;
> +                continue;
> +            }

May I ask under which case we will continue to walk the S-stage translation
after leaf PTE is found?

I thought the translation combinations are:
S-stage (i.e. Single-stage translation)
G-stage (i.e. G-stage only translation)
S-stage -> G-stage (i.e. Nested translation)

> +            /* Translation phase completed (GPA or SPA) */
> +            iotlb->translated_addr = base;
> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> +                                                         : IOMMU_RO;
> +
> +            /* Check MSI GPA address match */
> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> +                riscv_iommu_msi_check(s, ctx, base)) {
> +                /* Trap MSI writes and return GPA address. */
> +                iotlb->target_as = &s->trap_as;
> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +                return 0;
> +            }
> +
> +            /* Continue with G-Stage translation? */
> +            if (!pass && en_g) {
> +                pass = G_STAGE;
> +                addr = base;
> +                base = gatp;
> +                sc[pass].step = 0;
> +                continue;
> +            }
> +
> +            return 0;
> +        }
> +
> +        if (sc[pass].step == sc[pass].levels) {
> +            break; /* Can't find leaf PTE */
> +        }
> +
> +        /* Continue with G-Stage translation? */
> +        if (!pass && en_g) {
> +            pass = G_STAGE;
> +            addr = base;
> +            base = gatp;
> +            sc[pass].step = 0;
> +        }

Will this if condition ever be executed?

For S-stage -> G-stage (i.e. Nested translation),
G-stage translation should be continued by
the S-stage Leaf PTE's if condition above?

> +    } while (1);
> +
> +    return (iotlb->perm & IOMMU_WO) ?
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
>  }
>
>  /* Redirect MSI write for given GPA. */
> @@ -351,6 +572,10 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>
>      case RISCV_IOMMU_DDTP_MODE_BARE:
>          /* mock up pass-through translation context */
> +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
> +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_FSC_MODE_BARE);
>          ctx->tc = RISCV_IOMMU_DC_TC_V;
>          ctx->ta = 0;
>          ctx->msiptp = 0;
> @@ -424,6 +649,8 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>
>      /* Set translation context. */
>      ctx->tc = le64_to_cpu(dc.tc);
> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
> +    ctx->satp = le64_to_cpu(dc.fsc);
>      ctx->ta = le64_to_cpu(dc.ta);
>      ctx->msiptp = le64_to_cpu(dc.msiptp);
>      ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> @@ -433,14 +660,38 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>          return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>      }
>
> +    /* FSC field checks */
> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
> +
> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
> +        /* No S-Stage translation, done. */
> +        return 0;
> +    }
> +
>      if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
>          if (ctx->pasid != RISCV_IOMMU_NOPASID) {
>              /* PASID is disabled */
>              return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
>          }
> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
> +            /* Invalid translation mode */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
>          return 0;
>      }
>
> +    if (ctx->pasid == RISCV_IOMMU_NOPASID) {
> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
> +            /* No default PASID enabled, set BARE mode */
> +            ctx->satp = 0ULL;
> +            return 0;
> +        } else {
> +            /* Use default PASID #0 */
> +            ctx->pasid = 0;

How do we differentiate between the default PASID: 0
and RISCV_IOMMU_NOPASID?

Regards,
Frank Chang

> +        }
> +    }
> +
>      /* FSC.TC.PDTV enabled */
>      if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
>          /* Invalid PDTP.MODE */
> @@ -474,6 +725,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>
>      /* Use FSC and TA from process directory entry. */
>      ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->satp = le64_to_cpu(dc.fsc);
>
>      return 0;
>  }
> @@ -710,6 +962,7 @@ static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
>      GHashTable *iot_cache, hwaddr iova)
>  {
>      RISCVIOMMUEntry key = {
> +        .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
>          .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
>          .iova  = PPN_DOWN(iova),
>      };
> @@ -779,7 +1032,7 @@ static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>      }
>
>      /* Translate using device directory / page table information. */
> -    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb, false);
>
>      if (!fault && iotlb->target_as == &s->trap_as) {
>          /* Do not cache trapped MSI translations */
> @@ -790,6 +1043,7 @@ static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>          iot = g_new0(RISCVIOMMUEntry, 1);
>          iot->iova = PPN_DOWN(iotlb->iova);
>          iot->phys = PPN_DOWN(iotlb->translated_addr);
> +        iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
>          iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
>          iot->perm = iotlb->perm;
>          riscv_iommu_iot_update(s, iot_cache, iot);
> @@ -1394,6 +1648,14 @@ static void riscv_iommu_realize(DeviceState *dev, Error **errp)
>      if (s->enable_msi) {
>          s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
>      }
> +    if (s->enable_s_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
> +                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
> +    }
> +    if (s->enable_g_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
> +                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
> +    }
>      /* Report QEMU target physical address space limits */
>      s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
>                         TARGET_PHYS_ADDR_SPACE_BITS);
> @@ -1504,6 +1766,8 @@ static Property riscv_iommu_properties[] = {
>          LIMIT_CACHE_IOT),
>      DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
>      DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> +    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
> +    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
>      DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
>          TYPE_MEMORY_REGION, MemoryRegion *),
>      DEFINE_PROP_END_OF_LIST(),
> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> index eea2123686..9b33fb97ef 100644
> --- a/hw/riscv/riscv-iommu.h
> +++ b/hw/riscv/riscv-iommu.h
> @@ -38,6 +38,8 @@ struct RISCVIOMMUState {
>
>      bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>      bool enable_msi;      /* Enable MSI remapping */
> +    bool enable_s_stage;  /* Enable S/VS-Stage translation */
> +    bool enable_g_stage;  /* Enable G-Stage translation */
>
>      /* IOMMU Internal State */
>      uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> --
> 2.43.2
>
>
Andrew Jones May 10, 2024, 11:14 a.m. UTC | #2
On Fri, May 10, 2024 at 06:36:51PM GMT, Frank Chang wrote:
...
> >  static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> > -    IOMMUTLBEntry *iotlb)
> > +    IOMMUTLBEntry *iotlb, bool gpa)
> >  {
> > +    dma_addr_t addr, base;
> > +    uint64_t satp, gatp, pte;
> > +    bool en_s, en_g;
> > +    struct {
> > +        unsigned char step;
> > +        unsigned char levels;
> > +        unsigned char ptidxbits;
> > +        unsigned char ptesize;
> > +    } sc[2];
> > +    /* Translation stage phase */
> > +    enum {
> > +        S_STAGE = 0,
> > +        G_STAGE = 1,
> > +    } pass;
> > +
> > +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> > +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> > +
> > +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;
> > +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> > +
> >      /* Early check for MSI address match when IOVA == GPA */
> > -    if (iotlb->perm & IOMMU_WO &&
> > +    if (!en_s && (iotlb->perm & IOMMU_WO) &&
> 
> I'm wondering do we need to check "en_s" for MSI writes?
> 
> IOMMU spec Section 2.3.3. Process to translate addresses of MSIs says:
> "Determine if the address A is an access to a virtual interrupt file
> as specified in Section 2.1.3.6."
> 
> and Section 2.1.3.6 says:
> 
> "An incoming memory access made by a device is recognized as
> an access to a virtual interrupt file if the destination guest physical page
> matches the supplied address pattern in all bit positions that are zeros
> in the supplied address mask. In detail, a memory access to
> guest physical address A is recognized as an access to a virtual
> interrupt file’s
> memory-mapped page if:
> (A >> 12) & ~msi_addr_mask = (msi_addr_pattern & ~msi_addr_mask)"
> 
> Is checking the address pattern sufficient enough to determine
> the address is an MSI to a virtual interrupt file?
>

I think so. In fact, I've removed that en_s check on our internal build in
order to get things working for my irqbypass work, as we can do device
assignment with VFIO with only S-stage enabled.

Thanks,
drew
Daniel Henrique Barboza May 16, 2024, 7:41 p.m. UTC | #3
On 5/10/24 08:14, Andrew Jones wrote:
> On Fri, May 10, 2024 at 06:36:51PM GMT, Frank Chang wrote:
> ...
>>>   static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>>> -    IOMMUTLBEntry *iotlb)
>>> +    IOMMUTLBEntry *iotlb, bool gpa)
>>>   {
>>> +    dma_addr_t addr, base;
>>> +    uint64_t satp, gatp, pte;
>>> +    bool en_s, en_g;
>>> +    struct {
>>> +        unsigned char step;
>>> +        unsigned char levels;
>>> +        unsigned char ptidxbits;
>>> +        unsigned char ptesize;
>>> +    } sc[2];
>>> +    /* Translation stage phase */
>>> +    enum {
>>> +        S_STAGE = 0,
>>> +        G_STAGE = 1,
>>> +    } pass;
>>> +
>>> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
>>> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
>>> +
>>> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;
>>> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
>>> +
>>>       /* Early check for MSI address match when IOVA == GPA */
>>> -    if (iotlb->perm & IOMMU_WO &&
>>> +    if (!en_s && (iotlb->perm & IOMMU_WO) &&
>>
>> I'm wondering do we need to check "en_s" for MSI writes?
>>
>> IOMMU spec Section 2.3.3. Process to translate addresses of MSIs says:
>> "Determine if the address A is an access to a virtual interrupt file
>> as specified in Section 2.1.3.6."
>>
>> and Section 2.1.3.6 says:
>>
>> "An incoming memory access made by a device is recognized as
>> an access to a virtual interrupt file if the destination guest physical page
>> matches the supplied address pattern in all bit positions that are zeros
>> in the supplied address mask. In detail, a memory access to
>> guest physical address A is recognized as an access to a virtual
>> interrupt file’s
>> memory-mapped page if:
>> (A >> 12) & ~msi_addr_mask = (msi_addr_pattern & ~msi_addr_mask)"
>>
>> Is checking the address pattern sufficient enough to determine
>> the address is an MSI to a virtual interrupt file?
>>
> 
> I think so. In fact, I've removed that en_s check on our internal build in
> order to get things working for my irqbypass work, as we can do device
> assignment with VFIO with only S-stage enabled.

The following code will be fixed up here:

  static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
-    IOMMUTLBEntry *iotlb, bool gpa)
+    IOMMUTLBEntry *iotlb)
  {
      dma_addr_t addr, base;
      uint64_t satp, gatp, pte;
@@ -238,11 +237,11 @@ static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
      satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
      gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
  
-    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;
+    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
      en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
  
      /* Early check for MSI address match when IOVA == GPA */
-    if (!en_s && (iotlb->perm & IOMMU_WO) &&
+    if ((iotlb->perm & IOMMU_WO) &&
          riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
          iotlb->target_as = &s->trap_as;
          iotlb->translated_addr = iotlb->iova;
@@ -1203,7 +1202,7 @@ static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
      }
  
      /* Translate using device directory / page table information. */
-    fault = riscv_iommu_spa_fetch(s, ctx, iotlb, false);
+    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
  
      if (!fault && iotlb->target_as == &s->trap_as) {
          /* Do not cache trapped MSI translations */

'gpa' is eliminated since it was only being used as 'false' by the only
caller of riscv_iommu_spa_fetch(). The boolean was used only to calculate
en_s as "&& !gpa", so it's always 'true' and had no impact in en_s. My
understand here is that 'gpa' was a prototype of the first implementation
that got left behind and ended up not being used.

As for the MSI check, we won't skip translation if satp is bare (!en_s) because
we might be using just stage2 for a guest, thus en_s is removed from the
conditional. As Frank said, this change also complies with the spec since we don't
need to check satp to determine if the address is an MSI to a virtual interrupt
file.

And, last but not the least, this change doesn't break my KVM VFIO passthrough
test case :) I'll document more about the test case I'm using in the v3 cover
letter.


Thanks,

Daniel


> 
> Thanks,
> drew
diff mbox series

Patch

diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
index 8e80b1e52a..9d645d69ea 100644
--- a/hw/riscv/riscv-iommu-bits.h
+++ b/hw/riscv/riscv-iommu-bits.h
@@ -71,6 +71,14 @@  struct riscv_iommu_pq_record {
 /* 5.3 IOMMU Capabilities (64bits) */
 #define RISCV_IOMMU_REG_CAP             0x0000
 #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
+#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
+#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
+#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
+#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
+#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
+#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
+#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
+#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
 #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
 #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
 #define RISCV_IOMMU_CAP_IGS             GENMASK_ULL(29, 28)
@@ -79,6 +87,7 @@  struct riscv_iommu_pq_record {
 
 /* 5.4 Features control register (32bits) */
 #define RISCV_IOMMU_REG_FCTL            0x0008
+#define RISCV_IOMMU_FCTL_GXL            BIT(2)
 
 /* 5.5 Device-directory-table pointer (64bits) */
 #define RISCV_IOMMU_REG_DDTP            0x0010
@@ -195,6 +204,8 @@  struct riscv_iommu_dc {
 #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
 #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
 #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
+#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
+#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
 #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
 #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
 
diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
index 0b93146327..03a610fa75 100644
--- a/hw/riscv/riscv-iommu.c
+++ b/hw/riscv/riscv-iommu.c
@@ -58,6 +58,8 @@  struct RISCVIOMMUContext {
     uint64_t __rfu:20;          /* reserved */
     uint64_t tc;                /* Translation Control */
     uint64_t ta;                /* Translation Attributes */
+    uint64_t satp;              /* S-Stage address translation and protection */
+    uint64_t gatp;              /* G-Stage address translation and protection */
     uint64_t msi_addr_mask;     /* MSI filtering - address mask */
     uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
     uint64_t msiptp;            /* MSI redirection page table pointer */
@@ -194,12 +196,46 @@  static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
     return true;
 }
 
-/* RISCV IOMMU Address Translation Lookup - Page Table Walk */
+/*
+ * RISCV IOMMU Address Translation Lookup - Page Table Walk
+ *
+ * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
+ * Both implementation can be merged into single helper function in future.
+ * Keeping them separate for now, as error reporting and flow specifics are
+ * sufficiently different for separate implementation.
+ *
+ * @s        : IOMMU Device State
+ * @ctx      : Translation context for device id and process address space id.
+ * @iotlb    : translation data: physical address and access mode.
+ * @gpa      : provided IOVA is a guest physical address, use G-Stage only.
+ * @return   : success or fault cause code.
+ */
 static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
-    IOMMUTLBEntry *iotlb)
+    IOMMUTLBEntry *iotlb, bool gpa)
 {
+    dma_addr_t addr, base;
+    uint64_t satp, gatp, pte;
+    bool en_s, en_g;
+    struct {
+        unsigned char step;
+        unsigned char levels;
+        unsigned char ptidxbits;
+        unsigned char ptesize;
+    } sc[2];
+    /* Translation stage phase */
+    enum {
+        S_STAGE = 0,
+        G_STAGE = 1,
+    } pass;
+
+    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
+    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
+
+    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;
+    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
+
     /* Early check for MSI address match when IOVA == GPA */
-    if (iotlb->perm & IOMMU_WO &&
+    if (!en_s && (iotlb->perm & IOMMU_WO) &&
         riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
         iotlb->target_as = &s->trap_as;
         iotlb->translated_addr = iotlb->iova;
@@ -208,11 +244,196 @@  static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
     }
 
     /* Exit early for pass-through mode. */
-    iotlb->translated_addr = iotlb->iova;
-    iotlb->addr_mask = ~TARGET_PAGE_MASK;
-    /* Allow R/W in pass-through mode */
-    iotlb->perm = IOMMU_RW;
-    return 0;
+    if (!(en_s || en_g)) {
+        iotlb->translated_addr = iotlb->iova;
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        /* Allow R/W in pass-through mode */
+        iotlb->perm = IOMMU_RW;
+        return 0;
+    }
+
+    /* S/G translation parameters. */
+    for (pass = 0; pass < 2; pass++) {
+        uint32_t sv_mode;
+
+        sc[pass].step = 0;
+        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
+            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
+            /* 32bit mode for GXL/SXL == 1 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 2;
+                sc[pass].ptidxbits = 10;
+                sc[pass].ptesize   = 4;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        } else {
+            /* 64bit mode for GXL/SXL == 0 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 3;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 4;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 5;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        }
+    };
+
+    /* S/G stages translation tables root pointers */
+    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
+    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
+    addr = (en_s && en_g) ? satp : iotlb->iova;
+    base = en_g ? gatp : satp;
+    pass = en_g ? G_STAGE : S_STAGE;
+
+    do {
+        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
+        const unsigned va_bits = widened + sc[pass].ptidxbits;
+        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
+                                 (sc[pass].levels - 1 - sc[pass].step);
+        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
+        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
+        const bool ade =
+            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
+
+        /* Address range check before first level lookup */
+        if (!sc[pass].step) {
+            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
+            if ((addr & va_mask) != addr) {
+                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
+            }
+        }
+
+        /* Read page table entry */
+        if (dma_memory_read(s->target_as, pte_addr, &pte,
+                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
+                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
+        }
+
+        if (sc[pass].ptesize == 4) {
+            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
+        } else {
+            pte = le64_to_cpu(pte);
+        }
+
+        sc[pass].step++;
+        hwaddr ppn = pte >> PTE_PPN_SHIFT;
+
+        if (!(pte & PTE_V)) {
+            break;                /* Invalid PTE */
+        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
+            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
+            break;                /* Reserved leaf PTE flags: PTE_W */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
+            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
+        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
+            break;                /* Misaligned PPN */
+        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
+            break;                /* Read access check failed */
+        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
+            break;                /* Write access check failed */
+        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
+            break;                /* Access bit not set */
+        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
+            break;                /* Dirty bit not set */
+        } else {
+            /* Leaf PTE, translation completed. */
+            sc[pass].step = sc[pass].levels;
+            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
+            /* Update address mask based on smallest translation granularity */
+            iotlb->addr_mask &= (1ULL << va_skip) - 1;
+            /* Continue with S-Stage translation? */
+            if (pass && sc[0].step != sc[0].levels) {
+                pass = S_STAGE;
+                addr = iotlb->iova;
+                continue;
+            }
+            /* Translation phase completed (GPA or SPA) */
+            iotlb->translated_addr = base;
+            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
+                                                         : IOMMU_RO;
+
+            /* Check MSI GPA address match */
+            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
+                riscv_iommu_msi_check(s, ctx, base)) {
+                /* Trap MSI writes and return GPA address. */
+                iotlb->target_as = &s->trap_as;
+                iotlb->addr_mask = ~TARGET_PAGE_MASK;
+                return 0;
+            }
+
+            /* Continue with G-Stage translation? */
+            if (!pass && en_g) {
+                pass = G_STAGE;
+                addr = base;
+                base = gatp;
+                sc[pass].step = 0;
+                continue;
+            }
+
+            return 0;
+        }
+
+        if (sc[pass].step == sc[pass].levels) {
+            break; /* Can't find leaf PTE */
+        }
+
+        /* Continue with G-Stage translation? */
+        if (!pass && en_g) {
+            pass = G_STAGE;
+            addr = base;
+            base = gatp;
+            sc[pass].step = 0;
+        }
+    } while (1);
+
+    return (iotlb->perm & IOMMU_WO) ?
+                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
+                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
 }
 
 /* Redirect MSI write for given GPA. */
@@ -351,6 +572,10 @@  static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
 
     case RISCV_IOMMU_DDTP_MODE_BARE:
         /* mock up pass-through translation context */
+        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
+        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_FSC_MODE_BARE);
         ctx->tc = RISCV_IOMMU_DC_TC_V;
         ctx->ta = 0;
         ctx->msiptp = 0;
@@ -424,6 +649,8 @@  static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
 
     /* Set translation context. */
     ctx->tc = le64_to_cpu(dc.tc);
+    ctx->gatp = le64_to_cpu(dc.iohgatp);
+    ctx->satp = le64_to_cpu(dc.fsc);
     ctx->ta = le64_to_cpu(dc.ta);
     ctx->msiptp = le64_to_cpu(dc.msiptp);
     ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
@@ -433,14 +660,38 @@  static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
         return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
     }
 
+    /* FSC field checks */
+    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
+
+    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
+        /* No S-Stage translation, done. */
+        return 0;
+    }
+
     if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
         if (ctx->pasid != RISCV_IOMMU_NOPASID) {
             /* PASID is disabled */
             return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
         }
+        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
+            /* Invalid translation mode */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
         return 0;
     }
 
+    if (ctx->pasid == RISCV_IOMMU_NOPASID) {
+        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
+            /* No default PASID enabled, set BARE mode */
+            ctx->satp = 0ULL;
+            return 0;
+        } else {
+            /* Use default PASID #0 */
+            ctx->pasid = 0;
+        }
+    }
+
     /* FSC.TC.PDTV enabled */
     if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
         /* Invalid PDTP.MODE */
@@ -474,6 +725,7 @@  static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
 
     /* Use FSC and TA from process directory entry. */
     ctx->ta = le64_to_cpu(dc.ta);
+    ctx->satp = le64_to_cpu(dc.fsc);
 
     return 0;
 }
@@ -710,6 +962,7 @@  static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
     GHashTable *iot_cache, hwaddr iova)
 {
     RISCVIOMMUEntry key = {
+        .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
         .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
         .iova  = PPN_DOWN(iova),
     };
@@ -779,7 +1032,7 @@  static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
     }
 
     /* Translate using device directory / page table information. */
-    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
+    fault = riscv_iommu_spa_fetch(s, ctx, iotlb, false);
 
     if (!fault && iotlb->target_as == &s->trap_as) {
         /* Do not cache trapped MSI translations */
@@ -790,6 +1043,7 @@  static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
         iot = g_new0(RISCVIOMMUEntry, 1);
         iot->iova = PPN_DOWN(iotlb->iova);
         iot->phys = PPN_DOWN(iotlb->translated_addr);
+        iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
         iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
         iot->perm = iotlb->perm;
         riscv_iommu_iot_update(s, iot_cache, iot);
@@ -1394,6 +1648,14 @@  static void riscv_iommu_realize(DeviceState *dev, Error **errp)
     if (s->enable_msi) {
         s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
     }
+    if (s->enable_s_stage) {
+        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
+                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
+    }
+    if (s->enable_g_stage) {
+        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
+                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
+    }
     /* Report QEMU target physical address space limits */
     s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
                        TARGET_PHYS_ADDR_SPACE_BITS);
@@ -1504,6 +1766,8 @@  static Property riscv_iommu_properties[] = {
         LIMIT_CACHE_IOT),
     DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
     DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
+    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
+    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
     DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
         TYPE_MEMORY_REGION, MemoryRegion *),
     DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
index eea2123686..9b33fb97ef 100644
--- a/hw/riscv/riscv-iommu.h
+++ b/hw/riscv/riscv-iommu.h
@@ -38,6 +38,8 @@  struct RISCVIOMMUState {
 
     bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
     bool enable_msi;      /* Enable MSI remapping */
+    bool enable_s_stage;  /* Enable S/VS-Stage translation */
+    bool enable_g_stage;  /* Enable G-Stage translation */
 
     /* IOMMU Internal State */
     uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */