diff mbox series

[v6,5/7] iommu/riscv: Device directory management.

Message ID e18ec8ac169ae7f76e9044c26d0768e6469bad19.1716578450.git.tjeznach@rivosinc.com (mailing list archive)
State Superseded
Headers show
Series Linux RISC-V IOMMU Support | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR fail PR summary
conchuod/patch-5-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-5-test-2 fail .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-5-test-3 fail .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-5-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-5-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-5-test-6 success .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-5-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-5-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-5-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-5-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-5-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-5-test-12 success .github/scripts/patches/tests/verify_signedoff.sh

Commit Message

Tomasz Jeznach May 24, 2024, 7:34 p.m. UTC
Introduce device context allocation and device directory tree
management including capabilities discovery sequence, as described
in Chapter 2.1 of the RISC-V IOMMU Architecture Specification.

Device directory mode will be auto detected using DDTP WARL property,
using highest mode supported by the driver and hardware. If none
supported can be configured, driver will fall back to global pass-through.

First level DDTP page can be located in I/O (detected using DDTP WARL)
and system memory.

Only simple identity and blocking protection domains are supported by
this implementation.

Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Zong Li <zong.li@sifive.com>
Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
---
 drivers/iommu/riscv/iommu.c | 397 +++++++++++++++++++++++++++++++++++-
 drivers/iommu/riscv/iommu.h |   5 +
 2 files changed, 392 insertions(+), 10 deletions(-)

Comments

Zong Li May 31, 2024, 6:25 a.m. UTC | #1
On Sat, May 25, 2024 at 3:35 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> Introduce device context allocation and device directory tree
> management including capabilities discovery sequence, as described
> in Chapter 2.1 of the RISC-V IOMMU Architecture Specification.
>
> Device directory mode will be auto detected using DDTP WARL property,
> using highest mode supported by the driver and hardware. If none
> supported can be configured, driver will fall back to global pass-through.
>
> First level DDTP page can be located in I/O (detected using DDTP WARL)
> and system memory.
>
> Only simple identity and blocking protection domains are supported by
> this implementation.
>
> Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
> Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
> Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> Reviewed-by: Zong Li <zong.li@sifive.com>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> ---
>  drivers/iommu/riscv/iommu.c | 397 +++++++++++++++++++++++++++++++++++-
>  drivers/iommu/riscv/iommu.h |   5 +
>  2 files changed, 392 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index b8e0e4b62585..9ca130505c96 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -16,15 +16,168 @@
>  #include <linux/crash_dump.h>
>  #include <linux/init.h>
>  #include <linux/iommu.h>
> +#include <linux/iopoll.h>
>  #include <linux/kernel.h>
>  #include <linux/pci.h>
>
> +#include "../iommu-pages.h"
>  #include "iommu-bits.h"
>  #include "iommu.h"
>
>  /* Timeouts in [us] */
>  #define RISCV_IOMMU_DDTP_TIMEOUT       50000
>
> +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
> +#define phys_to_ppn(pa)  (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
> +#define ppn_to_phys(pn)         (((pn) << 2) & (((1ULL << 44) - 1) << 12))
> +
> +#define dev_to_iommu(dev) \
> +       iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
> +
> +/* Device resource-managed allocations */
> +struct riscv_iommu_devres {
> +       void *addr;
> +       int order;
> +};
> +
> +static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
> +{
> +       struct riscv_iommu_devres *devres = res;
> +
> +       iommu_free_pages(devres->addr, devres->order);
> +}
> +
> +static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
> +{
> +       struct riscv_iommu_devres *devres = res;
> +       struct riscv_iommu_devres *target = p;
> +
> +       return devres->addr == target->addr;
> +}
> +
> +static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order)
> +{
> +       struct riscv_iommu_devres *devres;
> +       void *addr;
> +
> +       addr = iommu_alloc_pages_node(dev_to_node(iommu->dev),
> +                                     GFP_KERNEL_ACCOUNT, order);
> +       if (unlikely(!addr))
> +               return NULL;
> +
> +       devres = devres_alloc(riscv_iommu_devres_pages_release,
> +                             sizeof(struct riscv_iommu_devres), GFP_KERNEL);
> +
> +       if (unlikely(!devres)) {
> +               iommu_free_pages(addr, order);
> +               return NULL;
> +       }
> +
> +       devres->addr = addr;
> +       devres->order = order;
> +
> +       devres_add(iommu->dev, devres);
> +
> +       return addr;
> +}
> +
> +static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
> +{
> +       struct riscv_iommu_devres devres = { .addr = addr };
> +
> +       devres_release(iommu->dev, riscv_iommu_devres_pages_release,
> +                      riscv_iommu_devres_pages_match, &devres);
> +}
> +
> +/* Lookup and initialize device context info structure. */
> +static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
> +                                                unsigned int devid)
> +{
> +       const bool base_format = !(iommu->caps & RISCV_IOMMU_CAP_MSI_FLAT);
> +       unsigned int depth;
> +       unsigned long ddt, old, new;
> +       void *ptr;
> +       u8 ddi_bits[3] = { 0 };
> +       u64 *ddtp = NULL;
> +
> +       /* Make sure the mode is valid */
> +       if (iommu->ddt_mode < RISCV_IOMMU_DDTP_MODE_1LVL ||
> +           iommu->ddt_mode > RISCV_IOMMU_DDTP_MODE_3LVL)
> +               return NULL;
> +
> +       /*
> +        * Device id partitioning for base format:
> +        * DDI[0]: bits 0 - 6   (1st level) (7 bits)
> +        * DDI[1]: bits 7 - 15  (2nd level) (9 bits)
> +        * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
> +        *
> +        * For extended format:
> +        * DDI[0]: bits 0 - 5   (1st level) (6 bits)
> +        * DDI[1]: bits 6 - 14  (2nd level) (9 bits)
> +        * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
> +        */
> +       if (base_format) {
> +               ddi_bits[0] = 7;
> +               ddi_bits[1] = 7 + 9;
> +               ddi_bits[2] = 7 + 9 + 8;
> +       } else {
> +               ddi_bits[0] = 6;
> +               ddi_bits[1] = 6 + 9;
> +               ddi_bits[2] = 6 + 9 + 9;
> +       }
> +
> +       /* Make sure device id is within range */
> +       depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_MODE_1LVL;
> +       if (devid >= (1 << ddi_bits[depth]))
> +               return NULL;
> +
> +       /* Get to the level of the non-leaf node that holds the device context */
> +       for (ddtp = iommu->ddt_root; depth-- > 0;) {
> +               const int split = ddi_bits[depth];
> +               /*
> +                * Each non-leaf node is 64bits wide and on each level
> +                * nodes are indexed by DDI[depth].
> +                */
> +               ddtp += (devid >> split) & 0x1FF;
> +
> +               /*
> +                * Check if this node has been populated and if not
> +                * allocate a new level and populate it.
> +                */
> +               do {
> +                       ddt = READ_ONCE(*(unsigned long *)ddtp);
> +                       if (ddt & RISCV_IOMMU_DDTE_VALID) {
> +                               ddtp = __va(ppn_to_phys(ddt));
> +                               break;
> +                       }
> +
> +                       ptr = riscv_iommu_get_pages(iommu, 0);
> +                       if (!ptr)
> +                               return NULL;
> +
> +                       new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_VALID;
> +                       old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
> +
> +                       if (old == ddt) {
> +                               ddtp = (u64 *)ptr;
> +                               break;
> +                       }
> +
> +                       /* Race setting DDT detected, re-read and retry. */
> +                       riscv_iommu_free_pages(iommu, ptr);
> +               } while (1);
> +       }
> +
> +       /*
> +        * Grab the node that matches DDI[depth], note that when using base
> +        * format the device context is 4 * 64bits, and the extended format
> +        * is 8 * 64bits, hence the (3 - base_format) below.
> +        */
> +       ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
> +
> +       return (struct riscv_iommu_dc *)ddtp;
> +}
> +
>  /*
>   * This is best effort IOMMU translation shutdown flow.
>   * Disable IOMMU without waiting for hardware response.
> @@ -37,10 +190,201 @@ static void riscv_iommu_disable(struct riscv_iommu_device *iommu)
>         riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
>  }
>
> +#define riscv_iommu_read_ddtp(iommu) ({ \
> +       u64 ddtp; \
> +       riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
> +                                 !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
> +                                 RISCV_IOMMU_DDTP_TIMEOUT); \
> +       ddtp; })
> +
> +static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
> +{
> +       u64 ddtp;
> +       unsigned int mode;
> +
> +       ddtp = riscv_iommu_read_ddtp(iommu);
> +       if (ddtp & RISCV_IOMMU_DDTP_BUSY)
> +               return -EBUSY;
> +
> +       /*
> +        * It is optional for the hardware to report a fixed address for device
> +        * directory root page when DDT.MODE is OFF or BARE.
> +        */
> +       mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
> +       if (mode == RISCV_IOMMU_DDTP_MODE_BARE ||
> +           mode == RISCV_IOMMU_DDTP_MODE_OFF) {
> +               /* Use WARL to discover hardware fixed DDT PPN */
> +               riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
> +                                  FIELD_PREP(RISCV_IOMMU_DDTP_MODE, mode));
> +               ddtp = riscv_iommu_read_ddtp(iommu);
> +               if (ddtp & RISCV_IOMMU_DDTP_BUSY)
> +                       return -EBUSY;
> +
> +               iommu->ddt_phys = ppn_to_phys(ddtp);
> +               if (iommu->ddt_phys)
> +                       iommu->ddt_root = devm_ioremap(iommu->dev,
> +                                                      iommu->ddt_phys, PAGE_SIZE);
> +               if (iommu->ddt_root)
> +                       memset(iommu->ddt_root, 0, PAGE_SIZE);
> +       }
> +
> +       if (!iommu->ddt_root) {
> +               iommu->ddt_root = riscv_iommu_get_pages(iommu, 0);
> +               iommu->ddt_phys = __pa(iommu->ddt_root);
> +       }
> +
> +       if (!iommu->ddt_root)
> +               return -ENOMEM;
> +
> +       return 0;
> +}
> +
> +/*
> + * Discover supported DDT modes starting from requested value,
> + * configure DDTP register with accepted mode and root DDT address.
> + * Accepted iommu->ddt_mode is updated on success.
> + */
> +static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
> +                                     unsigned int ddtp_mode)
> +{
> +       struct device *dev = iommu->dev;
> +       u64 ddtp, rq_ddtp;
> +       unsigned int mode, rq_mode = ddtp_mode;
> +
> +       ddtp = riscv_iommu_read_ddtp(iommu);
> +       if (ddtp & RISCV_IOMMU_DDTP_BUSY)
> +               return -EBUSY;
> +
> +       /* Disallow state transition from xLVL to xLVL. */
> +       mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
> +       if (mode != RISCV_IOMMU_DDTP_MODE_BARE &&
> +           mode != RISCV_IOMMU_DDTP_MODE_OFF &&
> +           rq_mode != RISCV_IOMMU_DDTP_MODE_BARE &&
> +           rq_mode != RISCV_IOMMU_DDTP_MODE_OFF)
> +               return -EINVAL;
> +
> +       do {
> +               rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_MODE, rq_mode);
> +               if (rq_mode > RISCV_IOMMU_DDTP_MODE_BARE)
> +                       rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
> +
> +               riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
> +               ddtp = riscv_iommu_read_ddtp(iommu);
> +               if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
> +                       dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
> +                               rq_mode, ddtp);
> +                       return -EBUSY;
> +               }
> +
> +               /* Verify IOMMU hardware accepts new DDTP config. */
> +               mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
> +
> +               if (rq_mode == mode)
> +                       break;
> +
> +               /* Hardware mandatory DDTP mode has not been accepted. */
> +               if (rq_mode < RISCV_IOMMU_DDTP_MODE_1LVL && rq_ddtp != ddtp) {
> +                       dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
> +                               ddtp, rq_ddtp);
> +                       return -EINVAL;
> +               }
> +
> +               /*
> +                * Mode field is WARL, an IOMMU may support a subset of
> +                * directory table levels in which case if we tried to set
> +                * an unsupported number of levels we'll readback either
> +                * a valid xLVL or off/bare. If we got off/bare, try again
> +                * with a smaller xLVL.
> +                */
> +               if (mode < RISCV_IOMMU_DDTP_MODE_1LVL &&
> +                   rq_mode > RISCV_IOMMU_DDTP_MODE_1LVL) {
> +                       dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
> +                       rq_mode--;
> +                       continue;
> +               }
> +
> +               /*
> +                * We tried all supported modes and IOMMU hardware failed to
> +                * accept new settings, something went very wrong since off/bare
> +                * and at least one xLVL must be supported.
> +                */
> +               dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
> +                       mode, ddtp_mode);
> +               return -EINVAL;
> +       } while (1);
> +
> +       iommu->ddt_mode = mode;
> +       if (mode != ddtp_mode)
> +               dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
> +
> +       return 0;
> +}
> +
> +#define RISCV_IOMMU_FSC_BARE 0
> +
> +/*
> + * Update IODIR for the device.
> + *
> + * During the execution of riscv_iommu_probe_device(), IODIR entries are
> + * allocated for the device's identifiers.  Device context invalidation
> + * becomes necessary only if one of the updated entries was previously
> + * marked as valid, given that invalid device context entries are not
> + * cached by the IOMMU hardware.
> + * In this implementation, updating a valid device context while the
> + * device is not quiesced might be disruptive, potentially causing
> + * interim translation faults.
> + */
> +static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
> +                                    struct device *dev, u64 fsc, u64 ta)
> +{
> +       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +       struct riscv_iommu_dc *dc;
> +       u64 tc;
> +       int i;
> +
> +       /* Device context invalidation ignored for now. */
> +
> +       /*
> +        * For device context with DC_TC_PDTV = 0, translation attributes valid bit
> +        * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
> +        */
> +       for (i = 0; i < fwspec->num_ids; i++) {
> +               dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
> +               tc = READ_ONCE(dc->tc);
> +               tc |= ta & RISCV_IOMMU_DC_TC_V;
> +
> +               WRITE_ONCE(dc->fsc, fsc);
> +               WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
> +               /* Update device context, write TC.V as the last step. */
> +               dma_wmb();
> +               WRITE_ONCE(dc->tc, tc);
> +       }

Does it make sense to invalidate the DDTE after we update the DDTE in
memory? This behavior will affect the nested IOMMU mechanism. The VMM
has to catch the event of a DDTE update from the guest and then
eventually go into the host IOMMU driver to configure the IOMMU
hardware. One way to achieve this is by catching the page fault of the
in-memory DDT page table, but it will be difficult to modify the
attribute of only a portion of the memory. Another way is through the
page fault of the MMIO region. A good candidate for the MMIO register
might be the tail pointer of the command queue because it makes sense
to invalidate the DDTE after updating a DDTE. I checked the SMMU and
DMAR implementations; they also invalidate the cache after updating
the table's entry. I was wondering if there is any chance to modify
the logic here, or if there are any ideas for this situation? Thanks.

> +}
> +
> +static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
> +                                             struct device *dev)
> +{
> +       struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +
> +       riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
> +
> +       return 0;
> +}
> +
> +static struct iommu_domain riscv_iommu_blocking_domain = {
> +       .type = IOMMU_DOMAIN_BLOCKED,
> +       .ops = &(const struct iommu_domain_ops) {
> +               .attach_dev = riscv_iommu_attach_blocking_domain,
> +       }
> +};
> +
>  static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
>                                               struct device *dev)
>  {
> -       /* Global pass-through already enabled, do nothing for now. */
> +       struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +
> +       riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
> +
>         return 0;
>  }
>
> @@ -72,6 +416,9 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
>  {
>         struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
>         struct riscv_iommu_device *iommu;
> +       struct riscv_iommu_dc *dc;
> +       u64 tc;
> +       int i;
>
>         if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
>                 return ERR_PTR(-ENODEV);
> @@ -80,12 +427,37 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
>         if (!iommu)
>                 return ERR_PTR(-ENODEV);
>
> +       /*
> +        * IOMMU hardware operating in fail-over BARE mode will provide
> +        * identity translation for all connected devices anyway...
> +        */
> +       if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_MODE_BARE)
> +               return ERR_PTR(-ENODEV);
> +
> +       /*
> +        * Allocate and pre-configure device context entries in
> +        * the device directory. Do not mark the context valid yet.
> +        */
> +       tc = 0;
> +       if (iommu->caps & RISCV_IOMMU_CAP_AMO_HWAD)
> +               tc |= RISCV_IOMMU_DC_TC_SADE;
> +       for (i = 0; i < fwspec->num_ids; i++) {
> +               dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
> +               if (!dc)
> +                       return ERR_PTR(-ENODEV);
> +               if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
> +                       dev_warn(dev, "already attached to IOMMU device directory\n");
> +               WRITE_ONCE(dc->tc, tc);
> +       }
> +
>         return &iommu->iommu;
>  }
>
>  static const struct iommu_ops riscv_iommu_ops = {
>         .of_xlate = riscv_iommu_of_xlate,
>         .identity_domain = &riscv_iommu_identity_domain,
> +       .blocked_domain = &riscv_iommu_blocking_domain,
> +       .release_domain = &riscv_iommu_blocking_domain,
>         .def_domain_type = riscv_iommu_device_domain_type,
>         .device_group = riscv_iommu_device_group,
>         .probe_device = riscv_iommu_probe_device,
> @@ -128,6 +500,7 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
>  {
>         iommu_device_unregister(&iommu->iommu);
>         iommu_device_sysfs_remove(&iommu->iommu);
> +       riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
>  }
>
>  int riscv_iommu_init(struct riscv_iommu_device *iommu)
> @@ -138,18 +511,20 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>         if (rc)
>                 return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
>
> -       /*
> -        * Placeholder for a complete IOMMU device initialization.  For now,
> -        * only bare minimum: enable global identity mapping mode and register sysfs.
> -        */
> -       riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
> -                          FIELD_PREP(RISCV_IOMMU_DDTP_MODE, RISCV_IOMMU_DDTP_MODE_BARE));
> +       rc = riscv_iommu_iodir_alloc(iommu);
> +       if (rc)
> +               return rc;
> +
> +       rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> +       if (rc)
> +               return rc;
>
>         rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
>                                     dev_name(iommu->dev));
> -       if (rc)
> -               return dev_err_probe(iommu->dev, rc,
> -                                    "cannot register sysfs interface\n");
> +       if (rc) {
> +               dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
> +               goto err_iodir_off;
> +       }
>
>         rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
>         if (rc) {
> @@ -161,5 +536,7 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>
>  err_remove_sysfs:
>         iommu_device_sysfs_remove(&iommu->iommu);
> +err_iodir_off:
> +       riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
>         return rc;
>  }
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index 700e33dc2446..f1696926582c 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -34,6 +34,11 @@ struct riscv_iommu_device {
>         /* available interrupt numbers, MSI or WSI */
>         unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
>         unsigned int irqs_count;
> +
> +       /* device directory */
> +       unsigned int ddt_mode;
> +       dma_addr_t ddt_phys;
> +       u64 *ddt_root;
>  };
>
>  int riscv_iommu_init(struct riscv_iommu_device *iommu);
> --
> 2.34.1
>
Jason Gunthorpe June 10, 2024, 5:49 p.m. UTC | #2
On Fri, May 31, 2024 at 02:25:15PM +0800, Zong Li wrote:

> > +static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
> > +                                    struct device *dev, u64 fsc, u64 ta)
> > +{
> > +       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> > +       struct riscv_iommu_dc *dc;
> > +       u64 tc;
> > +       int i;
> > +
> > +       /* Device context invalidation ignored for now. */
> > +
> > +       /*
> > +        * For device context with DC_TC_PDTV = 0, translation attributes valid bit
> > +        * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
> > +        */
> > +       for (i = 0; i < fwspec->num_ids; i++) {
> > +               dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
> > +               tc = READ_ONCE(dc->tc);
> > +               tc |= ta & RISCV_IOMMU_DC_TC_V;
> > +
> > +               WRITE_ONCE(dc->fsc, fsc);
> > +               WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
> > +               /* Update device context, write TC.V as the last step. */
> > +               dma_wmb();
> > +               WRITE_ONCE(dc->tc, tc);
> > +       }
> 
> Does it make sense to invalidate the DDTE after we update the DDTE in
> memory? This behavior will affect the nested IOMMU mechanism. The VMM
> has to catch the event of a DDTE update from the guest and then
> eventually go into the host IOMMU driver to configure the IOMMU
> hardware. 

Right, this is why I asked about negative caching.

The VMMs are a prime example of negative caching, in something like
the SMMU implementation the VMM will cache the V=0 STE until they see
an invalidation.

Driving the VMM shadowing/caching entirely off of the standard
invalidation mechanism is so much better than any other option.

IMHO you should have the RISCV spec revised to allow negative caching
in any invalidated data structure to permit the typical VMM design
driven off of shadowing triggered by invalidation commands.

Once the spec permits negative caching then the software would have to
invalidate after going V=0 -> V=1.

Jason
Tomasz Jeznach June 10, 2024, 6:48 p.m. UTC | #3
On Mon, Jun 10, 2024 at 10:49 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, May 31, 2024 at 02:25:15PM +0800, Zong Li wrote:
>
> > > +static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
> > > +                                    struct device *dev, u64 fsc, u64 ta)
> > > +{
> > > +       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> > > +       struct riscv_iommu_dc *dc;
> > > +       u64 tc;
> > > +       int i;
> > > +
> > > +       /* Device context invalidation ignored for now. */
> > > +
> > > +       /*
> > > +        * For device context with DC_TC_PDTV = 0, translation attributes valid bit
> > > +        * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
> > > +        */
> > > +       for (i = 0; i < fwspec->num_ids; i++) {
> > > +               dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
> > > +               tc = READ_ONCE(dc->tc);
> > > +               tc |= ta & RISCV_IOMMU_DC_TC_V;
> > > +
> > > +               WRITE_ONCE(dc->fsc, fsc);
> > > +               WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
> > > +               /* Update device context, write TC.V as the last step. */
> > > +               dma_wmb();
> > > +               WRITE_ONCE(dc->tc, tc);
> > > +       }
> >
> > Does it make sense to invalidate the DDTE after we update the DDTE in
> > memory? This behavior will affect the nested IOMMU mechanism. The VMM
> > has to catch the event of a DDTE update from the guest and then
> > eventually go into the host IOMMU driver to configure the IOMMU
> > hardware.
>
> Right, this is why I asked about negative caching.
>
> The VMMs are a prime example of negative caching, in something like
> the SMMU implementation the VMM will cache the V=0 STE until they see
> an invalidation.
>
> Driving the VMM shadowing/caching entirely off of the standard
> invalidation mechanism is so much better than any other option.
>
> IMHO you should have the RISCV spec revised to allow negative caching
> in any invalidated data structure to permit the typical VMM design
> driven off of shadowing triggered by invalidation commands.
>
> Once the spec permits negative caching then the software would have to
> invalidate after going V=0 -> V=1.
>
> Jason

Allowing negative cacheing by the spec (e.g. for VMM use cases) and
documenting required invalidation sequences would definitely help
here. I'm hesitating adding IODIR.INVAL that is not required by the
spec [1], but this is something that can be controlled by a
capabilities/feature bit once added to the specification or based on
VID:DID of the emulated Risc-V IOMMU.

Another option to consider for VMM is to utilize the WARL property of
DDTP, and provide fixed location of the single level DDTP, pointing to
MMIO region, where DDTE updates will result in vmm exit / fault
handler. This will likely not be as efficient as IODIR.INVAL issued
for any DDTE updates.

[1] https://github.com/riscv-non-isa/riscv-iommu/blob/main/src/iommu_data_structures.adoc#caching-in-memory-data-structures

- Tomasz
Jason Gunthorpe June 10, 2024, 10:20 p.m. UTC | #4
On Mon, Jun 10, 2024 at 11:48:23AM -0700, Tomasz Jeznach wrote:
> > Right, this is why I asked about negative caching.
> >
> > The VMMs are a prime example of negative caching, in something like
> > the SMMU implementation the VMM will cache the V=0 STE until they see
> > an invalidation.
> >
> > Driving the VMM shadowing/caching entirely off of the standard
> > invalidation mechanism is so much better than any other option.
> >
> > IMHO you should have the RISCV spec revised to allow negative caching
> > in any invalidated data structure to permit the typical VMM design
> > driven off of shadowing triggered by invalidation commands.
> >
> > Once the spec permits negative caching then the software would have to
> > invalidate after going V=0 -> V=1.
> >
> > Jason
> 
> Allowing negative cacheing by the spec (e.g. for VMM use cases) and
> documenting required invalidation sequences would definitely help
> here. 

Yes, you probably should really do that.

> I'm hesitating adding IODIR.INVAL that is not required by the
> spec [1],

If you expect to rapidly revise the spec then you should add it right
now so that all SW implementations that exist are
conforming. Otherwise you'll have compatability problems when you come
to implement nesting.

Obviously the VMM can't rely on a negative caching technique unless
the spec says it can.

> but this is something that can be controlled by a
> capabilities/feature bit once added to the specification or based on
> VID:DID of the emulated Risc-V IOMMU.

I'm not sure it really can. Once you start shipping SW people will run
it in a VM and the VMM will have to forever work without negative
caching.

My strong advice is to not expect the VMM trap random pages in guest
memory, that is a huge mess to implement and will delay your VMM side.

> Another option to consider for VMM is to utilize the WARL property of
> DDTP, and provide fixed location of the single level DDTP, pointing to
> MMIO region, where DDTE updates will result in vmm exit / fault
> handler. This will likely not be as efficient as IODIR.INVAL issued
> for any DDTE updates.

I don't know what all those things mean, but if you mean to have the
VMM supply faulting MMIO space that the VM is forced to put the DDTE
table into, then that would be better. It is still quite abnormal from
the VMM side..

My strong advice is to fix this. It is trivial to add the negative
caching language to the spec and will cause insignificant extra work
in this driver.

The gains on at least the ease of VMM implementation and architectural
similarity to other arches are well worth the effort. Otherwise I fear
it will be a struggle to get nesting support completed :(

Jason
Tomasz Jeznach June 11, 2024, 2 a.m. UTC | #5
On Mon, Jun 10, 2024 at 3:20 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Mon, Jun 10, 2024 at 11:48:23AM -0700, Tomasz Jeznach wrote:
> > > Right, this is why I asked about negative caching.
> > >
> > > The VMMs are a prime example of negative caching, in something like
> > > the SMMU implementation the VMM will cache the V=0 STE until they see
> > > an invalidation.
> > >
> > > Driving the VMM shadowing/caching entirely off of the standard
> > > invalidation mechanism is so much better than any other option.
> > >
> > > IMHO you should have the RISCV spec revised to allow negative caching
> > > in any invalidated data structure to permit the typical VMM design
> > > driven off of shadowing triggered by invalidation commands.
> > >
> > > Once the spec permits negative caching then the software would have to
> > > invalidate after going V=0 -> V=1.
> > >
> > > Jason
> >
> > Allowing negative cacheing by the spec (e.g. for VMM use cases) and
> > documenting required invalidation sequences would definitely help
> > here.
>
> Yes, you probably should really do that.
>
> > I'm hesitating adding IODIR.INVAL that is not required by the
> > spec [1],
>
> If you expect to rapidly revise the spec then you should add it right
> now so that all SW implementations that exist are
> conforming. Otherwise you'll have compatability problems when you come
> to implement nesting.
>
> Obviously the VMM can't rely on a negative caching technique unless
> the spec says it can.
>
> > but this is something that can be controlled by a
> > capabilities/feature bit once added to the specification or based on
> > VID:DID of the emulated Risc-V IOMMU.
>
> I'm not sure it really can. Once you start shipping SW people will run
> it in a VM and the VMM will have to forever work without negative
> caching.
>
> My strong advice is to not expect the VMM trap random pages in guest
> memory, that is a huge mess to implement and will delay your VMM side.
>
> > Another option to consider for VMM is to utilize the WARL property of
> > DDTP, and provide fixed location of the single level DDTP, pointing to
> > MMIO region, where DDTE updates will result in vmm exit / fault
> > handler. This will likely not be as efficient as IODIR.INVAL issued
> > for any DDTE updates.
>
> I don't know what all those things mean, but if you mean to have the
> VMM supply faulting MMIO space that the VM is forced to put the DDTE
> table into, then that would be better. It is still quite abnormal from
> the VMM side..
>
> My strong advice is to fix this. It is trivial to add the negative
> caching language to the spec and will cause insignificant extra work
> in this driver.
>
> The gains on at least the ease of VMM implementation and architectural
> similarity to other arches are well worth the effort. Otherwise I fear
> it will be a struggle to get nesting support completed :(
>
> Jason
>

Thanks for the comments, it definitely makes sense.

We will revise the wording in the RISC-V IOMMU spec to note that
certain implementations may require invalidating commands to be issued
when making DDT entries valid, and device tree / ACPI may be used to
report such requirement.

For now, I'll change the implementation to assume negative caching for
DDTE and will follow up with device tree / driver updates to make the
invalidation optional when revised specifications will be available.

Best,
- Tomasz

> --
> You received this message because you are subscribed to the Google Groups "linux" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux+unsubscribe@rivosinc.com.
> To view this discussion on the web visit https://groups.google.com/a/rivosinc.com/d/msgid/linux/20240610222051.GO791043%40ziepe.ca.
> For more options, visit https://groups.google.com/a/rivosinc.com/d/optout.
Jason Gunthorpe June 11, 2024, 12:12 p.m. UTC | #6
On Mon, Jun 10, 2024 at 07:00:34PM -0700, Tomasz Jeznach wrote:

> For now, I'll change the implementation to assume negative caching for
> DDTE and will follow up with device tree / driver updates to make the
> invalidation optional when revised specifications will be available.

Is there a reason to make it optional? It seems like it doesn't have
any performance downside to just always invalidate, attachment is not
a critical path operation.

I could see making something like negative PTE invalidation optional
as that is more performance path..

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index b8e0e4b62585..9ca130505c96 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -16,15 +16,168 @@ 
 #include <linux/crash_dump.h>
 #include <linux/init.h>
 #include <linux/iommu.h>
+#include <linux/iopoll.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
 
+#include "../iommu-pages.h"
 #include "iommu-bits.h"
 #include "iommu.h"
 
 /* Timeouts in [us] */
 #define RISCV_IOMMU_DDTP_TIMEOUT	50000
 
+/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
+#define phys_to_ppn(pa)  (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
+#define ppn_to_phys(pn)	 (((pn) << 2) & (((1ULL << 44) - 1) << 12))
+
+#define dev_to_iommu(dev) \
+	iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
+
+/* Device resource-managed allocations */
+struct riscv_iommu_devres {
+	void *addr;
+	int order;
+};
+
+static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
+{
+	struct riscv_iommu_devres *devres = res;
+
+	iommu_free_pages(devres->addr, devres->order);
+}
+
+static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
+{
+	struct riscv_iommu_devres *devres = res;
+	struct riscv_iommu_devres *target = p;
+
+	return devres->addr == target->addr;
+}
+
+static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order)
+{
+	struct riscv_iommu_devres *devres;
+	void *addr;
+
+	addr = iommu_alloc_pages_node(dev_to_node(iommu->dev),
+				      GFP_KERNEL_ACCOUNT, order);
+	if (unlikely(!addr))
+		return NULL;
+
+	devres = devres_alloc(riscv_iommu_devres_pages_release,
+			      sizeof(struct riscv_iommu_devres), GFP_KERNEL);
+
+	if (unlikely(!devres)) {
+		iommu_free_pages(addr, order);
+		return NULL;
+	}
+
+	devres->addr = addr;
+	devres->order = order;
+
+	devres_add(iommu->dev, devres);
+
+	return addr;
+}
+
+static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
+{
+	struct riscv_iommu_devres devres = { .addr = addr };
+
+	devres_release(iommu->dev, riscv_iommu_devres_pages_release,
+		       riscv_iommu_devres_pages_match, &devres);
+}
+
+/* Lookup and initialize device context info structure. */
+static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
+						 unsigned int devid)
+{
+	const bool base_format = !(iommu->caps & RISCV_IOMMU_CAP_MSI_FLAT);
+	unsigned int depth;
+	unsigned long ddt, old, new;
+	void *ptr;
+	u8 ddi_bits[3] = { 0 };
+	u64 *ddtp = NULL;
+
+	/* Make sure the mode is valid */
+	if (iommu->ddt_mode < RISCV_IOMMU_DDTP_MODE_1LVL ||
+	    iommu->ddt_mode > RISCV_IOMMU_DDTP_MODE_3LVL)
+		return NULL;
+
+	/*
+	 * Device id partitioning for base format:
+	 * DDI[0]: bits 0 - 6   (1st level) (7 bits)
+	 * DDI[1]: bits 7 - 15  (2nd level) (9 bits)
+	 * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
+	 *
+	 * For extended format:
+	 * DDI[0]: bits 0 - 5   (1st level) (6 bits)
+	 * DDI[1]: bits 6 - 14  (2nd level) (9 bits)
+	 * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
+	 */
+	if (base_format) {
+		ddi_bits[0] = 7;
+		ddi_bits[1] = 7 + 9;
+		ddi_bits[2] = 7 + 9 + 8;
+	} else {
+		ddi_bits[0] = 6;
+		ddi_bits[1] = 6 + 9;
+		ddi_bits[2] = 6 + 9 + 9;
+	}
+
+	/* Make sure device id is within range */
+	depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_MODE_1LVL;
+	if (devid >= (1 << ddi_bits[depth]))
+		return NULL;
+
+	/* Get to the level of the non-leaf node that holds the device context */
+	for (ddtp = iommu->ddt_root; depth-- > 0;) {
+		const int split = ddi_bits[depth];
+		/*
+		 * Each non-leaf node is 64bits wide and on each level
+		 * nodes are indexed by DDI[depth].
+		 */
+		ddtp += (devid >> split) & 0x1FF;
+
+		/*
+		 * Check if this node has been populated and if not
+		 * allocate a new level and populate it.
+		 */
+		do {
+			ddt = READ_ONCE(*(unsigned long *)ddtp);
+			if (ddt & RISCV_IOMMU_DDTE_VALID) {
+				ddtp = __va(ppn_to_phys(ddt));
+				break;
+			}
+
+			ptr = riscv_iommu_get_pages(iommu, 0);
+			if (!ptr)
+				return NULL;
+
+			new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_VALID;
+			old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
+
+			if (old == ddt) {
+				ddtp = (u64 *)ptr;
+				break;
+			}
+
+			/* Race setting DDT detected, re-read and retry. */
+			riscv_iommu_free_pages(iommu, ptr);
+		} while (1);
+	}
+
+	/*
+	 * Grab the node that matches DDI[depth], note that when using base
+	 * format the device context is 4 * 64bits, and the extended format
+	 * is 8 * 64bits, hence the (3 - base_format) below.
+	 */
+	ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
+
+	return (struct riscv_iommu_dc *)ddtp;
+}
+
 /*
  * This is best effort IOMMU translation shutdown flow.
  * Disable IOMMU without waiting for hardware response.
@@ -37,10 +190,201 @@  static void riscv_iommu_disable(struct riscv_iommu_device *iommu)
 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
 }
 
+#define riscv_iommu_read_ddtp(iommu) ({ \
+	u64 ddtp; \
+	riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
+				  !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
+				  RISCV_IOMMU_DDTP_TIMEOUT); \
+	ddtp; })
+
+static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
+{
+	u64 ddtp;
+	unsigned int mode;
+
+	ddtp = riscv_iommu_read_ddtp(iommu);
+	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+		return -EBUSY;
+
+	/*
+	 * It is optional for the hardware to report a fixed address for device
+	 * directory root page when DDT.MODE is OFF or BARE.
+	 */
+	mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
+	if (mode == RISCV_IOMMU_DDTP_MODE_BARE ||
+	    mode == RISCV_IOMMU_DDTP_MODE_OFF) {
+		/* Use WARL to discover hardware fixed DDT PPN */
+		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
+				   FIELD_PREP(RISCV_IOMMU_DDTP_MODE, mode));
+		ddtp = riscv_iommu_read_ddtp(iommu);
+		if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+			return -EBUSY;
+
+		iommu->ddt_phys = ppn_to_phys(ddtp);
+		if (iommu->ddt_phys)
+			iommu->ddt_root = devm_ioremap(iommu->dev,
+						       iommu->ddt_phys, PAGE_SIZE);
+		if (iommu->ddt_root)
+			memset(iommu->ddt_root, 0, PAGE_SIZE);
+	}
+
+	if (!iommu->ddt_root) {
+		iommu->ddt_root = riscv_iommu_get_pages(iommu, 0);
+		iommu->ddt_phys = __pa(iommu->ddt_root);
+	}
+
+	if (!iommu->ddt_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * Discover supported DDT modes starting from requested value,
+ * configure DDTP register with accepted mode and root DDT address.
+ * Accepted iommu->ddt_mode is updated on success.
+ */
+static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
+				      unsigned int ddtp_mode)
+{
+	struct device *dev = iommu->dev;
+	u64 ddtp, rq_ddtp;
+	unsigned int mode, rq_mode = ddtp_mode;
+
+	ddtp = riscv_iommu_read_ddtp(iommu);
+	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+		return -EBUSY;
+
+	/* Disallow state transition from xLVL to xLVL. */
+	mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
+	if (mode != RISCV_IOMMU_DDTP_MODE_BARE &&
+	    mode != RISCV_IOMMU_DDTP_MODE_OFF &&
+	    rq_mode != RISCV_IOMMU_DDTP_MODE_BARE &&
+	    rq_mode != RISCV_IOMMU_DDTP_MODE_OFF)
+		return -EINVAL;
+
+	do {
+		rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_MODE, rq_mode);
+		if (rq_mode > RISCV_IOMMU_DDTP_MODE_BARE)
+			rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
+
+		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
+		ddtp = riscv_iommu_read_ddtp(iommu);
+		if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
+			dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
+				rq_mode, ddtp);
+			return -EBUSY;
+		}
+
+		/* Verify IOMMU hardware accepts new DDTP config. */
+		mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
+
+		if (rq_mode == mode)
+			break;
+
+		/* Hardware mandatory DDTP mode has not been accepted. */
+		if (rq_mode < RISCV_IOMMU_DDTP_MODE_1LVL && rq_ddtp != ddtp) {
+			dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
+				ddtp, rq_ddtp);
+			return -EINVAL;
+		}
+
+		/*
+		 * Mode field is WARL, an IOMMU may support a subset of
+		 * directory table levels in which case if we tried to set
+		 * an unsupported number of levels we'll readback either
+		 * a valid xLVL or off/bare. If we got off/bare, try again
+		 * with a smaller xLVL.
+		 */
+		if (mode < RISCV_IOMMU_DDTP_MODE_1LVL &&
+		    rq_mode > RISCV_IOMMU_DDTP_MODE_1LVL) {
+			dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
+			rq_mode--;
+			continue;
+		}
+
+		/*
+		 * We tried all supported modes and IOMMU hardware failed to
+		 * accept new settings, something went very wrong since off/bare
+		 * and at least one xLVL must be supported.
+		 */
+		dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
+			mode, ddtp_mode);
+		return -EINVAL;
+	} while (1);
+
+	iommu->ddt_mode = mode;
+	if (mode != ddtp_mode)
+		dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
+
+	return 0;
+}
+
+#define RISCV_IOMMU_FSC_BARE 0
+
+/*
+ * Update IODIR for the device.
+ *
+ * During the execution of riscv_iommu_probe_device(), IODIR entries are
+ * allocated for the device's identifiers.  Device context invalidation
+ * becomes necessary only if one of the updated entries was previously
+ * marked as valid, given that invalid device context entries are not
+ * cached by the IOMMU hardware.
+ * In this implementation, updating a valid device context while the
+ * device is not quiesced might be disruptive, potentially causing
+ * interim translation faults.
+ */
+static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
+				     struct device *dev, u64 fsc, u64 ta)
+{
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct riscv_iommu_dc *dc;
+	u64 tc;
+	int i;
+
+	/* Device context invalidation ignored for now. */
+
+	/*
+	 * For device context with DC_TC_PDTV = 0, translation attributes valid bit
+	 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
+	 */
+	for (i = 0; i < fwspec->num_ids; i++) {
+		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+		tc = READ_ONCE(dc->tc);
+		tc |= ta & RISCV_IOMMU_DC_TC_V;
+
+		WRITE_ONCE(dc->fsc, fsc);
+		WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
+		/* Update device context, write TC.V as the last step. */
+		dma_wmb();
+		WRITE_ONCE(dc->tc, tc);
+	}
+}
+
+static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
+					      struct device *dev)
+{
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+
+	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
+
+	return 0;
+}
+
+static struct iommu_domain riscv_iommu_blocking_domain = {
+	.type = IOMMU_DOMAIN_BLOCKED,
+	.ops = &(const struct iommu_domain_ops) {
+		.attach_dev = riscv_iommu_attach_blocking_domain,
+	}
+};
+
 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
 					      struct device *dev)
 {
-	/* Global pass-through already enabled, do nothing for now. */
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+
+	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
+
 	return 0;
 }
 
@@ -72,6 +416,9 @@  static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct riscv_iommu_device *iommu;
+	struct riscv_iommu_dc *dc;
+	u64 tc;
+	int i;
 
 	if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
 		return ERR_PTR(-ENODEV);
@@ -80,12 +427,37 @@  static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
 	if (!iommu)
 		return ERR_PTR(-ENODEV);
 
+	/*
+	 * IOMMU hardware operating in fail-over BARE mode will provide
+	 * identity translation for all connected devices anyway...
+	 */
+	if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_MODE_BARE)
+		return ERR_PTR(-ENODEV);
+
+	/*
+	 * Allocate and pre-configure device context entries in
+	 * the device directory. Do not mark the context valid yet.
+	 */
+	tc = 0;
+	if (iommu->caps & RISCV_IOMMU_CAP_AMO_HWAD)
+		tc |= RISCV_IOMMU_DC_TC_SADE;
+	for (i = 0; i < fwspec->num_ids; i++) {
+		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+		if (!dc)
+			return ERR_PTR(-ENODEV);
+		if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
+			dev_warn(dev, "already attached to IOMMU device directory\n");
+		WRITE_ONCE(dc->tc, tc);
+	}
+
 	return &iommu->iommu;
 }
 
 static const struct iommu_ops riscv_iommu_ops = {
 	.of_xlate = riscv_iommu_of_xlate,
 	.identity_domain = &riscv_iommu_identity_domain,
+	.blocked_domain = &riscv_iommu_blocking_domain,
+	.release_domain = &riscv_iommu_blocking_domain,
 	.def_domain_type = riscv_iommu_device_domain_type,
 	.device_group = riscv_iommu_device_group,
 	.probe_device = riscv_iommu_probe_device,
@@ -128,6 +500,7 @@  void riscv_iommu_remove(struct riscv_iommu_device *iommu)
 {
 	iommu_device_unregister(&iommu->iommu);
 	iommu_device_sysfs_remove(&iommu->iommu);
+	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
 }
 
 int riscv_iommu_init(struct riscv_iommu_device *iommu)
@@ -138,18 +511,20 @@  int riscv_iommu_init(struct riscv_iommu_device *iommu)
 	if (rc)
 		return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
 
-	/*
-	 * Placeholder for a complete IOMMU device initialization.  For now,
-	 * only bare minimum: enable global identity mapping mode and register sysfs.
-	 */
-	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
-			   FIELD_PREP(RISCV_IOMMU_DDTP_MODE, RISCV_IOMMU_DDTP_MODE_BARE));
+	rc = riscv_iommu_iodir_alloc(iommu);
+	if (rc)
+		return rc;
+
+	rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
+	if (rc)
+		return rc;
 
 	rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
 				    dev_name(iommu->dev));
-	if (rc)
-		return dev_err_probe(iommu->dev, rc,
-				     "cannot register sysfs interface\n");
+	if (rc) {
+		dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
+		goto err_iodir_off;
+	}
 
 	rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
 	if (rc) {
@@ -161,5 +536,7 @@  int riscv_iommu_init(struct riscv_iommu_device *iommu)
 
 err_remove_sysfs:
 	iommu_device_sysfs_remove(&iommu->iommu);
+err_iodir_off:
+	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
 	return rc;
 }
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index 700e33dc2446..f1696926582c 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -34,6 +34,11 @@  struct riscv_iommu_device {
 	/* available interrupt numbers, MSI or WSI */
 	unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
 	unsigned int irqs_count;
+
+	/* device directory */
+	unsigned int ddt_mode;
+	dma_addr_t ddt_phys;
+	u64 *ddt_root;
 };
 
 int riscv_iommu_init(struct riscv_iommu_device *iommu);