Message ID | 6668ff51bf5b751b311ffc25946db10862dd86bd.1715708679.git.tjeznach@rivosinc.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Linux RISC-V IOMMU Support | expand |
On Wed, May 15, 2024 at 2:17 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote: > > Introduce device context allocation and device directory tree > management including capabilities discovery sequence, as described > in Chapter 2.1 of the RISC-V IOMMU Architecture Specification. > > Device directory mode will be auto detected using DDTP WARL property, > using highest mode supported by the driver and hardware. If none > supported can be configured, driver will fall back to global pass-through. > > First level DDTP page can be located in I/O (detected using DDTP WARL) > and system memory. > > Only simple identity and blocking protection domains are supported by > this implementation. > > Co-developed-by: Nick Kossifidis <mick@ics.forth.gr> > Signed-off-by: Nick Kossifidis <mick@ics.forth.gr> > Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com> > Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com> > --- > drivers/iommu/riscv/iommu.c | 397 +++++++++++++++++++++++++++++++++++- > drivers/iommu/riscv/iommu.h | 5 + > 2 files changed, 392 insertions(+), 10 deletions(-) > > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c > index b8e0e4b62585..9ca130505c96 100644 > --- a/drivers/iommu/riscv/iommu.c > +++ b/drivers/iommu/riscv/iommu.c > @@ -16,15 +16,168 @@ > #include <linux/crash_dump.h> > #include <linux/init.h> > #include <linux/iommu.h> > +#include <linux/iopoll.h> > #include <linux/kernel.h> > #include <linux/pci.h> > > +#include "../iommu-pages.h" > #include "iommu-bits.h" > #include "iommu.h" > > /* Timeouts in [us] */ > #define RISCV_IOMMU_DDTP_TIMEOUT 50000 > > +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ > +#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) > +#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) > + > +#define dev_to_iommu(dev) \ > + iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) > + > +/* Device resource-managed allocations */ > +struct riscv_iommu_devres { > + void *addr; > + int order; > +}; > + > +static void riscv_iommu_devres_pages_release(struct device *dev, void *res) > +{ > + struct riscv_iommu_devres *devres = res; > + > + iommu_free_pages(devres->addr, devres->order); > +} > + > +static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) > +{ > + struct riscv_iommu_devres *devres = res; > + struct riscv_iommu_devres *target = p; > + > + return devres->addr == target->addr; > +} > + > +static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order) > +{ > + struct riscv_iommu_devres *devres; > + void *addr; > + > + addr = iommu_alloc_pages_node(dev_to_node(iommu->dev), > + GFP_KERNEL_ACCOUNT, order); > + if (unlikely(!addr)) > + return NULL; > + > + devres = devres_alloc(riscv_iommu_devres_pages_release, > + sizeof(struct riscv_iommu_devres), GFP_KERNEL); > + > + if (unlikely(!devres)) { > + iommu_free_pages(addr, order); > + return NULL; > + } > + > + devres->addr = addr; > + devres->order = order; > + > + devres_add(iommu->dev, devres); > + > + return addr; > +} > + > +static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) > +{ > + struct riscv_iommu_devres devres = { .addr = addr }; > + > + devres_release(iommu->dev, riscv_iommu_devres_pages_release, > + riscv_iommu_devres_pages_match, &devres); > +} > + > +/* Lookup and initialize device context info structure. */ > +static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, > + unsigned int devid) > +{ > + const bool base_format = !(iommu->caps & RISCV_IOMMU_CAP_MSI_FLAT); > + unsigned int depth; > + unsigned long ddt, old, new; > + void *ptr; > + u8 ddi_bits[3] = { 0 }; > + u64 *ddtp = NULL; > + > + /* Make sure the mode is valid */ > + if (iommu->ddt_mode < RISCV_IOMMU_DDTP_MODE_1LVL || > + iommu->ddt_mode > RISCV_IOMMU_DDTP_MODE_3LVL) > + return NULL; > + > + /* > + * Device id partitioning for base format: > + * DDI[0]: bits 0 - 6 (1st level) (7 bits) > + * DDI[1]: bits 7 - 15 (2nd level) (9 bits) > + * DDI[2]: bits 16 - 23 (3rd level) (8 bits) > + * > + * For extended format: > + * DDI[0]: bits 0 - 5 (1st level) (6 bits) > + * DDI[1]: bits 6 - 14 (2nd level) (9 bits) > + * DDI[2]: bits 15 - 23 (3rd level) (9 bits) > + */ > + if (base_format) { > + ddi_bits[0] = 7; > + ddi_bits[1] = 7 + 9; > + ddi_bits[2] = 7 + 9 + 8; > + } else { > + ddi_bits[0] = 6; > + ddi_bits[1] = 6 + 9; > + ddi_bits[2] = 6 + 9 + 9; > + } > + > + /* Make sure device id is within range */ > + depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_MODE_1LVL; > + if (devid >= (1 << ddi_bits[depth])) > + return NULL; > + > + /* Get to the level of the non-leaf node that holds the device context */ > + for (ddtp = iommu->ddt_root; depth-- > 0;) { > + const int split = ddi_bits[depth]; > + /* > + * Each non-leaf node is 64bits wide and on each level > + * nodes are indexed by DDI[depth]. > + */ > + ddtp += (devid >> split) & 0x1FF; > + > + /* > + * Check if this node has been populated and if not > + * allocate a new level and populate it. > + */ > + do { > + ddt = READ_ONCE(*(unsigned long *)ddtp); > + if (ddt & RISCV_IOMMU_DDTE_VALID) { > + ddtp = __va(ppn_to_phys(ddt)); > + break; > + } > + > + ptr = riscv_iommu_get_pages(iommu, 0); > + if (!ptr) > + return NULL; > + > + new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_VALID; > + old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); > + > + if (old == ddt) { > + ddtp = (u64 *)ptr; > + break; > + } > + > + /* Race setting DDT detected, re-read and retry. */ > + riscv_iommu_free_pages(iommu, ptr); > + } while (1); > + } > + > + /* > + * Grab the node that matches DDI[depth], note that when using base > + * format the device context is 4 * 64bits, and the extended format > + * is 8 * 64bits, hence the (3 - base_format) below. > + */ > + ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); > + > + return (struct riscv_iommu_dc *)ddtp; > +} > + > /* > * This is best effort IOMMU translation shutdown flow. > * Disable IOMMU without waiting for hardware response. > @@ -37,10 +190,201 @@ static void riscv_iommu_disable(struct riscv_iommu_device *iommu) > riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); > } > > +#define riscv_iommu_read_ddtp(iommu) ({ \ > + u64 ddtp; \ > + riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ > + !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ > + RISCV_IOMMU_DDTP_TIMEOUT); \ > + ddtp; }) > + > +static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) > +{ > + u64 ddtp; > + unsigned int mode; > + > + ddtp = riscv_iommu_read_ddtp(iommu); > + if (ddtp & RISCV_IOMMU_DDTP_BUSY) > + return -EBUSY; > + > + /* > + * It is optional for the hardware to report a fixed address for device > + * directory root page when DDT.MODE is OFF or BARE. > + */ > + mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp); > + if (mode == RISCV_IOMMU_DDTP_MODE_BARE || > + mode == RISCV_IOMMU_DDTP_MODE_OFF) { > + /* Use WARL to discover hardware fixed DDT PPN */ > + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, > + FIELD_PREP(RISCV_IOMMU_DDTP_MODE, mode)); > + ddtp = riscv_iommu_read_ddtp(iommu); > + if (ddtp & RISCV_IOMMU_DDTP_BUSY) > + return -EBUSY; > + > + iommu->ddt_phys = ppn_to_phys(ddtp); > + if (iommu->ddt_phys) > + iommu->ddt_root = devm_ioremap(iommu->dev, > + iommu->ddt_phys, PAGE_SIZE); > + if (iommu->ddt_root) > + memset(iommu->ddt_root, 0, PAGE_SIZE); > + } > + > + if (!iommu->ddt_root) { > + iommu->ddt_root = riscv_iommu_get_pages(iommu, 0); > + iommu->ddt_phys = __pa(iommu->ddt_root); > + } > + > + if (!iommu->ddt_root) > + return -ENOMEM; > + > + return 0; > +} > + > +/* > + * Discover supported DDT modes starting from requested value, > + * configure DDTP register with accepted mode and root DDT address. > + * Accepted iommu->ddt_mode is updated on success. > + */ > +static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, > + unsigned int ddtp_mode) > +{ > + struct device *dev = iommu->dev; > + u64 ddtp, rq_ddtp; > + unsigned int mode, rq_mode = ddtp_mode; > + > + ddtp = riscv_iommu_read_ddtp(iommu); > + if (ddtp & RISCV_IOMMU_DDTP_BUSY) > + return -EBUSY; > + > + /* Disallow state transition from xLVL to xLVL. */ > + mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp); > + if (mode != RISCV_IOMMU_DDTP_MODE_BARE && > + mode != RISCV_IOMMU_DDTP_MODE_OFF && > + rq_mode != RISCV_IOMMU_DDTP_MODE_BARE && > + rq_mode != RISCV_IOMMU_DDTP_MODE_OFF) > + return -EINVAL; > + > + do { > + rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_MODE, rq_mode); > + if (rq_mode > RISCV_IOMMU_DDTP_MODE_BARE) > + rq_ddtp |= phys_to_ppn(iommu->ddt_phys); > + > + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); > + ddtp = riscv_iommu_read_ddtp(iommu); > + if (ddtp & RISCV_IOMMU_DDTP_BUSY) { > + dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", > + rq_mode, ddtp); > + return -EBUSY; > + } > + > + /* Verify IOMMU hardware accepts new DDTP config. */ > + mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp); > + > + if (rq_mode == mode) > + break; > + > + /* Hardware mandatory DDTP mode has not been accepted. */ > + if (rq_mode < RISCV_IOMMU_DDTP_MODE_1LVL && rq_ddtp != ddtp) { > + dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", > + ddtp, rq_ddtp); > + return -EINVAL; > + } > + > + /* > + * Mode field is WARL, an IOMMU may support a subset of > + * directory table levels in which case if we tried to set > + * an unsupported number of levels we'll readback either > + * a valid xLVL or off/bare. If we got off/bare, try again > + * with a smaller xLVL. > + */ > + if (mode < RISCV_IOMMU_DDTP_MODE_1LVL && > + rq_mode > RISCV_IOMMU_DDTP_MODE_1LVL) { > + dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); > + rq_mode--; > + continue; > + } > + > + /* > + * We tried all supported modes and IOMMU hardware failed to > + * accept new settings, something went very wrong since off/bare > + * and at least one xLVL must be supported. > + */ > + dev_err(dev, "DDTP hw mode %u, failed to set %u\n", > + mode, ddtp_mode); > + return -EINVAL; > + } while (1); > + > + iommu->ddt_mode = mode; > + if (mode != ddtp_mode) > + dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); > + > + return 0; > +} > + > +#define RISCV_IOMMU_FSC_BARE 0 > + > +/* > + * Update IODIR for the device. > + * > + * During the execution of riscv_iommu_probe_device(), IODIR entries are > + * allocated for the device's identifiers. Device context invalidation > + * becomes necessary only if one of the updated entries was previously > + * marked as valid, given that invalid device context entries are not > + * cached by the IOMMU hardware. > + * In this implementation, updating a valid device context while the > + * device is not quiesced might be disruptive, potentially causing > + * interim translation faults. > + */ > +static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, > + struct device *dev, u64 fsc, u64 ta) > +{ > + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); > + struct riscv_iommu_dc *dc; > + u64 tc; > + int i; > + > + /* Device context invalidation ignored for now. */ > + > + /* > + * For device context with DC_TC_PDTV = 0, translation attributes valid bit > + * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). > + */ > + for (i = 0; i < fwspec->num_ids; i++) { > + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); > + tc = READ_ONCE(dc->tc); > + tc |= ta & RISCV_IOMMU_DC_TC_V; > + > + WRITE_ONCE(dc->fsc, fsc); > + WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); > + /* Update device context, write TC.V as the last step. */ > + dma_wmb(); > + WRITE_ONCE(dc->tc, tc); > + } > +} > + > +static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, > + struct device *dev) > +{ > + struct riscv_iommu_device *iommu = dev_to_iommu(dev); > + > + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); > + > + return 0; > +} > + > +static struct iommu_domain riscv_iommu_blocking_domain = { > + .type = IOMMU_DOMAIN_BLOCKED, > + .ops = &(const struct iommu_domain_ops) { > + .attach_dev = riscv_iommu_attach_blocking_domain, > + } > +}; > + > static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, > struct device *dev) > { > - /* Global pass-through already enabled, do nothing for now. */ > + struct riscv_iommu_device *iommu = dev_to_iommu(dev); > + > + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); > + > return 0; > } > > @@ -72,6 +416,9 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) > { > struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); > struct riscv_iommu_device *iommu; > + struct riscv_iommu_dc *dc; > + u64 tc; > + int i; > > if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) > return ERR_PTR(-ENODEV); > @@ -80,12 +427,37 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) > if (!iommu) > return ERR_PTR(-ENODEV); > > + /* > + * IOMMU hardware operating in fail-over BARE mode will provide > + * identity translation for all connected devices anyway... > + */ > + if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_MODE_BARE) > + return ERR_PTR(-ENODEV); > + > + /* > + * Allocate and pre-configure device context entries in > + * the device directory. Do not mark the context valid yet. > + */ > + tc = 0; > + if (iommu->caps & RISCV_IOMMU_CAP_AMO_HWAD) > + tc |= RISCV_IOMMU_DC_TC_SADE; > + for (i = 0; i < fwspec->num_ids; i++) { > + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); > + if (!dc) > + return ERR_PTR(-ENODEV); > + if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) > + dev_warn(dev, "already attached to IOMMU device directory\n"); > + WRITE_ONCE(dc->tc, tc); > + } > + > return &iommu->iommu; > } > > static const struct iommu_ops riscv_iommu_ops = { > .of_xlate = riscv_iommu_of_xlate, > .identity_domain = &riscv_iommu_identity_domain, > + .blocked_domain = &riscv_iommu_blocking_domain, > + .release_domain = &riscv_iommu_blocking_domain, > .def_domain_type = riscv_iommu_device_domain_type, > .device_group = riscv_iommu_device_group, > .probe_device = riscv_iommu_probe_device, > @@ -128,6 +500,7 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu) > { > iommu_device_unregister(&iommu->iommu); > iommu_device_sysfs_remove(&iommu->iommu); > + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF); > } > > int riscv_iommu_init(struct riscv_iommu_device *iommu) > @@ -138,18 +511,20 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) > if (rc) > return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); > > - /* > - * Placeholder for a complete IOMMU device initialization. For now, > - * only bare minimum: enable global identity mapping mode and register sysfs. > - */ > - riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, > - FIELD_PREP(RISCV_IOMMU_DDTP_MODE, RISCV_IOMMU_DDTP_MODE_BARE)); > + rc = riscv_iommu_iodir_alloc(iommu); > + if (rc) > + return rc; > + > + rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX); > + if (rc) > + return rc; > > rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", > dev_name(iommu->dev)); > - if (rc) > - return dev_err_probe(iommu->dev, rc, > - "cannot register sysfs interface\n"); > + if (rc) { > + dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); > + goto err_iodir_off; > + } > > rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); > if (rc) { > @@ -161,5 +536,7 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) > > err_remove_sysfs: > iommu_device_sysfs_remove(&iommu->iommu); > +err_iodir_off: > + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF); > return rc; > } > diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h > index 700e33dc2446..f1696926582c 100644 > --- a/drivers/iommu/riscv/iommu.h > +++ b/drivers/iommu/riscv/iommu.h > @@ -34,6 +34,11 @@ struct riscv_iommu_device { > /* available interrupt numbers, MSI or WSI */ > unsigned int irqs[RISCV_IOMMU_INTR_COUNT]; > unsigned int irqs_count; > + > + /* device directory */ > + unsigned int ddt_mode; > + dma_addr_t ddt_phys; > + u64 *ddt_root; > }; > > int riscv_iommu_init(struct riscv_iommu_device *iommu); Reviewed-by: Zong Li <zong.li@sifive.com> Thanks > -- > 2.34.1 > > > _______________________________________________ > linux-riscv mailing list > linux-riscv@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-riscv
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index b8e0e4b62585..9ca130505c96 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -16,15 +16,168 @@ #include <linux/crash_dump.h> #include <linux/init.h> #include <linux/iommu.h> +#include <linux/iopoll.h> #include <linux/kernel.h> #include <linux/pci.h> +#include "../iommu-pages.h" #include "iommu-bits.h" #include "iommu.h" /* Timeouts in [us] */ #define RISCV_IOMMU_DDTP_TIMEOUT 50000 +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ +#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) +#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) + +#define dev_to_iommu(dev) \ + iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) + +/* Device resource-managed allocations */ +struct riscv_iommu_devres { + void *addr; + int order; +}; + +static void riscv_iommu_devres_pages_release(struct device *dev, void *res) +{ + struct riscv_iommu_devres *devres = res; + + iommu_free_pages(devres->addr, devres->order); +} + +static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) +{ + struct riscv_iommu_devres *devres = res; + struct riscv_iommu_devres *target = p; + + return devres->addr == target->addr; +} + +static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order) +{ + struct riscv_iommu_devres *devres; + void *addr; + + addr = iommu_alloc_pages_node(dev_to_node(iommu->dev), + GFP_KERNEL_ACCOUNT, order); + if (unlikely(!addr)) + return NULL; + + devres = devres_alloc(riscv_iommu_devres_pages_release, + sizeof(struct riscv_iommu_devres), GFP_KERNEL); + + if (unlikely(!devres)) { + iommu_free_pages(addr, order); + return NULL; + } + + devres->addr = addr; + devres->order = order; + + devres_add(iommu->dev, devres); + + return addr; +} + +static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) +{ + struct riscv_iommu_devres devres = { .addr = addr }; + + devres_release(iommu->dev, riscv_iommu_devres_pages_release, + riscv_iommu_devres_pages_match, &devres); +} + +/* Lookup and initialize device context info structure. */ +static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, + unsigned int devid) +{ + const bool base_format = !(iommu->caps & RISCV_IOMMU_CAP_MSI_FLAT); + unsigned int depth; + unsigned long ddt, old, new; + void *ptr; + u8 ddi_bits[3] = { 0 }; + u64 *ddtp = NULL; + + /* Make sure the mode is valid */ + if (iommu->ddt_mode < RISCV_IOMMU_DDTP_MODE_1LVL || + iommu->ddt_mode > RISCV_IOMMU_DDTP_MODE_3LVL) + return NULL; + + /* + * Device id partitioning for base format: + * DDI[0]: bits 0 - 6 (1st level) (7 bits) + * DDI[1]: bits 7 - 15 (2nd level) (9 bits) + * DDI[2]: bits 16 - 23 (3rd level) (8 bits) + * + * For extended format: + * DDI[0]: bits 0 - 5 (1st level) (6 bits) + * DDI[1]: bits 6 - 14 (2nd level) (9 bits) + * DDI[2]: bits 15 - 23 (3rd level) (9 bits) + */ + if (base_format) { + ddi_bits[0] = 7; + ddi_bits[1] = 7 + 9; + ddi_bits[2] = 7 + 9 + 8; + } else { + ddi_bits[0] = 6; + ddi_bits[1] = 6 + 9; + ddi_bits[2] = 6 + 9 + 9; + } + + /* Make sure device id is within range */ + depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_MODE_1LVL; + if (devid >= (1 << ddi_bits[depth])) + return NULL; + + /* Get to the level of the non-leaf node that holds the device context */ + for (ddtp = iommu->ddt_root; depth-- > 0;) { + const int split = ddi_bits[depth]; + /* + * Each non-leaf node is 64bits wide and on each level + * nodes are indexed by DDI[depth]. + */ + ddtp += (devid >> split) & 0x1FF; + + /* + * Check if this node has been populated and if not + * allocate a new level and populate it. + */ + do { + ddt = READ_ONCE(*(unsigned long *)ddtp); + if (ddt & RISCV_IOMMU_DDTE_VALID) { + ddtp = __va(ppn_to_phys(ddt)); + break; + } + + ptr = riscv_iommu_get_pages(iommu, 0); + if (!ptr) + return NULL; + + new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_VALID; + old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); + + if (old == ddt) { + ddtp = (u64 *)ptr; + break; + } + + /* Race setting DDT detected, re-read and retry. */ + riscv_iommu_free_pages(iommu, ptr); + } while (1); + } + + /* + * Grab the node that matches DDI[depth], note that when using base + * format the device context is 4 * 64bits, and the extended format + * is 8 * 64bits, hence the (3 - base_format) below. + */ + ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); + + return (struct riscv_iommu_dc *)ddtp; +} + /* * This is best effort IOMMU translation shutdown flow. * Disable IOMMU without waiting for hardware response. @@ -37,10 +190,201 @@ static void riscv_iommu_disable(struct riscv_iommu_device *iommu) riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); } +#define riscv_iommu_read_ddtp(iommu) ({ \ + u64 ddtp; \ + riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ + !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ + RISCV_IOMMU_DDTP_TIMEOUT); \ + ddtp; }) + +static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) +{ + u64 ddtp; + unsigned int mode; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* + * It is optional for the hardware to report a fixed address for device + * directory root page when DDT.MODE is OFF or BARE. + */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp); + if (mode == RISCV_IOMMU_DDTP_MODE_BARE || + mode == RISCV_IOMMU_DDTP_MODE_OFF) { + /* Use WARL to discover hardware fixed DDT PPN */ + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, + FIELD_PREP(RISCV_IOMMU_DDTP_MODE, mode)); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + iommu->ddt_phys = ppn_to_phys(ddtp); + if (iommu->ddt_phys) + iommu->ddt_root = devm_ioremap(iommu->dev, + iommu->ddt_phys, PAGE_SIZE); + if (iommu->ddt_root) + memset(iommu->ddt_root, 0, PAGE_SIZE); + } + + if (!iommu->ddt_root) { + iommu->ddt_root = riscv_iommu_get_pages(iommu, 0); + iommu->ddt_phys = __pa(iommu->ddt_root); + } + + if (!iommu->ddt_root) + return -ENOMEM; + + return 0; +} + +/* + * Discover supported DDT modes starting from requested value, + * configure DDTP register with accepted mode and root DDT address. + * Accepted iommu->ddt_mode is updated on success. + */ +static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, + unsigned int ddtp_mode) +{ + struct device *dev = iommu->dev; + u64 ddtp, rq_ddtp; + unsigned int mode, rq_mode = ddtp_mode; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* Disallow state transition from xLVL to xLVL. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp); + if (mode != RISCV_IOMMU_DDTP_MODE_BARE && + mode != RISCV_IOMMU_DDTP_MODE_OFF && + rq_mode != RISCV_IOMMU_DDTP_MODE_BARE && + rq_mode != RISCV_IOMMU_DDTP_MODE_OFF) + return -EINVAL; + + do { + rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_MODE, rq_mode); + if (rq_mode > RISCV_IOMMU_DDTP_MODE_BARE) + rq_ddtp |= phys_to_ppn(iommu->ddt_phys); + + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) { + dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", + rq_mode, ddtp); + return -EBUSY; + } + + /* Verify IOMMU hardware accepts new DDTP config. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp); + + if (rq_mode == mode) + break; + + /* Hardware mandatory DDTP mode has not been accepted. */ + if (rq_mode < RISCV_IOMMU_DDTP_MODE_1LVL && rq_ddtp != ddtp) { + dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", + ddtp, rq_ddtp); + return -EINVAL; + } + + /* + * Mode field is WARL, an IOMMU may support a subset of + * directory table levels in which case if we tried to set + * an unsupported number of levels we'll readback either + * a valid xLVL or off/bare. If we got off/bare, try again + * with a smaller xLVL. + */ + if (mode < RISCV_IOMMU_DDTP_MODE_1LVL && + rq_mode > RISCV_IOMMU_DDTP_MODE_1LVL) { + dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); + rq_mode--; + continue; + } + + /* + * We tried all supported modes and IOMMU hardware failed to + * accept new settings, something went very wrong since off/bare + * and at least one xLVL must be supported. + */ + dev_err(dev, "DDTP hw mode %u, failed to set %u\n", + mode, ddtp_mode); + return -EINVAL; + } while (1); + + iommu->ddt_mode = mode; + if (mode != ddtp_mode) + dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); + + return 0; +} + +#define RISCV_IOMMU_FSC_BARE 0 + +/* + * Update IODIR for the device. + * + * During the execution of riscv_iommu_probe_device(), IODIR entries are + * allocated for the device's identifiers. Device context invalidation + * becomes necessary only if one of the updated entries was previously + * marked as valid, given that invalid device context entries are not + * cached by the IOMMU hardware. + * In this implementation, updating a valid device context while the + * device is not quiesced might be disruptive, potentially causing + * interim translation faults. + */ +static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, + struct device *dev, u64 fsc, u64 ta) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct riscv_iommu_dc *dc; + u64 tc; + int i; + + /* Device context invalidation ignored for now. */ + + /* + * For device context with DC_TC_PDTV = 0, translation attributes valid bit + * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). + */ + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + tc = READ_ONCE(dc->tc); + tc |= ta & RISCV_IOMMU_DC_TC_V; + + WRITE_ONCE(dc->fsc, fsc); + WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); + /* Update device context, write TC.V as the last step. */ + dma_wmb(); + WRITE_ONCE(dc->tc, tc); + } +} + +static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); + + return 0; +} + +static struct iommu_domain riscv_iommu_blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = riscv_iommu_attach_blocking_domain, + } +}; + static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, struct device *dev) { - /* Global pass-through already enabled, do nothing for now. */ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); + return 0; } @@ -72,6 +416,9 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct riscv_iommu_device *iommu; + struct riscv_iommu_dc *dc; + u64 tc; + int i; if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) return ERR_PTR(-ENODEV); @@ -80,12 +427,37 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) if (!iommu) return ERR_PTR(-ENODEV); + /* + * IOMMU hardware operating in fail-over BARE mode will provide + * identity translation for all connected devices anyway... + */ + if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_MODE_BARE) + return ERR_PTR(-ENODEV); + + /* + * Allocate and pre-configure device context entries in + * the device directory. Do not mark the context valid yet. + */ + tc = 0; + if (iommu->caps & RISCV_IOMMU_CAP_AMO_HWAD) + tc |= RISCV_IOMMU_DC_TC_SADE; + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + if (!dc) + return ERR_PTR(-ENODEV); + if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) + dev_warn(dev, "already attached to IOMMU device directory\n"); + WRITE_ONCE(dc->tc, tc); + } + return &iommu->iommu; } static const struct iommu_ops riscv_iommu_ops = { .of_xlate = riscv_iommu_of_xlate, .identity_domain = &riscv_iommu_identity_domain, + .blocked_domain = &riscv_iommu_blocking_domain, + .release_domain = &riscv_iommu_blocking_domain, .def_domain_type = riscv_iommu_device_domain_type, .device_group = riscv_iommu_device_group, .probe_device = riscv_iommu_probe_device, @@ -128,6 +500,7 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu) { iommu_device_unregister(&iommu->iommu); iommu_device_sysfs_remove(&iommu->iommu); + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF); } int riscv_iommu_init(struct riscv_iommu_device *iommu) @@ -138,18 +511,20 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) if (rc) return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); - /* - * Placeholder for a complete IOMMU device initialization. For now, - * only bare minimum: enable global identity mapping mode and register sysfs. - */ - riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, - FIELD_PREP(RISCV_IOMMU_DDTP_MODE, RISCV_IOMMU_DDTP_MODE_BARE)); + rc = riscv_iommu_iodir_alloc(iommu); + if (rc) + return rc; + + rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX); + if (rc) + return rc; rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", dev_name(iommu->dev)); - if (rc) - return dev_err_probe(iommu->dev, rc, - "cannot register sysfs interface\n"); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); + goto err_iodir_off; + } rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); if (rc) { @@ -161,5 +536,7 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) err_remove_sysfs: iommu_device_sysfs_remove(&iommu->iommu); +err_iodir_off: + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF); return rc; } diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index 700e33dc2446..f1696926582c 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -34,6 +34,11 @@ struct riscv_iommu_device { /* available interrupt numbers, MSI or WSI */ unsigned int irqs[RISCV_IOMMU_INTR_COUNT]; unsigned int irqs_count; + + /* device directory */ + unsigned int ddt_mode; + dma_addr_t ddt_phys; + u64 *ddt_root; }; int riscv_iommu_init(struct riscv_iommu_device *iommu);