diff mbox series

[v2,5/7] iommu/riscv: Device directory management.

Message ID 232b2824d5dfd9b8dcb3553bfd506444273c3305.1713456598.git.tjeznach@rivosinc.com (mailing list archive)
State Superseded
Headers show
Series Linux RISC-V IOMMU Support | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR success PR summary
conchuod/patch-5-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-5-test-2 success .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-5-test-3 success .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-5-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-5-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-5-test-6 success .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-5-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-5-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-5-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-5-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-5-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-5-test-12 success .github/scripts/patches/tests/verify_signedoff.sh

Commit Message

Tomasz Jeznach April 18, 2024, 4:32 p.m. UTC
Introduce device context allocation and device directory tree
management including capabilities discovery sequence, as described
in Chapter 2.1 of the RISC-V IOMMU Architecture Specification.

Device directory mode will be auto detected using DDTP WARL property,
using highest mode supported by the driver and hardware. If none
supported can be configured, driver will fall back to global pass-through.

First level DDTP page can be located in I/O (detected using DDTP WARL)
and system memory.

Only identity protection domain is supported by this implementation.

Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
---
 drivers/iommu/riscv/iommu.c | 369 +++++++++++++++++++++++++++++++++++-
 drivers/iommu/riscv/iommu.h |   5 +
 2 files changed, 365 insertions(+), 9 deletions(-)

Comments

Jason Gunthorpe April 19, 2024, 12:40 p.m. UTC | #1
On Thu, Apr 18, 2024 at 09:32:23AM -0700, Tomasz Jeznach wrote:
> @@ -31,13 +32,350 @@ MODULE_LICENSE("GPL");
>  /* Timeouts in [us] */
>  #define RISCV_IOMMU_DDTP_TIMEOUT	50000
>  
> -static int riscv_iommu_attach_identity_domain(struct iommu_domain *domain,
> -					      struct device *dev)
> +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
> +#define phys_to_ppn(va)  (((va) >> 2) & (((1ULL << 44) - 1) << 10))
> +#define ppn_to_phys(pn)	 (((pn) << 2) & (((1ULL << 44) - 1) << 12))
> +
> +#define dev_to_iommu(dev) \
> +	container_of((dev)->iommu->iommu_dev, struct riscv_iommu_device, iommu)

We have iommu_get_iommu_dev() now

> +static unsigned long riscv_iommu_get_pages(struct riscv_iommu_device *iommu, unsigned int order)
> +{
> +	struct riscv_iommu_devres *devres;
> +	struct page *pages;
> +
> +	pages = alloc_pages_node(dev_to_node(iommu->dev),
> +				 GFP_KERNEL_ACCOUNT | __GFP_ZERO, order);
> +	if (unlikely(!pages)) {
> +		dev_err(iommu->dev, "Page allocation failed, order %u\n", order);
> +		return 0;
> +	}

This needs adjusting for the recently merged allocation accounting

> +static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
> +				     struct device *dev,
> +				     struct iommu_domain *iommu_domain)
> +{
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	struct riscv_iommu_dc *dc;
> +	u64 fsc, ta, tc;
> +	int i;
> +
> +	if (!iommu_domain) {
> +		ta = 0;
> +		tc = 0;
> +		fsc = 0;
> +	} else if (iommu_domain->type == IOMMU_DOMAIN_IDENTITY) {
> +		ta = 0;
> +		tc = RISCV_IOMMU_DC_TC_V;
> +		fsc = FIELD_PREP(RISCV_IOMMU_DC_FSC_MODE, RISCV_IOMMU_DC_FSC_MODE_BARE);
> +	} else {
> +		/* This should never happen. */
> +		return -ENODEV;
> +	}

Please don't write it like this. This function is already being called
by functions that are already under specific ops, don't check
domain->type here.

Instead have the caller compute and pass in the ta/tc/fsc
values. Maybe in a tidy struct..

> +	/* Update existing or allocate new entries in device directory */
> +	for (i = 0; i < fwspec->num_ids; i++) {
> +		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i], !iommu_domain);
> +		if (!dc && !iommu_domain)
> +			continue;
> +		if (!dc)
> +			return -ENODEV;

But if this fails some of the fwspecs were left in a weird state ?

Drivers should try hard to have attach functions that fail and make no
change at all or fully succeed.

Meaning ideally preallocate any required memory before doing any
change to the HW visable structures.

> +
> +		/* Swap device context, update TC valid bit as the last operation */
> +		xchg64(&dc->fsc, fsc);
> +		xchg64(&dc->ta, ta);
> +		xchg64(&dc->tc, tc);

This doesn't loook right? When you get to adding PAGING suport fsc has
the page table pfn and ta has the cache tag, so this will end up
tearing the data for sure, eg when asked to replace a PAGING domain
with another PAGING domain? That will create a functional/security
problem, right?

I would encourage you to re-use the ARM sequencing code, ideally moved
to some generic helper library. Every iommu driver dealing with
multi-quanta descriptors seems to have this same fundamental
sequencing problem.

> +static void riscv_iommu_release_device(struct device *dev)
> +{
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +
> +	riscv_iommu_attach_domain(iommu, dev, NULL);
> +}

The release_domain has landed too now. Please don't invent weird NULL
domain types that have special meaning. I assume clearing the V bit is
a blocking behavior? So please implement a proper blocking domain and
set release_domain = &riscv_iommu_blocking and just omit this release
function.

> @@ -133,12 +480,14 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>  	rc = riscv_iommu_init_check(iommu);
>  	if (rc)
>  		return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
> -	/*
> -	 * Placeholder for a complete IOMMU device initialization.
> -	 * For now, only bare minimum: enable global identity mapping mode and register sysfs.
> -	 */
> -	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
> -			   FIELD_PREP(RISCV_IOMMU_DDTP_MODE, RISCV_IOMMU_DDTP_MODE_BARE));
> +
> +	rc = riscv_iommu_ddt_alloc(iommu);
> +	if (WARN(rc, "cannot allocate device directory\n"))
> +		goto err_init;

memory allocation failure already makes noisy prints, more prints are
not needed..

> +	rc = riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> +	if (WARN(rc, "cannot enable iommu device\n"))
> +		goto err_init;

This is not a proper use of WARN, it should only be used for things
that cannot happen not undesired error paths.

Jason
Baolu Lu April 22, 2024, 5:11 a.m. UTC | #2
On 4/19/24 12:32 AM, Tomasz Jeznach wrote:
> Introduce device context allocation and device directory tree
> management including capabilities discovery sequence, as described
> in Chapter 2.1 of the RISC-V IOMMU Architecture Specification.
> 
> Device directory mode will be auto detected using DDTP WARL property,
> using highest mode supported by the driver and hardware. If none
> supported can be configured, driver will fall back to global pass-through.
> 
> First level DDTP page can be located in I/O (detected using DDTP WARL)
> and system memory.
> 
> Only identity protection domain is supported by this implementation.
> 
> Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
> Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> ---
>   drivers/iommu/riscv/iommu.c | 369 +++++++++++++++++++++++++++++++++++-
>   drivers/iommu/riscv/iommu.h |   5 +
>   2 files changed, 365 insertions(+), 9 deletions(-)

[ ... ]

> +
> +/*
> + * Discover supported DDT modes starting from requested value,
> + * configure DDTP register with accepted mode and root DDT address.
> + * Accepted iommu->ddt_mode is updated on success.
> + */
> +static int riscv_iommu_set_ddtp_mode(struct riscv_iommu_device *iommu,
> +				     unsigned int ddtp_mode)
> +{
> +	struct device *dev = iommu->dev;
> +	u64 ddtp, rq_ddtp;
> +	unsigned int mode, rq_mode = ddtp_mode;
> +	int rc;
> +
> +	rc = readq_relaxed_poll_timeout(iommu->reg + RISCV_IOMMU_REG_DDTP,
> +					ddtp, !(ddtp & RISCV_IOMMU_DDTP_BUSY),
> +					10, RISCV_IOMMU_DDTP_TIMEOUT);
> +	if (rc < 0)
> +		return -EBUSY;
> +
> +	/* Disallow state transition from xLVL to xLVL. */
> +	switch (FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp)) {
> +	case RISCV_IOMMU_DDTP_MODE_BARE:
> +	case RISCV_IOMMU_DDTP_MODE_OFF:
> +		break;
> +	default:
> +		if (rq_mode != RISCV_IOMMU_DDTP_MODE_BARE &&
> +		    rq_mode != RISCV_IOMMU_DDTP_MODE_OFF)
> +			return -EINVAL;

Is this check duplicate? It appears that it's always true in the default
branch.

> +		break;
> +	}
> +
> +	do {
> +		rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_MODE, rq_mode);
> +		if (rq_mode > RISCV_IOMMU_DDTP_MODE_BARE)
> +			rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
> +
> +		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
> +
> +		rc = readq_relaxed_poll_timeout(iommu->reg + RISCV_IOMMU_REG_DDTP,
> +						ddtp, !(ddtp & RISCV_IOMMU_DDTP_BUSY),
> +						10, RISCV_IOMMU_DDTP_TIMEOUT);
> +		if (rc < 0) {
> +			dev_warn(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
> +				 rq_mode, ddtp);
> +			return -EBUSY;
> +		}
> +
> +		/* Verify IOMMU hardware accepts new DDTP config. */
> +		mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
> +
> +		if (rq_mode == mode)
> +			break;
> +
> +		/* Hardware mandatory DDTP mode has not been accepted. */
> +		if (rq_mode < RISCV_IOMMU_DDTP_MODE_1LVL && rq_ddtp != ddtp) {
> +			dev_warn(dev, "DDTP update failed hw: %llx vs %llx\n", ddtp, rq_ddtp);
> +			return -EINVAL;
> +		}
> +
> +		/*
> +		 * Mode field is WARL, an IOMMU may support a subset of
> +		 * directory table levels in which case if we tried to set
> +		 * an unsupported number of levels we'll readback either
> +		 * a valid xLVL or off/bare. If we got off/bare, try again
> +		 * with a smaller xLVL.
> +		 */
> +		if (mode < RISCV_IOMMU_DDTP_MODE_1LVL &&
> +		    rq_mode > RISCV_IOMMU_DDTP_MODE_1LVL) {
> +			dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
> +			rq_mode--;
> +			continue;
> +		}
> +
> +		/*
> +		 * We tried all supported modes and IOMMU hardware failed to
> +		 * accept new settings, something went very wrong since off/bare
> +		 * and at least one xLVL must be supported.
> +		 */
> +		dev_warn(dev, "DDTP hw mode %u, failed to set %u\n", mode, ddtp_mode);
> +		return -EINVAL;
> +	} while (1);
> +
> +	iommu->ddt_mode = mode;
> +	if (mode != ddtp_mode)
> +		dev_warn(dev, "DDTP failover to %u mode, requested %u\n",
> +			 mode, ddtp_mode);
> +
> +	return 0;
> +}
> +

[ ... ]

> +
> +static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
> +				     struct device *dev,
> +				     struct iommu_domain *iommu_domain)
> +{
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	struct riscv_iommu_dc *dc;
> +	u64 fsc, ta, tc;
> +	int i;
> +
> +	if (!iommu_domain) {
> +		ta = 0;
> +		tc = 0;
> +		fsc = 0;
> +	} else if (iommu_domain->type == IOMMU_DOMAIN_IDENTITY) {
> +		ta = 0;
> +		tc = RISCV_IOMMU_DC_TC_V;
> +		fsc = FIELD_PREP(RISCV_IOMMU_DC_FSC_MODE, RISCV_IOMMU_DC_FSC_MODE_BARE);
> +	} else {
> +		/* This should never happen. */
> +		return -ENODEV;
> +	}

Move the domain->type check code to the domain-specific ops.

> +
> +	/* Update existing or allocate new entries in device directory */
> +	for (i = 0; i < fwspec->num_ids; i++) {
> +		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i], !iommu_domain);
> +		if (!dc && !iommu_domain)
> +			continue;
> +		if (!dc)
> +			return -ENODEV;
> +
> +		/* Swap device context, update TC valid bit as the last operation */
> +		xchg64(&dc->fsc, fsc);
> +		xchg64(&dc->ta, ta);
> +		xchg64(&dc->tc, tc);
> +
> +		/* Device context invalidation will be required. Ignoring for now. */
> +	}
> +
>   	return 0;
>   }
>   
> +static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
> +					      struct device *dev)
> +{
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +
> +	/* Global pass-through already enabled, do nothing. */
> +	if (iommu->ddt_mode == RISCV_IOMMU_DDTP_MODE_BARE)
> +		return 0;
> +
> +	return riscv_iommu_attach_domain(iommu, dev, iommu_domain);
> +}
> +
>   static struct iommu_domain riscv_iommu_identity_domain = {
>   	.type = IOMMU_DOMAIN_IDENTITY,
>   	.ops = &(const struct iommu_domain_ops) {
> @@ -82,6 +420,13 @@ static void riscv_iommu_probe_finalize(struct device *dev)
>   	iommu_setup_dma_ops(dev, 0, U64_MAX);
>   }
>   
> +static void riscv_iommu_release_device(struct device *dev)
> +{
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +
> +	riscv_iommu_attach_domain(iommu, dev, NULL);

Attaching a NULL domain to a device has already been removed. You can
use the iommu_ops->release_domain here.

> +}
> +
>   static const struct iommu_ops riscv_iommu_ops = {
>   	.owner = THIS_MODULE,
>   	.of_xlate = riscv_iommu_of_xlate,
> @@ -90,6 +435,7 @@ static const struct iommu_ops riscv_iommu_ops = {
>   	.device_group = riscv_iommu_device_group,
>   	.probe_device = riscv_iommu_probe_device,
>   	.probe_finalize = riscv_iommu_probe_finalize,

The probe_finalize op will be removed soon.

https://lore.kernel.org/linux-iommu/bebea331c1d688b34d9862eefd5ede47503961b8.1713523152.git.robin.murphy@arm.com/

> +	.release_device = riscv_iommu_release_device,
>   };
>   
>   static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
> @@ -124,6 +470,7 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
>   {
>   	iommu_device_unregister(&iommu->iommu);
>   	iommu_device_sysfs_remove(&iommu->iommu);
> +	riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
>   }
>   
>   int riscv_iommu_init(struct riscv_iommu_device *iommu)
> @@ -133,12 +480,14 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>   	rc = riscv_iommu_init_check(iommu);
>   	if (rc)
>   		return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
> -	/*
> -	 * Placeholder for a complete IOMMU device initialization.
> -	 * For now, only bare minimum: enable global identity mapping mode and register sysfs.
> -	 */
> -	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
> -			   FIELD_PREP(RISCV_IOMMU_DDTP_MODE, RISCV_IOMMU_DDTP_MODE_BARE));
> +
> +	rc = riscv_iommu_ddt_alloc(iommu);
> +	if (WARN(rc, "cannot allocate device directory\n"))
> +		goto err_init;
> +
> +	rc = riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> +	if (WARN(rc, "cannot enable iommu device\n"))
> +		goto err_init;
>   
>   	rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
>   				    dev_name(iommu->dev));
> @@ -154,5 +503,7 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>   err_iommu:
>   	iommu_device_sysfs_remove(&iommu->iommu);
>   err_sysfs:
> +	riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> +err_init:
>   	return rc;
>   }
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index 700e33dc2446..f1696926582c 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -34,6 +34,11 @@ struct riscv_iommu_device {
>   	/* available interrupt numbers, MSI or WSI */
>   	unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
>   	unsigned int irqs_count;
> +
> +	/* device directory */
> +	unsigned int ddt_mode;
> +	dma_addr_t ddt_phys;
> +	u64 *ddt_root;
>   };
>   
>   int riscv_iommu_init(struct riscv_iommu_device *iommu);

Best regards,
baolu
Tomasz Jeznach April 24, 2024, 11:01 p.m. UTC | #3
On Fri, Apr 19, 2024 at 5:40 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Thu, Apr 18, 2024 at 09:32:23AM -0700, Tomasz Jeznach wrote:
> > @@ -31,13 +32,350 @@ MODULE_LICENSE("GPL");
> >  /* Timeouts in [us] */
> >  #define RISCV_IOMMU_DDTP_TIMEOUT     50000
> >
> > -static int riscv_iommu_attach_identity_domain(struct iommu_domain *domain,
> > -                                           struct device *dev)
> > +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
> > +#define phys_to_ppn(va)  (((va) >> 2) & (((1ULL << 44) - 1) << 10))
> > +#define ppn_to_phys(pn)       (((pn) << 2) & (((1ULL << 44) - 1) << 12))
> > +
> > +#define dev_to_iommu(dev) \
> > +     container_of((dev)->iommu->iommu_dev, struct riscv_iommu_device, iommu)
>
> We have iommu_get_iommu_dev() now
>
> > +static unsigned long riscv_iommu_get_pages(struct riscv_iommu_device *iommu, unsigned int order)
> > +{
> > +     struct riscv_iommu_devres *devres;
> > +     struct page *pages;
> > +
> > +     pages = alloc_pages_node(dev_to_node(iommu->dev),
> > +                              GFP_KERNEL_ACCOUNT | __GFP_ZERO, order);
> > +     if (unlikely(!pages)) {
> > +             dev_err(iommu->dev, "Page allocation failed, order %u\n", order);
> > +             return 0;
> > +     }
>
> This needs adjusting for the recently merged allocation accounting
>
> > +static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
> > +                                  struct device *dev,
> > +                                  struct iommu_domain *iommu_domain)
> > +{
> > +     struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> > +     struct riscv_iommu_dc *dc;
> > +     u64 fsc, ta, tc;
> > +     int i;
> > +
> > +     if (!iommu_domain) {
> > +             ta = 0;
> > +             tc = 0;
> > +             fsc = 0;
> > +     } else if (iommu_domain->type == IOMMU_DOMAIN_IDENTITY) {
> > +             ta = 0;
> > +             tc = RISCV_IOMMU_DC_TC_V;
> > +             fsc = FIELD_PREP(RISCV_IOMMU_DC_FSC_MODE, RISCV_IOMMU_DC_FSC_MODE_BARE);
> > +     } else {
> > +             /* This should never happen. */
> > +             return -ENODEV;
> > +     }
>
> Please don't write it like this. This function is already being called
> by functions that are already under specific ops, don't check
> domain->type here.
>
> Instead have the caller compute and pass in the ta/tc/fsc
> values. Maybe in a tidy struct..
>
> > +     /* Update existing or allocate new entries in device directory */
> > +     for (i = 0; i < fwspec->num_ids; i++) {
> > +             dc = riscv_iommu_get_dc(iommu, fwspec->ids[i], !iommu_domain);
> > +             if (!dc && !iommu_domain)
> > +                     continue;
> > +             if (!dc)
> > +                     return -ENODEV;
>
> But if this fails some of the fwspecs were left in a weird state ?
>
> Drivers should try hard to have attach functions that fail and make no
> change at all or fully succeed.
>
> Meaning ideally preallocate any required memory before doing any
> change to the HW visable structures.
>

Good point. Done.
Looking at the fwspec->ids[] I'm assuming nobody will add/modify the
IDs after iommu_probe_device() completes.

> > +
> > +             /* Swap device context, update TC valid bit as the last operation */
> > +             xchg64(&dc->fsc, fsc);
> > +             xchg64(&dc->ta, ta);
> > +             xchg64(&dc->tc, tc);
>
> This doesn't loook right? When you get to adding PAGING suport fsc has
> the page table pfn and ta has the cache tag, so this will end up
> tearing the data for sure, eg when asked to replace a PAGING domain
> with another PAGING domain? That will create a functional/security
> problem, right?
>
> I would encourage you to re-use the ARM sequencing code, ideally moved
> to some generic helper library. Every iommu driver dealing with
> multi-quanta descriptors seems to have this same fundamental
> sequencing problem.
>

Good point. Reworked.

> > +static void riscv_iommu_release_device(struct device *dev)
> > +{
> > +     struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > +
> > +     riscv_iommu_attach_domain(iommu, dev, NULL);
> > +}
>
> The release_domain has landed too now. Please don't invent weird NULL
> domain types that have special meaning. I assume clearing the V bit is
> a blocking behavior? So please implement a proper blocking domain and
> set release_domain = &riscv_iommu_blocking and just omit this release
> function.
>

Updated to use release_domain, should be cleaner now.
Clearing TC.V is a blocking (but noisy) behavior, should be fine for
release domain where devices should be quiesced already.

> > @@ -133,12 +480,14 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> >       rc = riscv_iommu_init_check(iommu);
> >       if (rc)
> >               return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
> > -     /*
> > -      * Placeholder for a complete IOMMU device initialization.
> > -      * For now, only bare minimum: enable global identity mapping mode and register sysfs.
> > -      */
> > -     riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
> > -                        FIELD_PREP(RISCV_IOMMU_DDTP_MODE, RISCV_IOMMU_DDTP_MODE_BARE));
> > +
> > +     rc = riscv_iommu_ddt_alloc(iommu);
> > +     if (WARN(rc, "cannot allocate device directory\n"))
> > +             goto err_init;
>
> memory allocation failure already makes noisy prints, more prints are
> not needed..
>
> > +     rc = riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> > +     if (WARN(rc, "cannot enable iommu device\n"))
> > +             goto err_init;
>
> This is not a proper use of WARN, it should only be used for things
> that cannot happen not undesired error paths.
>
> Jason

Thanks, ack to all. Will push updated v3 shortly.
- Tomasz
Jason Gunthorpe April 24, 2024, 11:07 p.m. UTC | #4
On Wed, Apr 24, 2024 at 04:01:04PM -0700, Tomasz Jeznach wrote:
> > > +     /* Update existing or allocate new entries in device directory */
> > > +     for (i = 0; i < fwspec->num_ids; i++) {
> > > +             dc = riscv_iommu_get_dc(iommu, fwspec->ids[i], !iommu_domain);
> > > +             if (!dc && !iommu_domain)
> > > +                     continue;
> > > +             if (!dc)
> > > +                     return -ENODEV;
> >
> > But if this fails some of the fwspecs were left in a weird state ?
> >
> > Drivers should try hard to have attach functions that fail and make no
> > change at all or fully succeed.
> >
> > Meaning ideally preallocate any required memory before doing any
> > change to the HW visable structures.
> 
> Good point. Done.
> Looking at the fwspec->ids[] I'm assuming nobody will add/modify the
> IDs after iommu_probe_device() completes.

Yes

> > > +             /* Swap device context, update TC valid bit as the last operation */
> > > +             xchg64(&dc->fsc, fsc);
> > > +             xchg64(&dc->ta, ta);
> > > +             xchg64(&dc->tc, tc);
> >
> > This doesn't loook right? When you get to adding PAGING suport fsc has
> > the page table pfn and ta has the cache tag, so this will end up
> > tearing the data for sure, eg when asked to replace a PAGING domain
> > with another PAGING domain? That will create a functional/security
> > problem, right?
> >
> > I would encourage you to re-use the ARM sequencing code, ideally moved
> > to some generic helper library. Every iommu driver dealing with
> > multi-quanta descriptors seems to have this same fundamental
> > sequencing problem.
> >
> 
> Good point. Reworked.

I suppose by force clearing the v bit before starting the sequence?

That is OK but won't support some non-embedded focused features in the
long run. It is a good approach to get the driver landed though.
 
> > The release_domain has landed too now. Please don't invent weird NULL
> > domain types that have special meaning. I assume clearing the V bit is
> > a blocking behavior? So please implement a proper blocking domain and
> > set release_domain = &riscv_iommu_blocking and just omit this release
> > function.
> >
> 
> Updated to use release_domain, should be cleaner now.
> Clearing TC.V is a blocking (but noisy) behavior, should be fine for
> release domain where devices should be quiesced already.

blocking is fine to be noisy.

Jason
Tomasz Jeznach April 24, 2024, 11:07 p.m. UTC | #5
On Sun, Apr 21, 2024 at 10:13 PM Baolu Lu <baolu.lu@linux.intel.com> wrote:
>
> On 4/19/24 12:32 AM, Tomasz Jeznach wrote:
> > Introduce device context allocation and device directory tree
> > management including capabilities discovery sequence, as described
> > in Chapter 2.1 of the RISC-V IOMMU Architecture Specification.
> >
> > Device directory mode will be auto detected using DDTP WARL property,
> > using highest mode supported by the driver and hardware. If none
> > supported can be configured, driver will fall back to global pass-through.
> >
> > First level DDTP page can be located in I/O (detected using DDTP WARL)
> > and system memory.
> >
> > Only identity protection domain is supported by this implementation.
> >
> > Co-developed-by: Nick Kossifidis <mick@ics.forth.gr>
> > Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
> > Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> > ---
> >   drivers/iommu/riscv/iommu.c | 369 +++++++++++++++++++++++++++++++++++-
> >   drivers/iommu/riscv/iommu.h |   5 +
> >   2 files changed, 365 insertions(+), 9 deletions(-)
>
> [ ... ]
>
> > +
> > +/*
> > + * Discover supported DDT modes starting from requested value,
> > + * configure DDTP register with accepted mode and root DDT address.
> > + * Accepted iommu->ddt_mode is updated on success.
> > + */
> > +static int riscv_iommu_set_ddtp_mode(struct riscv_iommu_device *iommu,
> > +                                  unsigned int ddtp_mode)
> > +{
> > +     struct device *dev = iommu->dev;
> > +     u64 ddtp, rq_ddtp;
> > +     unsigned int mode, rq_mode = ddtp_mode;
> > +     int rc;
> > +
> > +     rc = readq_relaxed_poll_timeout(iommu->reg + RISCV_IOMMU_REG_DDTP,
> > +                                     ddtp, !(ddtp & RISCV_IOMMU_DDTP_BUSY),
> > +                                     10, RISCV_IOMMU_DDTP_TIMEOUT);
> > +     if (rc < 0)
> > +             return -EBUSY;
> > +
> > +     /* Disallow state transition from xLVL to xLVL. */
> > +     switch (FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp)) {
> > +     case RISCV_IOMMU_DDTP_MODE_BARE:
> > +     case RISCV_IOMMU_DDTP_MODE_OFF:
> > +             break;
> > +     default:
> > +             if (rq_mode != RISCV_IOMMU_DDTP_MODE_BARE &&
> > +                 rq_mode != RISCV_IOMMU_DDTP_MODE_OFF)
> > +                     return -EINVAL;
>
> Is this check duplicate? It appears that it's always true in the default
> branch.
>

No. The condition in the switch represents current mode, while check
in default branch checks desired mode device will be configured to.
I've reworked the code in v3 to be more readable.

> > +             break;
> > +     }
> > +
> > +     do {
> > +             rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_MODE, rq_mode);
> > +             if (rq_mode > RISCV_IOMMU_DDTP_MODE_BARE)
> > +                     rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
> > +
> > +             riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
> > +
> > +             rc = readq_relaxed_poll_timeout(iommu->reg + RISCV_IOMMU_REG_DDTP,
> > +                                             ddtp, !(ddtp & RISCV_IOMMU_DDTP_BUSY),
> > +                                             10, RISCV_IOMMU_DDTP_TIMEOUT);
> > +             if (rc < 0) {
> > +                     dev_warn(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
> > +                              rq_mode, ddtp);
> > +                     return -EBUSY;
> > +             }
> > +
> > +             /* Verify IOMMU hardware accepts new DDTP config. */
> > +             mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
> > +
> > +             if (rq_mode == mode)
> > +                     break;
> > +
> > +             /* Hardware mandatory DDTP mode has not been accepted. */
> > +             if (rq_mode < RISCV_IOMMU_DDTP_MODE_1LVL && rq_ddtp != ddtp) {
> > +                     dev_warn(dev, "DDTP update failed hw: %llx vs %llx\n", ddtp, rq_ddtp);
> > +                     return -EINVAL;
> > +             }
> > +
> > +             /*
> > +              * Mode field is WARL, an IOMMU may support a subset of
> > +              * directory table levels in which case if we tried to set
> > +              * an unsupported number of levels we'll readback either
> > +              * a valid xLVL or off/bare. If we got off/bare, try again
> > +              * with a smaller xLVL.
> > +              */
> > +             if (mode < RISCV_IOMMU_DDTP_MODE_1LVL &&
> > +                 rq_mode > RISCV_IOMMU_DDTP_MODE_1LVL) {
> > +                     dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
> > +                     rq_mode--;
> > +                     continue;
> > +             }
> > +
> > +             /*
> > +              * We tried all supported modes and IOMMU hardware failed to
> > +              * accept new settings, something went very wrong since off/bare
> > +              * and at least one xLVL must be supported.
> > +              */
> > +             dev_warn(dev, "DDTP hw mode %u, failed to set %u\n", mode, ddtp_mode);
> > +             return -EINVAL;
> > +     } while (1);
> > +
> > +     iommu->ddt_mode = mode;
> > +     if (mode != ddtp_mode)
> > +             dev_warn(dev, "DDTP failover to %u mode, requested %u\n",
> > +                      mode, ddtp_mode);
> > +
> > +     return 0;
> > +}
> > +
>
> [ ... ]
>
> > +
> > +static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
> > +                                  struct device *dev,
> > +                                  struct iommu_domain *iommu_domain)
> > +{
> > +     struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> > +     struct riscv_iommu_dc *dc;
> > +     u64 fsc, ta, tc;
> > +     int i;
> > +
> > +     if (!iommu_domain) {
> > +             ta = 0;
> > +             tc = 0;
> > +             fsc = 0;
> > +     } else if (iommu_domain->type == IOMMU_DOMAIN_IDENTITY) {
> > +             ta = 0;
> > +             tc = RISCV_IOMMU_DC_TC_V;
> > +             fsc = FIELD_PREP(RISCV_IOMMU_DC_FSC_MODE, RISCV_IOMMU_DC_FSC_MODE_BARE);
> > +     } else {
> > +             /* This should never happen. */
> > +             return -ENODEV;
> > +     }
>
> Move the domain->type check code to the domain-specific ops.
>
> > +
> > +     /* Update existing or allocate new entries in device directory */
> > +     for (i = 0; i < fwspec->num_ids; i++) {
> > +             dc = riscv_iommu_get_dc(iommu, fwspec->ids[i], !iommu_domain);
> > +             if (!dc && !iommu_domain)
> > +                     continue;
> > +             if (!dc)
> > +                     return -ENODEV;
> > +
> > +             /* Swap device context, update TC valid bit as the last operation */
> > +             xchg64(&dc->fsc, fsc);
> > +             xchg64(&dc->ta, ta);
> > +             xchg64(&dc->tc, tc);
> > +
> > +             /* Device context invalidation will be required. Ignoring for now. */
> > +     }
> > +
> >       return 0;
> >   }
> >
> > +static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
> > +                                           struct device *dev)
> > +{
> > +     struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > +
> > +     /* Global pass-through already enabled, do nothing. */
> > +     if (iommu->ddt_mode == RISCV_IOMMU_DDTP_MODE_BARE)
> > +             return 0;
> > +
> > +     return riscv_iommu_attach_domain(iommu, dev, iommu_domain);
> > +}
> > +
> >   static struct iommu_domain riscv_iommu_identity_domain = {
> >       .type = IOMMU_DOMAIN_IDENTITY,
> >       .ops = &(const struct iommu_domain_ops) {
> > @@ -82,6 +420,13 @@ static void riscv_iommu_probe_finalize(struct device *dev)
> >       iommu_setup_dma_ops(dev, 0, U64_MAX);
> >   }
> >
> > +static void riscv_iommu_release_device(struct device *dev)
> > +{
> > +     struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > +
> > +     riscv_iommu_attach_domain(iommu, dev, NULL);
>
> Attaching a NULL domain to a device has already been removed. You can
> use the iommu_ops->release_domain here.
>
> > +}
> > +
> >   static const struct iommu_ops riscv_iommu_ops = {
> >       .owner = THIS_MODULE,
> >       .of_xlate = riscv_iommu_of_xlate,
> > @@ -90,6 +435,7 @@ static const struct iommu_ops riscv_iommu_ops = {
> >       .device_group = riscv_iommu_device_group,
> >       .probe_device = riscv_iommu_probe_device,
> >       .probe_finalize = riscv_iommu_probe_finalize,
>
> The probe_finalize op will be removed soon.
>
> https://lore.kernel.org/linux-iommu/bebea331c1d688b34d9862eefd5ede47503961b8.1713523152.git.robin.murphy@arm.com/

Thanks, I'm aware of the change, once this change is pulled for
iommu/next I'll just remove probe_finalize.

>
> > +     .release_device = riscv_iommu_release_device,
> >   };
> >
> >   static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
> > @@ -124,6 +470,7 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
> >   {
> >       iommu_device_unregister(&iommu->iommu);
> >       iommu_device_sysfs_remove(&iommu->iommu);
> > +     riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> >   }
> >
> >   int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > @@ -133,12 +480,14 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> >       rc = riscv_iommu_init_check(iommu);
> >       if (rc)
> >               return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
> > -     /*
> > -      * Placeholder for a complete IOMMU device initialization.
> > -      * For now, only bare minimum: enable global identity mapping mode and register sysfs.
> > -      */
> > -     riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
> > -                        FIELD_PREP(RISCV_IOMMU_DDTP_MODE, RISCV_IOMMU_DDTP_MODE_BARE));
> > +
> > +     rc = riscv_iommu_ddt_alloc(iommu);
> > +     if (WARN(rc, "cannot allocate device directory\n"))
> > +             goto err_init;
> > +
> > +     rc = riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> > +     if (WARN(rc, "cannot enable iommu device\n"))
> > +             goto err_init;
> >
> >       rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
> >                                   dev_name(iommu->dev));
> > @@ -154,5 +503,7 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> >   err_iommu:
> >       iommu_device_sysfs_remove(&iommu->iommu);
> >   err_sysfs:
> > +     riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > +err_init:
> >       return rc;
> >   }
> > diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> > index 700e33dc2446..f1696926582c 100644
> > --- a/drivers/iommu/riscv/iommu.h
> > +++ b/drivers/iommu/riscv/iommu.h
> > @@ -34,6 +34,11 @@ struct riscv_iommu_device {
> >       /* available interrupt numbers, MSI or WSI */
> >       unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
> >       unsigned int irqs_count;
> > +
> > +     /* device directory */
> > +     unsigned int ddt_mode;
> > +     dma_addr_t ddt_phys;
> > +     u64 *ddt_root;
> >   };
> >
> >   int riscv_iommu_init(struct riscv_iommu_device *iommu);
>
> Best regards,
> baolu

Thank you, Best.
- Tomasz
diff mbox series

Patch

diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index d38317cb2493..721cc71cb959 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -16,6 +16,7 @@ 
 #include <linux/dma-mapping.h>
 #include <linux/init.h>
 #include <linux/iommu.h>
+#include <linux/iopoll.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
@@ -31,13 +32,350 @@  MODULE_LICENSE("GPL");
 /* Timeouts in [us] */
 #define RISCV_IOMMU_DDTP_TIMEOUT	50000
 
-static int riscv_iommu_attach_identity_domain(struct iommu_domain *domain,
-					      struct device *dev)
+/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
+#define phys_to_ppn(va)  (((va) >> 2) & (((1ULL << 44) - 1) << 10))
+#define ppn_to_phys(pn)	 (((pn) << 2) & (((1ULL << 44) - 1) << 12))
+
+#define dev_to_iommu(dev) \
+	container_of((dev)->iommu->iommu_dev, struct riscv_iommu_device, iommu)
+
+/* Device resource-managed allocations */
+struct riscv_iommu_devres {
+	unsigned long addr;
+	unsigned int order;
+};
+
+static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
+{
+	struct riscv_iommu_devres *devres = res;
+
+	free_pages(devres->addr, devres->order);
+}
+
+static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
+{
+	struct riscv_iommu_devres *devres = res;
+	struct riscv_iommu_devres *target = p;
+
+	return devres->addr == target->addr;
+}
+
+static unsigned long riscv_iommu_get_pages(struct riscv_iommu_device *iommu, unsigned int order)
+{
+	struct riscv_iommu_devres *devres;
+	struct page *pages;
+
+	pages = alloc_pages_node(dev_to_node(iommu->dev),
+				 GFP_KERNEL_ACCOUNT | __GFP_ZERO, order);
+	if (unlikely(!pages)) {
+		dev_err(iommu->dev, "Page allocation failed, order %u\n", order);
+		return 0;
+	}
+
+	devres = devres_alloc(riscv_iommu_devres_pages_release,
+			      sizeof(struct riscv_iommu_devres), GFP_KERNEL);
+
+	if (unlikely(!devres)) {
+		__free_pages(pages, order);
+		return 0;
+	}
+
+	devres->addr = (unsigned long)page_address(pages);
+	devres->order = order;
+
+	devres_add(iommu->dev, devres);
+
+	return devres->addr;
+}
+
+static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, unsigned long addr)
+{
+	struct riscv_iommu_devres devres = { .addr = addr };
+
+	devres_release(iommu->dev, riscv_iommu_devres_pages_release,
+		       riscv_iommu_devres_pages_match, &devres);
+}
+
+/* Lookup and initialize device context info structure. */
+static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
+						 unsigned int devid, bool fetch)
+{
+	const bool base_format = !(iommu->caps & RISCV_IOMMU_CAP_MSI_FLAT);
+	unsigned int depth;
+	unsigned long ddt, ptr, old, new;
+	u8 ddi_bits[3] = { 0 };
+	u64 *ddtp = NULL;
+
+	/* Make sure the mode is valid */
+	if (iommu->ddt_mode < RISCV_IOMMU_DDTP_MODE_1LVL ||
+	    iommu->ddt_mode > RISCV_IOMMU_DDTP_MODE_3LVL)
+		return NULL;
+
+	/*
+	 * Device id partitioning for base format:
+	 * DDI[0]: bits 0 - 6   (1st level) (7 bits)
+	 * DDI[1]: bits 7 - 15  (2nd level) (9 bits)
+	 * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
+	 *
+	 * For extended format:
+	 * DDI[0]: bits 0 - 5   (1st level) (6 bits)
+	 * DDI[1]: bits 6 - 14  (2nd level) (9 bits)
+	 * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
+	 */
+	if (base_format) {
+		ddi_bits[0] = 7;
+		ddi_bits[1] = 7 + 9;
+		ddi_bits[2] = 7 + 9 + 8;
+	} else {
+		ddi_bits[0] = 6;
+		ddi_bits[1] = 6 + 9;
+		ddi_bits[2] = 6 + 9 + 9;
+	}
+
+	/* Make sure device id is within range */
+	depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_MODE_1LVL;
+	if (devid >= (1 << ddi_bits[depth]))
+		return NULL;
+
+	/* Get to the level of the non-leaf node that holds the device context */
+	for (ddtp = iommu->ddt_root; depth-- > 0;) {
+		const int split = ddi_bits[depth];
+		/*
+		 * Each non-leaf node is 64bits wide and on each level
+		 * nodes are indexed by DDI[depth].
+		 */
+		ddtp += (devid >> split) & 0x1FF;
+
+		/*
+		 * Check if this node has been populated and if not
+		 * allocate a new level and populate it.
+		 */
+		do {
+			ddt = READ_ONCE(*(unsigned long *)ddtp);
+			if (ddt & RISCV_IOMMU_DDTE_VALID) {
+				ddtp = __va(ppn_to_phys(ddt));
+				break;
+			}
+
+			/* Fetch only, do not allocate new device context. */
+			if (fetch)
+				return NULL;
+
+			ptr = riscv_iommu_get_pages(iommu, 0);
+			if (!ptr)
+				return NULL;
+
+			new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_VALID;
+			old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
+
+			if (old == ddt) {
+				ddtp = (u64 *)ptr;
+				break;
+			}
+
+			/* Race setting DDT detected, re-read and retry. */
+			riscv_iommu_free_pages(iommu, ptr);
+		} while (1);
+	}
+
+	/*
+	 * Grab the node that matches DDI[depth], note that when using base
+	 * format the device context is 4 * 64bits, and the extended format
+	 * is 8 * 64bits, hence the (3 - base_format) below.
+	 */
+	ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
+
+	return (struct riscv_iommu_dc *)ddtp;
+}
+
+/*
+ * Discover supported DDT modes starting from requested value,
+ * configure DDTP register with accepted mode and root DDT address.
+ * Accepted iommu->ddt_mode is updated on success.
+ */
+static int riscv_iommu_set_ddtp_mode(struct riscv_iommu_device *iommu,
+				     unsigned int ddtp_mode)
+{
+	struct device *dev = iommu->dev;
+	u64 ddtp, rq_ddtp;
+	unsigned int mode, rq_mode = ddtp_mode;
+	int rc;
+
+	rc = readq_relaxed_poll_timeout(iommu->reg + RISCV_IOMMU_REG_DDTP,
+					ddtp, !(ddtp & RISCV_IOMMU_DDTP_BUSY),
+					10, RISCV_IOMMU_DDTP_TIMEOUT);
+	if (rc < 0)
+		return -EBUSY;
+
+	/* Disallow state transition from xLVL to xLVL. */
+	switch (FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp)) {
+	case RISCV_IOMMU_DDTP_MODE_BARE:
+	case RISCV_IOMMU_DDTP_MODE_OFF:
+		break;
+	default:
+		if (rq_mode != RISCV_IOMMU_DDTP_MODE_BARE &&
+		    rq_mode != RISCV_IOMMU_DDTP_MODE_OFF)
+			return -EINVAL;
+		break;
+	}
+
+	do {
+		rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_MODE, rq_mode);
+		if (rq_mode > RISCV_IOMMU_DDTP_MODE_BARE)
+			rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
+
+		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
+
+		rc = readq_relaxed_poll_timeout(iommu->reg + RISCV_IOMMU_REG_DDTP,
+						ddtp, !(ddtp & RISCV_IOMMU_DDTP_BUSY),
+						10, RISCV_IOMMU_DDTP_TIMEOUT);
+		if (rc < 0) {
+			dev_warn(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
+				 rq_mode, ddtp);
+			return -EBUSY;
+		}
+
+		/* Verify IOMMU hardware accepts new DDTP config. */
+		mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
+
+		if (rq_mode == mode)
+			break;
+
+		/* Hardware mandatory DDTP mode has not been accepted. */
+		if (rq_mode < RISCV_IOMMU_DDTP_MODE_1LVL && rq_ddtp != ddtp) {
+			dev_warn(dev, "DDTP update failed hw: %llx vs %llx\n", ddtp, rq_ddtp);
+			return -EINVAL;
+		}
+
+		/*
+		 * Mode field is WARL, an IOMMU may support a subset of
+		 * directory table levels in which case if we tried to set
+		 * an unsupported number of levels we'll readback either
+		 * a valid xLVL or off/bare. If we got off/bare, try again
+		 * with a smaller xLVL.
+		 */
+		if (mode < RISCV_IOMMU_DDTP_MODE_1LVL &&
+		    rq_mode > RISCV_IOMMU_DDTP_MODE_1LVL) {
+			dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
+			rq_mode--;
+			continue;
+		}
+
+		/*
+		 * We tried all supported modes and IOMMU hardware failed to
+		 * accept new settings, something went very wrong since off/bare
+		 * and at least one xLVL must be supported.
+		 */
+		dev_warn(dev, "DDTP hw mode %u, failed to set %u\n", mode, ddtp_mode);
+		return -EINVAL;
+	} while (1);
+
+	iommu->ddt_mode = mode;
+	if (mode != ddtp_mode)
+		dev_warn(dev, "DDTP failover to %u mode, requested %u\n",
+			 mode, ddtp_mode);
+
+	return 0;
+}
+
+static int riscv_iommu_ddt_alloc(struct riscv_iommu_device *iommu)
 {
-	/* Global pass-through already enabled, do nothing for now. */
+	u64 ddtp;
+	unsigned int mode;
+
+	riscv_iommu_readq_timeout(iommu, RISCV_IOMMU_REG_DDTP,
+				  ddtp, !(ddtp & RISCV_IOMMU_DDTP_BUSY),
+				  10, RISCV_IOMMU_DDTP_TIMEOUT);
+
+	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+		return -EBUSY;
+
+	/*
+	 * It is optional for the hardware to report a fixed address for device
+	 * directory root page when DDT.MODE is OFF or BARE.
+	 */
+	mode = FIELD_GET(RISCV_IOMMU_DDTP_MODE, ddtp);
+	if (mode != RISCV_IOMMU_DDTP_MODE_BARE && mode != RISCV_IOMMU_DDTP_MODE_OFF) {
+		/* Use WARL to discover hardware fixed DDT PPN */
+		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
+				   FIELD_PREP(RISCV_IOMMU_DDTP_MODE, mode));
+		riscv_iommu_readl_timeout(iommu, RISCV_IOMMU_REG_DDTP,
+					  ddtp, !(ddtp & RISCV_IOMMU_DDTP_BUSY),
+					  10, RISCV_IOMMU_DDTP_TIMEOUT);
+		if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+			return -EBUSY;
+
+		iommu->ddt_phys = ppn_to_phys(ddtp);
+		if (iommu->ddt_phys)
+			iommu->ddt_root = devm_ioremap(iommu->dev, iommu->ddt_phys, PAGE_SIZE);
+		if (iommu->ddt_root)
+			memset(iommu->ddt_root, 0, PAGE_SIZE);
+	}
+
+	if (!iommu->ddt_root) {
+		iommu->ddt_root = (u64 *)riscv_iommu_get_pages(iommu, 0);
+		iommu->ddt_phys = __pa(iommu->ddt_root);
+	}
+
+	if (!iommu->ddt_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
+				     struct device *dev,
+				     struct iommu_domain *iommu_domain)
+{
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct riscv_iommu_dc *dc;
+	u64 fsc, ta, tc;
+	int i;
+
+	if (!iommu_domain) {
+		ta = 0;
+		tc = 0;
+		fsc = 0;
+	} else if (iommu_domain->type == IOMMU_DOMAIN_IDENTITY) {
+		ta = 0;
+		tc = RISCV_IOMMU_DC_TC_V;
+		fsc = FIELD_PREP(RISCV_IOMMU_DC_FSC_MODE, RISCV_IOMMU_DC_FSC_MODE_BARE);
+	} else {
+		/* This should never happen. */
+		return -ENODEV;
+	}
+
+	/* Update existing or allocate new entries in device directory */
+	for (i = 0; i < fwspec->num_ids; i++) {
+		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i], !iommu_domain);
+		if (!dc && !iommu_domain)
+			continue;
+		if (!dc)
+			return -ENODEV;
+
+		/* Swap device context, update TC valid bit as the last operation */
+		xchg64(&dc->fsc, fsc);
+		xchg64(&dc->ta, ta);
+		xchg64(&dc->tc, tc);
+
+		/* Device context invalidation will be required. Ignoring for now. */
+	}
+
 	return 0;
 }
 
+static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
+					      struct device *dev)
+{
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+
+	/* Global pass-through already enabled, do nothing. */
+	if (iommu->ddt_mode == RISCV_IOMMU_DDTP_MODE_BARE)
+		return 0;
+
+	return riscv_iommu_attach_domain(iommu, dev, iommu_domain);
+}
+
 static struct iommu_domain riscv_iommu_identity_domain = {
 	.type = IOMMU_DOMAIN_IDENTITY,
 	.ops = &(const struct iommu_domain_ops) {
@@ -82,6 +420,13 @@  static void riscv_iommu_probe_finalize(struct device *dev)
 	iommu_setup_dma_ops(dev, 0, U64_MAX);
 }
 
+static void riscv_iommu_release_device(struct device *dev)
+{
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+
+	riscv_iommu_attach_domain(iommu, dev, NULL);
+}
+
 static const struct iommu_ops riscv_iommu_ops = {
 	.owner = THIS_MODULE,
 	.of_xlate = riscv_iommu_of_xlate,
@@ -90,6 +435,7 @@  static const struct iommu_ops riscv_iommu_ops = {
 	.device_group = riscv_iommu_device_group,
 	.probe_device = riscv_iommu_probe_device,
 	.probe_finalize = riscv_iommu_probe_finalize,
+	.release_device = riscv_iommu_release_device,
 };
 
 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
@@ -124,6 +470,7 @@  void riscv_iommu_remove(struct riscv_iommu_device *iommu)
 {
 	iommu_device_unregister(&iommu->iommu);
 	iommu_device_sysfs_remove(&iommu->iommu);
+	riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
 }
 
 int riscv_iommu_init(struct riscv_iommu_device *iommu)
@@ -133,12 +480,14 @@  int riscv_iommu_init(struct riscv_iommu_device *iommu)
 	rc = riscv_iommu_init_check(iommu);
 	if (rc)
 		return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
-	/*
-	 * Placeholder for a complete IOMMU device initialization.
-	 * For now, only bare minimum: enable global identity mapping mode and register sysfs.
-	 */
-	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
-			   FIELD_PREP(RISCV_IOMMU_DDTP_MODE, RISCV_IOMMU_DDTP_MODE_BARE));
+
+	rc = riscv_iommu_ddt_alloc(iommu);
+	if (WARN(rc, "cannot allocate device directory\n"))
+		goto err_init;
+
+	rc = riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
+	if (WARN(rc, "cannot enable iommu device\n"))
+		goto err_init;
 
 	rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
 				    dev_name(iommu->dev));
@@ -154,5 +503,7 @@  int riscv_iommu_init(struct riscv_iommu_device *iommu)
 err_iommu:
 	iommu_device_sysfs_remove(&iommu->iommu);
 err_sysfs:
+	riscv_iommu_set_ddtp_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
+err_init:
 	return rc;
 }
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index 700e33dc2446..f1696926582c 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -34,6 +34,11 @@  struct riscv_iommu_device {
 	/* available interrupt numbers, MSI or WSI */
 	unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
 	unsigned int irqs_count;
+
+	/* device directory */
+	unsigned int ddt_mode;
+	dma_addr_t ddt_phys;
+	u64 *ddt_root;
 };
 
 int riscv_iommu_init(struct riscv_iommu_device *iommu);