diff mbox series

[v2,7/7] iommu/riscv: Paging domain support

Message ID 301244bc3ff5da484b46d3fecc931cdad7d2806f.1713456598.git.tjeznach@rivosinc.com (mailing list archive)
State Superseded
Headers show
Series Linux RISC-V IOMMU Support | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR success PR summary
conchuod/patch-7-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-7-test-2 success .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-7-test-3 success .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-7-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-7-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-7-test-6 success .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-7-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-7-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-7-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-7-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-7-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-7-test-12 success .github/scripts/patches/tests/verify_signedoff.sh

Commit Message

Tomasz Jeznach April 18, 2024, 4:32 p.m. UTC
Introduce first-stage address translation support.

Page table configured by the IOMMU driver will use the same format
as the CPU’s MMU, and will fallback to identity translation if the
page table format configured for the MMU is not supported by the
IOMMU hardware.

This change introduces IOTINVAL.VMA command, required to invalidate
any cached IOATC entries after mapping is updated and/or removed from
the paging domain. Invalidations for the non-leaf page entries will
be added to the driver code in separate patch series, following spec
update to clarify non-leaf cache invalidation command. With this patch,
allowing only 4K mappings and keeping non-leaf page entries in memory
this should be a reasonable simplification.

Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
---
 drivers/iommu/riscv/Kconfig |   1 +
 drivers/iommu/riscv/iommu.c | 467 +++++++++++++++++++++++++++++++++++-
 2 files changed, 466 insertions(+), 2 deletions(-)

Comments

Jason Gunthorpe April 19, 2024, 12:56 p.m. UTC | #1
On Thu, Apr 18, 2024 at 09:32:25AM -0700, Tomasz Jeznach wrote:

> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index a4f74588cdc2..32ddc372432d 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -46,6 +46,10 @@ MODULE_LICENSE("GPL");
>  #define dev_to_iommu(dev) \
>  	container_of((dev)->iommu->iommu_dev, struct riscv_iommu_device, iommu)
>  
> +/* IOMMU PSCID allocation namespace. */
> +static DEFINE_IDA(riscv_iommu_pscids);
> +#define RISCV_IOMMU_MAX_PSCID		BIT(20)
> +

You may consider putting this IDA in the riscv_iommu_device() and move
the pscid from the domain to the bond?

>  /* Device resource-managed allocations */
>  struct riscv_iommu_devres {
>  	unsigned long addr;
> @@ -752,12 +756,77 @@ static int riscv_iommu_ddt_alloc(struct riscv_iommu_device *iommu)
>  	return 0;
>  }
>  
> +struct riscv_iommu_bond {
> +	struct list_head list;
> +	struct rcu_head rcu;
> +	struct device *dev;
> +};
> +
> +/* This struct contains protection domain specific IOMMU driver data. */
> +struct riscv_iommu_domain {
> +	struct iommu_domain domain;
> +	struct list_head bonds;
> +	int pscid;
> +	int numa_node;
> +	int amo_enabled:1;
> +	unsigned int pgd_mode;
> +	/* paging domain */
> +	unsigned long pgd_root;
> +};

Glad to see there is no riscv_iommu_device pointer in the domain!

> +static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
> +				    unsigned long start, unsigned long end)
> +{
> +	struct riscv_iommu_bond *bond;
> +	struct riscv_iommu_device *iommu;
> +	struct riscv_iommu_command cmd;
> +	unsigned long len = end - start + 1;
> +	unsigned long iova;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(bond, &domain->bonds, list) {
> +		iommu = dev_to_iommu(bond->dev);

Pedantically this locking isn't locked right, there is technically
nothing that prevents bond->dev and the iommu instance struct from
being freed here. eg iommufd can hit races here if userspace can hot
unplug devices.

I suggest storing the iommu pointer itself in the bond instead of the
device then add a synchronize_rcu() to the iommu unregister path.

> +		riscv_iommu_cmd_inval_vma(&cmd);
> +		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> +		if (len > 0 && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
> +			for (iova = start; iova < end; iova += PAGE_SIZE) {
> +				riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> +				riscv_iommu_cmd_send(iommu, &cmd, 0);
> +			}
> +		} else {
> +			riscv_iommu_cmd_send(iommu, &cmd, 0);
> +		}
> +	}

This seems suboptimal, you probably want to copy the new design that
Intel is doing where you allocate "bonds" that are already
de-duplicated. Ie if I have 10 devices on the same iommu sharing the
domain the above will invalidate the PSCID 10 times. It should only be
done once.

ie add a "bond" for the (iommu,pscid) and refcount that based on how
many devices are used. Then another "bond" for the ATS stuff eventually.

> +
> +	list_for_each_entry_rcu(bond, &domain->bonds, list) {
> +		iommu = dev_to_iommu(bond->dev);
> +
> +		riscv_iommu_cmd_iofence(&cmd);
> +		riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
> +	}
> +	rcu_read_unlock();
> +}
> +

> @@ -787,12 +870,390 @@ static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
>  		xchg64(&dc->ta, ta);
>  		xchg64(&dc->tc, tc);
>  
> -		/* Device context invalidation will be required. Ignoring for now. */
> +		if (!(tc & RISCV_IOMMU_DC_TC_V))
> +			continue;

No negative caching in HW?

> +		/* Invalidate device context cache */
> +		riscv_iommu_cmd_iodir_inval_ddt(&cmd);
> +		riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
> +		riscv_iommu_cmd_send(iommu, &cmd, 0);
> +
> +		if (FIELD_GET(RISCV_IOMMU_PC_FSC_MODE, fsc) == RISCV_IOMMU_DC_FSC_MODE_BARE)
> +			continue;
> +
> +		/* Invalidate last valid PSCID */
> +		riscv_iommu_cmd_inval_vma(&cmd);
> +		riscv_iommu_cmd_inval_set_pscid(&cmd, FIELD_GET(RISCV_IOMMU_DC_TA_PSCID, ta));
> +		riscv_iommu_cmd_send(iommu, &cmd, 0);
> +	}
> +
> +	/* Synchronize directory update */
> +	riscv_iommu_cmd_iofence(&cmd);
> +	riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
> +
> +	/* Track domain to devices mapping. */
> +	if (bond)
> +		list_add_rcu(&bond->list, &domain->bonds);

This is in the wrong order, the invalidation on the pscid needs to
start before the pscid is loaded into HW in the first place otherwise
concurrent invalidations may miss HW updates.

> +
> +	/* Remove tracking from previous domain, if needed. */
> +	iommu_domain = iommu_get_domain_for_dev(dev);
> +	if (iommu_domain && !!(iommu_domain->type & __IOMMU_DOMAIN_PAGING)) {

No need for !!, && is already booleanizing

> +		domain = iommu_domain_to_riscv(iommu_domain);
> +		bond = NULL;
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(b, &domain->bonds, list) {
> +			if (b->dev == dev) {
> +				bond = b;
> +				break;
> +			}
> +		}
> +		rcu_read_unlock();
> +
> +		if (bond) {
> +			list_del_rcu(&bond->list);
> +			kfree_rcu(bond, rcu);
> +		}
> +	}
> +
> +	return 0;
> +}

> +static inline size_t get_page_size(size_t size)
> +{
> +	if (size >= IOMMU_PAGE_SIZE_512G)
> +		return IOMMU_PAGE_SIZE_512G;
> +	if (size >= IOMMU_PAGE_SIZE_1G)
> +		return IOMMU_PAGE_SIZE_1G;
> +	if (size >= IOMMU_PAGE_SIZE_2M)
> +		return IOMMU_PAGE_SIZE_2M;
> +	return IOMMU_PAGE_SIZE_4K;
> +}
> +
> +#define _io_pte_present(pte)	((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
> +#define _io_pte_leaf(pte)	((pte) & _PAGE_LEAF)
> +#define _io_pte_none(pte)	((pte) == 0)
> +#define _io_pte_entry(pn, prot)	((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
> +
> +static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
> +				 unsigned long pte, struct list_head *freelist)
> +{
> +	unsigned long *ptr;
> +	int i;
> +
> +	if (!_io_pte_present(pte) || _io_pte_leaf(pte))
> +		return;
> +
> +	ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
> +
> +	/* Recursively free all sub page table pages */
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		pte = READ_ONCE(ptr[i]);
> +		if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
> +			riscv_iommu_pte_free(domain, pte, freelist);
> +	}
> +
> +	if (freelist)
> +		list_add_tail(&virt_to_page(ptr)->lru, freelist);
> +	else
> +		free_page((unsigned long)ptr);
> +}

Consider putting the page table handling in its own file?

> +static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
> +					    struct device *dev)
> +{
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
> +	struct page *page;
> +
> +	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
> +		return -ENODEV;
> +
> +	domain->numa_node = dev_to_node(iommu->dev);
> +	domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAP_AMO_HWAD);
> +
> +	if (!domain->pgd_root) {
> +		page = alloc_pages_node(domain->numa_node,
> +					GFP_KERNEL_ACCOUNT | __GFP_ZERO, 0);
> +		if (!page)
> +			return -ENOMEM;
> +		domain->pgd_root = (unsigned long)page_to_virt(page);

The pgd_root should be allocated by the alloc_paging function, not
during attach. There is no locking here that will protect against
concurrent attach and also map before attach should work.

You can pick up the numa affinity from the alloc paging dev pointer
(note it may be null still in some cases)

Jason
Baolu Lu April 22, 2024, 5:21 a.m. UTC | #2
On 4/19/24 12:32 AM, Tomasz Jeznach wrote:
> Introduce first-stage address translation support.
> 
> Page table configured by the IOMMU driver will use the same format
> as the CPU’s MMU, and will fallback to identity translation if the
> page table format configured for the MMU is not supported by the
> IOMMU hardware.
> 
> This change introduces IOTINVAL.VMA command, required to invalidate
> any cached IOATC entries after mapping is updated and/or removed from
> the paging domain. Invalidations for the non-leaf page entries will
> be added to the driver code in separate patch series, following spec
> update to clarify non-leaf cache invalidation command. With this patch,
> allowing only 4K mappings and keeping non-leaf page entries in memory
> this should be a reasonable simplification.
> 
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> ---
>   drivers/iommu/riscv/Kconfig |   1 +
>   drivers/iommu/riscv/iommu.c | 467 +++++++++++++++++++++++++++++++++++-
>   2 files changed, 466 insertions(+), 2 deletions(-)
> 

[...]

> +
>   static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
>   				     struct device *dev,
>   				     struct iommu_domain *iommu_domain)
>   {
>   	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	struct riscv_iommu_domain *domain;
>   	struct riscv_iommu_dc *dc;
> +	struct riscv_iommu_bond *bond = NULL, *b;
> +	struct riscv_iommu_command cmd;
>   	u64 fsc, ta, tc;
>   	int i;
>   
> @@ -769,6 +838,20 @@ static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
>   		ta = 0;
>   		tc = RISCV_IOMMU_DC_TC_V;
>   		fsc = FIELD_PREP(RISCV_IOMMU_DC_FSC_MODE, RISCV_IOMMU_DC_FSC_MODE_BARE);
> +	} else if (iommu_domain->type & __IOMMU_DOMAIN_PAGING) {
> +		domain = iommu_domain_to_riscv(iommu_domain);
> +
> +		ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid);
> +		tc = RISCV_IOMMU_DC_TC_V;
> +		if (domain->amo_enabled)
> +			tc |= RISCV_IOMMU_DC_TC_SADE;
> +		fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
> +		      FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
> +
> +		bond = kzalloc(sizeof(*bond), GFP_KERNEL);
> +		if (!bond)
> +			return -ENOMEM;
> +		bond->dev = dev;
>   	} else {
>   		/* This should never happen. */
>   		return -ENODEV;
> @@ -787,12 +870,390 @@ static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
>   		xchg64(&dc->ta, ta);
>   		xchg64(&dc->tc, tc);
>   
> -		/* Device context invalidation will be required. Ignoring for now. */
> +		if (!(tc & RISCV_IOMMU_DC_TC_V))
> +			continue;
> +
> +		/* Invalidate device context cache */
> +		riscv_iommu_cmd_iodir_inval_ddt(&cmd);
> +		riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
> +		riscv_iommu_cmd_send(iommu, &cmd, 0);
> +
> +		if (FIELD_GET(RISCV_IOMMU_PC_FSC_MODE, fsc) == RISCV_IOMMU_DC_FSC_MODE_BARE)
> +			continue;
> +
> +		/* Invalidate last valid PSCID */
> +		riscv_iommu_cmd_inval_vma(&cmd);
> +		riscv_iommu_cmd_inval_set_pscid(&cmd, FIELD_GET(RISCV_IOMMU_DC_TA_PSCID, ta));
> +		riscv_iommu_cmd_send(iommu, &cmd, 0);
> +	}
> +
> +	/* Synchronize directory update */
> +	riscv_iommu_cmd_iofence(&cmd);
> +	riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
> +
> +	/* Track domain to devices mapping. */
> +	if (bond)
> +		list_add_rcu(&bond->list, &domain->bonds);
> +
> +	/* Remove tracking from previous domain, if needed. */
> +	iommu_domain = iommu_get_domain_for_dev(dev);

Calling iommu_get_domain_for_dev() in the domain attaching path is very
fragile because it heavily depends on the order of calling the attach
callback and setting the domain pointer in the core.

Perhaps the driver can use dev_iommu_priv_set/get() to keep the active
domain in the per-device private data?

> +	if (iommu_domain && !!(iommu_domain->type & __IOMMU_DOMAIN_PAGING)) {
> +		domain = iommu_domain_to_riscv(iommu_domain);
> +		bond = NULL;
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(b, &domain->bonds, list) {
> +			if (b->dev == dev) {
> +				bond = b;
> +				break;
> +			}
> +		}
> +		rcu_read_unlock();
> +
> +		if (bond) {
> +			list_del_rcu(&bond->list);
> +			kfree_rcu(bond, rcu);
> +		}
> +	}
> +
> +	return 0;
> +}

Best regards,
baolu
Baolu Lu April 22, 2024, 7:40 a.m. UTC | #3
On 2024/4/19 20:56, Jason Gunthorpe wrote:
>> +		riscv_iommu_cmd_inval_vma(&cmd);
>> +		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
>> +		if (len > 0 && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
>> +			for (iova = start; iova < end; iova += PAGE_SIZE) {
>> +				riscv_iommu_cmd_inval_set_addr(&cmd, iova);
>> +				riscv_iommu_cmd_send(iommu, &cmd, 0);
>> +			}
>> +		} else {
>> +			riscv_iommu_cmd_send(iommu, &cmd, 0);
>> +		}
>> +	}
> This seems suboptimal, you probably want to copy the new design that
> Intel is doing where you allocate "bonds" that are already
> de-duplicated. Ie if I have 10 devices on the same iommu sharing the
> domain the above will invalidate the PSCID 10 times. It should only be
> done once.
> 
> ie add a "bond" for the (iommu,pscid) and refcount that based on how
> many devices are used. Then another "bond" for the ATS stuff eventually.

The latest version is under discussion here.

https://lore.kernel.org/linux-iommu/20240416080656.60968-1-baolu.lu@linux.intel.com/

Supposedly, you can make such optimization after the base code is landed
in the mainline tree if the change is big.

Best regards,
baolu
Jason Gunthorpe April 22, 2024, 7:30 p.m. UTC | #4
On Mon, Apr 22, 2024 at 01:21:05PM +0800, Baolu Lu wrote:
> > +	/* Track domain to devices mapping. */
> > +	if (bond)
> > +		list_add_rcu(&bond->list, &domain->bonds);
> > +
> > +	/* Remove tracking from previous domain, if needed. */
> > +	iommu_domain = iommu_get_domain_for_dev(dev);
> 
> Calling iommu_get_domain_for_dev() in the domain attaching path is very
> fragile because it heavily depends on the order of calling the attach
> callback and setting the domain pointer in the core.

We have a couple places doing this already, the core code accomodates
it well enough for deleting from a list.. So I think it is OK to keep
doing.

> Perhaps the driver can use dev_iommu_priv_set/get() to keep the active
> domain in the per-device private data?
> 
> > +	if (iommu_domain && !!(iommu_domain->type & __IOMMU_DOMAIN_PAGING)) {
> > +		domain = iommu_domain_to_riscv(iommu_domain);
> > +		bond = NULL;
> > +		rcu_read_lock();
> > +		list_for_each_entry_rcu(b, &domain->bonds, list) {
> > +			if (b->dev == dev) {
> > +				bond = b;
> > +				break;
> > +			}
> > +		}
> > +		rcu_read_unlock();

But now that I look again, this is not safe, you have to hold some
kind of per-domain lock to mutate the list. rcu_*read*_lock() cannot
be used for write.

Jason
Andrew Jones April 23, 2024, 5 p.m. UTC | #5
On Thu, Apr 18, 2024 at 09:32:25AM -0700, Tomasz Jeznach wrote:
...
> +static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
> +				      unsigned long iova, size_t pgsize, size_t pgcount,
> +				      struct iommu_iotlb_gather *gather)
> +{
> +	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
> +	size_t size = pgcount << __ffs(pgsize);
> +	unsigned long *ptr, old;
> +	size_t unmapped = 0;
> +	size_t pte_size;
> +
> +	while (unmapped < size) {
> +		ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
> +		if (!ptr)
> +			return unmapped;
> +
> +		/* partial unmap is not allowed, fail. */
> +		if (iova & ~(pte_size - 1))
                           ^ Shouldn't this ~ be removed?

> +			return unmapped;
> +
> +		old = READ_ONCE(*ptr);
> +		if (cmpxchg_relaxed(ptr, old, 0) != old)
> +			continue;
> +
> +		iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
> +					    pte_size);
> +
> +		iova += pte_size;
> +		unmapped += pte_size;
> +	}
> +
> +	return unmapped;
> +}
> +

Thanks,
drew
Tomasz Jeznach April 24, 2024, 11:30 p.m. UTC | #6
On Fri, Apr 19, 2024 at 5:56 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Thu, Apr 18, 2024 at 09:32:25AM -0700, Tomasz Jeznach wrote:
>
> > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> > index a4f74588cdc2..32ddc372432d 100644
> > --- a/drivers/iommu/riscv/iommu.c
> > +++ b/drivers/iommu/riscv/iommu.c
> > @@ -46,6 +46,10 @@ MODULE_LICENSE("GPL");
> >  #define dev_to_iommu(dev) \
> >       container_of((dev)->iommu->iommu_dev, struct riscv_iommu_device, iommu)
> >
> > +/* IOMMU PSCID allocation namespace. */
> > +static DEFINE_IDA(riscv_iommu_pscids);
> > +#define RISCV_IOMMU_MAX_PSCID                BIT(20)
> > +
>
> You may consider putting this IDA in the riscv_iommu_device() and move
> the pscid from the domain to the bond?
>

I've been considering containing IDA inside riscv_iommu_device at some
point,  but it made PCSID management more complicated.  In the follow
up patches it is desired for PSCID to be unique across all IOMMUs in
the system (within guest's GSCID), as the protection domains might
(and will) be shared between more than single IOMMU device.

> >  /* Device resource-managed allocations */
> >  struct riscv_iommu_devres {
> >       unsigned long addr;
> > @@ -752,12 +756,77 @@ static int riscv_iommu_ddt_alloc(struct riscv_iommu_device *iommu)
> >       return 0;
> >  }
> >
> > +struct riscv_iommu_bond {
> > +     struct list_head list;
> > +     struct rcu_head rcu;
> > +     struct device *dev;
> > +};
> > +
> > +/* This struct contains protection domain specific IOMMU driver data. */
> > +struct riscv_iommu_domain {
> > +     struct iommu_domain domain;
> > +     struct list_head bonds;
> > +     int pscid;
> > +     int numa_node;
> > +     int amo_enabled:1;
> > +     unsigned int pgd_mode;
> > +     /* paging domain */
> > +     unsigned long pgd_root;
> > +};
>
> Glad to see there is no riscv_iommu_device pointer in the domain!
>
> > +static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
> > +                                 unsigned long start, unsigned long end)
> > +{
> > +     struct riscv_iommu_bond *bond;
> > +     struct riscv_iommu_device *iommu;
> > +     struct riscv_iommu_command cmd;
> > +     unsigned long len = end - start + 1;
> > +     unsigned long iova;
> > +
> > +     rcu_read_lock();
> > +     list_for_each_entry_rcu(bond, &domain->bonds, list) {
> > +             iommu = dev_to_iommu(bond->dev);
>
> Pedantically this locking isn't locked right, there is technically
> nothing that prevents bond->dev and the iommu instance struct from
> being freed here. eg iommufd can hit races here if userspace can hot
> unplug devices.
>
> I suggest storing the iommu pointer itself in the bond instead of the
> device then add a synchronize_rcu() to the iommu unregister path.
>

Very good point. Thanks for pointing this out.
Reworked to add locking around list modifications (and do not
incorrectly rely on iommu group mutex locks).

> > +             riscv_iommu_cmd_inval_vma(&cmd);
> > +             riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> > +             if (len > 0 && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
> > +                     for (iova = start; iova < end; iova += PAGE_SIZE) {
> > +                             riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> > +                             riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +                     }
> > +             } else {
> > +                     riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +             }
> > +     }
>
> This seems suboptimal, you probably want to copy the new design that
> Intel is doing where you allocate "bonds" that are already
> de-duplicated. Ie if I have 10 devices on the same iommu sharing the
> domain the above will invalidate the PSCID 10 times. It should only be
> done once.
>
> ie add a "bond" for the (iommu,pscid) and refcount that based on how
> many devices are used. Then another "bond" for the ATS stuff eventually.
>

Agree, not perfect to send duplicate invalidations.
This should improve with follow up patchsets introducing of SVA
(reusing the same, extended bond structure) and update to send IOTLB
range invalidations.

For this change I've decided to go with as simple as possible
implementation and over-invalidate for domains with multiple devices
attached. Hope this makes sense.

> > +
> > +     list_for_each_entry_rcu(bond, &domain->bonds, list) {
> > +             iommu = dev_to_iommu(bond->dev);
> > +
> > +             riscv_iommu_cmd_iofence(&cmd);
> > +             riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
> > +     }
> > +     rcu_read_unlock();
> > +}
> > +
>
> > @@ -787,12 +870,390 @@ static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
> >               xchg64(&dc->ta, ta);
> >               xchg64(&dc->tc, tc);
> >
> > -             /* Device context invalidation will be required. Ignoring for now. */
> > +             if (!(tc & RISCV_IOMMU_DC_TC_V))
> > +                     continue;
>
> No negative caching in HW?
>
No. Disallowed by the spec.

> > +             /* Invalidate device context cache */
> > +             riscv_iommu_cmd_iodir_inval_ddt(&cmd);
> > +             riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
> > +             riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +
> > +             if (FIELD_GET(RISCV_IOMMU_PC_FSC_MODE, fsc) == RISCV_IOMMU_DC_FSC_MODE_BARE)
> > +                     continue;
> > +
> > +             /* Invalidate last valid PSCID */
> > +             riscv_iommu_cmd_inval_vma(&cmd);
> > +             riscv_iommu_cmd_inval_set_pscid(&cmd, FIELD_GET(RISCV_IOMMU_DC_TA_PSCID, ta));
> > +             riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +     }
> > +
> > +     /* Synchronize directory update */
> > +     riscv_iommu_cmd_iofence(&cmd);
> > +     riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
> > +
> > +     /* Track domain to devices mapping. */
> > +     if (bond)
> > +             list_add_rcu(&bond->list, &domain->bonds);
>
> This is in the wrong order, the invalidation on the pscid needs to
> start before the pscid is loaded into HW in the first place otherwise
> concurrent invalidations may miss HW updates.
>
> > +
> > +     /* Remove tracking from previous domain, if needed. */
> > +     iommu_domain = iommu_get_domain_for_dev(dev);
> > +     if (iommu_domain && !!(iommu_domain->type & __IOMMU_DOMAIN_PAGING)) {
>
> No need for !!, && is already booleanizing
>
> > +             domain = iommu_domain_to_riscv(iommu_domain);
> > +             bond = NULL;
> > +             rcu_read_lock();
> > +             list_for_each_entry_rcu(b, &domain->bonds, list) {
> > +                     if (b->dev == dev) {
> > +                             bond = b;
> > +                             break;
> > +                     }
> > +             }
> > +             rcu_read_unlock();
> > +
> > +             if (bond) {
> > +                     list_del_rcu(&bond->list);
> > +                     kfree_rcu(bond, rcu);
> > +             }
> > +     }
> > +
> > +     return 0;
> > +}
>
> > +static inline size_t get_page_size(size_t size)
> > +{
> > +     if (size >= IOMMU_PAGE_SIZE_512G)
> > +             return IOMMU_PAGE_SIZE_512G;
> > +     if (size >= IOMMU_PAGE_SIZE_1G)
> > +             return IOMMU_PAGE_SIZE_1G;
> > +     if (size >= IOMMU_PAGE_SIZE_2M)
> > +             return IOMMU_PAGE_SIZE_2M;
> > +     return IOMMU_PAGE_SIZE_4K;
> > +}
> > +
> > +#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
> > +#define _io_pte_leaf(pte)    ((pte) & _PAGE_LEAF)
> > +#define _io_pte_none(pte)    ((pte) == 0)
> > +#define _io_pte_entry(pn, prot)      ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
> > +
> > +static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
> > +                              unsigned long pte, struct list_head *freelist)
> > +{
> > +     unsigned long *ptr;
> > +     int i;
> > +
> > +     if (!_io_pte_present(pte) || _io_pte_leaf(pte))
> > +             return;
> > +
> > +     ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
> > +
> > +     /* Recursively free all sub page table pages */
> > +     for (i = 0; i < PTRS_PER_PTE; i++) {
> > +             pte = READ_ONCE(ptr[i]);
> > +             if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
> > +                     riscv_iommu_pte_free(domain, pte, freelist);
> > +     }
> > +
> > +     if (freelist)
> > +             list_add_tail(&virt_to_page(ptr)->lru, freelist);
> > +     else
> > +             free_page((unsigned long)ptr);
> > +}
>
> Consider putting the page table handling in its own file?
>

It was in separate file at some point, but merged to iommu.c, as its
simple enough with ~300 lines only. Probably not worth separating this
out.

> > +static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
> > +                                         struct device *dev)
> > +{
> > +     struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > +     struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
> > +     struct page *page;
> > +
> > +     if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
> > +             return -ENODEV;
> > +
> > +     domain->numa_node = dev_to_node(iommu->dev);
> > +     domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAP_AMO_HWAD);
> > +
> > +     if (!domain->pgd_root) {
> > +             page = alloc_pages_node(domain->numa_node,
> > +                                     GFP_KERNEL_ACCOUNT | __GFP_ZERO, 0);
> > +             if (!page)
> > +                     return -ENOMEM;
> > +             domain->pgd_root = (unsigned long)page_to_virt(page);
>
> The pgd_root should be allocated by the alloc_paging function, not
> during attach. There is no locking here that will protect against
> concurrent attach and also map before attach should work.
>
> You can pick up the numa affinity from the alloc paging dev pointer
> (note it may be null still in some cases)
>

Good point. Thanks. Will send update shortly with v3.

> Jason

Ack to all other comments, thank you!
Best,
- Tomasz
Jason Gunthorpe April 24, 2024, 11:39 p.m. UTC | #7
On Wed, Apr 24, 2024 at 04:30:45PM -0700, Tomasz Jeznach wrote:
> > > @@ -46,6 +46,10 @@ MODULE_LICENSE("GPL");
> > >  #define dev_to_iommu(dev) \
> > >       container_of((dev)->iommu->iommu_dev, struct riscv_iommu_device, iommu)
> > >
> > > +/* IOMMU PSCID allocation namespace. */
> > > +static DEFINE_IDA(riscv_iommu_pscids);
> > > +#define RISCV_IOMMU_MAX_PSCID                BIT(20)
> > > +
> >
> > You may consider putting this IDA in the riscv_iommu_device() and move
> > the pscid from the domain to the bond?
> >
> 
> I've been considering containing IDA inside riscv_iommu_device at some
> point,  but it made PCSID management more complicated.  In the follow
> up patches it is desired for PSCID to be unique across all IOMMUs in
> the system (within guest's GSCID), as the protection domains might
> (and will) be shared between more than single IOMMU device.

The PCSID isn't scoped under the GSCID? That doesn't sound very good,
it means VM's can't direct issue invalidation with their local view of
the PCSID space?

> > This seems suboptimal, you probably want to copy the new design that
> > Intel is doing where you allocate "bonds" that are already
> > de-duplicated. Ie if I have 10 devices on the same iommu sharing the
> > domain the above will invalidate the PSCID 10 times. It should only be
> > done once.
> >
> > ie add a "bond" for the (iommu,pscid) and refcount that based on how
> > many devices are used. Then another "bond" for the ATS stuff eventually.
> >
> 
> Agree, not perfect to send duplicate invalidations.
> This should improve with follow up patchsets introducing of SVA
> (reusing the same, extended bond structure) and update to send IOTLB
> range invalidations.
> 
> For this change I've decided to go with as simple as possible
> implementation and over-invalidate for domains with multiple devices
> attached. Hope this makes sense.

It is fine as long as you do fix it..

Jason
Tomasz Jeznach April 24, 2024, 11:54 p.m. UTC | #8
On Wed, Apr 24, 2024 at 4:39 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Wed, Apr 24, 2024 at 04:30:45PM -0700, Tomasz Jeznach wrote:
> > > > @@ -46,6 +46,10 @@ MODULE_LICENSE("GPL");
> > > >  #define dev_to_iommu(dev) \
> > > >       container_of((dev)->iommu->iommu_dev, struct riscv_iommu_device, iommu)
> > > >
> > > > +/* IOMMU PSCID allocation namespace. */
> > > > +static DEFINE_IDA(riscv_iommu_pscids);
> > > > +#define RISCV_IOMMU_MAX_PSCID                BIT(20)
> > > > +
> > >
> > > You may consider putting this IDA in the riscv_iommu_device() and move
> > > the pscid from the domain to the bond?
> > >
> >
> > I've been considering containing IDA inside riscv_iommu_device at some
> > point,  but it made PCSID management more complicated.  In the follow
> > up patches it is desired for PSCID to be unique across all IOMMUs in
> > the system (within guest's GSCID), as the protection domains might
> > (and will) be shared between more than single IOMMU device.
>
> The PCSID isn't scoped under the GSCID? That doesn't sound very good,
> it means VM's can't direct issue invalidation with their local view of
> the PCSID space?
>

To clarify: PSCID namespace is per GSCID.
However there might be more than one IOMMU in a single system sharing
the same GSCID, and with e.g. SVA domains attached to more than one
IOMMU. It was simpler to manage PCSID globally.

PSCID management for the VM assigned GSCID will be the VM's responsibility.

> > > This seems suboptimal, you probably want to copy the new design that
> > > Intel is doing where you allocate "bonds" that are already
> > > de-duplicated. Ie if I have 10 devices on the same iommu sharing the
> > > domain the above will invalidate the PSCID 10 times. It should only be
> > > done once.
> > >
> > > ie add a "bond" for the (iommu,pscid) and refcount that based on how
> > > many devices are used. Then another "bond" for the ATS stuff eventually.
> > >
> >
> > Agree, not perfect to send duplicate invalidations.
> > This should improve with follow up patchsets introducing of SVA
> > (reusing the same, extended bond structure) and update to send IOTLB
> > range invalidations.
> >
> > For this change I've decided to go with as simple as possible
> > implementation and over-invalidate for domains with multiple devices
> > attached. Hope this makes sense.
>
> It is fine as long as you do fix it..
>

SG. I'll have a second look if it can be fixed sooner.

> Jason

Best,
- Tomasz
Jason Gunthorpe April 25, 2024, 12:48 a.m. UTC | #9
On Wed, Apr 24, 2024 at 04:54:01PM -0700, Tomasz Jeznach wrote:
> On Wed, Apr 24, 2024 at 4:39 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Wed, Apr 24, 2024 at 04:30:45PM -0700, Tomasz Jeznach wrote:
> > > > > @@ -46,6 +46,10 @@ MODULE_LICENSE("GPL");
> > > > >  #define dev_to_iommu(dev) \
> > > > >       container_of((dev)->iommu->iommu_dev, struct riscv_iommu_device, iommu)
> > > > >
> > > > > +/* IOMMU PSCID allocation namespace. */
> > > > > +static DEFINE_IDA(riscv_iommu_pscids);
> > > > > +#define RISCV_IOMMU_MAX_PSCID                BIT(20)
> > > > > +
> > > >
> > > > You may consider putting this IDA in the riscv_iommu_device() and move
> > > > the pscid from the domain to the bond?
> > > >
> > >
> > > I've been considering containing IDA inside riscv_iommu_device at some
> > > point,  but it made PCSID management more complicated.  In the follow
> > > up patches it is desired for PSCID to be unique across all IOMMUs in
> > > the system (within guest's GSCID), as the protection domains might
> > > (and will) be shared between more than single IOMMU device.
> >
> > The PCSID isn't scoped under the GSCID? That doesn't sound very good,
> > it means VM's can't direct issue invalidation with their local view of
> > the PCSID space?
> >
> 
> To clarify: PSCID namespace is per GSCID.
> However there might be more than one IOMMU in a single system sharing
> the same GSCID

I assume this is because GSCID ends up shared with kvm?

> and with e.g. SVA domains attached to more than one
> IOMMU. It was simpler to manage PCSID globally.

If the PSCID is moved into the invalidation list like Intel structured
it then it doesn't matter for SVA, or really anything.

AFAIK the only reason to do otherwise is if you have a reason to share
the ID with the CPU/MM and the IOMMU probably to coordinate
invalidations. But if you do this then you really just always want to
use the MM's global ID space in the first place...

So I'm not sure :)

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/riscv/Kconfig b/drivers/iommu/riscv/Kconfig
index 711326992585..6f9fb396034a 100644
--- a/drivers/iommu/riscv/Kconfig
+++ b/drivers/iommu/riscv/Kconfig
@@ -7,6 +7,7 @@  config RISCV_IOMMU
 	select DMA_OPS
 	select IOMMU_API
 	select IOMMU_IOVA
+	select IOMMU_DMA
 	help
 	  Support for implementations of the RISC-V IOMMU architecture that
 	  complements the RISC-V MMU capabilities, providing similar address
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index a4f74588cdc2..32ddc372432d 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -46,6 +46,10 @@  MODULE_LICENSE("GPL");
 #define dev_to_iommu(dev) \
 	container_of((dev)->iommu->iommu_dev, struct riscv_iommu_device, iommu)
 
+/* IOMMU PSCID allocation namespace. */
+static DEFINE_IDA(riscv_iommu_pscids);
+#define RISCV_IOMMU_MAX_PSCID		BIT(20)
+
 /* Device resource-managed allocations */
 struct riscv_iommu_devres {
 	unsigned long addr;
@@ -752,12 +756,77 @@  static int riscv_iommu_ddt_alloc(struct riscv_iommu_device *iommu)
 	return 0;
 }
 
+struct riscv_iommu_bond {
+	struct list_head list;
+	struct rcu_head rcu;
+	struct device *dev;
+};
+
+/* This struct contains protection domain specific IOMMU driver data. */
+struct riscv_iommu_domain {
+	struct iommu_domain domain;
+	struct list_head bonds;
+	int pscid;
+	int numa_node;
+	int amo_enabled:1;
+	unsigned int pgd_mode;
+	/* paging domain */
+	unsigned long pgd_root;
+};
+
+#define iommu_domain_to_riscv(iommu_domain) \
+	container_of(iommu_domain, struct riscv_iommu_domain, domain)
+
+/*
+ * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
+ * This limit will be replaced with range invalidations, if supported by
+ * the hardware, when RISC-V IOMMU architecture specification update for
+ * range invalidations update will be available.
+ */
+#define RISCV_IOMMU_IOTLB_INVAL_LIMIT	(2 << 20)
+
+static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
+				    unsigned long start, unsigned long end)
+{
+	struct riscv_iommu_bond *bond;
+	struct riscv_iommu_device *iommu;
+	struct riscv_iommu_command cmd;
+	unsigned long len = end - start + 1;
+	unsigned long iova;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(bond, &domain->bonds, list) {
+		iommu = dev_to_iommu(bond->dev);
+		riscv_iommu_cmd_inval_vma(&cmd);
+		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+		if (len > 0 && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
+			for (iova = start; iova < end; iova += PAGE_SIZE) {
+				riscv_iommu_cmd_inval_set_addr(&cmd, iova);
+				riscv_iommu_cmd_send(iommu, &cmd, 0);
+			}
+		} else {
+			riscv_iommu_cmd_send(iommu, &cmd, 0);
+		}
+	}
+
+	list_for_each_entry_rcu(bond, &domain->bonds, list) {
+		iommu = dev_to_iommu(bond->dev);
+
+		riscv_iommu_cmd_iofence(&cmd);
+		riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
+	}
+	rcu_read_unlock();
+}
+
 static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
 				     struct device *dev,
 				     struct iommu_domain *iommu_domain)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct riscv_iommu_domain *domain;
 	struct riscv_iommu_dc *dc;
+	struct riscv_iommu_bond *bond = NULL, *b;
+	struct riscv_iommu_command cmd;
 	u64 fsc, ta, tc;
 	int i;
 
@@ -769,6 +838,20 @@  static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
 		ta = 0;
 		tc = RISCV_IOMMU_DC_TC_V;
 		fsc = FIELD_PREP(RISCV_IOMMU_DC_FSC_MODE, RISCV_IOMMU_DC_FSC_MODE_BARE);
+	} else if (iommu_domain->type & __IOMMU_DOMAIN_PAGING) {
+		domain = iommu_domain_to_riscv(iommu_domain);
+
+		ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid);
+		tc = RISCV_IOMMU_DC_TC_V;
+		if (domain->amo_enabled)
+			tc |= RISCV_IOMMU_DC_TC_SADE;
+		fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
+		      FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
+
+		bond = kzalloc(sizeof(*bond), GFP_KERNEL);
+		if (!bond)
+			return -ENOMEM;
+		bond->dev = dev;
 	} else {
 		/* This should never happen. */
 		return -ENODEV;
@@ -787,12 +870,390 @@  static int riscv_iommu_attach_domain(struct riscv_iommu_device *iommu,
 		xchg64(&dc->ta, ta);
 		xchg64(&dc->tc, tc);
 
-		/* Device context invalidation will be required. Ignoring for now. */
+		if (!(tc & RISCV_IOMMU_DC_TC_V))
+			continue;
+
+		/* Invalidate device context cache */
+		riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+		riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
+		riscv_iommu_cmd_send(iommu, &cmd, 0);
+
+		if (FIELD_GET(RISCV_IOMMU_PC_FSC_MODE, fsc) == RISCV_IOMMU_DC_FSC_MODE_BARE)
+			continue;
+
+		/* Invalidate last valid PSCID */
+		riscv_iommu_cmd_inval_vma(&cmd);
+		riscv_iommu_cmd_inval_set_pscid(&cmd, FIELD_GET(RISCV_IOMMU_DC_TA_PSCID, ta));
+		riscv_iommu_cmd_send(iommu, &cmd, 0);
+	}
+
+	/* Synchronize directory update */
+	riscv_iommu_cmd_iofence(&cmd);
+	riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+
+	/* Track domain to devices mapping. */
+	if (bond)
+		list_add_rcu(&bond->list, &domain->bonds);
+
+	/* Remove tracking from previous domain, if needed. */
+	iommu_domain = iommu_get_domain_for_dev(dev);
+	if (iommu_domain && !!(iommu_domain->type & __IOMMU_DOMAIN_PAGING)) {
+		domain = iommu_domain_to_riscv(iommu_domain);
+		bond = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(b, &domain->bonds, list) {
+			if (b->dev == dev) {
+				bond = b;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (bond) {
+			list_del_rcu(&bond->list);
+			kfree_rcu(bond, rcu);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * IOVA page translation tree management.
+ */
+
+#define IOMMU_PAGE_SIZE_4K     BIT_ULL(12)
+#define IOMMU_PAGE_SIZE_2M     BIT_ULL(21)
+#define IOMMU_PAGE_SIZE_1G     BIT_ULL(30)
+#define IOMMU_PAGE_SIZE_512G   BIT_ULL(39)
+
+#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
+
+static void riscv_iommu_flush_iotlb_all(struct iommu_domain *iommu_domain)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+	riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+}
+
+static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
+				   struct iommu_iotlb_gather *gather)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+	riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
+}
+
+static inline size_t get_page_size(size_t size)
+{
+	if (size >= IOMMU_PAGE_SIZE_512G)
+		return IOMMU_PAGE_SIZE_512G;
+	if (size >= IOMMU_PAGE_SIZE_1G)
+		return IOMMU_PAGE_SIZE_1G;
+	if (size >= IOMMU_PAGE_SIZE_2M)
+		return IOMMU_PAGE_SIZE_2M;
+	return IOMMU_PAGE_SIZE_4K;
+}
+
+#define _io_pte_present(pte)	((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
+#define _io_pte_leaf(pte)	((pte) & _PAGE_LEAF)
+#define _io_pte_none(pte)	((pte) == 0)
+#define _io_pte_entry(pn, prot)	((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
+
+static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
+				 unsigned long pte, struct list_head *freelist)
+{
+	unsigned long *ptr;
+	int i;
+
+	if (!_io_pte_present(pte) || _io_pte_leaf(pte))
+		return;
+
+	ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+
+	/* Recursively free all sub page table pages */
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = READ_ONCE(ptr[i]);
+		if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
+			riscv_iommu_pte_free(domain, pte, freelist);
+	}
+
+	if (freelist)
+		list_add_tail(&virt_to_page(ptr)->lru, freelist);
+	else
+		free_page((unsigned long)ptr);
+}
+
+static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
+					    unsigned long iova, size_t pgsize, gfp_t gfp)
+{
+	unsigned long *ptr = (unsigned long *)domain->pgd_root;
+	unsigned long pte, old;
+	int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+	struct page *page;
+
+	do {
+		const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+		ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+		/*
+		 * Note: returned entry might be a non-leaf if there was existing mapping
+		 * with smaller granularity. Up to the caller to replace and invalidate.
+		 */
+		if (((size_t)1 << shift) == pgsize)
+			return ptr;
+pte_retry:
+		pte = READ_ONCE(*ptr);
+		/*
+		 * This is very likely incorrect as we should not be adding new mapping
+		 * with smaller granularity on top of existing 2M/1G mapping. Fail.
+		 */
+		if (_io_pte_present(pte) && _io_pte_leaf(pte))
+			return NULL;
+		/*
+		 * Non-leaf entry is missing, allocate and try to add to the page table.
+		 * This might race with other mappings, retry on error.
+		 */
+		if (_io_pte_none(pte)) {
+			page = alloc_pages_node(domain->numa_node, __GFP_ZERO | gfp, 0);
+			if (!page)
+				return NULL;
+			old = pte;
+			pte = _io_pte_entry(page_to_pfn(page), _PAGE_TABLE);
+			if (cmpxchg_relaxed(ptr, old, pte) != old) {
+				__free_pages(page, 0);
+				goto pte_retry;
+			}
+		}
+		ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+	} while (level-- > 0);
+
+	return NULL;
+}
+
+static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
+					    unsigned long iova, size_t *pte_pgsize)
+{
+	unsigned long *ptr = (unsigned long *)domain->pgd_root;
+	unsigned long pte;
+	int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+
+	do {
+		const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+		ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+		pte = READ_ONCE(*ptr);
+		if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
+			*pte_pgsize = (size_t)1 << shift;
+			return ptr;
+		}
+		if (_io_pte_none(pte))
+			return NULL;
+		ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+	} while (level-- > 0);
+
+	return NULL;
+}
+
+static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
+				 unsigned long iova, phys_addr_t phys,
+				 size_t pgsize, size_t pgcount, int prot,
+				 gfp_t gfp, size_t *mapped)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	size_t size = 0;
+	size_t page_size = get_page_size(pgsize);
+	unsigned long *ptr;
+	unsigned long pte, old, pte_prot;
+
+	if (!(prot & IOMMU_WRITE))
+		pte_prot = _PAGE_BASE | _PAGE_READ;
+	else if (domain->amo_enabled)
+		pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
+	else
+		pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
+
+	while (pgcount) {
+		ptr = riscv_iommu_pte_alloc(domain, iova, page_size, gfp);
+		if (!ptr) {
+			*mapped = size;
+			return -ENOMEM;
+		}
+
+		old = READ_ONCE(*ptr);
+		pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
+		if (cmpxchg_relaxed(ptr, old, pte) != old)
+			continue;
+
+		/* TODO: non-leaf page invalidation is pending spec update */
+		riscv_iommu_pte_free(domain, old, NULL);
+
+		size += page_size;
+		iova += page_size;
+		phys += page_size;
+		--pgcount;
 	}
 
+	*mapped = size;
+
 	return 0;
 }
 
+static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
+				      unsigned long iova, size_t pgsize, size_t pgcount,
+				      struct iommu_iotlb_gather *gather)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	size_t size = pgcount << __ffs(pgsize);
+	unsigned long *ptr, old;
+	size_t unmapped = 0;
+	size_t pte_size;
+
+	while (unmapped < size) {
+		ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+		if (!ptr)
+			return unmapped;
+
+		/* partial unmap is not allowed, fail. */
+		if (iova & ~(pte_size - 1))
+			return unmapped;
+
+		old = READ_ONCE(*ptr);
+		if (cmpxchg_relaxed(ptr, old, 0) != old)
+			continue;
+
+		iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
+					    pte_size);
+
+		iova += pte_size;
+		unmapped += pte_size;
+	}
+
+	return unmapped;
+}
+
+static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, dma_addr_t iova)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	unsigned long pte_size;
+	unsigned long *ptr;
+
+	ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+	if (_io_pte_none(*ptr) || !_io_pte_present(*ptr))
+		return 0;
+
+	return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
+}
+
+static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+	WARN_ON(!list_empty(&domain->bonds));
+
+	if (domain->pgd_root) {
+		const unsigned long pfn = virt_to_pfn(domain->pgd_root);
+
+		riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
+	}
+
+	if ((int)domain->pscid > 0)
+		ida_free(&riscv_iommu_pscids, domain->pscid);
+
+	kfree(domain);
+}
+
+static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
+{
+	switch (pgd_mode) {
+	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+		return iommu->caps & RISCV_IOMMU_CAP_S_SV39;
+
+	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+		return iommu->caps & RISCV_IOMMU_CAP_S_SV48;
+
+	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+		return iommu->caps & RISCV_IOMMU_CAP_S_SV57;
+	}
+	return false;
+}
+
+static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
+					    struct device *dev)
+{
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	struct page *page;
+
+	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
+		return -ENODEV;
+
+	domain->numa_node = dev_to_node(iommu->dev);
+	domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAP_AMO_HWAD);
+
+	if (!domain->pgd_root) {
+		page = alloc_pages_node(domain->numa_node,
+					GFP_KERNEL_ACCOUNT | __GFP_ZERO, 0);
+		if (!page)
+			return -ENOMEM;
+		domain->pgd_root = (unsigned long)page_to_virt(page);
+	}
+
+	return riscv_iommu_attach_domain(iommu, dev, iommu_domain);
+}
+
+static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
+	.attach_dev = riscv_iommu_attach_paging_domain,
+	.free = riscv_iommu_free_paging_domain,
+	.map_pages = riscv_iommu_map_pages,
+	.unmap_pages = riscv_iommu_unmap_pages,
+	.iova_to_phys = riscv_iommu_iova_to_phys,
+	.iotlb_sync = riscv_iommu_iotlb_sync,
+	.flush_iotlb_all = riscv_iommu_flush_iotlb_all,
+};
+
+static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
+{
+	struct riscv_iommu_domain *domain;
+
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD_RCU(&domain->bonds);
+
+	domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
+					RISCV_IOMMU_MAX_PSCID - 1, GFP_KERNEL);
+	if (domain->pscid < 0) {
+		kfree(domain);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * Note: RISC-V Privilege spec mandates that virtual addresses
+	 * need to be sign-extended, so if (VA_BITS - 1) is set, all
+	 * bits >= VA_BITS need to also be set or else we'll get a
+	 * page fault. However the code that creates the mappings
+	 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
+	 * for now, so we'll end up with invalid virtual addresses
+	 * to map. As a workaround until we get this sorted out
+	 * limit the available virtual addresses to VA_BITS - 1.
+	 */
+	domain->domain.geometry.aperture_start = 0;
+	domain->domain.geometry.aperture_end = DMA_BIT_MASK(VA_BITS - 1);
+	domain->domain.geometry.force_aperture = true;
+
+	/*
+	 * Follow system address translation mode.
+	 * RISC-V IOMMU ATP mode values match RISC-V CPU SATP mode values.
+	 */
+	domain->pgd_mode = satp_mode >> SATP_MODE_SHIFT;
+	domain->numa_node = NUMA_NO_NODE;
+	domain->domain.ops = &riscv_iommu_paging_domain_ops;
+
+	return &domain->domain;
+}
+
 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
 					      struct device *dev)
 {
@@ -814,7 +1275,7 @@  static struct iommu_domain riscv_iommu_identity_domain = {
 
 static int riscv_iommu_device_domain_type(struct device *dev)
 {
-	return IOMMU_DOMAIN_IDENTITY;
+	return 0;
 }
 
 static struct iommu_group *riscv_iommu_device_group(struct device *dev)
@@ -858,8 +1319,10 @@  static void riscv_iommu_release_device(struct device *dev)
 
 static const struct iommu_ops riscv_iommu_ops = {
 	.owner = THIS_MODULE,
+	.pgsize_bitmap = SZ_4K,
 	.of_xlate = riscv_iommu_of_xlate,
 	.identity_domain = &riscv_iommu_identity_domain,
+	.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
 	.def_domain_type = riscv_iommu_device_domain_type,
 	.device_group = riscv_iommu_device_group,
 	.probe_device = riscv_iommu_probe_device,