diff mbox series

[RFC,v2,07/10] iommu/riscv: support nested iommu for creating domains owned by userspace

Message ID 20240614142156.29420-8-zong.li@sifive.com (mailing list archive)
State RFC
Headers show
Series RISC-V IOMMU HPM and nested IOMMU support | expand

Checks

Context Check Description
conchuod/vmtest-fixes-PR fail merge-conflict

Commit Message

Zong Li June 14, 2024, 2:21 p.m. UTC
This patch implements .domain_alloc_user operation for creating domains
owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent
domain for second stage, s1 domain will be the first stage.

Don't remove IOMMU private data of dev in blocked domain, because it
holds the user data of device, which is used when attaching device into
s1 domain.

Signed-off-by: Zong Li <zong.li@sifive.com>
---
 drivers/iommu/riscv/iommu.c  | 236 ++++++++++++++++++++++++++++++++++-
 include/uapi/linux/iommufd.h |  17 +++
 2 files changed, 252 insertions(+), 1 deletion(-)

Comments

Jason Gunthorpe June 19, 2024, 4:02 p.m. UTC | #1
On Fri, Jun 14, 2024 at 10:21:53PM +0800, Zong Li wrote:
> This patch implements .domain_alloc_user operation for creating domains
> owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent
> domain for second stage, s1 domain will be the first stage.
> 
> Don't remove IOMMU private data of dev in blocked domain, because it
> holds the user data of device, which is used when attaching device into
> s1 domain.
> 
> Signed-off-by: Zong Li <zong.li@sifive.com>
> ---
>  drivers/iommu/riscv/iommu.c  | 236 ++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/iommufd.h |  17 +++
>  2 files changed, 252 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 2130106e421f..410b236e9b24 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -846,6 +846,8 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
>  
>  /* This struct contains protection domain specific IOMMU driver data. */
>  struct riscv_iommu_domain {
> +	struct riscv_iommu_domain *s2;
> +	struct riscv_iommu_device *iommu;

IMHO you should create a riscv_iommu_domain_nested and not put these
here, like ARM did.

The kernel can't change the nested domain so it can't recieve and
distribute invalidations.

> +/**
> + * Nested IOMMU operations
> + */
> +
> +static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev)
> +{
> +	struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> +
> +	/*
> +	 * Add bond to the new domain's list, but don't unlink in current domain.
> +	 * We need to flush entries in stage-2 page table by iterating the list.
> +	 */
> +	if (riscv_iommu_bond_link(riscv_domain, dev))
> +		return -ENOMEM;
> +
> +	riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX);
> +	info->dc_user.ta |= RISCV_IOMMU_PC_TA_V;

Seems odd??

> +	riscv_iommu_iodir_update(iommu, dev, &info->dc_user);

This will be need some updating to allow changes that don't toggle
V=0, like in arm.

> +	info->domain = riscv_domain;
> +
> +	return 0;
> +}
> +
> +static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
> +{
> +	struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> +	struct riscv_iommu_bond *bond;
> +
> +	/* Unlink bond in s2 domain, because we link bond both on s1 and s2 domain */
> +	list_for_each_entry_rcu(bond, &riscv_domain->s2->bonds, list)
> +		riscv_iommu_bond_unlink(riscv_domain->s2, bond->dev);
> +
> +	if ((int)riscv_domain->pscid > 0)
> +		ida_free(&riscv_iommu_pscids, riscv_domain->pscid);
> +
> +	kfree(riscv_domain);
> +}
> +
> +static const struct iommu_domain_ops riscv_iommu_nested_domain_ops = {
> +	.attach_dev	= riscv_iommu_attach_dev_nested,
> +	.free		= riscv_iommu_domain_free_nested,
> +};
> +
> +static int
> +riscv_iommu_get_dc_user(struct device *dev, struct iommu_hwpt_riscv_iommu *user_arg)
> +{
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> +	struct riscv_iommu_dc dc;
> +	struct riscv_iommu_fq_record event;
> +	u64 dc_len = sizeof(struct riscv_iommu_dc) >>
> +		     (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT));
> +	u64 event_len = sizeof(struct riscv_iommu_fq_record);
> +	void __user *event_user = NULL;
> +
> +	for (int i = 0; i < fwspec->num_ids; i++) {
> +		event.hdr =
> +			FIELD_PREP(RISCV_IOMMU_FQ_HDR_CAUSE, RISCV_IOMMU_FQ_CAUSE_DDT_INVALID) |
> +			FIELD_PREP(RISCV_IOMMU_FQ_HDR_DID, fwspec->ids[i]);
> +
> +		/* Sanity check DC of stage-1 from user data */
> +		if (!user_arg->out_event_uptr || user_arg->event_len != event_len)
> +			return -EINVAL;

This is not extensible, see below about just inlining it.

> +		event_user = u64_to_user_ptr(user_arg->out_event_uptr);
> +
> +		if (!user_arg->dc_uptr || user_arg->dc_len != dc_len)
> +			return -EINVAL;
> +
> +		if (copy_from_user(&dc, u64_to_user_ptr(user_arg->dc_uptr), dc_len))
> +			return -EFAULT;
> +
> +		if (!(dc.tc & RISCV_IOMMU_DDTE_V)) {
> +			dev_dbg(dev, "Invalid DDT from user data\n");
> +			if (copy_to_user(event_user, &event, event_len))
> +				return -EFAULT;
> +		}

On ARM we are going to support non-valid STEs. It should put the the
translation into blocking and ideally emulate translation failure
events.

> +
> +		if (!dc.fsc || dc.iohgatp) {
> +			dev_dbg(dev, "Wrong page table from user data\n");
> +			if (copy_to_user(event_user, &event, event_len))
> +				return -EFAULT;
> +		}
> +
> +		/* Save DC of stage-1 from user data */
> +		memcpy(&info->dc_user,
> +		       riscv_iommu_get_dc(iommu, fwspec->ids[i]),

This does not seem right, the fwspec shouldn't be part of domain
allocation, even arguably for nesting. The nesting domain should
represent the user_dc only. Any customization of kernel controlled bits
should be done during attach only, nor do I really understand why this
is looping over all the fwspecs but only memcpying the last one..

> +		       sizeof(struct riscv_iommu_dc));
> +		info->dc_user.fsc = dc.fsc;
> +	}
> +
> +	return 0;
> +}
> +
> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_nested(struct device *dev,
> +				struct iommu_domain *parent,
> +				const struct iommu_user_data *user_data)
> +{
> +	struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent);
> +	struct riscv_iommu_domain *s1_domain;
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct iommu_hwpt_riscv_iommu arg;
> +	int ret, va_bits;
> +
> +	if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU)
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	if (parent->type != IOMMU_DOMAIN_UNMANAGED)
> +		return ERR_PTR(-EINVAL);
> +
> +	ret = iommu_copy_struct_from_user(&arg,
> +					  user_data,
> +					  IOMMU_HWPT_DATA_RISCV_IOMMU,
> +					  out_event_uptr);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL);
> +	if (!s1_domain)
> +		return ERR_PTR(-ENOMEM);
> +
> +	spin_lock_init(&s1_domain->lock);
> +	INIT_LIST_HEAD_RCU(&s1_domain->bonds);
> +
> +	s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
> +					   RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
> +	if (s1_domain->pscid < 0) {
> +		iommu_free_page(s1_domain->pgd_root);
> +		kfree(s1_domain);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	/* Get device context of stage-1 from user*/
> +	ret = riscv_iommu_get_dc_user(dev, &arg);
> +	if (ret) {
> +		kfree(s1_domain);
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	if (!iommu) {
> +		va_bits = VA_BITS;
> +	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
> +		va_bits = 57;
> +	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
> +		va_bits = 48;
> +	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
> +		va_bits = 39;
> +	} else {
> +		dev_err(dev, "cannot find supported page table mode\n");
> +		return ERR_PTR(-ENODEV);
> +	}
> +
> +	/*
> +	 * The ops->domain_alloc_user could be directly called by the iommufd core,
> +	 * instead of iommu core. So, this function need to set the default value of
> +	 * following data member:
> +	 *  - domain->pgsize_bitmap
> +	 *  - domain->geometry
> +	 *  - domain->type
> +	 *  - domain->ops
> +	 */
> +	s1_domain->s2 = s2_domain;
> +	s1_domain->iommu = iommu;
> +	s1_domain->domain.type = IOMMU_DOMAIN_NESTED;
> +	s1_domain->domain.ops = &riscv_iommu_nested_domain_ops;
> +	s1_domain->domain.pgsize_bitmap = SZ_4K;
> +	s1_domain->domain.geometry.aperture_start = 0;
> +	s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1);
> +	s1_domain->domain.geometry.force_aperture = true;

There is no geometry or page size of nesting domains.

> +
> +	return &s1_domain->domain;
> +}
> +
> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
> +			      struct iommu_domain *parent,
> +			      const struct iommu_user_data *user_data)
> +{
> +	struct iommu_domain *domain;
> +	struct riscv_iommu_domain *riscv_domain;
> +
> +	/* Allocate stage-1 domain if it has stage-2 parent domain */
> +	if (parent)
> +		return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
> +
> +	if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	if (user_data)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* domain_alloc_user op needs to be fully initialized */
> +	domain = iommu_domain_alloc(dev->bus);

Please organize your driver to avoid calling this core function
through a pointer and return the correct type from the start so you
don't have to cast.

> +	if (!domain)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/*
> +	 * We assume that nest-parent or g-stage only will come here
> +	 * TODO: Shadow page table doesn't be supported now.
> +	 *       We currently can't distinguish g-stage and shadow
> +	 *       page table here. Shadow page table shouldn't be
> +	 *       put at stage-2.
> +	 */
> +	riscv_domain = iommu_domain_to_riscv(domain);
> +
> +	/* pgd_root may be allocated in .domain_alloc_paging */
> +	if (riscv_domain->pgd_root)
> +		iommu_free_page(riscv_domain->pgd_root);

And don't do stuff like this, if domain_alloc didn't do the right
stuff then reorganize it so that it does. Most likely pass in a flag
that you are allocating the nest so it can setup properly if it is
only a small change like this.

> +/**
> + * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table
> + *                                 info (IOMMU_HWPT_TYPE_RISCV_IOMMU)
> + * @dc_len: Length of device context
> + * @dc_uptr: User pointer to the address of device context
> + * @event_len: Length of an event record
> + * @out_event_uptr: User pointer to the address of event record
> + */
> +struct iommu_hwpt_riscv_iommu {
> +	__aligned_u64 dc_len;
> +	__aligned_u64 dc_uptr;

Do we really want this to be a pointer? ARM just inlined it in the
struct, why not do that?

> +	__aligned_u64 event_len;
> +	__aligned_u64 out_event_uptr;
> +};

Similar here too, why not just inline the response memory?

Jason
Joao Martins June 19, 2024, 4:34 p.m. UTC | #2
On 14/06/2024 15:21, Zong Li wrote:
> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
> +			      struct iommu_domain *parent,
> +			      const struct iommu_user_data *user_data)
> +{
> +	struct iommu_domain *domain;
> +	struct riscv_iommu_domain *riscv_domain;
> +
> +	/* Allocate stage-1 domain if it has stage-2 parent domain */
> +	if (parent)
> +		return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
> +
> +	if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
> +		return ERR_PTR(-EOPNOTSUPP);
> +

IOMMU_HWPT_ALLOC_DIRTY_TRACKING flag check should be dropped if it's not
supported in code (which looks to be the case in your series) e.g.

	if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT)))
		return ERR_PTR(-EOPNOTSUPP);
Zong Li June 21, 2024, 7:34 a.m. UTC | #3
On Thu, Jun 20, 2024 at 12:34 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> On 14/06/2024 15:21, Zong Li wrote:
> > +static struct iommu_domain *
> > +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
> > +                           struct iommu_domain *parent,
> > +                           const struct iommu_user_data *user_data)
> > +{
> > +     struct iommu_domain *domain;
> > +     struct riscv_iommu_domain *riscv_domain;
> > +
> > +     /* Allocate stage-1 domain if it has stage-2 parent domain */
> > +     if (parent)
> > +             return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
> > +
> > +     if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
> > +             return ERR_PTR(-EOPNOTSUPP);
> > +
>
> IOMMU_HWPT_ALLOC_DIRTY_TRACKING flag check should be dropped if it's not
> supported in code (which looks to be the case in your series) e.g.

Thanks for your tips, I will remove it in the next version.

>
>         if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT)))
>                 return ERR_PTR(-EOPNOTSUPP);
Zong Li June 28, 2024, 9:03 a.m. UTC | #4
On Thu, Jun 20, 2024 at 12:02 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, Jun 14, 2024 at 10:21:53PM +0800, Zong Li wrote:
> > This patch implements .domain_alloc_user operation for creating domains
> > owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent
> > domain for second stage, s1 domain will be the first stage.
> >
> > Don't remove IOMMU private data of dev in blocked domain, because it
> > holds the user data of device, which is used when attaching device into
> > s1 domain.
> >
> > Signed-off-by: Zong Li <zong.li@sifive.com>
> > ---
> >  drivers/iommu/riscv/iommu.c  | 236 ++++++++++++++++++++++++++++++++++-
> >  include/uapi/linux/iommufd.h |  17 +++
> >  2 files changed, 252 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> > index 2130106e421f..410b236e9b24 100644
> > --- a/drivers/iommu/riscv/iommu.c
> > +++ b/drivers/iommu/riscv/iommu.c
> > @@ -846,6 +846,8 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
> >
> >  /* This struct contains protection domain specific IOMMU driver data. */
> >  struct riscv_iommu_domain {
> > +     struct riscv_iommu_domain *s2;
> > +     struct riscv_iommu_device *iommu;
>
> IMHO you should create a riscv_iommu_domain_nested and not put these
> here, like ARM did.
>
> The kernel can't change the nested domain so it can't recieve and
> distribute invalidations.

Ok, as you mentioned, there are many data elements in that data
structure won't be used in s1 domain.

>
> > +/**
> > + * Nested IOMMU operations
> > + */
> > +
> > +static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev)
> > +{
> > +     struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> > +     struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > +     struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> > +
> > +     /*
> > +      * Add bond to the new domain's list, but don't unlink in current domain.
> > +      * We need to flush entries in stage-2 page table by iterating the list.
> > +      */
> > +     if (riscv_iommu_bond_link(riscv_domain, dev))
> > +             return -ENOMEM;
> > +
> > +     riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX);
> > +     info->dc_user.ta |= RISCV_IOMMU_PC_TA_V;
>
> Seems odd??
>
> > +     riscv_iommu_iodir_update(iommu, dev, &info->dc_user);
>
> This will be need some updating to allow changes that don't toggle
> V=0, like in arm.

I think the right code snippet is put in 8th patch as you pointed in
8th patch. I will correct it in next version.

>
> > +     info->domain = riscv_domain;
> > +
> > +     return 0;
> > +}
> > +
> > +static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
> > +{
> > +     struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> > +     struct riscv_iommu_bond *bond;
> > +
> > +     /* Unlink bond in s2 domain, because we link bond both on s1 and s2 domain */
> > +     list_for_each_entry_rcu(bond, &riscv_domain->s2->bonds, list)
> > +             riscv_iommu_bond_unlink(riscv_domain->s2, bond->dev);
> > +
> > +     if ((int)riscv_domain->pscid > 0)
> > +             ida_free(&riscv_iommu_pscids, riscv_domain->pscid);
> > +
> > +     kfree(riscv_domain);
> > +}
> > +
> > +static const struct iommu_domain_ops riscv_iommu_nested_domain_ops = {
> > +     .attach_dev     = riscv_iommu_attach_dev_nested,
> > +     .free           = riscv_iommu_domain_free_nested,
> > +};
> > +
> > +static int
> > +riscv_iommu_get_dc_user(struct device *dev, struct iommu_hwpt_riscv_iommu *user_arg)
> > +{
> > +     struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> > +     struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > +     struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> > +     struct riscv_iommu_dc dc;
> > +     struct riscv_iommu_fq_record event;
> > +     u64 dc_len = sizeof(struct riscv_iommu_dc) >>
> > +                  (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT));
> > +     u64 event_len = sizeof(struct riscv_iommu_fq_record);
> > +     void __user *event_user = NULL;
> > +
> > +     for (int i = 0; i < fwspec->num_ids; i++) {
> > +             event.hdr =
> > +                     FIELD_PREP(RISCV_IOMMU_FQ_HDR_CAUSE, RISCV_IOMMU_FQ_CAUSE_DDT_INVALID) |
> > +                     FIELD_PREP(RISCV_IOMMU_FQ_HDR_DID, fwspec->ids[i]);
> > +
> > +             /* Sanity check DC of stage-1 from user data */
> > +             if (!user_arg->out_event_uptr || user_arg->event_len != event_len)
> > +                     return -EINVAL;
>
> This is not extensible, see below about just inlining it.
>
> > +             event_user = u64_to_user_ptr(user_arg->out_event_uptr);
> > +
> > +             if (!user_arg->dc_uptr || user_arg->dc_len != dc_len)
> > +                     return -EINVAL;
> > +
> > +             if (copy_from_user(&dc, u64_to_user_ptr(user_arg->dc_uptr), dc_len))
> > +                     return -EFAULT;
> > +
> > +             if (!(dc.tc & RISCV_IOMMU_DDTE_V)) {
> > +                     dev_dbg(dev, "Invalid DDT from user data\n");
> > +                     if (copy_to_user(event_user, &event, event_len))
> > +                             return -EFAULT;
> > +             }
>
> On ARM we are going to support non-valid STEs. It should put the the
> translation into blocking and ideally emulate translation failure
> events.

Ok, let me consider this situation in next version.

>
> > +
> > +             if (!dc.fsc || dc.iohgatp) {
> > +                     dev_dbg(dev, "Wrong page table from user data\n");
> > +                     if (copy_to_user(event_user, &event, event_len))
> > +                             return -EFAULT;
> > +             }
> > +
> > +             /* Save DC of stage-1 from user data */
> > +             memcpy(&info->dc_user,
> > +                    riscv_iommu_get_dc(iommu, fwspec->ids[i]),
>
> This does not seem right, the fwspec shouldn't be part of domain
> allocation, even arguably for nesting. The nesting domain should
> represent the user_dc only. Any customization of kernel controlled bits
> should be done during attach only, nor do I really understand why this
> is looping over all the fwspecs but only memcpying the last one..
>

The fwspec is used to get the value of current dc, because we want to
also back up the address of second stage table (i.e. iohgatp), The
reason is that this value will be cleaned when device is attached to
the blocking domain, before the device attaches to s1 domain, we can't
get the original value of iohgatp anymore when attach device to s1
domain.
For the issue for only memcpying the last one, I should fix it in next
version, we might need to allocate the multiple user_dc at runtime,
because we don't statically know how many id will be used in one
platform device. does it make sense to you?

> > +                    sizeof(struct riscv_iommu_dc));
> > +             info->dc_user.fsc = dc.fsc;
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +static struct iommu_domain *
> > +riscv_iommu_domain_alloc_nested(struct device *dev,
> > +                             struct iommu_domain *parent,
> > +                             const struct iommu_user_data *user_data)
> > +{
> > +     struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent);
> > +     struct riscv_iommu_domain *s1_domain;
> > +     struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > +     struct iommu_hwpt_riscv_iommu arg;
> > +     int ret, va_bits;
> > +
> > +     if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU)
> > +             return ERR_PTR(-EOPNOTSUPP);
> > +
> > +     if (parent->type != IOMMU_DOMAIN_UNMANAGED)
> > +             return ERR_PTR(-EINVAL);
> > +
> > +     ret = iommu_copy_struct_from_user(&arg,
> > +                                       user_data,
> > +                                       IOMMU_HWPT_DATA_RISCV_IOMMU,
> > +                                       out_event_uptr);
> > +     if (ret)
> > +             return ERR_PTR(ret);
> > +
> > +     s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL);
> > +     if (!s1_domain)
> > +             return ERR_PTR(-ENOMEM);
> > +
> > +     spin_lock_init(&s1_domain->lock);
> > +     INIT_LIST_HEAD_RCU(&s1_domain->bonds);
> > +
> > +     s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
> > +                                        RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
> > +     if (s1_domain->pscid < 0) {
> > +             iommu_free_page(s1_domain->pgd_root);
> > +             kfree(s1_domain);
> > +             return ERR_PTR(-ENOMEM);
> > +     }
> > +
> > +     /* Get device context of stage-1 from user*/
> > +     ret = riscv_iommu_get_dc_user(dev, &arg);
> > +     if (ret) {
> > +             kfree(s1_domain);
> > +             return ERR_PTR(-EINVAL);
> > +     }
> > +
> > +     if (!iommu) {
> > +             va_bits = VA_BITS;
> > +     } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
> > +             va_bits = 57;
> > +     } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
> > +             va_bits = 48;
> > +     } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
> > +             va_bits = 39;
> > +     } else {
> > +             dev_err(dev, "cannot find supported page table mode\n");
> > +             return ERR_PTR(-ENODEV);
> > +     }
> > +
> > +     /*
> > +      * The ops->domain_alloc_user could be directly called by the iommufd core,
> > +      * instead of iommu core. So, this function need to set the default value of
> > +      * following data member:
> > +      *  - domain->pgsize_bitmap
> > +      *  - domain->geometry
> > +      *  - domain->type
> > +      *  - domain->ops
> > +      */
> > +     s1_domain->s2 = s2_domain;
> > +     s1_domain->iommu = iommu;
> > +     s1_domain->domain.type = IOMMU_DOMAIN_NESTED;
> > +     s1_domain->domain.ops = &riscv_iommu_nested_domain_ops;
> > +     s1_domain->domain.pgsize_bitmap = SZ_4K;
> > +     s1_domain->domain.geometry.aperture_start = 0;
> > +     s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1);
> > +     s1_domain->domain.geometry.force_aperture = true;
>
> There is no geometry or page size of nesting domains.
>

Thanks for pointing it out. I will fix it in the next version.

> > +
> > +     return &s1_domain->domain;
> > +}
> > +
> > +static struct iommu_domain *
> > +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
> > +                           struct iommu_domain *parent,
> > +                           const struct iommu_user_data *user_data)
> > +{
> > +     struct iommu_domain *domain;
> > +     struct riscv_iommu_domain *riscv_domain;
> > +
> > +     /* Allocate stage-1 domain if it has stage-2 parent domain */
> > +     if (parent)
> > +             return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
> > +
> > +     if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
> > +             return ERR_PTR(-EOPNOTSUPP);
> > +
> > +     if (user_data)
> > +             return ERR_PTR(-EINVAL);
> > +
> > +     /* domain_alloc_user op needs to be fully initialized */
> > +     domain = iommu_domain_alloc(dev->bus);
>
> Please organize your driver to avoid calling this core function
> through a pointer and return the correct type from the start so you
> don't have to cast.

Ok, let me modify this part. Thanks .

>
> > +     if (!domain)
> > +             return ERR_PTR(-ENOMEM);
> > +
> > +     /*
> > +      * We assume that nest-parent or g-stage only will come here
> > +      * TODO: Shadow page table doesn't be supported now.
> > +      *       We currently can't distinguish g-stage and shadow
> > +      *       page table here. Shadow page table shouldn't be
> > +      *       put at stage-2.
> > +      */
> > +     riscv_domain = iommu_domain_to_riscv(domain);
> > +
> > +     /* pgd_root may be allocated in .domain_alloc_paging */
> > +     if (riscv_domain->pgd_root)
> > +             iommu_free_page(riscv_domain->pgd_root);
>
> And don't do stuff like this, if domain_alloc didn't do the right
> stuff then reorganize it so that it does. Most likely pass in a flag
> that you are allocating the nest so it can setup properly if it is
> only a small change like this.

Yes, if we don't rely on the original domain allocation, it won't be
weird as it is now.

>
> > +/**
> > + * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table
> > + *                                 info (IOMMU_HWPT_TYPE_RISCV_IOMMU)
> > + * @dc_len: Length of device context
> > + * @dc_uptr: User pointer to the address of device context
> > + * @event_len: Length of an event record
> > + * @out_event_uptr: User pointer to the address of event record
> > + */
> > +struct iommu_hwpt_riscv_iommu {
> > +     __aligned_u64 dc_len;
> > +     __aligned_u64 dc_uptr;
>
> Do we really want this to be a pointer? ARM just inlined it in the
> struct, why not do that?
>
> > +     __aligned_u64 event_len;
> > +     __aligned_u64 out_event_uptr;
> > +};
>
> Similar here too, why not just inline the response memory?

I think we can just inline them, just like what we do in the
'iommu_hwpt_riscv_iommu_invalidate'. does I understand correctly?

>
> Jason
Jason Gunthorpe June 28, 2024, 10:32 p.m. UTC | #5
On Fri, Jun 28, 2024 at 05:03:41PM +0800, Zong Li wrote:
> > > +
> > > +             if (!dc.fsc || dc.iohgatp) {
> > > +                     dev_dbg(dev, "Wrong page table from user data\n");
> > > +                     if (copy_to_user(event_user, &event, event_len))
> > > +                             return -EFAULT;
> > > +             }
> > > +
> > > +             /* Save DC of stage-1 from user data */
> > > +             memcpy(&info->dc_user,
> > > +                    riscv_iommu_get_dc(iommu, fwspec->ids[i]),
> >
> > This does not seem right, the fwspec shouldn't be part of domain
> > allocation, even arguably for nesting. The nesting domain should
> > represent the user_dc only. Any customization of kernel controlled bits
> > should be done during attach only, nor do I really understand why this
> > is looping over all the fwspecs but only memcpying the last one..
> >
> 
> The fwspec is used to get the value of current dc, because we want to
> also back up the address of second stage table (i.e. iohgatp), The
> reason is that this value will be cleaned when device is attached to
> the blocking domain, before the device attaches to s1 domain, we can't
> get the original value of iohgatp anymore when attach device to s1
> domain.

This is wrong, you get the value of iohgatp from the S2 domain the
nest knows directly. You must never make assumptions about domain
attach order or rely on the current value of the HW tables to
construct any attachment.

Follow the design like ARM has now where the value of the device table
entry is computed wholly from scratch using only the contents of the
domain pointer, including combining the S1 and S2 domain information.

And then you need to refactor and use the programmer I wrote for ARM
to be able to do the correct hitless transitions without a V=0
step. It is not too hard but will clean this all up.

> > > +/**
> > > + * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table
> > > + *                                 info (IOMMU_HWPT_TYPE_RISCV_IOMMU)
> > > + * @dc_len: Length of device context
> > > + * @dc_uptr: User pointer to the address of device context
> > > + * @event_len: Length of an event record
> > > + * @out_event_uptr: User pointer to the address of event record
> > > + */
> > > +struct iommu_hwpt_riscv_iommu {
> > > +     __aligned_u64 dc_len;
> > > +     __aligned_u64 dc_uptr;
> >
> > Do we really want this to be a pointer? ARM just inlined it in the
> > struct, why not do that?
> >
> > > +     __aligned_u64 event_len;
> > > +     __aligned_u64 out_event_uptr;
> > > +};
> >
> > Similar here too, why not just inline the response memory?
> 
> I think we can just inline them, just like what we do in the
> 'iommu_hwpt_riscv_iommu_invalidate'. does I understand correctly?

Yeah I think so

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 2130106e421f..410b236e9b24 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -846,6 +846,8 @@  static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
 
 /* This struct contains protection domain specific IOMMU driver data. */
 struct riscv_iommu_domain {
+	struct riscv_iommu_domain *s2;
+	struct riscv_iommu_device *iommu;
 	struct iommu_domain domain;
 	struct list_head bonds;
 	spinlock_t lock;		/* protect bonds list updates. */
@@ -863,6 +865,7 @@  struct riscv_iommu_domain {
 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
 struct riscv_iommu_info {
 	struct riscv_iommu_domain *domain;
+	struct riscv_iommu_dc dc_user;
 };
 
 /*
@@ -1532,7 +1535,6 @@  static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
 	/* Make device context invalid, translation requests will fault w/ #258 */
 	riscv_iommu_iodir_update(iommu, dev, &dc);
 	riscv_iommu_bond_unlink(info->domain, dev);
-	info->domain = NULL;
 
 	return 0;
 }
@@ -1568,6 +1570,237 @@  static struct iommu_domain riscv_iommu_identity_domain = {
 	}
 };
 
+/**
+ * Nested IOMMU operations
+ */
+
+static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev)
+{
+	struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+	/*
+	 * Add bond to the new domain's list, but don't unlink in current domain.
+	 * We need to flush entries in stage-2 page table by iterating the list.
+	 */
+	if (riscv_iommu_bond_link(riscv_domain, dev))
+		return -ENOMEM;
+
+	riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX);
+	info->dc_user.ta |= RISCV_IOMMU_PC_TA_V;
+	riscv_iommu_iodir_update(iommu, dev, &info->dc_user);
+
+	info->domain = riscv_domain;
+
+	return 0;
+}
+
+static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
+{
+	struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
+	struct riscv_iommu_bond *bond;
+
+	/* Unlink bond in s2 domain, because we link bond both on s1 and s2 domain */
+	list_for_each_entry_rcu(bond, &riscv_domain->s2->bonds, list)
+		riscv_iommu_bond_unlink(riscv_domain->s2, bond->dev);
+
+	if ((int)riscv_domain->pscid > 0)
+		ida_free(&riscv_iommu_pscids, riscv_domain->pscid);
+
+	kfree(riscv_domain);
+}
+
+static const struct iommu_domain_ops riscv_iommu_nested_domain_ops = {
+	.attach_dev	= riscv_iommu_attach_dev_nested,
+	.free		= riscv_iommu_domain_free_nested,
+};
+
+static int
+riscv_iommu_get_dc_user(struct device *dev, struct iommu_hwpt_riscv_iommu *user_arg)
+{
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+	struct riscv_iommu_dc dc;
+	struct riscv_iommu_fq_record event;
+	u64 dc_len = sizeof(struct riscv_iommu_dc) >>
+		     (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT));
+	u64 event_len = sizeof(struct riscv_iommu_fq_record);
+	void __user *event_user = NULL;
+
+	for (int i = 0; i < fwspec->num_ids; i++) {
+		event.hdr =
+			FIELD_PREP(RISCV_IOMMU_FQ_HDR_CAUSE, RISCV_IOMMU_FQ_CAUSE_DDT_INVALID) |
+			FIELD_PREP(RISCV_IOMMU_FQ_HDR_DID, fwspec->ids[i]);
+
+		/* Sanity check DC of stage-1 from user data */
+		if (!user_arg->out_event_uptr || user_arg->event_len != event_len)
+			return -EINVAL;
+
+		event_user = u64_to_user_ptr(user_arg->out_event_uptr);
+
+		if (!user_arg->dc_uptr || user_arg->dc_len != dc_len)
+			return -EINVAL;
+
+		if (copy_from_user(&dc, u64_to_user_ptr(user_arg->dc_uptr), dc_len))
+			return -EFAULT;
+
+		if (!(dc.tc & RISCV_IOMMU_DDTE_V)) {
+			dev_dbg(dev, "Invalid DDT from user data\n");
+			if (copy_to_user(event_user, &event, event_len))
+				return -EFAULT;
+		}
+
+		if (!dc.fsc || dc.iohgatp) {
+			dev_dbg(dev, "Wrong page table from user data\n");
+			if (copy_to_user(event_user, &event, event_len))
+				return -EFAULT;
+		}
+
+		/* Save DC of stage-1 from user data */
+		memcpy(&info->dc_user,
+		       riscv_iommu_get_dc(iommu, fwspec->ids[i]),
+		       sizeof(struct riscv_iommu_dc));
+		info->dc_user.fsc = dc.fsc;
+	}
+
+	return 0;
+}
+
+static struct iommu_domain *
+riscv_iommu_domain_alloc_nested(struct device *dev,
+				struct iommu_domain *parent,
+				const struct iommu_user_data *user_data)
+{
+	struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent);
+	struct riscv_iommu_domain *s1_domain;
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+	struct iommu_hwpt_riscv_iommu arg;
+	int ret, va_bits;
+
+	if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (parent->type != IOMMU_DOMAIN_UNMANAGED)
+		return ERR_PTR(-EINVAL);
+
+	ret = iommu_copy_struct_from_user(&arg,
+					  user_data,
+					  IOMMU_HWPT_DATA_RISCV_IOMMU,
+					  out_event_uptr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL);
+	if (!s1_domain)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&s1_domain->lock);
+	INIT_LIST_HEAD_RCU(&s1_domain->bonds);
+
+	s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
+					   RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
+	if (s1_domain->pscid < 0) {
+		iommu_free_page(s1_domain->pgd_root);
+		kfree(s1_domain);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Get device context of stage-1 from user*/
+	ret = riscv_iommu_get_dc_user(dev, &arg);
+	if (ret) {
+		kfree(s1_domain);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!iommu) {
+		va_bits = VA_BITS;
+	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
+		va_bits = 57;
+	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
+		va_bits = 48;
+	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
+		va_bits = 39;
+	} else {
+		dev_err(dev, "cannot find supported page table mode\n");
+		return ERR_PTR(-ENODEV);
+	}
+
+	/*
+	 * The ops->domain_alloc_user could be directly called by the iommufd core,
+	 * instead of iommu core. So, this function need to set the default value of
+	 * following data member:
+	 *  - domain->pgsize_bitmap
+	 *  - domain->geometry
+	 *  - domain->type
+	 *  - domain->ops
+	 */
+	s1_domain->s2 = s2_domain;
+	s1_domain->iommu = iommu;
+	s1_domain->domain.type = IOMMU_DOMAIN_NESTED;
+	s1_domain->domain.ops = &riscv_iommu_nested_domain_ops;
+	s1_domain->domain.pgsize_bitmap = SZ_4K;
+	s1_domain->domain.geometry.aperture_start = 0;
+	s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1);
+	s1_domain->domain.geometry.force_aperture = true;
+
+	return &s1_domain->domain;
+}
+
+static struct iommu_domain *
+riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
+			      struct iommu_domain *parent,
+			      const struct iommu_user_data *user_data)
+{
+	struct iommu_domain *domain;
+	struct riscv_iommu_domain *riscv_domain;
+
+	/* Allocate stage-1 domain if it has stage-2 parent domain */
+	if (parent)
+		return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
+
+	if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (user_data)
+		return ERR_PTR(-EINVAL);
+
+	/* domain_alloc_user op needs to be fully initialized */
+	domain = iommu_domain_alloc(dev->bus);
+	if (!domain)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * We assume that nest-parent or g-stage only will come here
+	 * TODO: Shadow page table doesn't be supported now.
+	 *       We currently can't distinguish g-stage and shadow
+	 *       page table here. Shadow page table shouldn't be
+	 *       put at stage-2.
+	 */
+	riscv_domain = iommu_domain_to_riscv(domain);
+
+	/* pgd_root may be allocated in .domain_alloc_paging */
+	if (riscv_domain->pgd_root)
+		iommu_free_page(riscv_domain->pgd_root);
+
+	riscv_domain->pgd_root = iommu_alloc_pages_node(riscv_domain->numa_node,
+							GFP_KERNEL_ACCOUNT,
+							2);
+	if (!riscv_domain->pgd_root)
+		return ERR_PTR(-ENOMEM);
+
+	riscv_domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
+					      RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
+	if (riscv_domain->gscid < 0) {
+		iommu_free_pages(riscv_domain->pgd_root, 2);
+		kfree(riscv_domain);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return domain;
+}
+
 static void *riscv_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
 {
 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
@@ -1668,6 +1901,7 @@  static const struct iommu_ops riscv_iommu_ops = {
 	.blocked_domain = &riscv_iommu_blocking_domain,
 	.release_domain = &riscv_iommu_blocking_domain,
 	.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
+	.domain_alloc_user = riscv_iommu_domain_alloc_user,
 	.def_domain_type = riscv_iommu_device_domain_type,
 	.device_group = riscv_iommu_device_group,
 	.probe_device = riscv_iommu_probe_device,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 736f4408b5e0..514463fe85d3 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -390,14 +390,31 @@  struct iommu_hwpt_vtd_s1 {
 	__u32 __reserved;
 };
 
+/**
+ * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table
+ *                                 info (IOMMU_HWPT_TYPE_RISCV_IOMMU)
+ * @dc_len: Length of device context
+ * @dc_uptr: User pointer to the address of device context
+ * @event_len: Length of an event record
+ * @out_event_uptr: User pointer to the address of event record
+ */
+struct iommu_hwpt_riscv_iommu {
+	__aligned_u64 dc_len;
+	__aligned_u64 dc_uptr;
+	__aligned_u64 event_len;
+	__aligned_u64 out_event_uptr;
+};
+
 /**
  * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
  * @IOMMU_HWPT_DATA_NONE: no data
  * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ * @IOMMU_HWPT_DATA_RISCV_IOMMU: RISC-V IOMMU device context table
  */
 enum iommu_hwpt_data_type {
 	IOMMU_HWPT_DATA_NONE,
 	IOMMU_HWPT_DATA_VTD_S1,
+	IOMMU_HWPT_DATA_RISCV_IOMMU,
 };
 
 /**