diff mbox series

[v3] vhost/vdpa: Add MSI translation tables to iommu for software-managed MSI

Message ID 20240320101912.28210-1-w_angrong@163.com (mailing list archive)
State New, archived
Headers show
Series [v3] vhost/vdpa: Add MSI translation tables to iommu for software-managed MSI | expand

Commit Message

tab March 20, 2024, 10:19 a.m. UTC
From: Rong Wang <w_angrong@163.com>

Once enable iommu domain for one device, the MSI
translation tables have to be there for software-managed MSI.
Otherwise, platform with software-managed MSI without an
irq bypass function, can not get a correct memory write event
from pcie, will not get irqs.
The solution is to obtain the MSI phy base address from
iommu reserved region, and set it to iommu MSI cookie,
then translation tables will be created while request irq.

Change log
----------

v1->v2:
- add resv iotlb to avoid overlap mapping.
v2->v3:
- there is no need to export the iommu symbol anymore.

Signed-off-by: Rong Wang <w_angrong@163.com>
---
 drivers/vhost/vdpa.c | 59 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 56 insertions(+), 3 deletions(-)

Comments

Stefano Garzarella March 20, 2024, 11:23 a.m. UTC | #1
On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
>From: Rong Wang <w_angrong@163.com>
>
>Once enable iommu domain for one device, the MSI
>translation tables have to be there for software-managed MSI.
>Otherwise, platform with software-managed MSI without an
>irq bypass function, can not get a correct memory write event
>from pcie, will not get irqs.
>The solution is to obtain the MSI phy base address from
>iommu reserved region, and set it to iommu MSI cookie,
>then translation tables will be created while request irq.
>
>Change log
>----------
>
>v1->v2:
>- add resv iotlb to avoid overlap mapping.
>v2->v3:
>- there is no need to export the iommu symbol anymore.
>
>Signed-off-by: Rong Wang <w_angrong@163.com>
>---
> drivers/vhost/vdpa.c | 59 +++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 56 insertions(+), 3 deletions(-)
>
>diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
>index ba52d128aeb7..28b56b10372b 100644
>--- a/drivers/vhost/vdpa.c
>+++ b/drivers/vhost/vdpa.c
>@@ -49,6 +49,7 @@ struct vhost_vdpa {
> 	struct completion completion;
> 	struct vdpa_device *vdpa;
> 	struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
>+	struct vhost_iotlb resv_iotlb;
> 	struct device dev;
> 	struct cdev cdev;
> 	atomic_t opened;
>@@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v)
> static int vhost_vdpa_reset(struct vhost_vdpa *v)
> {
> 	v->in_batch = 0;
>+	vhost_iotlb_reset(&v->resv_iotlb);
> 	return _compat_vdpa_reset(v);
> }
>
>@@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
> 	    msg->iova + msg->size - 1 > v->range.last)
> 		return -EINVAL;
>
>+	if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
>+					msg->iova + msg->size - 1))
>+		return -EINVAL;
>+
> 	if (vhost_iotlb_itree_first(iotlb, msg->iova,
> 				    msg->iova + msg->size - 1))
> 		return -EEXIST;
>
>+

Unnecessary new line here.

> 	if (vdpa->use_va)
> 		return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
> 					 msg->uaddr, msg->perm);
>@@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
> 	return vhost_chr_write_iter(dev, from);
> }
>
>+static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev,
>+	struct vhost_iotlb *resv_iotlb)
>+{
>+	struct list_head dev_resv_regions;
>+	phys_addr_t resv_msi_base = 0;
>+	struct iommu_resv_region *region;
>+	int ret = 0;
>+	bool with_sw_msi = false;
>+	bool with_hw_msi = false;
>+
>+	INIT_LIST_HEAD(&dev_resv_regions);
>+	iommu_get_resv_regions(dma_dev, &dev_resv_regions);
>+
>+	list_for_each_entry(region, &dev_resv_regions, list) {
>+		ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
>+				region->start + region->length - 1,
>+				0, 0, NULL);
>+		if (ret) {
>+			vhost_iotlb_reset(resv_iotlb);
>+			break;
>+		}
>+
>+		if (region->type == IOMMU_RESV_MSI)
>+			with_hw_msi = true;
>+
>+		if (region->type == IOMMU_RESV_SW_MSI) {
>+			resv_msi_base = region->start;

Can it happen that there are multiple regions of the IOMMU_RESV_SW_MSI 
type?

In this case, is it correct to overwrite `resv_msi_base`?

>+			with_sw_msi = true;
>+		}
>+	}
>+
>+	if (!ret && !with_hw_msi && with_sw_msi)
>+		ret = iommu_get_msi_cookie(domain, resv_msi_base);

If `iommu_get_msi_cookie()` fails:
  - Should we avoid calling iommu_put_resv_regions()?
  - Should we also call `vhost_iotlb_reset(resv_iotlb)` like for the
    vhost_iotlb_add_range_ctx() failure ?

If it is the case, maybe it's better to add an error label where do the 
cleanup.

>+
>+	iommu_put_resv_regions(dma_dev, &dev_resv_regions);
>+
>+	return ret;
>+}
>+
> static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> {
> 	struct vdpa_device *vdpa = v->vdpa;
>@@ -1335,11 +1381,16 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
>
> 	ret = iommu_attach_device(v->domain, dma_dev);
> 	if (ret)
>-		goto err_attach;
>+		goto err_alloc_domain;
>
>-	return 0;
>+	ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb);
>+	if (ret)
>+		goto err_attach_device;
>
>-err_attach:
>+	return 0;

I suggest to add a new line here to separate the error path for the 
success path.

>+err_attach_device:
>+	iommu_detach_device(v->domain, dma_dev);
>+err_alloc_domain:
> 	iommu_domain_free(v->domain);
> 	v->domain = NULL;
> 	return ret;
>@@ -1595,6 +1646,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
> 		goto err;
> 	}
>
>+	vhost_iotlb_init(&v->resv_iotlb, 0, 0);
>+

IIUC the lifetime of v->resv_iotlb, we initialize it here in the 
vdpa_driver.probe() and we fill it during the `open` of the vhost-vdpa 
character device.

So, should we reset it in the `release` of the vhost-vdpa character 
device?

Thanks,
Stefano

> 	r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
> 	if (r)
> 		goto err;
>-- 
>2.27.0
>
>
Jason Wang March 21, 2024, 6:11 a.m. UTC | #2
On Wed, Mar 20, 2024 at 6:20 PM Wang Rong <w_angrong@163.com> wrote:
>
> From: Rong Wang <w_angrong@163.com>
>
> Once enable iommu domain for one device, the MSI
> translation tables have to be there for software-managed MSI.
> Otherwise, platform with software-managed MSI without an
> irq bypass function, can not get a correct memory write event
> from pcie, will not get irqs.
> The solution is to obtain the MSI phy base address from
> iommu reserved region, and set it to iommu MSI cookie,
> then translation tables will be created while request irq.
>
> Change log
> ----------
>
> v1->v2:
> - add resv iotlb to avoid overlap mapping.
> v2->v3:
> - there is no need to export the iommu symbol anymore.
>
> Signed-off-by: Rong Wang <w_angrong@163.com>
> ---
>  drivers/vhost/vdpa.c | 59 +++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 56 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index ba52d128aeb7..28b56b10372b 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
> @@ -49,6 +49,7 @@ struct vhost_vdpa {
>         struct completion completion;
>         struct vdpa_device *vdpa;
>         struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
> +       struct vhost_iotlb resv_iotlb;

Is it better to introduce a reserved flag like VHOST_MAP_RESERVED,
which means it can't be modified by the userspace but the kernel.

So we don't need to have two IOTLB. But I guess the reason you have
this is because we may have multiple address spaces where the MSI
routing should work for all of them?

Another note, vhost-vDPA support virtual address mapping, so this
should only work for physicall address mapping. E.g in the case of
SVA, MSI iova is a valid IOVA for the driver/usrespace.

>         struct device dev;
>         struct cdev cdev;
>         atomic_t opened;
> @@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v)
>  static int vhost_vdpa_reset(struct vhost_vdpa *v)
>  {
>         v->in_batch = 0;
> +       vhost_iotlb_reset(&v->resv_iotlb);

We try hard to avoid this for performance, see this commit:

commit 4398776f7a6d532c466f9e41f601c9a291fac5ef
Author: Si-Wei Liu <si-wei.liu@oracle.com>
Date:   Sat Oct 21 02:25:15 2023 -0700

    vhost-vdpa: introduce IOTLB_PERSIST backend feature bit

Any reason you need to do this?

>         return _compat_vdpa_reset(v);
>  }
>
> @@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
>             msg->iova + msg->size - 1 > v->range.last)
>                 return -EINVAL;
>
> +       if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
> +                                       msg->iova + msg->size - 1))
> +               return -EINVAL;
> +
>         if (vhost_iotlb_itree_first(iotlb, msg->iova,
>                                     msg->iova + msg->size - 1))
>                 return -EEXIST;
>
> +
>         if (vdpa->use_va)
>                 return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
>                                          msg->uaddr, msg->perm);
> @@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
>         return vhost_chr_write_iter(dev, from);
>  }
>
> +static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev,
> +       struct vhost_iotlb *resv_iotlb)
> +{
> +       struct list_head dev_resv_regions;
> +       phys_addr_t resv_msi_base = 0;
> +       struct iommu_resv_region *region;
> +       int ret = 0;
> +       bool with_sw_msi = false;
> +       bool with_hw_msi = false;
> +
> +       INIT_LIST_HEAD(&dev_resv_regions);
> +       iommu_get_resv_regions(dma_dev, &dev_resv_regions);
> +
> +       list_for_each_entry(region, &dev_resv_regions, list) {
> +               ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
> +                               region->start + region->length - 1,
> +                               0, 0, NULL);

I think MSI should be write-only?

> +               if (ret) {
> +                       vhost_iotlb_reset(resv_iotlb);

Need to report an error here.

Thanks
Michael S. Tsirkin March 21, 2024, 6:59 a.m. UTC | #3
On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> From: Rong Wang <w_angrong@163.com>
> 
> Once enable iommu domain for one device, the MSI
> translation tables have to be there for software-managed MSI.
> Otherwise, platform with software-managed MSI without an
> irq bypass function, can not get a correct memory write event
> from pcie, will not get irqs.
> The solution is to obtain the MSI phy base address from
> iommu reserved region, and set it to iommu MSI cookie,
> then translation tables will be created while request irq.
> 
> Change log
> ----------
> 
> v1->v2:
> - add resv iotlb to avoid overlap mapping.
> v2->v3:
> - there is no need to export the iommu symbol anymore.
> 
> Signed-off-by: Rong Wang <w_angrong@163.com>

There's in interest to keep extending vhost iotlb -
we should just switch over to iommufd which supports
this already.

> ---
>  drivers/vhost/vdpa.c | 59 +++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 56 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index ba52d128aeb7..28b56b10372b 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
> @@ -49,6 +49,7 @@ struct vhost_vdpa {
>  	struct completion completion;
>  	struct vdpa_device *vdpa;
>  	struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
> +	struct vhost_iotlb resv_iotlb;
>  	struct device dev;
>  	struct cdev cdev;
>  	atomic_t opened;
> @@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v)
>  static int vhost_vdpa_reset(struct vhost_vdpa *v)
>  {
>  	v->in_batch = 0;
> +	vhost_iotlb_reset(&v->resv_iotlb);
>  	return _compat_vdpa_reset(v);
>  }
>  
> @@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
>  	    msg->iova + msg->size - 1 > v->range.last)
>  		return -EINVAL;
>  
> +	if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
> +					msg->iova + msg->size - 1))
> +		return -EINVAL;
> +
>  	if (vhost_iotlb_itree_first(iotlb, msg->iova,
>  				    msg->iova + msg->size - 1))
>  		return -EEXIST;
>  
> +
>  	if (vdpa->use_va)
>  		return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
>  					 msg->uaddr, msg->perm);
> @@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
>  	return vhost_chr_write_iter(dev, from);
>  }
>  
> +static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev,
> +	struct vhost_iotlb *resv_iotlb)
> +{
> +	struct list_head dev_resv_regions;
> +	phys_addr_t resv_msi_base = 0;
> +	struct iommu_resv_region *region;
> +	int ret = 0;
> +	bool with_sw_msi = false;
> +	bool with_hw_msi = false;
> +
> +	INIT_LIST_HEAD(&dev_resv_regions);
> +	iommu_get_resv_regions(dma_dev, &dev_resv_regions);
> +
> +	list_for_each_entry(region, &dev_resv_regions, list) {
> +		ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
> +				region->start + region->length - 1,
> +				0, 0, NULL);
> +		if (ret) {
> +			vhost_iotlb_reset(resv_iotlb);
> +			break;
> +		}
> +
> +		if (region->type == IOMMU_RESV_MSI)
> +			with_hw_msi = true;
> +
> +		if (region->type == IOMMU_RESV_SW_MSI) {
> +			resv_msi_base = region->start;
> +			with_sw_msi = true;
> +		}
> +	}
> +
> +	if (!ret && !with_hw_msi && with_sw_msi)
> +		ret = iommu_get_msi_cookie(domain, resv_msi_base);
> +
> +	iommu_put_resv_regions(dma_dev, &dev_resv_regions);
> +
> +	return ret;
> +}
> +
>  static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
>  {
>  	struct vdpa_device *vdpa = v->vdpa;
> @@ -1335,11 +1381,16 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
>  
>  	ret = iommu_attach_device(v->domain, dma_dev);
>  	if (ret)
> -		goto err_attach;
> +		goto err_alloc_domain;
>  
> -	return 0;
> +	ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb);
> +	if (ret)
> +		goto err_attach_device;
>  
> -err_attach:
> +	return 0;
> +err_attach_device:
> +	iommu_detach_device(v->domain, dma_dev);
> +err_alloc_domain:
>  	iommu_domain_free(v->domain);
>  	v->domain = NULL;
>  	return ret;
> @@ -1595,6 +1646,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
>  		goto err;
>  	}
>  
> +	vhost_iotlb_init(&v->resv_iotlb, 0, 0);
> +
>  	r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
>  	if (r)
>  		goto err;
> -- 
> 2.27.0
>
Jason Wang March 27, 2024, 9:08 a.m. UTC | #4
On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > From: Rong Wang <w_angrong@163.com>
> >
> > Once enable iommu domain for one device, the MSI
> > translation tables have to be there for software-managed MSI.
> > Otherwise, platform with software-managed MSI without an
> > irq bypass function, can not get a correct memory write event
> > from pcie, will not get irqs.
> > The solution is to obtain the MSI phy base address from
> > iommu reserved region, and set it to iommu MSI cookie,
> > then translation tables will be created while request irq.
> >
> > Change log
> > ----------
> >
> > v1->v2:
> > - add resv iotlb to avoid overlap mapping.
> > v2->v3:
> > - there is no need to export the iommu symbol anymore.
> >
> > Signed-off-by: Rong Wang <w_angrong@163.com>
>
> There's in interest to keep extending vhost iotlb -
> we should just switch over to iommufd which supports
> this already.

IOMMUFD is good but VFIO supports this before IOMMUFD. This patch
makes vDPA run without a backporting of full IOMMUFD in the production
environment. I think it's worth.

If you worry about the extension, we can just use the vhost iotlb
existing facility to do this.

Thanks

>
> > ---
> >  drivers/vhost/vdpa.c | 59 +++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 56 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> > index ba52d128aeb7..28b56b10372b 100644
> > --- a/drivers/vhost/vdpa.c
> > +++ b/drivers/vhost/vdpa.c
> > @@ -49,6 +49,7 @@ struct vhost_vdpa {
> >       struct completion completion;
> >       struct vdpa_device *vdpa;
> >       struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
> > +     struct vhost_iotlb resv_iotlb;
> >       struct device dev;
> >       struct cdev cdev;
> >       atomic_t opened;
> > @@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v)
> >  static int vhost_vdpa_reset(struct vhost_vdpa *v)
> >  {
> >       v->in_batch = 0;
> > +     vhost_iotlb_reset(&v->resv_iotlb);
> >       return _compat_vdpa_reset(v);
> >  }
> >
> > @@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
> >           msg->iova + msg->size - 1 > v->range.last)
> >               return -EINVAL;
> >
> > +     if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
> > +                                     msg->iova + msg->size - 1))
> > +             return -EINVAL;
> > +
> >       if (vhost_iotlb_itree_first(iotlb, msg->iova,
> >                                   msg->iova + msg->size - 1))
> >               return -EEXIST;
> >
> > +
> >       if (vdpa->use_va)
> >               return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
> >                                        msg->uaddr, msg->perm);
> > @@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
> >       return vhost_chr_write_iter(dev, from);
> >  }
> >
> > +static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev,
> > +     struct vhost_iotlb *resv_iotlb)
> > +{
> > +     struct list_head dev_resv_regions;
> > +     phys_addr_t resv_msi_base = 0;
> > +     struct iommu_resv_region *region;
> > +     int ret = 0;
> > +     bool with_sw_msi = false;
> > +     bool with_hw_msi = false;
> > +
> > +     INIT_LIST_HEAD(&dev_resv_regions);
> > +     iommu_get_resv_regions(dma_dev, &dev_resv_regions);
> > +
> > +     list_for_each_entry(region, &dev_resv_regions, list) {
> > +             ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
> > +                             region->start + region->length - 1,
> > +                             0, 0, NULL);
> > +             if (ret) {
> > +                     vhost_iotlb_reset(resv_iotlb);
> > +                     break;
> > +             }
> > +
> > +             if (region->type == IOMMU_RESV_MSI)
> > +                     with_hw_msi = true;
> > +
> > +             if (region->type == IOMMU_RESV_SW_MSI) {
> > +                     resv_msi_base = region->start;
> > +                     with_sw_msi = true;
> > +             }
> > +     }
> > +
> > +     if (!ret && !with_hw_msi && with_sw_msi)
> > +             ret = iommu_get_msi_cookie(domain, resv_msi_base);
> > +
> > +     iommu_put_resv_regions(dma_dev, &dev_resv_regions);
> > +
> > +     return ret;
> > +}
> > +
> >  static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> >  {
> >       struct vdpa_device *vdpa = v->vdpa;
> > @@ -1335,11 +1381,16 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> >
> >       ret = iommu_attach_device(v->domain, dma_dev);
> >       if (ret)
> > -             goto err_attach;
> > +             goto err_alloc_domain;
> >
> > -     return 0;
> > +     ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb);
> > +     if (ret)
> > +             goto err_attach_device;
> >
> > -err_attach:
> > +     return 0;
> > +err_attach_device:
> > +     iommu_detach_device(v->domain, dma_dev);
> > +err_alloc_domain:
> >       iommu_domain_free(v->domain);
> >       v->domain = NULL;
> >       return ret;
> > @@ -1595,6 +1646,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
> >               goto err;
> >       }
> >
> > +     vhost_iotlb_init(&v->resv_iotlb, 0, 0);
> > +
> >       r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
> >       if (r)
> >               goto err;
> > --
> > 2.27.0
> >
>
Jason Wang March 29, 2024, 3:55 a.m. UTC | #5
On Wed, Mar 27, 2024 at 5:08 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > From: Rong Wang <w_angrong@163.com>
> > >
> > > Once enable iommu domain for one device, the MSI
> > > translation tables have to be there for software-managed MSI.
> > > Otherwise, platform with software-managed MSI without an
> > > irq bypass function, can not get a correct memory write event
> > > from pcie, will not get irqs.
> > > The solution is to obtain the MSI phy base address from
> > > iommu reserved region, and set it to iommu MSI cookie,
> > > then translation tables will be created while request irq.
> > >
> > > Change log
> > > ----------
> > >
> > > v1->v2:
> > > - add resv iotlb to avoid overlap mapping.
> > > v2->v3:
> > > - there is no need to export the iommu symbol anymore.
> > >
> > > Signed-off-by: Rong Wang <w_angrong@163.com>
> >
> > There's in interest to keep extending vhost iotlb -
> > we should just switch over to iommufd which supports
> > this already.
>
> IOMMUFD is good but VFIO supports this before IOMMUFD. This patch
> makes vDPA run without a backporting of full IOMMUFD in the production
> environment. I think it's worth.
>
> If you worry about the extension, we can just use the vhost iotlb
> existing facility to do this.
>
> Thanks

Btw, Wang Rong,

It looks that Cindy does have the bandwidth in working for IOMMUFD support.

Do you have the will to do that?

Thanks
Michael S. Tsirkin March 29, 2024, 9:13 a.m. UTC | #6
On Wed, Mar 27, 2024 at 05:08:57PM +0800, Jason Wang wrote:
> On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > From: Rong Wang <w_angrong@163.com>
> > >
> > > Once enable iommu domain for one device, the MSI
> > > translation tables have to be there for software-managed MSI.
> > > Otherwise, platform with software-managed MSI without an
> > > irq bypass function, can not get a correct memory write event
> > > from pcie, will not get irqs.
> > > The solution is to obtain the MSI phy base address from
> > > iommu reserved region, and set it to iommu MSI cookie,
> > > then translation tables will be created while request irq.
> > >
> > > Change log
> > > ----------
> > >
> > > v1->v2:
> > > - add resv iotlb to avoid overlap mapping.
> > > v2->v3:
> > > - there is no need to export the iommu symbol anymore.
> > >
> > > Signed-off-by: Rong Wang <w_angrong@163.com>
> >
> > There's in interest to keep extending vhost iotlb -
> > we should just switch over to iommufd which supports
> > this already.
> 
> IOMMUFD is good but VFIO supports this before IOMMUFD.

You mean VFIO migrated to IOMMUFD but of course they keep supporting
their old UAPI? OK and point being?

> This patch
> makes vDPA run without a backporting of full IOMMUFD in the production
> environment. I think it's worth.

Where do we stop? saying no to features is the only tool maintainers
have to make cleanups happen, otherwise people will just keep piling
stuff up.

> If you worry about the extension, we can just use the vhost iotlb
> existing facility to do this.
> 
> Thanks
> 
> >
> > > ---
> > >  drivers/vhost/vdpa.c | 59 +++++++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 56 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> > > index ba52d128aeb7..28b56b10372b 100644
> > > --- a/drivers/vhost/vdpa.c
> > > +++ b/drivers/vhost/vdpa.c
> > > @@ -49,6 +49,7 @@ struct vhost_vdpa {
> > >       struct completion completion;
> > >       struct vdpa_device *vdpa;
> > >       struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
> > > +     struct vhost_iotlb resv_iotlb;
> > >       struct device dev;
> > >       struct cdev cdev;
> > >       atomic_t opened;
> > > @@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v)
> > >  static int vhost_vdpa_reset(struct vhost_vdpa *v)
> > >  {
> > >       v->in_batch = 0;
> > > +     vhost_iotlb_reset(&v->resv_iotlb);
> > >       return _compat_vdpa_reset(v);
> > >  }
> > >
> > > @@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
> > >           msg->iova + msg->size - 1 > v->range.last)
> > >               return -EINVAL;
> > >
> > > +     if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
> > > +                                     msg->iova + msg->size - 1))
> > > +             return -EINVAL;
> > > +
> > >       if (vhost_iotlb_itree_first(iotlb, msg->iova,
> > >                                   msg->iova + msg->size - 1))
> > >               return -EEXIST;
> > >
> > > +
> > >       if (vdpa->use_va)
> > >               return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
> > >                                        msg->uaddr, msg->perm);
> > > @@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
> > >       return vhost_chr_write_iter(dev, from);
> > >  }
> > >
> > > +static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev,
> > > +     struct vhost_iotlb *resv_iotlb)
> > > +{
> > > +     struct list_head dev_resv_regions;
> > > +     phys_addr_t resv_msi_base = 0;
> > > +     struct iommu_resv_region *region;
> > > +     int ret = 0;
> > > +     bool with_sw_msi = false;
> > > +     bool with_hw_msi = false;
> > > +
> > > +     INIT_LIST_HEAD(&dev_resv_regions);
> > > +     iommu_get_resv_regions(dma_dev, &dev_resv_regions);
> > > +
> > > +     list_for_each_entry(region, &dev_resv_regions, list) {
> > > +             ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
> > > +                             region->start + region->length - 1,
> > > +                             0, 0, NULL);
> > > +             if (ret) {
> > > +                     vhost_iotlb_reset(resv_iotlb);
> > > +                     break;
> > > +             }
> > > +
> > > +             if (region->type == IOMMU_RESV_MSI)
> > > +                     with_hw_msi = true;
> > > +
> > > +             if (region->type == IOMMU_RESV_SW_MSI) {
> > > +                     resv_msi_base = region->start;
> > > +                     with_sw_msi = true;
> > > +             }
> > > +     }
> > > +
> > > +     if (!ret && !with_hw_msi && with_sw_msi)
> > > +             ret = iommu_get_msi_cookie(domain, resv_msi_base);
> > > +
> > > +     iommu_put_resv_regions(dma_dev, &dev_resv_regions);
> > > +
> > > +     return ret;
> > > +}
> > > +
> > >  static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> > >  {
> > >       struct vdpa_device *vdpa = v->vdpa;
> > > @@ -1335,11 +1381,16 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> > >
> > >       ret = iommu_attach_device(v->domain, dma_dev);
> > >       if (ret)
> > > -             goto err_attach;
> > > +             goto err_alloc_domain;
> > >
> > > -     return 0;
> > > +     ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb);
> > > +     if (ret)
> > > +             goto err_attach_device;
> > >
> > > -err_attach:
> > > +     return 0;
> > > +err_attach_device:
> > > +     iommu_detach_device(v->domain, dma_dev);
> > > +err_alloc_domain:
> > >       iommu_domain_free(v->domain);
> > >       v->domain = NULL;
> > >       return ret;
> > > @@ -1595,6 +1646,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
> > >               goto err;
> > >       }
> > >
> > > +     vhost_iotlb_init(&v->resv_iotlb, 0, 0);
> > > +
> > >       r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
> > >       if (r)
> > >               goto err;
> > > --
> > > 2.27.0
> > >
> >
Michael S. Tsirkin March 29, 2024, 9:13 a.m. UTC | #7
On Fri, Mar 29, 2024 at 11:55:50AM +0800, Jason Wang wrote:
> On Wed, Mar 27, 2024 at 5:08 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > > From: Rong Wang <w_angrong@163.com>
> > > >
> > > > Once enable iommu domain for one device, the MSI
> > > > translation tables have to be there for software-managed MSI.
> > > > Otherwise, platform with software-managed MSI without an
> > > > irq bypass function, can not get a correct memory write event
> > > > from pcie, will not get irqs.
> > > > The solution is to obtain the MSI phy base address from
> > > > iommu reserved region, and set it to iommu MSI cookie,
> > > > then translation tables will be created while request irq.
> > > >
> > > > Change log
> > > > ----------
> > > >
> > > > v1->v2:
> > > > - add resv iotlb to avoid overlap mapping.
> > > > v2->v3:
> > > > - there is no need to export the iommu symbol anymore.
> > > >
> > > > Signed-off-by: Rong Wang <w_angrong@163.com>
> > >
> > > There's in interest to keep extending vhost iotlb -
> > > we should just switch over to iommufd which supports
> > > this already.
> >
> > IOMMUFD is good but VFIO supports this before IOMMUFD. This patch
> > makes vDPA run without a backporting of full IOMMUFD in the production
> > environment. I think it's worth.
> >
> > If you worry about the extension, we can just use the vhost iotlb
> > existing facility to do this.
> >
> > Thanks
> 
> Btw, Wang Rong,
> 
> It looks that Cindy does have the bandwidth in working for IOMMUFD support.

I think you mean she does not.

> Do you have the will to do that?
> 
> Thanks
Jason Wang March 29, 2024, 10:39 a.m. UTC | #8
On Fri, Mar 29, 2024 at 5:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Mar 27, 2024 at 05:08:57PM +0800, Jason Wang wrote:
> > On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > > From: Rong Wang <w_angrong@163.com>
> > > >
> > > > Once enable iommu domain for one device, the MSI
> > > > translation tables have to be there for software-managed MSI.
> > > > Otherwise, platform with software-managed MSI without an
> > > > irq bypass function, can not get a correct memory write event
> > > > from pcie, will not get irqs.
> > > > The solution is to obtain the MSI phy base address from
> > > > iommu reserved region, and set it to iommu MSI cookie,
> > > > then translation tables will be created while request irq.
> > > >
> > > > Change log
> > > > ----------
> > > >
> > > > v1->v2:
> > > > - add resv iotlb to avoid overlap mapping.
> > > > v2->v3:
> > > > - there is no need to export the iommu symbol anymore.
> > > >
> > > > Signed-off-by: Rong Wang <w_angrong@163.com>
> > >
> > > There's in interest to keep extending vhost iotlb -
> > > we should just switch over to iommufd which supports
> > > this already.
> >
> > IOMMUFD is good but VFIO supports this before IOMMUFD.
>
> You mean VFIO migrated to IOMMUFD but of course they keep supporting
> their old UAPI?

I meant VFIO support software managed MSI before IOMMUFD.

> OK and point being?
>
> > This patch
> > makes vDPA run without a backporting of full IOMMUFD in the production
> > environment. I think it's worth.
>
> Where do we stop? saying no to features is the only tool maintainers
> have to make cleanups happen, otherwise people will just keep piling
> stuff up.

I think we should not have more features than VFIO without IOMMUFD.

Thanks

>
> > If you worry about the extension, we can just use the vhost iotlb
> > existing facility to do this.
> >
> > Thanks
> >
> > >
> > > > ---
> > > >  drivers/vhost/vdpa.c | 59 +++++++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 56 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> > > > index ba52d128aeb7..28b56b10372b 100644
> > > > --- a/drivers/vhost/vdpa.c
> > > > +++ b/drivers/vhost/vdpa.c
> > > > @@ -49,6 +49,7 @@ struct vhost_vdpa {
> > > >       struct completion completion;
> > > >       struct vdpa_device *vdpa;
> > > >       struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
> > > > +     struct vhost_iotlb resv_iotlb;
> > > >       struct device dev;
> > > >       struct cdev cdev;
> > > >       atomic_t opened;
> > > > @@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v)
> > > >  static int vhost_vdpa_reset(struct vhost_vdpa *v)
> > > >  {
> > > >       v->in_batch = 0;
> > > > +     vhost_iotlb_reset(&v->resv_iotlb);
> > > >       return _compat_vdpa_reset(v);
> > > >  }
> > > >
> > > > @@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
> > > >           msg->iova + msg->size - 1 > v->range.last)
> > > >               return -EINVAL;
> > > >
> > > > +     if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
> > > > +                                     msg->iova + msg->size - 1))
> > > > +             return -EINVAL;
> > > > +
> > > >       if (vhost_iotlb_itree_first(iotlb, msg->iova,
> > > >                                   msg->iova + msg->size - 1))
> > > >               return -EEXIST;
> > > >
> > > > +
> > > >       if (vdpa->use_va)
> > > >               return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
> > > >                                        msg->uaddr, msg->perm);
> > > > @@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
> > > >       return vhost_chr_write_iter(dev, from);
> > > >  }
> > > >
> > > > +static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev,
> > > > +     struct vhost_iotlb *resv_iotlb)
> > > > +{
> > > > +     struct list_head dev_resv_regions;
> > > > +     phys_addr_t resv_msi_base = 0;
> > > > +     struct iommu_resv_region *region;
> > > > +     int ret = 0;
> > > > +     bool with_sw_msi = false;
> > > > +     bool with_hw_msi = false;
> > > > +
> > > > +     INIT_LIST_HEAD(&dev_resv_regions);
> > > > +     iommu_get_resv_regions(dma_dev, &dev_resv_regions);
> > > > +
> > > > +     list_for_each_entry(region, &dev_resv_regions, list) {
> > > > +             ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
> > > > +                             region->start + region->length - 1,
> > > > +                             0, 0, NULL);
> > > > +             if (ret) {
> > > > +                     vhost_iotlb_reset(resv_iotlb);
> > > > +                     break;
> > > > +             }
> > > > +
> > > > +             if (region->type == IOMMU_RESV_MSI)
> > > > +                     with_hw_msi = true;
> > > > +
> > > > +             if (region->type == IOMMU_RESV_SW_MSI) {
> > > > +                     resv_msi_base = region->start;
> > > > +                     with_sw_msi = true;
> > > > +             }
> > > > +     }
> > > > +
> > > > +     if (!ret && !with_hw_msi && with_sw_msi)
> > > > +             ret = iommu_get_msi_cookie(domain, resv_msi_base);
> > > > +
> > > > +     iommu_put_resv_regions(dma_dev, &dev_resv_regions);
> > > > +
> > > > +     return ret;
> > > > +}
> > > > +
> > > >  static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> > > >  {
> > > >       struct vdpa_device *vdpa = v->vdpa;
> > > > @@ -1335,11 +1381,16 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> > > >
> > > >       ret = iommu_attach_device(v->domain, dma_dev);
> > > >       if (ret)
> > > > -             goto err_attach;
> > > > +             goto err_alloc_domain;
> > > >
> > > > -     return 0;
> > > > +     ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb);
> > > > +     if (ret)
> > > > +             goto err_attach_device;
> > > >
> > > > -err_attach:
> > > > +     return 0;
> > > > +err_attach_device:
> > > > +     iommu_detach_device(v->domain, dma_dev);
> > > > +err_alloc_domain:
> > > >       iommu_domain_free(v->domain);
> > > >       v->domain = NULL;
> > > >       return ret;
> > > > @@ -1595,6 +1646,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
> > > >               goto err;
> > > >       }
> > > >
> > > > +     vhost_iotlb_init(&v->resv_iotlb, 0, 0);
> > > > +
> > > >       r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
> > > >       if (r)
> > > >               goto err;
> > > > --
> > > > 2.27.0
> > > >
> > >
>
Jason Wang March 29, 2024, 10:39 a.m. UTC | #9
On Fri, Mar 29, 2024 at 5:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Mar 29, 2024 at 11:55:50AM +0800, Jason Wang wrote:
> > On Wed, Mar 27, 2024 at 5:08 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > > > From: Rong Wang <w_angrong@163.com>
> > > > >
> > > > > Once enable iommu domain for one device, the MSI
> > > > > translation tables have to be there for software-managed MSI.
> > > > > Otherwise, platform with software-managed MSI without an
> > > > > irq bypass function, can not get a correct memory write event
> > > > > from pcie, will not get irqs.
> > > > > The solution is to obtain the MSI phy base address from
> > > > > iommu reserved region, and set it to iommu MSI cookie,
> > > > > then translation tables will be created while request irq.
> > > > >
> > > > > Change log
> > > > > ----------
> > > > >
> > > > > v1->v2:
> > > > > - add resv iotlb to avoid overlap mapping.
> > > > > v2->v3:
> > > > > - there is no need to export the iommu symbol anymore.
> > > > >
> > > > > Signed-off-by: Rong Wang <w_angrong@163.com>
> > > >
> > > > There's in interest to keep extending vhost iotlb -
> > > > we should just switch over to iommufd which supports
> > > > this already.
> > >
> > > IOMMUFD is good but VFIO supports this before IOMMUFD. This patch
> > > makes vDPA run without a backporting of full IOMMUFD in the production
> > > environment. I think it's worth.
> > >
> > > If you worry about the extension, we can just use the vhost iotlb
> > > existing facility to do this.
> > >
> > > Thanks
> >
> > Btw, Wang Rong,
> >
> > It looks that Cindy does have the bandwidth in working for IOMMUFD support.
>
> I think you mean she does not.

Yes, you are right.

Thanks

>
> > Do you have the will to do that?
> >
> > Thanks
>
Michael S. Tsirkin March 29, 2024, 10:42 a.m. UTC | #10
On Fri, Mar 29, 2024 at 06:39:33PM +0800, Jason Wang wrote:
> On Fri, Mar 29, 2024 at 5:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Mar 27, 2024 at 05:08:57PM +0800, Jason Wang wrote:
> > > On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > > > From: Rong Wang <w_angrong@163.com>
> > > > >
> > > > > Once enable iommu domain for one device, the MSI
> > > > > translation tables have to be there for software-managed MSI.
> > > > > Otherwise, platform with software-managed MSI without an
> > > > > irq bypass function, can not get a correct memory write event
> > > > > from pcie, will not get irqs.
> > > > > The solution is to obtain the MSI phy base address from
> > > > > iommu reserved region, and set it to iommu MSI cookie,
> > > > > then translation tables will be created while request irq.
> > > > >
> > > > > Change log
> > > > > ----------
> > > > >
> > > > > v1->v2:
> > > > > - add resv iotlb to avoid overlap mapping.
> > > > > v2->v3:
> > > > > - there is no need to export the iommu symbol anymore.
> > > > >
> > > > > Signed-off-by: Rong Wang <w_angrong@163.com>
> > > >
> > > > There's in interest to keep extending vhost iotlb -
> > > > we should just switch over to iommufd which supports
> > > > this already.
> > >
> > > IOMMUFD is good but VFIO supports this before IOMMUFD.
> >
> > You mean VFIO migrated to IOMMUFD but of course they keep supporting
> > their old UAPI?
> 
> I meant VFIO support software managed MSI before IOMMUFD.

And then they switched over and stopped adding new IOMMU
related features. And so should vdpa?


> > OK and point being?
> >
> > > This patch
> > > makes vDPA run without a backporting of full IOMMUFD in the production
> > > environment. I think it's worth.
> >
> > Where do we stop? saying no to features is the only tool maintainers
> > have to make cleanups happen, otherwise people will just keep piling
> > stuff up.
> 
> I think we should not have more features than VFIO without IOMMUFD.
> 
> Thanks
> 
> >
> > > If you worry about the extension, we can just use the vhost iotlb
> > > existing facility to do this.
> > >
> > > Thanks
> > >
> > > >
> > > > > ---
> > > > >  drivers/vhost/vdpa.c | 59 +++++++++++++++++++++++++++++++++++++++++---
> > > > >  1 file changed, 56 insertions(+), 3 deletions(-)
> > > > >
> > > > > diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> > > > > index ba52d128aeb7..28b56b10372b 100644
> > > > > --- a/drivers/vhost/vdpa.c
> > > > > +++ b/drivers/vhost/vdpa.c
> > > > > @@ -49,6 +49,7 @@ struct vhost_vdpa {
> > > > >       struct completion completion;
> > > > >       struct vdpa_device *vdpa;
> > > > >       struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
> > > > > +     struct vhost_iotlb resv_iotlb;
> > > > >       struct device dev;
> > > > >       struct cdev cdev;
> > > > >       atomic_t opened;
> > > > > @@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v)
> > > > >  static int vhost_vdpa_reset(struct vhost_vdpa *v)
> > > > >  {
> > > > >       v->in_batch = 0;
> > > > > +     vhost_iotlb_reset(&v->resv_iotlb);
> > > > >       return _compat_vdpa_reset(v);
> > > > >  }
> > > > >
> > > > > @@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
> > > > >           msg->iova + msg->size - 1 > v->range.last)
> > > > >               return -EINVAL;
> > > > >
> > > > > +     if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
> > > > > +                                     msg->iova + msg->size - 1))
> > > > > +             return -EINVAL;
> > > > > +
> > > > >       if (vhost_iotlb_itree_first(iotlb, msg->iova,
> > > > >                                   msg->iova + msg->size - 1))
> > > > >               return -EEXIST;
> > > > >
> > > > > +
> > > > >       if (vdpa->use_va)
> > > > >               return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
> > > > >                                        msg->uaddr, msg->perm);
> > > > > @@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
> > > > >       return vhost_chr_write_iter(dev, from);
> > > > >  }
> > > > >
> > > > > +static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev,
> > > > > +     struct vhost_iotlb *resv_iotlb)
> > > > > +{
> > > > > +     struct list_head dev_resv_regions;
> > > > > +     phys_addr_t resv_msi_base = 0;
> > > > > +     struct iommu_resv_region *region;
> > > > > +     int ret = 0;
> > > > > +     bool with_sw_msi = false;
> > > > > +     bool with_hw_msi = false;
> > > > > +
> > > > > +     INIT_LIST_HEAD(&dev_resv_regions);
> > > > > +     iommu_get_resv_regions(dma_dev, &dev_resv_regions);
> > > > > +
> > > > > +     list_for_each_entry(region, &dev_resv_regions, list) {
> > > > > +             ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
> > > > > +                             region->start + region->length - 1,
> > > > > +                             0, 0, NULL);
> > > > > +             if (ret) {
> > > > > +                     vhost_iotlb_reset(resv_iotlb);
> > > > > +                     break;
> > > > > +             }
> > > > > +
> > > > > +             if (region->type == IOMMU_RESV_MSI)
> > > > > +                     with_hw_msi = true;
> > > > > +
> > > > > +             if (region->type == IOMMU_RESV_SW_MSI) {
> > > > > +                     resv_msi_base = region->start;
> > > > > +                     with_sw_msi = true;
> > > > > +             }
> > > > > +     }
> > > > > +
> > > > > +     if (!ret && !with_hw_msi && with_sw_msi)
> > > > > +             ret = iommu_get_msi_cookie(domain, resv_msi_base);
> > > > > +
> > > > > +     iommu_put_resv_regions(dma_dev, &dev_resv_regions);
> > > > > +
> > > > > +     return ret;
> > > > > +}
> > > > > +
> > > > >  static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> > > > >  {
> > > > >       struct vdpa_device *vdpa = v->vdpa;
> > > > > @@ -1335,11 +1381,16 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> > > > >
> > > > >       ret = iommu_attach_device(v->domain, dma_dev);
> > > > >       if (ret)
> > > > > -             goto err_attach;
> > > > > +             goto err_alloc_domain;
> > > > >
> > > > > -     return 0;
> > > > > +     ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb);
> > > > > +     if (ret)
> > > > > +             goto err_attach_device;
> > > > >
> > > > > -err_attach:
> > > > > +     return 0;
> > > > > +err_attach_device:
> > > > > +     iommu_detach_device(v->domain, dma_dev);
> > > > > +err_alloc_domain:
> > > > >       iommu_domain_free(v->domain);
> > > > >       v->domain = NULL;
> > > > >       return ret;
> > > > > @@ -1595,6 +1646,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
> > > > >               goto err;
> > > > >       }
> > > > >
> > > > > +     vhost_iotlb_init(&v->resv_iotlb, 0, 0);
> > > > > +
> > > > >       r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
> > > > >       if (r)
> > > > >               goto err;
> > > > > --
> > > > > 2.27.0
> > > > >
> > > >
> >
tab April 3, 2024, 1:41 a.m. UTC | #11
I&nbsp;need&nbsp;to&nbsp;discuss&nbsp;internally,&nbsp;and&nbsp;there&nbsp;may&nbsp;be&nbsp;someone&nbsp;else&nbsp;will&nbsp;do&nbsp;that.
在 2024-03-29 18:39:54,"Jason Wang" <jasowang@redhat.com> 写道:
On Fri, Mar 29, 2024 at 5:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Mar 29, 2024 at 11:55:50AM +0800, Jason Wang wrote:
> > On Wed, Mar 27, 2024 at 5:08 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > > > From: Rong Wang <w_angrong@163.com>
> > > > >
> > > > > Once enable iommu domain for one device, the MSI
> > > > > translation tables have to be there for software-managed MSI.
> > > > > Otherwise, platform with software-managed MSI without an
> > > > > irq bypass function, can not get a correct memory write event
> > > > > from pcie, will not get irqs.
> > > > > The solution is to obtain the MSI phy base address from
> > > > > iommu reserved region, and set it to iommu MSI cookie,
> > > > > then translation tables will be created while request irq.
> > > > >
> > > > > Change log
> > > > > ----------
> > > > >
> > > > > v1->v2:
> > > > > - add resv iotlb to avoid overlap mapping.
> > > > > v2->v3:
> > > > > - there is no need to export the iommu symbol anymore.
> > > > >
> > > > > Signed-off-by: Rong Wang <w_angrong@163.com>
> > > >
> > > > There's in interest to keep extending vhost iotlb -
> > > > we should just switch over to iommufd which supports
> > > > this already.
> > >
> > > IOMMUFD is good but VFIO supports this before IOMMUFD. This patch
> > > makes vDPA run without a backporting of full IOMMUFD in the production
> > > environment. I think it's worth.
> > >
> > > If you worry about the extension, we can just use the vhost iotlb
> > > existing facility to do this.
> > >
> > > Thanks
> >
> > Btw, Wang Rong,
> >
> > It looks that Cindy does have the bandwidth in working for IOMMUFD support.
>
> I think you mean she does not.

Yes, you are right.

Thanks

>
> > Do you have the will to do that?
> >
> > Thanks
>
Jason Wang April 7, 2024, 3:38 a.m. UTC | #12
On Fri, Mar 29, 2024 at 6:42 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Mar 29, 2024 at 06:39:33PM +0800, Jason Wang wrote:
> > On Fri, Mar 29, 2024 at 5:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Mar 27, 2024 at 05:08:57PM +0800, Jason Wang wrote:
> > > > On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > > > > From: Rong Wang <w_angrong@163.com>
> > > > > >
> > > > > > Once enable iommu domain for one device, the MSI
> > > > > > translation tables have to be there for software-managed MSI.
> > > > > > Otherwise, platform with software-managed MSI without an
> > > > > > irq bypass function, can not get a correct memory write event
> > > > > > from pcie, will not get irqs.
> > > > > > The solution is to obtain the MSI phy base address from
> > > > > > iommu reserved region, and set it to iommu MSI cookie,
> > > > > > then translation tables will be created while request irq.
> > > > > >
> > > > > > Change log
> > > > > > ----------
> > > > > >
> > > > > > v1->v2:
> > > > > > - add resv iotlb to avoid overlap mapping.
> > > > > > v2->v3:
> > > > > > - there is no need to export the iommu symbol anymore.
> > > > > >
> > > > > > Signed-off-by: Rong Wang <w_angrong@163.com>
> > > > >
> > > > > There's in interest to keep extending vhost iotlb -
> > > > > we should just switch over to iommufd which supports
> > > > > this already.
> > > >
> > > > IOMMUFD is good but VFIO supports this before IOMMUFD.
> > >
> > > You mean VFIO migrated to IOMMUFD but of course they keep supporting
> > > their old UAPI?
> >
> > I meant VFIO support software managed MSI before IOMMUFD.
>
> And then they switched over and stopped adding new IOMMU
> related features. And so should vdpa?

For some cloud vendors, it means vDPA can't be used until

1) IOMMUFD support for vDPA is supported by upstream
2) IOMMUFD is backported

1) might be fine but 2) might be impossible.

Assuming IOMMUFD hasn't been done for vDPA. Adding small features like
this seems reasonable (especially considering it is supported by the
"legacy" VFIO container).

Thanks

>
>
> > > OK and point being?
> > >
> > > > This patch
> > > > makes vDPA run without a backporting of full IOMMUFD in the production
> > > > environment. I think it's worth.
> > >
> > > Where do we stop? saying no to features is the only tool maintainers
> > > have to make cleanups happen, otherwise people will just keep piling
> > > stuff up.
> >
> > I think we should not have more features than VFIO without IOMMUFD.
> >
> > Thanks
> >
> > >
> > > > If you worry about the extension, we can just use the vhost iotlb
> > > > existing facility to do this.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > > ---
> > > > > >  drivers/vhost/vdpa.c | 59 +++++++++++++++++++++++++++++++++++++++++---
> > > > > >  1 file changed, 56 insertions(+), 3 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> > > > > > index ba52d128aeb7..28b56b10372b 100644
> > > > > > --- a/drivers/vhost/vdpa.c
> > > > > > +++ b/drivers/vhost/vdpa.c
> > > > > > @@ -49,6 +49,7 @@ struct vhost_vdpa {
> > > > > >       struct completion completion;
> > > > > >       struct vdpa_device *vdpa;
> > > > > >       struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
> > > > > > +     struct vhost_iotlb resv_iotlb;
> > > > > >       struct device dev;
> > > > > >       struct cdev cdev;
> > > > > >       atomic_t opened;
> > > > > > @@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v)
> > > > > >  static int vhost_vdpa_reset(struct vhost_vdpa *v)
> > > > > >  {
> > > > > >       v->in_batch = 0;
> > > > > > +     vhost_iotlb_reset(&v->resv_iotlb);
> > > > > >       return _compat_vdpa_reset(v);
> > > > > >  }
> > > > > >
> > > > > > @@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
> > > > > >           msg->iova + msg->size - 1 > v->range.last)
> > > > > >               return -EINVAL;
> > > > > >
> > > > > > +     if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
> > > > > > +                                     msg->iova + msg->size - 1))
> > > > > > +             return -EINVAL;
> > > > > > +
> > > > > >       if (vhost_iotlb_itree_first(iotlb, msg->iova,
> > > > > >                                   msg->iova + msg->size - 1))
> > > > > >               return -EEXIST;
> > > > > >
> > > > > > +
> > > > > >       if (vdpa->use_va)
> > > > > >               return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
> > > > > >                                        msg->uaddr, msg->perm);
> > > > > > @@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
> > > > > >       return vhost_chr_write_iter(dev, from);
> > > > > >  }
> > > > > >
> > > > > > +static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev,
> > > > > > +     struct vhost_iotlb *resv_iotlb)
> > > > > > +{
> > > > > > +     struct list_head dev_resv_regions;
> > > > > > +     phys_addr_t resv_msi_base = 0;
> > > > > > +     struct iommu_resv_region *region;
> > > > > > +     int ret = 0;
> > > > > > +     bool with_sw_msi = false;
> > > > > > +     bool with_hw_msi = false;
> > > > > > +
> > > > > > +     INIT_LIST_HEAD(&dev_resv_regions);
> > > > > > +     iommu_get_resv_regions(dma_dev, &dev_resv_regions);
> > > > > > +
> > > > > > +     list_for_each_entry(region, &dev_resv_regions, list) {
> > > > > > +             ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
> > > > > > +                             region->start + region->length - 1,
> > > > > > +                             0, 0, NULL);
> > > > > > +             if (ret) {
> > > > > > +                     vhost_iotlb_reset(resv_iotlb);
> > > > > > +                     break;
> > > > > > +             }
> > > > > > +
> > > > > > +             if (region->type == IOMMU_RESV_MSI)
> > > > > > +                     with_hw_msi = true;
> > > > > > +
> > > > > > +             if (region->type == IOMMU_RESV_SW_MSI) {
> > > > > > +                     resv_msi_base = region->start;
> > > > > > +                     with_sw_msi = true;
> > > > > > +             }
> > > > > > +     }
> > > > > > +
> > > > > > +     if (!ret && !with_hw_msi && with_sw_msi)
> > > > > > +             ret = iommu_get_msi_cookie(domain, resv_msi_base);
> > > > > > +
> > > > > > +     iommu_put_resv_regions(dma_dev, &dev_resv_regions);
> > > > > > +
> > > > > > +     return ret;
> > > > > > +}
> > > > > > +
> > > > > >  static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> > > > > >  {
> > > > > >       struct vdpa_device *vdpa = v->vdpa;
> > > > > > @@ -1335,11 +1381,16 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
> > > > > >
> > > > > >       ret = iommu_attach_device(v->domain, dma_dev);
> > > > > >       if (ret)
> > > > > > -             goto err_attach;
> > > > > > +             goto err_alloc_domain;
> > > > > >
> > > > > > -     return 0;
> > > > > > +     ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb);
> > > > > > +     if (ret)
> > > > > > +             goto err_attach_device;
> > > > > >
> > > > > > -err_attach:
> > > > > > +     return 0;
> > > > > > +err_attach_device:
> > > > > > +     iommu_detach_device(v->domain, dma_dev);
> > > > > > +err_alloc_domain:
> > > > > >       iommu_domain_free(v->domain);
> > > > > >       v->domain = NULL;
> > > > > >       return ret;
> > > > > > @@ -1595,6 +1646,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
> > > > > >               goto err;
> > > > > >       }
> > > > > >
> > > > > > +     vhost_iotlb_init(&v->resv_iotlb, 0, 0);
> > > > > > +
> > > > > >       r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
> > > > > >       if (r)
> > > > > >               goto err;
> > > > > > --
> > > > > > 2.27.0
> > > > > >
> > > > >
> > >
>
Jason Wang April 8, 2024, 6:37 a.m. UTC | #13
On Wed, Apr 3, 2024 at 10:47 AM tab <w_angrong@163.com> wrote:
>
> > >
> > > On Fri, Mar 29, 2024 at 11:55:50AM +0800, Jason Wang wrote:
> > > > On Wed, Mar 27, 2024 at 5:08 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > > > > > From: Rong Wang <w_angrong@163.com>
> > > > > > >
> > > > > > > Once enable iommu domain for one device, the MSI
> > > > > > > translation tables have to be there for software-managed MSI.
> > > > > > > Otherwise, platform with software-managed MSI without an
> > > > > > > irq bypass function, can not get a correct memory write event
> > > > > > > from pcie, will not get irqs.
> > > > > > > The solution is to obtain the MSI phy base address from
> > > > > > > iommu reserved region, and set it to iommu MSI cookie,
> > > > > > > then translation tables will be created while request irq.
> > > > > > >
> > > > > > > Change log
> > > > > > > ----------
> > > > > > >
> > > > > > > v1->v2:
> > > > > > > - add resv iotlb to avoid overlap mapping.
> > > > > > > v2->v3:
> > > > > > > - there is no need to export the iommu symbol anymore.
> > > > > > >
> > > > > > > Signed-off-by: Rong Wang <w_angrong@163.com>
> > > > > >
> > > > > > There's in interest to keep extending vhost iotlb -
> > > > > > we should just switch over to iommufd which supports
> > > > > > this already.
> > > > >
> > > > > IOMMUFD is good but VFIO supports this before IOMMUFD. This patch
> > > > > makes vDPA run without a backporting of full IOMMUFD in the production
> > > > > environment. I think it's worth.
> > > > >
> > > > > If you worry about the extension, we can just use the vhost iotlb
> > > > > existing facility to do this.
> > > > >
> > > > > Thanks
> > > >
> > > > Btw, Wang Rong,
> > > >
> > > > It looks that Cindy does have the bandwidth in working for IOMMUFD support.
> > >
> > > I think you mean she does not.
> >
> > Yes, you are right.
> >
> > Thanks
>
> I need to discuss internally, and there may be someone else will do that.
>
> Thanks.

Ok, please let us know if you have a conclusion.

Thanks

>
> >
> > >
> > > > Do you have the will to do that?
> > > >
> > > > Thanks
> > >
>
>
>
>
> --
> 发自我的网易邮箱平板适配版
> <br/><br/><br/>
>
>
> ----- Original Message -----
> From: "Jason Wang" <jasowang@redhat.com>
> To: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: "Wang Rong" <w_angrong@163.com>, kvm@vger.kernel.org, virtualization@lists.linux.dev, netdev@vger.kernel.org, linux-kernel@vger.kernel.org, "Cindy Lu" <lulu@redhat.com>
> Sent: Fri, 29 Mar 2024 18:39:54 +0800
> Subject: Re: [PATCH v3] vhost/vdpa: Add MSI translation tables to iommu for software-managed MSI
>
> On Fri, Mar 29, 2024 at 5:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, Mar 29, 2024 at 11:55:50AM +0800, Jason Wang wrote:
> > > On Wed, Mar 27, 2024 at 5:08 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Thu, Mar 21, 2024 at 3:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:
> > > > > > From: Rong Wang <w_angrong@163.com>
> > > > > >
> > > > > > Once enable iommu domain for one device, the MSI
> > > > > > translation tables have to be there for software-managed MSI.
> > > > > > Otherwise, platform with software-managed MSI without an
> > > > > > irq bypass function, can not get a correct memory write event
> > > > > > from pcie, will not get irqs.
> > > > > > The solution is to obtain the MSI phy base address from
> > > > > > iommu reserved region, and set it to iommu MSI cookie,
> > > > > > then translation tables will be created while request irq.
> > > > > >
> > > > > > Change log
> > > > > > ----------
> > > > > >
> > > > > > v1->v2:
> > > > > > - add resv iotlb to avoid overlap mapping.
> > > > > > v2->v3:
> > > > > > - there is no need to export the iommu symbol anymore.
> > > > > >
> > > > > > Signed-off-by: Rong Wang <w_angrong@163.com>
> > > > >
> > > > > There's in interest to keep extending vhost iotlb -
> > > > > we should just switch over to iommufd which supports
> > > > > this already.
> > > >
> > > > IOMMUFD is good but VFIO supports this before IOMMUFD. This patch
> > > > makes vDPA run without a backporting of full IOMMUFD in the production
> > > > environment. I think it's worth.
> > > >
> > > > If you worry about the extension, we can just use the vhost iotlb
> > > > existing facility to do this.
> > > >
> > > > Thanks
> > >
> > > Btw, Wang Rong,
> > >
> > > It looks that Cindy does have the bandwidth in working for IOMMUFD support.
> >
> > I think you mean she does not.
>
> Yes, you are right.
>
> Thanks
>
> >
> > > Do you have the will to do that?
> > >
> > > Thanks
> >
diff mbox series

Patch

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index ba52d128aeb7..28b56b10372b 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -49,6 +49,7 @@  struct vhost_vdpa {
 	struct completion completion;
 	struct vdpa_device *vdpa;
 	struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
+	struct vhost_iotlb resv_iotlb;
 	struct device dev;
 	struct cdev cdev;
 	atomic_t opened;
@@ -247,6 +248,7 @@  static int _compat_vdpa_reset(struct vhost_vdpa *v)
 static int vhost_vdpa_reset(struct vhost_vdpa *v)
 {
 	v->in_batch = 0;
+	vhost_iotlb_reset(&v->resv_iotlb);
 	return _compat_vdpa_reset(v);
 }
 
@@ -1219,10 +1221,15 @@  static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
 	    msg->iova + msg->size - 1 > v->range.last)
 		return -EINVAL;
 
+	if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
+					msg->iova + msg->size - 1))
+		return -EINVAL;
+
 	if (vhost_iotlb_itree_first(iotlb, msg->iova,
 				    msg->iova + msg->size - 1))
 		return -EEXIST;
 
+
 	if (vdpa->use_va)
 		return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
 					 msg->uaddr, msg->perm);
@@ -1307,6 +1314,45 @@  static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
 	return vhost_chr_write_iter(dev, from);
 }
 
+static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev,
+	struct vhost_iotlb *resv_iotlb)
+{
+	struct list_head dev_resv_regions;
+	phys_addr_t resv_msi_base = 0;
+	struct iommu_resv_region *region;
+	int ret = 0;
+	bool with_sw_msi = false;
+	bool with_hw_msi = false;
+
+	INIT_LIST_HEAD(&dev_resv_regions);
+	iommu_get_resv_regions(dma_dev, &dev_resv_regions);
+
+	list_for_each_entry(region, &dev_resv_regions, list) {
+		ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
+				region->start + region->length - 1,
+				0, 0, NULL);
+		if (ret) {
+			vhost_iotlb_reset(resv_iotlb);
+			break;
+		}
+
+		if (region->type == IOMMU_RESV_MSI)
+			with_hw_msi = true;
+
+		if (region->type == IOMMU_RESV_SW_MSI) {
+			resv_msi_base = region->start;
+			with_sw_msi = true;
+		}
+	}
+
+	if (!ret && !with_hw_msi && with_sw_msi)
+		ret = iommu_get_msi_cookie(domain, resv_msi_base);
+
+	iommu_put_resv_regions(dma_dev, &dev_resv_regions);
+
+	return ret;
+}
+
 static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
 {
 	struct vdpa_device *vdpa = v->vdpa;
@@ -1335,11 +1381,16 @@  static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
 
 	ret = iommu_attach_device(v->domain, dma_dev);
 	if (ret)
-		goto err_attach;
+		goto err_alloc_domain;
 
-	return 0;
+	ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb);
+	if (ret)
+		goto err_attach_device;
 
-err_attach:
+	return 0;
+err_attach_device:
+	iommu_detach_device(v->domain, dma_dev);
+err_alloc_domain:
 	iommu_domain_free(v->domain);
 	v->domain = NULL;
 	return ret;
@@ -1595,6 +1646,8 @@  static int vhost_vdpa_probe(struct vdpa_device *vdpa)
 		goto err;
 	}
 
+	vhost_iotlb_init(&v->resv_iotlb, 0, 0);
+
 	r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
 	if (r)
 		goto err;