diff mbox series

[v2,1/2] vfio: introduce vfio_dma_rw to read/write a range of IOVAs

Message ID 20200115035303.12362-1-yan.y.zhao@intel.com (mailing list archive)
State New, archived
Headers show
Series use vfio_dma_rw to read/write IOVAs from CPU side | expand

Commit Message

Yan Zhao Jan. 15, 2020, 3:53 a.m. UTC
vfio_dma_rw will read/write a range of user space memory pointed to by
IOVA into/from a kernel buffer without pinning the user space memory.

TODO: mark the IOVAs to user space memory dirty if they are written in
vfio_dma_rw().

Cc: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 drivers/vfio/vfio.c             | 45 +++++++++++++++++++
 drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
 include/linux/vfio.h            |  5 +++
 3 files changed, 126 insertions(+)

Comments

Alex Williamson Jan. 15, 2020, 8:06 p.m. UTC | #1
On Tue, 14 Jan 2020 22:53:03 -0500
Yan Zhao <yan.y.zhao@intel.com> wrote:

> vfio_dma_rw will read/write a range of user space memory pointed to by
> IOVA into/from a kernel buffer without pinning the user space memory.
> 
> TODO: mark the IOVAs to user space memory dirty if they are written in
> vfio_dma_rw().
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> ---
>  drivers/vfio/vfio.c             | 45 +++++++++++++++++++
>  drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
>  include/linux/vfio.h            |  5 +++
>  3 files changed, 126 insertions(+)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c8482624ca34..8bd52bc841cf 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
>  }
>  EXPORT_SYMBOL(vfio_unpin_pages);
>  
> +/*
> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
> + * buffer without pinning the user space memory
> + * @dev [in]  : device
> + * @iova [in] : base IOVA of a user space buffer
> + * @data [in] : pointer to kernel buffer
> + * @len [in]  : kernel buffer length
> + * @write     : indicate read or write
> + * Return error code on failure or 0 on success.
> + */
> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> +		   size_t len, bool write)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	int ret = 0;
> +
> +	if (!dev || !data || len <= 0)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (!group)
> +		return -ENODEV;
> +
> +	ret = vfio_group_add_container_user(group);
> +	if (ret)
> +		goto out;
> +
> +	container = group->container;
> +	driver = container->iommu_driver;
> +
> +	if (likely(driver && driver->ops->dma_rw))
> +		ret = driver->ops->dma_rw(container->iommu_data,
> +					   iova, data, len, write);
> +	else
> +		ret = -ENOTTY;
> +
> +	vfio_group_try_dissolve_container(group);
> +out:
> +	vfio_group_put(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_dma_rw);
> +
>  static int vfio_register_iommu_notifier(struct vfio_group *group,
>  					unsigned long *events,
>  					struct notifier_block *nb)
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ada8e6cdb88..a2d850b97ae6 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -27,6 +27,7 @@
>  #include <linux/iommu.h>
>  #include <linux/module.h>
>  #include <linux/mm.h>
> +#include <linux/mmu_context.h>
>  #include <linux/rbtree.h>
>  #include <linux/sched/signal.h>
>  #include <linux/sched/mm.h>
> @@ -2326,6 +2327,80 @@ static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
>  	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
>  }
>  
> +static size_t vfio_iommu_type1_rw_dma_nopin(struct vfio_iommu *iommu,
> +					    dma_addr_t iova, void *data,
> +					    size_t count, bool write)

"_nopin"?  It might be pinned, but that's irrelevant to this interface.
Maybe "_chunk" as we're only trying to operate on the chunk of the whole
that fits within the next vfio_dma?  Also swapping rw_dma vs dma_rw,
pick one.

> +{
> +	struct mm_struct *mm;
> +	unsigned long vaddr;
> +	struct vfio_dma *dma;
> +	bool kthread = current->mm == NULL;
> +	size_t done = 0;
> +	size_t offset;
> +
> +	dma = vfio_find_dma(iommu, iova, 1);
> +	if (!dma)
> +		return 0;
> +
> +	if (write && !(dma->prot & IOMMU_WRITE))
> +		return 0;

Good catch, but users can also designate a mapping without read
permissions, in which case this interface should not allow read either.
Thanks,

Alex

> +
> +	mm = get_task_mm(dma->task);
> +
> +	if (!mm)
> +		return 0;
> +
> +	if (kthread)
> +		use_mm(mm);
> +	else if (current->mm != mm)
> +		goto out;
> +
> +	offset = iova - dma->iova;
> +
> +	if (count > dma->size - offset)
> +		count = dma->size - offset;
> +
> +	vaddr = dma->vaddr + offset;
> +
> +	if (write)
> +		done = __copy_to_user((void __user *)vaddr, data, count) ?
> +				       0 : count;
> +	else
> +		done = __copy_from_user(data, (void __user *)vaddr, count) ?
> +					0 : count;
> +
> +	if (kthread)
> +		unuse_mm(mm);
> +out:
> +	mmput(mm);
> +	return done;
> +}
> +
> +static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t iova,
> +				   void *data, size_t count, bool write)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	int ret = 0;
> +	size_t done = 0;
> +
> +	mutex_lock(&iommu->lock);
> +	while (count > 0) {
> +		done = vfio_iommu_type1_rw_dma_nopin(iommu, iova, data,
> +						   count, write);
> +		if (!done) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		count -= done;
> +		data += done;
> +		iova += done;
> +	}
> +
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.name			= "vfio-iommu-type1",
>  	.owner			= THIS_MODULE,
> @@ -2338,6 +2413,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.unpin_pages		= vfio_iommu_type1_unpin_pages,
>  	.register_notifier	= vfio_iommu_type1_register_notifier,
>  	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
> +	.dma_rw			= vfio_iommu_type1_dma_rw,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index e42a711a2800..962f76a2d668 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -82,6 +82,8 @@ struct vfio_iommu_driver_ops {
>  					     struct notifier_block *nb);
>  	int		(*unregister_notifier)(void *iommu_data,
>  					       struct notifier_block *nb);
> +	int		(*dma_rw)(void *iommu_data, dma_addr_t iova,
> +				   void *data, size_t count, bool write);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -107,6 +109,9 @@ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
>  extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
>  			    int npage);
>  
> +extern int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> +		       size_t len, bool write);
> +
>  /* each type has independent events */
>  enum vfio_notify_type {
>  	VFIO_IOMMU_NOTIFY = 0,
Mika Penttilä Jan. 16, 2020, 2:30 a.m. UTC | #2
On 15.1.2020 22.06, Alex Williamson wrote:
> On Tue, 14 Jan 2020 22:53:03 -0500
> Yan Zhao <yan.y.zhao@intel.com> wrote:
>
>> vfio_dma_rw will read/write a range of user space memory pointed to by
>> IOVA into/from a kernel buffer without pinning the user space memory.
>>
>> TODO: mark the IOVAs to user space memory dirty if they are written in
>> vfio_dma_rw().
>>
>> Cc: Kevin Tian <kevin.tian@intel.com>
>> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
>> ---
>>   drivers/vfio/vfio.c             | 45 +++++++++++++++++++
>>   drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
>>   include/linux/vfio.h            |  5 +++
>>   3 files changed, 126 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>> index c8482624ca34..8bd52bc841cf 100644
>> --- a/drivers/vfio/vfio.c
>> +++ b/drivers/vfio/vfio.c
>> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
>>   }
>>   EXPORT_SYMBOL(vfio_unpin_pages);
>>   
>> +/*
>> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
>> + * buffer without pinning the user space memory
>> + * @dev [in]  : device
>> + * @iova [in] : base IOVA of a user space buffer
>> + * @data [in] : pointer to kernel buffer
>> + * @len [in]  : kernel buffer length
>> + * @write     : indicate read or write
>> + * Return error code on failure or 0 on success.
>> + */
>> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
>> +		   size_t len, bool write)
>> +{
>> +	struct vfio_container *container;
>> +	struct vfio_group *group;
>> +	struct vfio_iommu_driver *driver;
>> +	int ret = 0;

Do you know the iova given to vfio_dma_rw() is indeed a gpa and not iova 
from a iommu mapping? So isn't it you actually assume all the guest is 
pinned,
like from device assignment?

Or who and how is the vfio mapping added before the vfio_dma_rw() ?

Thanks,
Mika
Alex Williamson Jan. 16, 2020, 2:59 a.m. UTC | #3
On Thu, 16 Jan 2020 02:30:52 +0000
Mika Penttilä <mika.penttila@nextfour.com> wrote:

> On 15.1.2020 22.06, Alex Williamson wrote:
> > On Tue, 14 Jan 2020 22:53:03 -0500
> > Yan Zhao <yan.y.zhao@intel.com> wrote:
> >  
> >> vfio_dma_rw will read/write a range of user space memory pointed to by
> >> IOVA into/from a kernel buffer without pinning the user space memory.
> >>
> >> TODO: mark the IOVAs to user space memory dirty if they are written in
> >> vfio_dma_rw().
> >>
> >> Cc: Kevin Tian <kevin.tian@intel.com>
> >> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> >> ---
> >>   drivers/vfio/vfio.c             | 45 +++++++++++++++++++
> >>   drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
> >>   include/linux/vfio.h            |  5 +++
> >>   3 files changed, 126 insertions(+)
> >>
> >> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >> index c8482624ca34..8bd52bc841cf 100644
> >> --- a/drivers/vfio/vfio.c
> >> +++ b/drivers/vfio/vfio.c
> >> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
> >>   }
> >>   EXPORT_SYMBOL(vfio_unpin_pages);
> >>   
> >> +/*
> >> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
> >> + * buffer without pinning the user space memory
> >> + * @dev [in]  : device
> >> + * @iova [in] : base IOVA of a user space buffer
> >> + * @data [in] : pointer to kernel buffer
> >> + * @len [in]  : kernel buffer length
> >> + * @write     : indicate read or write
> >> + * Return error code on failure or 0 on success.
> >> + */
> >> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> >> +		   size_t len, bool write)
> >> +{
> >> +	struct vfio_container *container;
> >> +	struct vfio_group *group;
> >> +	struct vfio_iommu_driver *driver;
> >> +	int ret = 0;  
> 
> Do you know the iova given to vfio_dma_rw() is indeed a gpa and not iova 
> from a iommu mapping? So isn't it you actually assume all the guest is 
> pinned,
> like from device assignment?
> 
> Or who and how is the vfio mapping added before the vfio_dma_rw() ?

vfio only knows about IOVAs, not GPAs.  It's possible that IOVAs are
identity mapped to the GPA space, but a VM with a vIOMMU would quickly
break any such assumption.  Pinning is also not required.  This access
is via the CPU, not the I/O device, so we don't require the memory to
be pinning and it potentially won't be for a non-IOMMU backed mediated
device.  The intention here is that via the mediation of an mdev
device, a vendor driver would already know IOVA ranges for the device
to access via the guest driver programming of the device.  Thanks,

Alex
Mika Penttilä Jan. 16, 2020, 3:15 a.m. UTC | #4
On 16.1.2020 4.59, Alex Williamson wrote:
> On Thu, 16 Jan 2020 02:30:52 +0000
> Mika Penttilä <mika.penttila@nextfour.com> wrote:
>
>> On 15.1.2020 22.06, Alex Williamson wrote:
>>> On Tue, 14 Jan 2020 22:53:03 -0500
>>> Yan Zhao <yan.y.zhao@intel.com> wrote:
>>>   
>>>> vfio_dma_rw will read/write a range of user space memory pointed to by
>>>> IOVA into/from a kernel buffer without pinning the user space memory.
>>>>
>>>> TODO: mark the IOVAs to user space memory dirty if they are written in
>>>> vfio_dma_rw().
>>>>
>>>> Cc: Kevin Tian <kevin.tian@intel.com>
>>>> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
>>>> ---
>>>>    drivers/vfio/vfio.c             | 45 +++++++++++++++++++
>>>>    drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
>>>>    include/linux/vfio.h            |  5 +++
>>>>    3 files changed, 126 insertions(+)
>>>>
>>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>>>> index c8482624ca34..8bd52bc841cf 100644
>>>> --- a/drivers/vfio/vfio.c
>>>> +++ b/drivers/vfio/vfio.c
>>>> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
>>>>    }
>>>>    EXPORT_SYMBOL(vfio_unpin_pages);
>>>>    
>>>> +/*
>>>> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
>>>> + * buffer without pinning the user space memory
>>>> + * @dev [in]  : device
>>>> + * @iova [in] : base IOVA of a user space buffer
>>>> + * @data [in] : pointer to kernel buffer
>>>> + * @len [in]  : kernel buffer length
>>>> + * @write     : indicate read or write
>>>> + * Return error code on failure or 0 on success.
>>>> + */
>>>> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
>>>> +		   size_t len, bool write)
>>>> +{
>>>> +	struct vfio_container *container;
>>>> +	struct vfio_group *group;
>>>> +	struct vfio_iommu_driver *driver;
>>>> +	int ret = 0;
>> Do you know the iova given to vfio_dma_rw() is indeed a gpa and not iova
>> from a iommu mapping? So isn't it you actually assume all the guest is
>> pinned,
>> like from device assignment?
>>
>> Or who and how is the vfio mapping added before the vfio_dma_rw() ?
> vfio only knows about IOVAs, not GPAs.  It's possible that IOVAs are
> identity mapped to the GPA space, but a VM with a vIOMMU would quickly
> break any such assumption.  Pinning is also not required.  This access
> is via the CPU, not the I/O device, so we don't require the memory to
> be pinning and it potentially won't be for a non-IOMMU backed mediated
> device.  The intention here is that via the mediation of an mdev
> device, a vendor driver would already know IOVA ranges for the device
> to access via the guest driver programming of the device.  Thanks,
>
> Alex

Thanks Alex... you mean IOVA is in the case of iommu already a 
iommu-translated address to a user space VA in VM host space?
How does it get to hold on that? What piece of meditation is responsible 
for this?

--Mika
Alex Williamson Jan. 16, 2020, 3:58 a.m. UTC | #5
On Thu, 16 Jan 2020 03:15:58 +0000
Mika Penttilä <mika.penttila@nextfour.com> wrote:

> On 16.1.2020 4.59, Alex Williamson wrote:
> > On Thu, 16 Jan 2020 02:30:52 +0000
> > Mika Penttilä <mika.penttila@nextfour.com> wrote:
> >  
> >> On 15.1.2020 22.06, Alex Williamson wrote:  
> >>> On Tue, 14 Jan 2020 22:53:03 -0500
> >>> Yan Zhao <yan.y.zhao@intel.com> wrote:
> >>>     
> >>>> vfio_dma_rw will read/write a range of user space memory pointed to by
> >>>> IOVA into/from a kernel buffer without pinning the user space memory.
> >>>>
> >>>> TODO: mark the IOVAs to user space memory dirty if they are written in
> >>>> vfio_dma_rw().
> >>>>
> >>>> Cc: Kevin Tian <kevin.tian@intel.com>
> >>>> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> >>>> ---
> >>>>    drivers/vfio/vfio.c             | 45 +++++++++++++++++++
> >>>>    drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
> >>>>    include/linux/vfio.h            |  5 +++
> >>>>    3 files changed, 126 insertions(+)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >>>> index c8482624ca34..8bd52bc841cf 100644
> >>>> --- a/drivers/vfio/vfio.c
> >>>> +++ b/drivers/vfio/vfio.c
> >>>> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
> >>>>    }
> >>>>    EXPORT_SYMBOL(vfio_unpin_pages);
> >>>>    
> >>>> +/*
> >>>> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
> >>>> + * buffer without pinning the user space memory
> >>>> + * @dev [in]  : device
> >>>> + * @iova [in] : base IOVA of a user space buffer
> >>>> + * @data [in] : pointer to kernel buffer
> >>>> + * @len [in]  : kernel buffer length
> >>>> + * @write     : indicate read or write
> >>>> + * Return error code on failure or 0 on success.
> >>>> + */
> >>>> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> >>>> +		   size_t len, bool write)
> >>>> +{
> >>>> +	struct vfio_container *container;
> >>>> +	struct vfio_group *group;
> >>>> +	struct vfio_iommu_driver *driver;
> >>>> +	int ret = 0;  
> >> Do you know the iova given to vfio_dma_rw() is indeed a gpa and not iova
> >> from a iommu mapping? So isn't it you actually assume all the guest is
> >> pinned,
> >> like from device assignment?
> >>
> >> Or who and how is the vfio mapping added before the vfio_dma_rw() ?  
> > vfio only knows about IOVAs, not GPAs.  It's possible that IOVAs are
> > identity mapped to the GPA space, but a VM with a vIOMMU would quickly
> > break any such assumption.  Pinning is also not required.  This access
> > is via the CPU, not the I/O device, so we don't require the memory to
> > be pinning and it potentially won't be for a non-IOMMU backed mediated
> > device.  The intention here is that via the mediation of an mdev
> > device, a vendor driver would already know IOVA ranges for the device
> > to access via the guest driver programming of the device.  Thanks,
> >
> > Alex  
> 
> Thanks Alex... you mean IOVA is in the case of iommu already a 
> iommu-translated address to a user space VA in VM host space?

The user (QEMU in the case of device assignment) performs ioctls to map
user VAs to IOVAs for the device.  With IOMMU backing the VAs are
pinned to get HPA and the IOVA to HPA mappings are programmed into the
IOMMU.  Thus the device accesses the IOVA to get to the HPA, which is
the backing for the VA.  In this case we're simply using the IOVA to
lookup the VA and access it with the CPU directly.  The IOMMU isn't
involved, but we're still performing an access as if we were the device
doing a DMA. Let me know if that doesn't answer your question.

> How does it get to hold on that? What piece of meditation is responsible 
> for this?

It's device specific.  The mdev vendor driver is mediating a specific
hardware device where user accesses to MMIO on the device configures
DMA targets.  The mediation needs to trap those accesses in order to
pin page and program the real hardware with real physical addresses (be
they HPA or host-IOVAs depending on the host IOMMU config) to perform
those DMAs.  For cases where the CPU might choose to perform some sort
of virtual DMA on behalf of the device itself, this interface would be
used.  Thanks,

Alex
Yan Zhao Jan. 16, 2020, 5:32 a.m. UTC | #6
On Thu, Jan 16, 2020 at 04:06:38AM +0800, Alex Williamson wrote:
> On Tue, 14 Jan 2020 22:53:03 -0500
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > vfio_dma_rw will read/write a range of user space memory pointed to by
> > IOVA into/from a kernel buffer without pinning the user space memory.
> > 
> > TODO: mark the IOVAs to user space memory dirty if they are written in
> > vfio_dma_rw().
> > 
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > ---
> >  drivers/vfio/vfio.c             | 45 +++++++++++++++++++
> >  drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
> >  include/linux/vfio.h            |  5 +++
> >  3 files changed, 126 insertions(+)
> > 
> > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> > index c8482624ca34..8bd52bc841cf 100644
> > --- a/drivers/vfio/vfio.c
> > +++ b/drivers/vfio/vfio.c
> > @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
> >  }
> >  EXPORT_SYMBOL(vfio_unpin_pages);
> >  
> > +/*
> > + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
> > + * buffer without pinning the user space memory
> > + * @dev [in]  : device
> > + * @iova [in] : base IOVA of a user space buffer
> > + * @data [in] : pointer to kernel buffer
> > + * @len [in]  : kernel buffer length
> > + * @write     : indicate read or write
> > + * Return error code on failure or 0 on success.
> > + */
> > +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> > +		   size_t len, bool write)
> > +{
> > +	struct vfio_container *container;
> > +	struct vfio_group *group;
> > +	struct vfio_iommu_driver *driver;
> > +	int ret = 0;
> > +
> > +	if (!dev || !data || len <= 0)
> > +		return -EINVAL;
> > +
> > +	group = vfio_group_get_from_dev(dev);
> > +	if (!group)
> > +		return -ENODEV;
> > +
> > +	ret = vfio_group_add_container_user(group);
> > +	if (ret)
> > +		goto out;
> > +
> > +	container = group->container;
> > +	driver = container->iommu_driver;
> > +
> > +	if (likely(driver && driver->ops->dma_rw))
> > +		ret = driver->ops->dma_rw(container->iommu_data,
> > +					   iova, data, len, write);
> > +	else
> > +		ret = -ENOTTY;
> > +
> > +	vfio_group_try_dissolve_container(group);
> > +out:
> > +	vfio_group_put(group);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL(vfio_dma_rw);
> > +
> >  static int vfio_register_iommu_notifier(struct vfio_group *group,
> >  					unsigned long *events,
> >  					struct notifier_block *nb)
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 2ada8e6cdb88..a2d850b97ae6 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/iommu.h>
> >  #include <linux/module.h>
> >  #include <linux/mm.h>
> > +#include <linux/mmu_context.h>
> >  #include <linux/rbtree.h>
> >  #include <linux/sched/signal.h>
> >  #include <linux/sched/mm.h>
> > @@ -2326,6 +2327,80 @@ static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
> >  	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
> >  }
> >  
> > +static size_t vfio_iommu_type1_rw_dma_nopin(struct vfio_iommu *iommu,
> > +					    dma_addr_t iova, void *data,
> > +					    size_t count, bool write)
> 
> "_nopin"?  It might be pinned, but that's irrelevant to this interface.
> Maybe "_chunk" as we're only trying to operate on the chunk of the whole
> that fits within the next vfio_dma?  Also swapping rw_dma vs dma_rw,
> pick one.
>
got it!

> > +{
> > +	struct mm_struct *mm;
> > +	unsigned long vaddr;
> > +	struct vfio_dma *dma;
> > +	bool kthread = current->mm == NULL;
> > +	size_t done = 0;
> > +	size_t offset;
> > +
> > +	dma = vfio_find_dma(iommu, iova, 1);
> > +	if (!dma)
> > +		return 0;
> > +
> > +	if (write && !(dma->prot & IOMMU_WRITE))
> > +		return 0;
> 
> Good catch, but users can also designate a mapping without read
> permissions, in which case this interface should not allow read either.
> Thanks,
>
yes, will add it too. thanks :)

Yan

> > +
> > +	mm = get_task_mm(dma->task);
> > +
> > +	if (!mm)
> > +		return 0;
> > +
> > +	if (kthread)
> > +		use_mm(mm);
> > +	else if (current->mm != mm)
> > +		goto out;
> > +
> > +	offset = iova - dma->iova;
> > +
> > +	if (count > dma->size - offset)
> > +		count = dma->size - offset;
> > +
> > +	vaddr = dma->vaddr + offset;
> > +
> > +	if (write)
> > +		done = __copy_to_user((void __user *)vaddr, data, count) ?
> > +				       0 : count;
> > +	else
> > +		done = __copy_from_user(data, (void __user *)vaddr, count) ?
> > +					0 : count;
> > +
> > +	if (kthread)
> > +		unuse_mm(mm);
> > +out:
> > +	mmput(mm);
> > +	return done;
> > +}
> > +
> > +static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t iova,
> > +				   void *data, size_t count, bool write)
> > +{
> > +	struct vfio_iommu *iommu = iommu_data;
> > +	int ret = 0;
> > +	size_t done = 0;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	while (count > 0) {
> > +		done = vfio_iommu_type1_rw_dma_nopin(iommu, iova, data,
> > +						   count, write);
> > +		if (!done) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +
> > +		count -= done;
> > +		data += done;
> > +		iova += done;
> > +	}
> > +
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> >  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> >  	.name			= "vfio-iommu-type1",
> >  	.owner			= THIS_MODULE,
> > @@ -2338,6 +2413,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> >  	.unpin_pages		= vfio_iommu_type1_unpin_pages,
> >  	.register_notifier	= vfio_iommu_type1_register_notifier,
> >  	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
> > +	.dma_rw			= vfio_iommu_type1_dma_rw,
> >  };
> >  
> >  static int __init vfio_iommu_type1_init(void)
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > index e42a711a2800..962f76a2d668 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -82,6 +82,8 @@ struct vfio_iommu_driver_ops {
> >  					     struct notifier_block *nb);
> >  	int		(*unregister_notifier)(void *iommu_data,
> >  					       struct notifier_block *nb);
> > +	int		(*dma_rw)(void *iommu_data, dma_addr_t iova,
> > +				   void *data, size_t count, bool write);
> >  };
> >  
> >  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> > @@ -107,6 +109,9 @@ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> >  extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
> >  			    int npage);
> >  
> > +extern int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> > +		       size_t len, bool write);
> > +
> >  /* each type has independent events */
> >  enum vfio_notify_type {
> >  	VFIO_IOMMU_NOTIFY = 0,
> 
> _______________________________________________
> intel-gvt-dev mailing list
> intel-gvt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev
diff mbox series

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index c8482624ca34..8bd52bc841cf 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1961,6 +1961,51 @@  int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
+/*
+ * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
+ * buffer without pinning the user space memory
+ * @dev [in]  : device
+ * @iova [in] : base IOVA of a user space buffer
+ * @data [in] : pointer to kernel buffer
+ * @len [in]  : kernel buffer length
+ * @write     : indicate read or write
+ * Return error code on failure or 0 on success.
+ */
+int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
+		   size_t len, bool write)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	int ret = 0;
+
+	if (!dev || !data || len <= 0)
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (!group)
+		return -ENODEV;
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		goto out;
+
+	container = group->container;
+	driver = container->iommu_driver;
+
+	if (likely(driver && driver->ops->dma_rw))
+		ret = driver->ops->dma_rw(container->iommu_data,
+					   iova, data, len, write);
+	else
+		ret = -ENOTTY;
+
+	vfio_group_try_dissolve_container(group);
+out:
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_dma_rw);
+
 static int vfio_register_iommu_notifier(struct vfio_group *group,
 					unsigned long *events,
 					struct notifier_block *nb)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2ada8e6cdb88..a2d850b97ae6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -27,6 +27,7 @@ 
 #include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/mmu_context.h>
 #include <linux/rbtree.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
@@ -2326,6 +2327,80 @@  static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
 	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
 }
 
+static size_t vfio_iommu_type1_rw_dma_nopin(struct vfio_iommu *iommu,
+					    dma_addr_t iova, void *data,
+					    size_t count, bool write)
+{
+	struct mm_struct *mm;
+	unsigned long vaddr;
+	struct vfio_dma *dma;
+	bool kthread = current->mm == NULL;
+	size_t done = 0;
+	size_t offset;
+
+	dma = vfio_find_dma(iommu, iova, 1);
+	if (!dma)
+		return 0;
+
+	if (write && !(dma->prot & IOMMU_WRITE))
+		return 0;
+
+	mm = get_task_mm(dma->task);
+
+	if (!mm)
+		return 0;
+
+	if (kthread)
+		use_mm(mm);
+	else if (current->mm != mm)
+		goto out;
+
+	offset = iova - dma->iova;
+
+	if (count > dma->size - offset)
+		count = dma->size - offset;
+
+	vaddr = dma->vaddr + offset;
+
+	if (write)
+		done = __copy_to_user((void __user *)vaddr, data, count) ?
+				       0 : count;
+	else
+		done = __copy_from_user(data, (void __user *)vaddr, count) ?
+					0 : count;
+
+	if (kthread)
+		unuse_mm(mm);
+out:
+	mmput(mm);
+	return done;
+}
+
+static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t iova,
+				   void *data, size_t count, bool write)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	int ret = 0;
+	size_t done = 0;
+
+	mutex_lock(&iommu->lock);
+	while (count > 0) {
+		done = vfio_iommu_type1_rw_dma_nopin(iommu, iova, data,
+						   count, write);
+		if (!done) {
+			ret = -EFAULT;
+			break;
+		}
+
+		count -= done;
+		data += done;
+		iova += done;
+	}
+
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.name			= "vfio-iommu-type1",
 	.owner			= THIS_MODULE,
@@ -2338,6 +2413,7 @@  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
 	.register_notifier	= vfio_iommu_type1_register_notifier,
 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
+	.dma_rw			= vfio_iommu_type1_dma_rw,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e42a711a2800..962f76a2d668 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -82,6 +82,8 @@  struct vfio_iommu_driver_ops {
 					     struct notifier_block *nb);
 	int		(*unregister_notifier)(void *iommu_data,
 					       struct notifier_block *nb);
+	int		(*dma_rw)(void *iommu_data, dma_addr_t iova,
+				   void *data, size_t count, bool write);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -107,6 +109,9 @@  extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
 extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
 			    int npage);
 
+extern int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
+		       size_t len, bool write);
+
 /* each type has independent events */
 enum vfio_notify_type {
 	VFIO_IOMMU_NOTIFY = 0,