diff mbox

[RFC,30/30] vfio: Allow to bind foreign task

Message ID 20170227195441.5170-31-jean-philippe.brucker@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jean-Philippe Brucker Feb. 27, 2017, 7:54 p.m. UTC
Let the process that owns the device create an address space bond on
behalf of another process. We add a pid argument to the BIND_TASK ioctl,
allowing the caller to bind a foreign task. The expected program flow in
this case is:

* Process A creates the VFIO context and initializes the device.
* Process B asks A to bind its address space.
* Process A issues an ioctl to the VFIO device fd with BIND_TASK(pid).
  It may communicate the given PASID back to process B or keep track of it
  internally.
* Process B asks A to perform transactions on its virtual address.
* Process A launches transaction tagged with the given PASID.

Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
---
 drivers/vfio/vfio.c       | 35 +++++++++++++++++++++++++++++++++--
 include/uapi/linux/vfio.h | 15 +++++++++++++++
 2 files changed, 48 insertions(+), 2 deletions(-)

Comments

Alex Williamson Feb. 28, 2017, 3:54 a.m. UTC | #1
On Mon, 27 Feb 2017 19:54:41 +0000
Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote:

> Let the process that owns the device create an address space bond on
> behalf of another process. We add a pid argument to the BIND_TASK ioctl,
> allowing the caller to bind a foreign task. The expected program flow in
> this case is:
> 
> * Process A creates the VFIO context and initializes the device.
> * Process B asks A to bind its address space.
> * Process A issues an ioctl to the VFIO device fd with BIND_TASK(pid).
>   It may communicate the given PASID back to process B or keep track of it
>   internally.
> * Process B asks A to perform transactions on its virtual address.
> * Process A launches transaction tagged with the given PASID.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> ---
>  drivers/vfio/vfio.c       | 35 +++++++++++++++++++++++++++++++++--
>  include/uapi/linux/vfio.h | 15 +++++++++++++++
>  2 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c4505d8f4c61..ecc5d07e3dbb 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -26,6 +26,7 @@
>  #include <linux/module.h>
>  #include <linux/mutex.h>
>  #include <linux/pci.h>
> +#include <linux/ptrace.h>
>  #include <linux/rwsem.h>
>  #include <linux/sched.h>
>  #include <linux/slab.h>
> @@ -1660,7 +1661,7 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
>  	struct vfio_device_svm svm;
>  	struct vfio_task *vfio_task;
>  
> -	minsz = offsetofend(struct vfio_device_svm, pasid);
> +	minsz = offsetofend(struct vfio_device_svm, pid);


This is only the minsz if flags includes VFIO_SVM_PID, right?
Otherwise this isn't a backward compatible change (granted you're
proposing both in the same series), userspace built against 29/30
won't work against 30/30.

>  
>  	if (copy_from_user(&svm, (void __user *)arg, minsz))
>  		return -EFAULT;
> @@ -1669,9 +1670,39 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
>  		return -EINVAL;
>  
>  	if (cmd == VFIO_DEVICE_BIND_TASK) {
> -		struct task_struct *task = current;
> +		struct mm_struct *mm;
> +		struct task_struct *task;
> +
> +		if (svm.flags & ~VFIO_SVM_PID)
> +			return -EINVAL;

29/30 never validated flags, so theoretically userspace compiled
against 29/30 could have put anything in flags and it would have
worked, no longer the case here.

> +
> +		if (svm.flags & VFIO_SVM_PID) {
> +			rcu_read_lock();
> +			task = find_task_by_vpid(svm.pid);
> +			if (task)
> +				get_task_struct(task);
> +			rcu_read_unlock();
> +			if (!task)
> +				return -ESRCH;
> +
> +			/*
> +			 * Ensure process has RW access on the task's mm
> +			 * FIXME:
> +			 * - I think this ought to be in the IOMMU API
> +			 * - I'm assuming permission is never revoked during the
> +			 *   task's lifetime. Might be mistaken.
> +			 */
> +			mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
> +			if (!mm || IS_ERR(mm))
> +				return IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> +			mmput(mm);
> +		} else {
> +			get_task_struct(current);
> +			task = current;
> +		}
>  
>  		ret = iommu_bind_task(device->dev, task, &svm.pasid, 0, NULL);
> +		put_task_struct(task);
>  		if (ret)
>  			return ret;
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 3fe4197a5ea0..41ae8a231d42 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -415,7 +415,9 @@ struct vfio_device_svm {
>  	__u32	flags;
>  #define VFIO_SVM_PASID_RELEASE_FLUSHED	(1 << 0)
>  #define VFIO_SVM_PASID_RELEASE_CLEAN	(1 << 1)
> +#define VFIO_SVM_PID			(1 << 2)
>  	__u32	pasid;
> +	__u32	pid;
>  };
>  /*
>   * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
> @@ -432,6 +434,19 @@ struct vfio_device_svm {
>   * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
>   * ID is unique to a device.
>   *
> + * VFIO_SVM_PID: bind task @pid instead of current task. The shared address
> + *        space identified by @pasid is that of task identified by @pid.
> + *
> + *        Given that the caller owns the device, setting this flag grants the
> + *        caller read and write permissions on the entire address space of
> + *        foreign task described by @pid. Therefore, permission to perform the
> + *        bind operation on a foreign process is governed by the ptrace access
> + *        mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) for more
> + *        information.
> + *
> + *        If the VFIO_SVM_PID flag is not set, @pid is unused and it is the
> + *        current task that is bound to the device.
> + *
>   * The bond between device and process must be removed with
>   * VFIO_DEVICE_UNBIND_TASK before exiting.
>   *

BTW, nice commit logs throughout this series, I probably need to read
through them a few more times to really digest it all.  AIUI, the VFIO
support here is really only useful for basic userspace drivers, I don't
see how we could take advantage of it for a VM use case where the guest
manages the PASID space for a domain.  Perhaps it hasn't spent enough
cycles bouncing around in my head yet.  Thanks,

Alex
Tian, Kevin Feb. 28, 2017, 6:43 a.m. UTC | #2
> From: Alex Williamson
> Sent: Tuesday, February 28, 2017 11:54 AM
> 
> On Mon, 27 Feb 2017 19:54:41 +0000
> Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote:
> 
[...]
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 3fe4197a5ea0..41ae8a231d42 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -415,7 +415,9 @@ struct vfio_device_svm {
> >  	__u32	flags;
> >  #define VFIO_SVM_PASID_RELEASE_FLUSHED	(1 << 0)
> >  #define VFIO_SVM_PASID_RELEASE_CLEAN	(1 << 1)
> > +#define VFIO_SVM_PID			(1 << 2)
> >  	__u32	pasid;
> > +	__u32	pid;
> >  };
> >  /*
> >   * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
> > @@ -432,6 +434,19 @@ struct vfio_device_svm {
> >   * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
> >   * ID is unique to a device.
> >   *
> > + * VFIO_SVM_PID: bind task @pid instead of current task. The shared address
> > + *        space identified by @pasid is that of task identified by @pid.
> > + *
> > + *        Given that the caller owns the device, setting this flag grants the
> > + *        caller read and write permissions on the entire address space of
> > + *        foreign task described by @pid. Therefore, permission to perform the
> > + *        bind operation on a foreign process is governed by the ptrace access
> > + *        mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) for
> more
> > + *        information.
> > + *
> > + *        If the VFIO_SVM_PID flag is not set, @pid is unused and it is the
> > + *        current task that is bound to the device.
> > + *
> >   * The bond between device and process must be removed with
> >   * VFIO_DEVICE_UNBIND_TASK before exiting.
> >   *
> 
> BTW, nice commit logs throughout this series, I probably need to read
> through them a few more times to really digest it all.  AIUI, the VFIO
> support here is really only useful for basic userspace drivers, I don't
> see how we could take advantage of it for a VM use case where the guest
> manages the PASID space for a domain.  Perhaps it hasn't spent enough
> cycles bouncing around in my head yet.  Thanks,
> 

Current definition doesn't work with virtualization usage, at least on Intel
VT-d. To enable virtualized SVM within a VM, architecturally VT-d needs
be in a nested mode - go through guest PASID table to find guest CR3, 
use guest CR3 as 1st level translation for GVA->GPA and then use 2nd 
level translation for GPA->HPA. PASID table is fully allocated/managed
by VM. Within the translation process each guest pointer (PASID or 1st 
level paging structures) is treated as GPA which also goes through 2nd 
level translation. I didn't read ARM SMMU spec yet, but hope the basic 
mechanism stays similar.

Here we need an API which allows Qemu vIOMMU to bind guest PASID
table pointer and enable nested mode for target device in underlying 
IOMMU hardware, while proposed API is only for user space driver 
regarding to binding a specific host address space.

Based on above requirement difference, Alex, do you prefer to 
introducing one API covering both usages or separate APIs for their
own purposes?

btw Yi is working on a SVM virtualization prototype based on Intel 
VT-d. I hope soon he will send out a RFC so we can align the high
level API requirement better. :-)

Thanks
Kevin
Jean-Philippe Brucker Feb. 28, 2017, 3:22 p.m. UTC | #3
Hi Kevin,

On Tue, Feb 28, 2017 at 06:43:31AM +0000, Tian, Kevin wrote:
> > From: Alex Williamson
> > Sent: Tuesday, February 28, 2017 11:54 AM
> > 
> > On Mon, 27 Feb 2017 19:54:41 +0000
> > Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote:
> > 
> [...]
> > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > index 3fe4197a5ea0..41ae8a231d42 100644
> > > --- a/include/uapi/linux/vfio.h
> > > +++ b/include/uapi/linux/vfio.h
> > > @@ -415,7 +415,9 @@ struct vfio_device_svm {
> > >  	__u32	flags;
> > >  #define VFIO_SVM_PASID_RELEASE_FLUSHED	(1 << 0)
> > >  #define VFIO_SVM_PASID_RELEASE_CLEAN	(1 << 1)
> > > +#define VFIO_SVM_PID			(1 << 2)
> > >  	__u32	pasid;
> > > +	__u32	pid;
> > >  };
> > >  /*
> > >   * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
> > > @@ -432,6 +434,19 @@ struct vfio_device_svm {
> > >   * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
> > >   * ID is unique to a device.
> > >   *
> > > + * VFIO_SVM_PID: bind task @pid instead of current task. The shared address
> > > + *        space identified by @pasid is that of task identified by @pid.
> > > + *
> > > + *        Given that the caller owns the device, setting this flag grants the
> > > + *        caller read and write permissions on the entire address space of
> > > + *        foreign task described by @pid. Therefore, permission to perform the
> > > + *        bind operation on a foreign process is governed by the ptrace access
> > > + *        mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) for
> > more
> > > + *        information.
> > > + *
> > > + *        If the VFIO_SVM_PID flag is not set, @pid is unused and it is the
> > > + *        current task that is bound to the device.
> > > + *
> > >   * The bond between device and process must be removed with
> > >   * VFIO_DEVICE_UNBIND_TASK before exiting.
> > >   *
> > 
> > BTW, nice commit logs throughout this series, I probably need to read
> > through them a few more times to really digest it all.  AIUI, the VFIO
> > support here is really only useful for basic userspace drivers, I don't
> > see how we could take advantage of it for a VM use case where the guest
> > manages the PASID space for a domain.  Perhaps it hasn't spent enough
> > cycles bouncing around in my head yet.  Thanks,
> > 
> 
> Current definition doesn't work with virtualization usage, at least on Intel
> VT-d. To enable virtualized SVM within a VM, architecturally VT-d needs
> be in a nested mode - go through guest PASID table to find guest CR3, 
> use guest CR3 as 1st level translation for GVA->GPA and then use 2nd 
> level translation for GPA->HPA. PASID table is fully allocated/managed
> by VM. Within the translation process each guest pointer (PASID or 1st 
> level paging structures) is treated as GPA which also goes through 2nd 
> level translation. I didn't read ARM SMMU spec yet, but hope the basic 
> mechanism stays similar.

If I understand correctly, it is very similar on ARM SMMU, where we have
two stages of translation. Stage-1 is GVA->GPA and stage-2 is GPA->HPA,
with all intermediate tables of stage-1 translation obtained via stage-2
as well. The SMMU holds stage-1 paging structure in the PASID tables.
 
> Here we need an API which allows Qemu vIOMMU to bind guest PASID
> table pointer and enable nested mode for target device in underlying 
> IOMMU hardware, while proposed API is only for user space driver 
> regarding to binding a specific host address space.
> 
> Based on above requirement difference, Alex, do you prefer to 
> introducing one API covering both usages or separate APIs for their
> own purposes?
> 
> btw Yi is working on a SVM virtualization prototype based on Intel 
> VT-d. I hope soon he will send out a RFC so we can align the high
> level API requirement better. :-)

For IO virtualization on ARM, I'm currently working on a generic
para-virtualized IOMMU, where the IOMMU presented to the VM is different
from the hardware SMMU (I'll try not to go into details here, to avoid
derailing the discussion too much). For virtual SVM, the PASID table
format would be different between vIOMMU and pIOMMU, but the page table
formats would be the same as the MMU.

The VFIO interface for this would therefore have to be more fine-grained
than passing the whole PASID table. And could be implemented by
extending the interface proposed here.

User passes an opaque architecture-specific structure containing page
table format and pgd via the BIND_TASK VFIO ioctl. And the pIOMMU can
manage its own PASID tables, pointing to VM page tables. I was thinking
of letting the physical IOMMU handle PASID allocation and return it to
the VM via BIND_TASK instead of letting the guest do it, but that's more
of an implementation detail.

When talking about SVM virtualization, there also is the case where the
VMM wants to avoid pinning all of the guest RAM prior to assigning
devices to a VM. In short, stage-2 SVM, where a device fault is handled
by KVM to map GPA->HPA. I think the interface presented in this patch
could also be reused, but there wouldn't be a lot of overlapping. The
PASID wouldn't be used, and we'd need to pass an eventfd or another
mechanism that allows KVM or the VMM to handle faults. This makes me
more confident that the name "VFIO_IOMMU_SVM_BIND" might be more
suitable than "VFIO_IOMMU_BIND_TASK".

To summarize, I think that this API can be reused when implementing a
para-virtualized IOMMU. But for the "full" virtualization case, a
somewhat orthogonal API would be needed. The fault reporting
infrastructure would most likely be common. So I don't think that this
proposal will collide with the SVM virtualization work for VT-d.

Thanks,
Jean-Philippe
Tian, Kevin March 1, 2017, 8:02 a.m. UTC | #4
> From: Jean-Philippe Brucker [mailto:Jean-Philippe.Brucker@arm.com]
> Sent: Tuesday, February 28, 2017 11:23 PM
> 
> Hi Kevin,
> 
> On Tue, Feb 28, 2017 at 06:43:31AM +0000, Tian, Kevin wrote:
> > > From: Alex Williamson
> > > Sent: Tuesday, February 28, 2017 11:54 AM
> > >
> > > On Mon, 27 Feb 2017 19:54:41 +0000
> > > Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote:
> > >
> > [...]
> > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > > index 3fe4197a5ea0..41ae8a231d42 100644
> > > > --- a/include/uapi/linux/vfio.h
> > > > +++ b/include/uapi/linux/vfio.h
> > > > @@ -415,7 +415,9 @@ struct vfio_device_svm {
> > > >  	__u32	flags;
> > > >  #define VFIO_SVM_PASID_RELEASE_FLUSHED	(1 << 0)
> > > >  #define VFIO_SVM_PASID_RELEASE_CLEAN	(1 << 1)
> > > > +#define VFIO_SVM_PID			(1 << 2)
> > > >  	__u32	pasid;
> > > > +	__u32	pid;
> > > >  };
> > > >  /*
> > > >   * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
> > > > @@ -432,6 +434,19 @@ struct vfio_device_svm {
> > > >   * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
> > > >   * ID is unique to a device.
> > > >   *
> > > > + * VFIO_SVM_PID: bind task @pid instead of current task. The shared address
> > > > + *        space identified by @pasid is that of task identified by @pid.
> > > > + *
> > > > + *        Given that the caller owns the device, setting this flag grants the
> > > > + *        caller read and write permissions on the entire address space of
> > > > + *        foreign task described by @pid. Therefore, permission to perform the
> > > > + *        bind operation on a foreign process is governed by the ptrace access
> > > > + *        mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2)
> for
> > > more
> > > > + *        information.
> > > > + *
> > > > + *        If the VFIO_SVM_PID flag is not set, @pid is unused and it is the
> > > > + *        current task that is bound to the device.
> > > > + *
> > > >   * The bond between device and process must be removed with
> > > >   * VFIO_DEVICE_UNBIND_TASK before exiting.
> > > >   *
> > >
> > > BTW, nice commit logs throughout this series, I probably need to read
> > > through them a few more times to really digest it all.  AIUI, the VFIO
> > > support here is really only useful for basic userspace drivers, I don't
> > > see how we could take advantage of it for a VM use case where the guest
> > > manages the PASID space for a domain.  Perhaps it hasn't spent enough
> > > cycles bouncing around in my head yet.  Thanks,
> > >
> >
> > Current definition doesn't work with virtualization usage, at least on Intel
> > VT-d. To enable virtualized SVM within a VM, architecturally VT-d needs
> > be in a nested mode - go through guest PASID table to find guest CR3,
> > use guest CR3 as 1st level translation for GVA->GPA and then use 2nd
> > level translation for GPA->HPA. PASID table is fully allocated/managed
> > by VM. Within the translation process each guest pointer (PASID or 1st
> > level paging structures) is treated as GPA which also goes through 2nd
> > level translation. I didn't read ARM SMMU spec yet, but hope the basic
> > mechanism stays similar.
> 
> If I understand correctly, it is very similar on ARM SMMU, where we have
> two stages of translation. Stage-1 is GVA->GPA and stage-2 is GPA->HPA,
> with all intermediate tables of stage-1 translation obtained via stage-2
> as well. The SMMU holds stage-1 paging structure in the PASID tables.

Good to know. :-)

> 
> > Here we need an API which allows Qemu vIOMMU to bind guest PASID
> > table pointer and enable nested mode for target device in underlying
> > IOMMU hardware, while proposed API is only for user space driver
> > regarding to binding a specific host address space.
> >
> > Based on above requirement difference, Alex, do you prefer to
> > introducing one API covering both usages or separate APIs for their
> > own purposes?
> >
> > btw Yi is working on a SVM virtualization prototype based on Intel
> > VT-d. I hope soon he will send out a RFC so we can align the high
> > level API requirement better. :-)
> 
> For IO virtualization on ARM, I'm currently working on a generic
> para-virtualized IOMMU, where the IOMMU presented to the VM is different
> from the hardware SMMU (I'll try not to go into details here, to avoid
> derailing the discussion too much). For virtual SVM, the PASID table
> format would be different between vIOMMU and pIOMMU, but the page table
> formats would be the same as the MMU.

When you say 'generic para-virtualized IOMMU', does 'generic' apply
to ARM only (cross different ARM SMMU versions), or apply to other
vendors (e.g. Intel, AMD, etc.)? Just want to touch base your high
level idea here. 

> 
> The VFIO interface for this would therefore have to be more fine-grained
> than passing the whole PASID table. And could be implemented by
> extending the interface proposed here.
> 
> User passes an opaque architecture-specific structure containing page
> table format and pgd via the BIND_TASK VFIO ioctl. And the pIOMMU can
> manage its own PASID tables, pointing to VM page tables. I was thinking
> of letting the physical IOMMU handle PASID allocation and return it to
> the VM via BIND_TASK instead of letting the guest do it, but that's more
> of an implementation detail.

I can see some value of doing this way... anyway not distract this thread.
Let's discuss detail when you send out that RFC in future thread.

> 
> When talking about SVM virtualization, there also is the case where the
> VMM wants to avoid pinning all of the guest RAM prior to assigning
> devices to a VM. In short, stage-2 SVM, where a device fault is handled
> by KVM to map GPA->HPA. I think the interface presented in this patch
> could also be reused, but there wouldn't be a lot of overlapping. The
> PASID wouldn't be used, and we'd need to pass an eventfd or another
> mechanism that allows KVM or the VMM to handle faults. This makes me
> more confident that the name "VFIO_IOMMU_SVM_BIND" might be more
> suitable than "VFIO_IOMMU_BIND_TASK".

yes SVM_BIND sounds more general.

> 
> To summarize, I think that this API can be reused when implementing a
> para-virtualized IOMMU. But for the "full" virtualization case, a
> somewhat orthogonal API would be needed. The fault reporting
> infrastructure would most likely be common. So I don't think that this
> proposal will collide with the SVM virtualization work for VT-d.
> 

Thanks for sharing your thought. Even for 'full' virtualization e.g. in 
our case, we may also reuse the same API if you would like to go
with new name, which is generic enough to cover all potential usages
with sub-ops defined to differentiate (bind to host process, bind to 
guest process, bind to guest PASID table, etc).

Thanks
Kevin
Jean-Philippe Brucker March 2, 2017, 10:50 a.m. UTC | #5
On Wed, Mar 01, 2017 at 08:02:09AM +0000, Tian, Kevin wrote:
> > From: Jean-Philippe Brucker [mailto:Jean-Philippe.Brucker@arm.com]
> > Sent: Tuesday, February 28, 2017 11:23 PM
> > 
> > Hi Kevin,
> > 
> > On Tue, Feb 28, 2017 at 06:43:31AM +0000, Tian, Kevin wrote:
> > > > From: Alex Williamson
> > > > Sent: Tuesday, February 28, 2017 11:54 AM
> > > >
> > > > On Mon, 27 Feb 2017 19:54:41 +0000
> > > > Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote:
> > > >
> > > [...]
> > > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > > > index 3fe4197a5ea0..41ae8a231d42 100644
> > > > > --- a/include/uapi/linux/vfio.h
> > > > > +++ b/include/uapi/linux/vfio.h
> > > > > @@ -415,7 +415,9 @@ struct vfio_device_svm {
> > > > >  	__u32	flags;
> > > > >  #define VFIO_SVM_PASID_RELEASE_FLUSHED	(1 << 0)
> > > > >  #define VFIO_SVM_PASID_RELEASE_CLEAN	(1 << 1)
> > > > > +#define VFIO_SVM_PID			(1 << 2)
> > > > >  	__u32	pasid;
> > > > > +	__u32	pid;
> > > > >  };
> > > > >  /*
> > > > >   * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
> > > > > @@ -432,6 +434,19 @@ struct vfio_device_svm {
> > > > >   * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
> > > > >   * ID is unique to a device.
> > > > >   *
> > > > > + * VFIO_SVM_PID: bind task @pid instead of current task. The shared address
> > > > > + *        space identified by @pasid is that of task identified by @pid.
> > > > > + *
> > > > > + *        Given that the caller owns the device, setting this flag grants the
> > > > > + *        caller read and write permissions on the entire address space of
> > > > > + *        foreign task described by @pid. Therefore, permission to perform the
> > > > > + *        bind operation on a foreign process is governed by the ptrace access
> > > > > + *        mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2)
> > for
> > > > more
> > > > > + *        information.
> > > > > + *
> > > > > + *        If the VFIO_SVM_PID flag is not set, @pid is unused and it is the
> > > > > + *        current task that is bound to the device.
> > > > > + *
> > > > >   * The bond between device and process must be removed with
> > > > >   * VFIO_DEVICE_UNBIND_TASK before exiting.
> > > > >   *
> > > >
> > > > BTW, nice commit logs throughout this series, I probably need to read
> > > > through them a few more times to really digest it all.  AIUI, the VFIO
> > > > support here is really only useful for basic userspace drivers, I don't
> > > > see how we could take advantage of it for a VM use case where the guest
> > > > manages the PASID space for a domain.  Perhaps it hasn't spent enough
> > > > cycles bouncing around in my head yet.  Thanks,
> > > >
> > >
> > > Current definition doesn't work with virtualization usage, at least on Intel
> > > VT-d. To enable virtualized SVM within a VM, architecturally VT-d needs
> > > be in a nested mode - go through guest PASID table to find guest CR3,
> > > use guest CR3 as 1st level translation for GVA->GPA and then use 2nd
> > > level translation for GPA->HPA. PASID table is fully allocated/managed
> > > by VM. Within the translation process each guest pointer (PASID or 1st
> > > level paging structures) is treated as GPA which also goes through 2nd
> > > level translation. I didn't read ARM SMMU spec yet, but hope the basic
> > > mechanism stays similar.
> > 
> > If I understand correctly, it is very similar on ARM SMMU, where we have
> > two stages of translation. Stage-1 is GVA->GPA and stage-2 is GPA->HPA,
> > with all intermediate tables of stage-1 translation obtained via stage-2
> > as well. The SMMU holds stage-1 paging structure in the PASID tables.
> 
> Good to know. :-)
> 
> > 
> > > Here we need an API which allows Qemu vIOMMU to bind guest PASID
> > > table pointer and enable nested mode for target device in underlying
> > > IOMMU hardware, while proposed API is only for user space driver
> > > regarding to binding a specific host address space.
> > >
> > > Based on above requirement difference, Alex, do you prefer to
> > > introducing one API covering both usages or separate APIs for their
> > > own purposes?
> > >
> > > btw Yi is working on a SVM virtualization prototype based on Intel
> > > VT-d. I hope soon he will send out a RFC so we can align the high
> > > level API requirement better. :-)
> > 
> > For IO virtualization on ARM, I'm currently working on a generic
> > para-virtualized IOMMU, where the IOMMU presented to the VM is different
> > from the hardware SMMU (I'll try not to go into details here, to avoid
> > derailing the discussion too much). For virtual SVM, the PASID table
> > format would be different between vIOMMU and pIOMMU, but the page table
> > formats would be the same as the MMU.
> 
> When you say 'generic para-virtualized IOMMU', does 'generic' apply
> to ARM only (cross different ARM SMMU versions), or apply to other
> vendors (e.g. Intel, AMD, etc.)? Just want to touch base your high
> level idea here. 

It wouldn't apply to ARM only, we're trying to avoid any dependency on
architecture or vendor.

> > 
> > The VFIO interface for this would therefore have to be more fine-grained
> > than passing the whole PASID table. And could be implemented by
> > extending the interface proposed here.
> > 
> > User passes an opaque architecture-specific structure containing page
> > table format and pgd via the BIND_TASK VFIO ioctl. And the pIOMMU can
> > manage its own PASID tables, pointing to VM page tables. I was thinking
> > of letting the physical IOMMU handle PASID allocation and return it to
> > the VM via BIND_TASK instead of letting the guest do it, but that's more
> > of an implementation detail.
> 
> I can see some value of doing this way... anyway not distract this thread.
> Let's discuss detail when you send out that RFC in future thread.
> 
> > 
> > When talking about SVM virtualization, there also is the case where the
> > VMM wants to avoid pinning all of the guest RAM prior to assigning
> > devices to a VM. In short, stage-2 SVM, where a device fault is handled
> > by KVM to map GPA->HPA. I think the interface presented in this patch
> > could also be reused, but there wouldn't be a lot of overlapping. The
> > PASID wouldn't be used, and we'd need to pass an eventfd or another
> > mechanism that allows KVM or the VMM to handle faults. This makes me
> > more confident that the name "VFIO_IOMMU_SVM_BIND" might be more
> > suitable than "VFIO_IOMMU_BIND_TASK".
> 
> yes SVM_BIND sounds more general.
> 
> > 
> > To summarize, I think that this API can be reused when implementing a
> > para-virtualized IOMMU. But for the "full" virtualization case, a
> > somewhat orthogonal API would be needed. The fault reporting
> > infrastructure would most likely be common. So I don't think that this
> > proposal will collide with the SVM virtualization work for VT-d.
> > 
> 
> Thanks for sharing your thought. Even for 'full' virtualization e.g. in 
> our case, we may also reuse the same API if you would like to go
> with new name, which is generic enough to cover all potential usages
> with sub-ops defined to differentiate (bind to host process, bind to 
> guest process, bind to guest PASID table, etc).

Yes I am keen on using a common API, so I'm looking forward to your
SVM virtualization RFC as well.

Thanks,
Jean-Philippe
Tomasz Nowicki April 26, 2017, 7:25 a.m. UTC | #6
On 27.02.2017 20:54, Jean-Philippe Brucker wrote:
> Let the process that owns the device create an address space bond on
> behalf of another process. We add a pid argument to the BIND_TASK ioctl,
> allowing the caller to bind a foreign task. The expected program flow in
> this case is:
>
> * Process A creates the VFIO context and initializes the device.
> * Process B asks A to bind its address space.
> * Process A issues an ioctl to the VFIO device fd with BIND_TASK(pid).
>   It may communicate the given PASID back to process B or keep track of it
>   internally.
> * Process B asks A to perform transactions on its virtual address.
> * Process A launches transaction tagged with the given PASID.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> ---
>  drivers/vfio/vfio.c       | 35 +++++++++++++++++++++++++++++++++--
>  include/uapi/linux/vfio.h | 15 +++++++++++++++
>  2 files changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c4505d8f4c61..ecc5d07e3dbb 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -26,6 +26,7 @@
>  #include <linux/module.h>
>  #include <linux/mutex.h>
>  #include <linux/pci.h>
> +#include <linux/ptrace.h>
>  #include <linux/rwsem.h>
>  #include <linux/sched.h>
>  #include <linux/slab.h>
> @@ -1660,7 +1661,7 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
>  	struct vfio_device_svm svm;
>  	struct vfio_task *vfio_task;
>
> -	minsz = offsetofend(struct vfio_device_svm, pasid);
> +	minsz = offsetofend(struct vfio_device_svm, pid);
>
>  	if (copy_from_user(&svm, (void __user *)arg, minsz))
>  		return -EFAULT;
> @@ -1669,9 +1670,39 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
>  		return -EINVAL;
>
>  	if (cmd == VFIO_DEVICE_BIND_TASK) {
> -		struct task_struct *task = current;
> +		struct mm_struct *mm;
> +		struct task_struct *task;
> +
> +		if (svm.flags & ~VFIO_SVM_PID)
> +			return -EINVAL;
> +
> +		if (svm.flags & VFIO_SVM_PID) {
> +			rcu_read_lock();
> +			task = find_task_by_vpid(svm.pid);
> +			if (task)
> +				get_task_struct(task);
> +			rcu_read_unlock();
> +			if (!task)
> +				return -ESRCH;
> +
> +			/*
> +			 * Ensure process has RW access on the task's mm
> +			 * FIXME:
> +			 * - I think this ought to be in the IOMMU API
> +			 * - I'm assuming permission is never revoked during the
> +			 *   task's lifetime. Might be mistaken.
> +			 */
> +			mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
> +			if (!mm || IS_ERR(mm))

I know this is RFC patch but considering we will keep this as is, we 
need here:
+put_task_struct(task);


> +				return IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> +			mmput(mm);
> +		} else {
> +			get_task_struct(current);
> +			task = current;
> +		}
>
>  		ret = iommu_bind_task(device->dev, task, &svm.pasid, 0, NULL);
> +		put_task_struct(task);
>  		if (ret)
>  			return ret;
>

Thanks,
Tomasz
Jean-Philippe Brucker April 26, 2017, 10:08 a.m. UTC | #7
Hi Tomasz,

Thanks for looking at this.

On 26/04/17 08:25, Tomasz Nowicki wrote:
> On 27.02.2017 20:54, Jean-Philippe Brucker wrote:
>> Let the process that owns the device create an address space bond on
>> behalf of another process. We add a pid argument to the BIND_TASK ioctl,
>> allowing the caller to bind a foreign task. The expected program flow in
>> this case is:
>>
>> * Process A creates the VFIO context and initializes the device.
>> * Process B asks A to bind its address space.
>> * Process A issues an ioctl to the VFIO device fd with BIND_TASK(pid).
>>   It may communicate the given PASID back to process B or keep track of it
>>   internally.
>> * Process B asks A to perform transactions on its virtual address.
>> * Process A launches transaction tagged with the given PASID.
>>
>> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
>> ---
>>  drivers/vfio/vfio.c       | 35 +++++++++++++++++++++++++++++++++--
>>  include/uapi/linux/vfio.h | 15 +++++++++++++++
>>  2 files changed, 48 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>> index c4505d8f4c61..ecc5d07e3dbb 100644
>> --- a/drivers/vfio/vfio.c
>> +++ b/drivers/vfio/vfio.c
>> @@ -26,6 +26,7 @@
>>  #include <linux/module.h>
>>  #include <linux/mutex.h>
>>  #include <linux/pci.h>
>> +#include <linux/ptrace.h>
>>  #include <linux/rwsem.h>
>>  #include <linux/sched.h>
>>  #include <linux/slab.h>
>> @@ -1660,7 +1661,7 @@ static long vfio_svm_ioctl(struct vfio_device
>> *device, unsigned int cmd,
>>      struct vfio_device_svm svm;
>>      struct vfio_task *vfio_task;
>>
>> -    minsz = offsetofend(struct vfio_device_svm, pasid);
>> +    minsz = offsetofend(struct vfio_device_svm, pid);
>>
>>      if (copy_from_user(&svm, (void __user *)arg, minsz))
>>          return -EFAULT;
>> @@ -1669,9 +1670,39 @@ static long vfio_svm_ioctl(struct vfio_device
>> *device, unsigned int cmd,
>>          return -EINVAL;
>>
>>      if (cmd == VFIO_DEVICE_BIND_TASK) {
>> -        struct task_struct *task = current;
>> +        struct mm_struct *mm;
>> +        struct task_struct *task;
>> +
>> +        if (svm.flags & ~VFIO_SVM_PID)
>> +            return -EINVAL;
>> +
>> +        if (svm.flags & VFIO_SVM_PID) {
>> +            rcu_read_lock();
>> +            task = find_task_by_vpid(svm.pid);
>> +            if (task)
>> +                get_task_struct(task);
>> +            rcu_read_unlock();
>> +            if (!task)
>> +                return -ESRCH;
>> +
>> +            /*
>> +             * Ensure process has RW access on the task's mm
>> +             * FIXME:
>> +             * - I think this ought to be in the IOMMU API
>> +             * - I'm assuming permission is never revoked during the
>> +             *   task's lifetime. Might be mistaken.
>> +             */
>> +            mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
>> +            if (!mm || IS_ERR(mm))
> 
> I know this is RFC patch but considering we will keep this as is, we need
> here:
> +put_task_struct(task);

Indeed. I considerably reworked the VFIO patches for next version, but
this bug was still in there.

Thanks,
Jean-Philippe
diff mbox

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index c4505d8f4c61..ecc5d07e3dbb 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -26,6 +26,7 @@ 
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/pci.h>
+#include <linux/ptrace.h>
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -1660,7 +1661,7 @@  static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
 	struct vfio_device_svm svm;
 	struct vfio_task *vfio_task;
 
-	minsz = offsetofend(struct vfio_device_svm, pasid);
+	minsz = offsetofend(struct vfio_device_svm, pid);
 
 	if (copy_from_user(&svm, (void __user *)arg, minsz))
 		return -EFAULT;
@@ -1669,9 +1670,39 @@  static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
 		return -EINVAL;
 
 	if (cmd == VFIO_DEVICE_BIND_TASK) {
-		struct task_struct *task = current;
+		struct mm_struct *mm;
+		struct task_struct *task;
+
+		if (svm.flags & ~VFIO_SVM_PID)
+			return -EINVAL;
+
+		if (svm.flags & VFIO_SVM_PID) {
+			rcu_read_lock();
+			task = find_task_by_vpid(svm.pid);
+			if (task)
+				get_task_struct(task);
+			rcu_read_unlock();
+			if (!task)
+				return -ESRCH;
+
+			/*
+			 * Ensure process has RW access on the task's mm
+			 * FIXME:
+			 * - I think this ought to be in the IOMMU API
+			 * - I'm assuming permission is never revoked during the
+			 *   task's lifetime. Might be mistaken.
+			 */
+			mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+			if (!mm || IS_ERR(mm))
+				return IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+			mmput(mm);
+		} else {
+			get_task_struct(current);
+			task = current;
+		}
 
 		ret = iommu_bind_task(device->dev, task, &svm.pasid, 0, NULL);
+		put_task_struct(task);
 		if (ret)
 			return ret;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 3fe4197a5ea0..41ae8a231d42 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -415,7 +415,9 @@  struct vfio_device_svm {
 	__u32	flags;
 #define VFIO_SVM_PASID_RELEASE_FLUSHED	(1 << 0)
 #define VFIO_SVM_PASID_RELEASE_CLEAN	(1 << 1)
+#define VFIO_SVM_PID			(1 << 2)
 	__u32	pasid;
+	__u32	pid;
 };
 /*
  * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
@@ -432,6 +434,19 @@  struct vfio_device_svm {
  * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
  * ID is unique to a device.
  *
+ * VFIO_SVM_PID: bind task @pid instead of current task. The shared address
+ *        space identified by @pasid is that of task identified by @pid.
+ *
+ *        Given that the caller owns the device, setting this flag grants the
+ *        caller read and write permissions on the entire address space of
+ *        foreign task described by @pid. Therefore, permission to perform the
+ *        bind operation on a foreign process is governed by the ptrace access
+ *        mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) for more
+ *        information.
+ *
+ *        If the VFIO_SVM_PID flag is not set, @pid is unused and it is the
+ *        current task that is bound to the device.
+ *
  * The bond between device and process must be removed with
  * VFIO_DEVICE_UNBIND_TASK before exiting.
  *