diff mbox series

[RFC,10/10] vfio/type1: Register device notifier

Message ID 161401275279.16443.6350471385325897377.stgit@gimli.home (mailing list archive)
State New, archived
Headers show
Series vfio: Device memory DMA mapping improvements | expand

Commit Message

Alex Williamson Feb. 22, 2021, 4:52 p.m. UTC
Introduce a new default strict MMIO mapping mode where the vma for
a VM_PFNMAP mapping must be backed by a vfio device.  This allows
holding a reference to the device and registering a notifier for the
device, which additionally keeps the device in an IOMMU context for
the extent of the DMA mapping.  On notification of device release,
automatically drop the DMA mappings for it.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c |  124 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+), 1 deletion(-)

Comments

Jason Gunthorpe Feb. 22, 2021, 5:55 p.m. UTC | #1
On Mon, Feb 22, 2021 at 09:52:32AM -0700, Alex Williamson wrote:
> Introduce a new default strict MMIO mapping mode where the vma for
> a VM_PFNMAP mapping must be backed by a vfio device.  This allows
> holding a reference to the device and registering a notifier for the
> device, which additionally keeps the device in an IOMMU context for
> the extent of the DMA mapping.  On notification of device release,
> automatically drop the DMA mappings for it.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>  drivers/vfio/vfio_iommu_type1.c |  124 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 123 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index b34ee4b96a4a..2a16257bd5b6 100644
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -61,6 +61,11 @@ module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
>  MODULE_PARM_DESC(dma_entry_limit,
>  		 "Maximum number of user DMA mappings per container (65535).");
>  
> +static bool strict_mmio_maps = true;
> +module_param_named(strict_mmio_maps, strict_mmio_maps, bool, 0644);
> +MODULE_PARM_DESC(strict_mmio_maps,
> +		 "Restrict to safe DMA mappings of device memory (true).");

I think this should be a kconfig, historically we've required kconfig
to opt-in to unsafe things that could violate kernel security. Someone
building a secure boot trusted kernel system should not have an
options for userspace to just turn off protections.

> +/* Req separate object for async removal from notifier vs dropping vfio_dma */
> +struct pfnmap_obj {
> +	struct notifier_block	nb;
> +	struct work_struct	work;
> +	struct vfio_iommu	*iommu;
> +	struct vfio_device	*device;
> +};

So this is basically the dmabuf, I think it would be simple enough to
go in here and change it down the road if someone had interest.

> +static void unregister_device_bg(struct work_struct *work)
> +{
> +	struct pfnmap_obj *pfnmap = container_of(work, struct pfnmap_obj, work);
> +
> +	vfio_device_unregister_notifier(pfnmap->device, &pfnmap->nb);
> +	vfio_device_put(pfnmap->device);

The device_put keeps the device from becoming unregistered, but what
happens during the hot reset case? Is this what the cover letter
was talking about? CPU access is revoked but P2P is still possible?

> +static int vfio_device_nb_cb(struct notifier_block *nb,
> +			     unsigned long action, void *unused)
> +{
> +	struct pfnmap_obj *pfnmap = container_of(nb, struct pfnmap_obj, nb);
> +
> +	switch (action) {
> +	case VFIO_DEVICE_RELEASE:
> +	{
> +		struct vfio_dma *dma, *dma_last = NULL;
> +		int retries = 0;
> +again:
> +		mutex_lock(&pfnmap->iommu->lock);
> +		dma = pfnmap_find_dma(pfnmap);

Feels a bit strange that the vfio_dma isn't linked to the pfnmap_obj
instead of searching the entire list?

> @@ -549,8 +625,48 @@ static int vaddr_get_pfn(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  		if (ret == -EAGAIN)
>  			goto retry;

I'd prefer this was written a bit differently, I would like it very
much if this doesn't mis-use follow_pte() by returning pfn outside
the lock.

vaddr_get_bar_pfn(..)
{
        vma = find_vma_intersection(mm, vaddr, vaddr + 1);
	if (!vma)
           return -ENOENT;
        if ((vma->vm_flags & VM_DENYWRITE) && (prot & PROT_WRITE)) // Check me
           return -EFAULT;
        device = vfio_device_get_from_vma(vma);
	if (!device)
           return -ENOENT;

	/*
         * Now do the same as vfio_pci_mmap_fault() - the vm_pgoff must
	 * be the physical pfn when using this mechanism. Delete follow_pte entirely()
         */
        pfn = (vaddr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff
	
        /* de-dup device and record that we are using device's pages in the
	   pfnmap */
        ...
}

This would be significantly better if it could do whole ranges instead
of page at a time.

Jason
Alex Williamson Feb. 24, 2021, 9:55 p.m. UTC | #2
On Mon, 22 Feb 2021 13:55:23 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Mon, Feb 22, 2021 at 09:52:32AM -0700, Alex Williamson wrote:
> > Introduce a new default strict MMIO mapping mode where the vma for
> > a VM_PFNMAP mapping must be backed by a vfio device.  This allows
> > holding a reference to the device and registering a notifier for the
> > device, which additionally keeps the device in an IOMMU context for
> > the extent of the DMA mapping.  On notification of device release,
> > automatically drop the DMA mappings for it.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> >  drivers/vfio/vfio_iommu_type1.c |  124 +++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 123 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index b34ee4b96a4a..2a16257bd5b6 100644
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -61,6 +61,11 @@ module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
> >  MODULE_PARM_DESC(dma_entry_limit,
> >  		 "Maximum number of user DMA mappings per container (65535).");
> >  
> > +static bool strict_mmio_maps = true;
> > +module_param_named(strict_mmio_maps, strict_mmio_maps, bool, 0644);
> > +MODULE_PARM_DESC(strict_mmio_maps,
> > +		 "Restrict to safe DMA mappings of device memory (true).");  
> 
> I think this should be a kconfig, historically we've required kconfig
> to opt-in to unsafe things that could violate kernel security. Someone
> building a secure boot trusted kernel system should not have an
> options for userspace to just turn off protections.

It could certainly be further protected that this option might not
exist based on a Kconfig, but I think we're already risking breaking
some existing users and I'd rather allow it with an opt-in (like we
already do for lack of interrupt isolation), possibly even with a
kernel taint if used, if necessary.

> > +/* Req separate object for async removal from notifier vs dropping vfio_dma */
> > +struct pfnmap_obj {
> > +	struct notifier_block	nb;
> > +	struct work_struct	work;
> > +	struct vfio_iommu	*iommu;
> > +	struct vfio_device	*device;
> > +};  
> 
> So this is basically the dmabuf, I think it would be simple enough to
> go in here and change it down the road if someone had interest.
> 
> > +static void unregister_device_bg(struct work_struct *work)
> > +{
> > +	struct pfnmap_obj *pfnmap = container_of(work, struct pfnmap_obj, work);
> > +
> > +	vfio_device_unregister_notifier(pfnmap->device, &pfnmap->nb);
> > +	vfio_device_put(pfnmap->device);  
> 
> The device_put keeps the device from becoming unregistered, but what
> happens during the hot reset case? Is this what the cover letter
> was talking about? CPU access is revoked but P2P is still possible?

Yes, this only addresses cutting off DMA mappings to a released device
where we can safely drop the DMA mapping entirely.  I think we can
consider complete removal of the mapping object safe in this case
because the user has no ongoing access to the device and after
re-opening the device it would be reasonable to expect mappings would
need to be re-established.

Doing the same around disabling device memory or a reset has a much
greater potential to break userspace.  In some of these cases QEMU will
do the right thing, but reset with dependent devices gets into
scenarios that I can't be sure about.  Other userspace drivers also
exist and I can't verify them all.  So ideally we'd want to temporarily
remove the IOMMU mapping, leaving the mapping object, and restoring it
on the other side.  But I don't think we have a way to do that in the
IOMMU API currently, other than unmap and later remap.  So we might
need to support a zombie mode for the mapping object or enhance the
IOMMU API where we could leave the iotlb entry in place but drop r+w
permissions.

At this point we're also just trying to define which error we get,
perhaps an unsupported request if we do nothing or an IOMMU fault if we
invalidate or suspend the mapping.  It's not guaranteed that one of
these has better behavior on the system than the other.
 
> > +static int vfio_device_nb_cb(struct notifier_block *nb,
> > +			     unsigned long action, void *unused)
> > +{
> > +	struct pfnmap_obj *pfnmap = container_of(nb, struct pfnmap_obj, nb);
> > +
> > +	switch (action) {
> > +	case VFIO_DEVICE_RELEASE:
> > +	{
> > +		struct vfio_dma *dma, *dma_last = NULL;
> > +		int retries = 0;
> > +again:
> > +		mutex_lock(&pfnmap->iommu->lock);
> > +		dma = pfnmap_find_dma(pfnmap);  
> 
> Feels a bit strange that the vfio_dma isn't linked to the pfnmap_obj
> instead of searching the entire list?

It does, I've been paranoid about whether we can trust that the
vfio_dma is still valid in all cases.  I had myself convinced that if
our notifier actions expand we could get another callback before our
workqueue removes the notifier, but that might be more simply handled
by having a vfio_dma pointer that gets cleared once we remove the
vfio_dma.  I'll take another look.


> > @@ -549,8 +625,48 @@ static int vaddr_get_pfn(struct vfio_iommu *iommu, struct vfio_dma *dma,
> >  		if (ret == -EAGAIN)
> >  			goto retry;  
> 
> I'd prefer this was written a bit differently, I would like it very
> much if this doesn't mis-use follow_pte() by returning pfn outside
> the lock.
> 
> vaddr_get_bar_pfn(..)
> {
>         vma = find_vma_intersection(mm, vaddr, vaddr + 1);
> 	if (!vma)
>            return -ENOENT;
>         if ((vma->vm_flags & VM_DENYWRITE) && (prot & PROT_WRITE)) // Check me
>            return -EFAULT;
>         device = vfio_device_get_from_vma(vma);
> 	if (!device)
>            return -ENOENT;
> 
> 	/*
>          * Now do the same as vfio_pci_mmap_fault() - the vm_pgoff must
> 	 * be the physical pfn when using this mechanism. Delete follow_pte entirely()
>          */
>         pfn = (vaddr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff
> 	
>         /* de-dup device and record that we are using device's pages in the
> 	   pfnmap */
>         ...
> }


This seems to undo both:

5cbf3264bc71 ("vfio/type1: Fix VA->PA translation for PFNMAP VMAs in vaddr_get_pfn()")

(which also suggests we are going to break users without the module
option opt-in above)

And:

41311242221e ("vfio/type1: Support faulting PFNMAP vmas")

So we'd have an alternate path in the un-safe mode and we'd lose the
ability to fault in mappings.

> This would be significantly better if it could do whole ranges instead
> of page at a time.

There are some patches I just put in from Daniel Jordan that use
batching.  Thanks,

Alex
Jason Gunthorpe Feb. 25, 2021, 12:22 a.m. UTC | #3
On Wed, Feb 24, 2021 at 02:55:08PM -0700, Alex Williamson wrote:

> > > +static bool strict_mmio_maps = true;
> > > +module_param_named(strict_mmio_maps, strict_mmio_maps, bool, 0644);
> > > +MODULE_PARM_DESC(strict_mmio_maps,
> > > +		 "Restrict to safe DMA mappings of device memory (true).");  
> > 
> > I think this should be a kconfig, historically we've required kconfig
> > to opt-in to unsafe things that could violate kernel security. Someone
> > building a secure boot trusted kernel system should not have an
> > options for userspace to just turn off protections.
> 
> It could certainly be further protected that this option might not
> exist based on a Kconfig, but I think we're already risking breaking
> some existing users and I'd rather allow it with an opt-in (like we
> already do for lack of interrupt isolation), possibly even with a
> kernel taint if used, if necessary.

Makes me nervous, security should not be optional.

> > I'd prefer this was written a bit differently, I would like it very
> > much if this doesn't mis-use follow_pte() by returning pfn outside
> > the lock.
> > 
> > vaddr_get_bar_pfn(..)
> > {
> >         vma = find_vma_intersection(mm, vaddr, vaddr + 1);
> > 	if (!vma)
> >            return -ENOENT;
> >         if ((vma->vm_flags & VM_DENYWRITE) && (prot & PROT_WRITE)) // Check me
> >            return -EFAULT;
> >         device = vfio_device_get_from_vma(vma);
> > 	if (!device)
> >            return -ENOENT;
> > 
> > 	/*
> >          * Now do the same as vfio_pci_mmap_fault() - the vm_pgoff must
> > 	 * be the physical pfn when using this mechanism. Delete follow_pte entirely()
> >          */
> >         pfn = (vaddr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff
> > 	
> >         /* de-dup device and record that we are using device's pages in the
> > 	   pfnmap */
> >         ...
> > }
> 
> 
> This seems to undo both:
> 
> 5cbf3264bc71 ("vfio/type1: Fix VA->PA translation for PFNMAP VMAs in vaddr_get_pfn()")

No, the bug this commit described is fixed by calling
vfio_device_get_from_vma() which excludes all non-VFIO VMAs already.

We can assert that the vm_pgoff is in a specific format because it is
a VFIO owned VMA and must follow the rules to be part of the address
space. See my last email

Here I was suggesting to use the vm_pgoff == PFN rule, but since
you've clarified that doesn't work we'd have to determine the PFN from
the region number through the vfio_device instead.

> (which also suggests we are going to break users without the module
> option opt-in above)

Not necessarily, this is complaining vfio crashes, it doesn't say they
actually needed the IOMMU to work on those VMAs because they are doing
P2P DMA.

I think, if this does break someone, they are on a real fringe and
must have already modified their kernel, so a kconfig is the right
approach. It is pretty hard to get non-GUP'able DMA'able memory into a
process with the stock kernel.

Generally speaking, I think Linus has felt security bug fixes like
this are more on the OK side of things to break fringe users.

> And:
> 
> 41311242221e ("vfio/type1: Support faulting PFNMAP vmas")
> 
> So we'd have an alternate path in the un-safe mode and we'd lose the
> ability to fault in mappings.

As above we already exclude VMAs that are not from VFIO, and VFIO
sourced VMA's do not meaningfully implement fault for this use
case. So calling fixup_user_fault() is pointless.

Peter just did this so we could ask him what it was for..

I feel pretty strongly that removing the call to follow_pte is
important here. Even if we do cover all the issues with mis-using the
API it just makes a maintenance problem to leave it in.

Jason
Peter Xu Feb. 25, 2021, 5:54 p.m. UTC | #4
On Wed, Feb 24, 2021 at 08:22:16PM -0400, Jason Gunthorpe wrote:
> On Wed, Feb 24, 2021 at 02:55:08PM -0700, Alex Williamson wrote:
> 
> > > > +static bool strict_mmio_maps = true;
> > > > +module_param_named(strict_mmio_maps, strict_mmio_maps, bool, 0644);
> > > > +MODULE_PARM_DESC(strict_mmio_maps,
> > > > +		 "Restrict to safe DMA mappings of device memory (true).");  
> > > 
> > > I think this should be a kconfig, historically we've required kconfig
> > > to opt-in to unsafe things that could violate kernel security. Someone
> > > building a secure boot trusted kernel system should not have an
> > > options for userspace to just turn off protections.
> > 
> > It could certainly be further protected that this option might not
> > exist based on a Kconfig, but I think we're already risking breaking
> > some existing users and I'd rather allow it with an opt-in (like we
> > already do for lack of interrupt isolation), possibly even with a
> > kernel taint if used, if necessary.
> 
> Makes me nervous, security should not be optional.
> 
> > > I'd prefer this was written a bit differently, I would like it very
> > > much if this doesn't mis-use follow_pte() by returning pfn outside
> > > the lock.
> > > 
> > > vaddr_get_bar_pfn(..)
> > > {
> > >         vma = find_vma_intersection(mm, vaddr, vaddr + 1);
> > > 	if (!vma)
> > >            return -ENOENT;
> > >         if ((vma->vm_flags & VM_DENYWRITE) && (prot & PROT_WRITE)) // Check me
> > >            return -EFAULT;
> > >         device = vfio_device_get_from_vma(vma);
> > > 	if (!device)
> > >            return -ENOENT;
> > > 
> > > 	/*
> > >          * Now do the same as vfio_pci_mmap_fault() - the vm_pgoff must
> > > 	 * be the physical pfn when using this mechanism. Delete follow_pte entirely()
> > >          */
> > >         pfn = (vaddr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff
> > > 	
> > >         /* de-dup device and record that we are using device's pages in the
> > > 	   pfnmap */
> > >         ...
> > > }
> > 
> > 
> > This seems to undo both:
> > 
> > 5cbf3264bc71 ("vfio/type1: Fix VA->PA translation for PFNMAP VMAs in vaddr_get_pfn()")
> 
> No, the bug this commit described is fixed by calling
> vfio_device_get_from_vma() which excludes all non-VFIO VMAs already.
> 
> We can assert that the vm_pgoff is in a specific format because it is
> a VFIO owned VMA and must follow the rules to be part of the address
> space. See my last email
> 
> Here I was suggesting to use the vm_pgoff == PFN rule, but since
> you've clarified that doesn't work we'd have to determine the PFN from
> the region number through the vfio_device instead.
> 
> > (which also suggests we are going to break users without the module
> > option opt-in above)
> 
> Not necessarily, this is complaining vfio crashes, it doesn't say they
> actually needed the IOMMU to work on those VMAs because they are doing
> P2P DMA.
> 
> I think, if this does break someone, they are on a real fringe and
> must have already modified their kernel, so a kconfig is the right
> approach. It is pretty hard to get non-GUP'able DMA'able memory into a
> process with the stock kernel.
> 
> Generally speaking, I think Linus has felt security bug fixes like
> this are more on the OK side of things to break fringe users.
> 
> > And:
> > 
> > 41311242221e ("vfio/type1: Support faulting PFNMAP vmas")
> > 
> > So we'd have an alternate path in the un-safe mode and we'd lose the
> > ability to fault in mappings.
> 
> As above we already exclude VMAs that are not from VFIO, and VFIO
> sourced VMA's do not meaningfully implement fault for this use
> case. So calling fixup_user_fault() is pointless.
> 
> Peter just did this so we could ask him what it was for..
> 
> I feel pretty strongly that removing the call to follow_pte is
> important here. Even if we do cover all the issues with mis-using the
> API it just makes a maintenance problem to leave it in.

I can't say I fully understand the whole rational behind 5cbf3264bc71, but that
commit still sounds reasonable to me, since I don't see why VFIO cannot do
VFIO_IOMMU_MAP_DMA upon another memory range that's neither anonymous memory
nor vfio mapped MMIO range.  In those cases, vm_pgoff namespace defined by vfio
may not be true anymore, iiuc.

Then if with that follow_pfn() for non-vfio mappings, it seems also very
reasonable to have 41311242221e or similar as proposed by Alex to make sure pte
installed before calling that, for either vfio or other vma providers.

Or does it mean that we don't want to allow VFIO dma to those unknown memory
backends, for some reason?

Thanks,
Jason Gunthorpe Feb. 25, 2021, 6:19 p.m. UTC | #5
On Thu, Feb 25, 2021 at 12:54:57PM -0500, Peter Xu wrote:
 
> I can't say I fully understand the whole rational behind 5cbf3264bc71, but that
> commit still sounds reasonable to me, since I don't see why VFIO cannot do
> VFIO_IOMMU_MAP_DMA upon another memory range that's neither anonymous memory
> nor vfio mapped MMIO range.

It is not so much it can't, more that it doesn't and doesn't need to.

> In those cases, vm_pgoff namespace defined by vfio may not be true
> anymore, iiuc.

Since this series is proposing linking the VMA to an address_space all
the vm_pgoffs must be in the same namespace

> Or does it mean that we don't want to allow VFIO dma to those unknown memory
> backends, for some reason?

Correct. VFIO can map into the IOMMU PFNs it can get a reference
to. pin_user_pages() works for the majority, special VFIO VMAs cover
the rest, and everthing else must be blocked for security.

Jason
Peter Xu Feb. 25, 2021, 7:06 p.m. UTC | #6
On Thu, Feb 25, 2021 at 02:19:45PM -0400, Jason Gunthorpe wrote:
> On Thu, Feb 25, 2021 at 12:54:57PM -0500, Peter Xu wrote:
>  
> > I can't say I fully understand the whole rational behind 5cbf3264bc71, but that
> > commit still sounds reasonable to me, since I don't see why VFIO cannot do
> > VFIO_IOMMU_MAP_DMA upon another memory range that's neither anonymous memory
> > nor vfio mapped MMIO range.
> 
> It is not so much it can't, more that it doesn't and doesn't need to.

OK.

> 
> > In those cases, vm_pgoff namespace defined by vfio may not be true
> > anymore, iiuc.
> 
> Since this series is proposing linking the VMA to an address_space all
> the vm_pgoffs must be in the same namespace

Agreed.  I saw discussions around on redefining the vm_pgoff namespace, I can't
say I followed that closely either, but yes it definitely makes sense to always
use an unified namespace.  Maybe we should even comment it somewhere on how
vm_pgoff is encoded?

> 
> > Or does it mean that we don't want to allow VFIO dma to those unknown memory
> > backends, for some reason?
> 
> Correct. VFIO can map into the IOMMU PFNs it can get a reference
> to. pin_user_pages() works for the majority, special VFIO VMAs cover
> the rest, and everthing else must be blocked for security.

If we all agree that the current follow_pfn() should only apply to vfio
internal vmas, then it seems we can drop it indeed, as long as the crash
reported in 5cbf3264b would fail gracefully at e.g. VFIO_IOMMU_MAP_DMA rather
than triggering a kernel warning somehow.

However I'm still confused on why it's more secure - the current process to do
VFIO_IOMMU_MAP_DMA should at least has proper permission for everything to be
setup, including the special vma, right?  Say, if the process can write to
those memories, then shouldn't we also allow it to grant this write permission
to other devices too?

Thanks,
Jason Gunthorpe Feb. 25, 2021, 7:17 p.m. UTC | #7
On Thu, Feb 25, 2021 at 02:06:46PM -0500, Peter Xu wrote:

> Agreed.  I saw discussions around on redefining the vm_pgoff namespace, I can't
> say I followed that closely either, but yes it definitely makes sense to always
> use an unified namespace.  Maybe we should even comment it somewhere on how
> vm_pgoff is encoded?

Yes, it should be described, it is subtle
 
> > Correct. VFIO can map into the IOMMU PFNs it can get a reference
> > to. pin_user_pages() works for the majority, special VFIO VMAs cover
> > the rest, and everthing else must be blocked for security.
> 
> If we all agree that the current follow_pfn() should only apply to vfio
> internal vmas, 

I want to remvoe follow_pfn(). Internal VMAs can deduce the PFN from
the vm_pgoff, they don't need to do follow.

> then it seems we can drop it indeed, as long as the crash reported
> in 5cbf3264b would fail gracefully at e.g. VFIO_IOMMU_MAP_DMA rather
> than triggering a kernel warning somehow.

Yes, this will just fail the ioctl because pin_user_pages() failed and
the VMA was not VFIO.

> However I'm still confused on why it's more secure - the current process to do
> VFIO_IOMMU_MAP_DMA should at least has proper permission for everything to be
> setup, including the special vma, right?  Say, if the process can write to
> those memories, then shouldn't we also allow it to grant this write permission
> to other devices too?

It is a use-after-free. Once the PFN is programmed into the IOMMU it
becomes completely divorced from the VMA. Remember there is no
pin_user_page here, so the PFN has no reference count.

If the owner of the VMA decided to zap it or otherwise then the IOMMU
access keeps going - but now the owner thinks the PFN is free'd and
nobody is referencing it. Goes bad.

Jason
Peter Xu Feb. 25, 2021, 7:54 p.m. UTC | #8
On Thu, Feb 25, 2021 at 03:17:14PM -0400, Jason Gunthorpe wrote:
> It is a use-after-free. Once the PFN is programmed into the IOMMU it
> becomes completely divorced from the VMA. Remember there is no
> pin_user_page here, so the PFN has no reference count.
> 
> If the owner of the VMA decided to zap it or otherwise then the IOMMU
> access keeps going - but now the owner thinks the PFN is free'd and
> nobody is referencing it. Goes bad.

Sounds reasonable.  Thanks,
Christoph Hellwig Feb. 26, 2021, 5:47 a.m. UTC | #9
On Mon, Feb 22, 2021 at 01:55:23PM -0400, Jason Gunthorpe wrote:
> > +static bool strict_mmio_maps = true;
> > +module_param_named(strict_mmio_maps, strict_mmio_maps, bool, 0644);
> > +MODULE_PARM_DESC(strict_mmio_maps,
> > +		 "Restrict to safe DMA mappings of device memory (true).");
> 
> I think this should be a kconfig, historically we've required kconfig
> to opt-in to unsafe things that could violate kernel security. Someone
> building a secure boot trusted kernel system should not have an
> options for userspace to just turn off protections.

Agreed, but I'd go one step further:  Why should we allow the unsafe
mode at all?
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b34ee4b96a4a..2a16257bd5b6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -61,6 +61,11 @@  module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
 MODULE_PARM_DESC(dma_entry_limit,
 		 "Maximum number of user DMA mappings per container (65535).");
 
+static bool strict_mmio_maps = true;
+module_param_named(strict_mmio_maps, strict_mmio_maps, bool, 0644);
+MODULE_PARM_DESC(strict_mmio_maps,
+		 "Restrict to safe DMA mappings of device memory (true).");
+
 struct vfio_iommu {
 	struct list_head	domain_list;
 	struct list_head	iova_list;
@@ -88,6 +93,14 @@  struct vfio_domain {
 	bool			fgsp;		/* Fine-grained super pages */
 };
 
+/* Req separate object for async removal from notifier vs dropping vfio_dma */
+struct pfnmap_obj {
+	struct notifier_block	nb;
+	struct work_struct	work;
+	struct vfio_iommu	*iommu;
+	struct vfio_device	*device;
+};
+
 struct vfio_dma {
 	struct rb_node		node;
 	dma_addr_t		iova;		/* Device address */
@@ -100,6 +113,7 @@  struct vfio_dma {
 	struct task_struct	*task;
 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
 	unsigned long		*bitmap;
+	struct pfnmap_obj	*pfnmap;
 };
 
 struct vfio_group {
@@ -517,6 +531,68 @@  static int unmap_dma_pfn_list(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	return 0;
 }
 
+static void unregister_device_bg(struct work_struct *work)
+{
+	struct pfnmap_obj *pfnmap = container_of(work, struct pfnmap_obj, work);
+
+	vfio_device_unregister_notifier(pfnmap->device, &pfnmap->nb);
+	vfio_device_put(pfnmap->device);
+	kfree(pfnmap);
+}
+
+/*
+ * pfnmap object can exist beyond the dma mapping referencing it, but it holds
+ * a container reference assuring the iommu exists.  Find the dma, if exists.
+ */
+struct vfio_dma *pfnmap_find_dma(struct pfnmap_obj *pfnmap)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&pfnmap->iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		if (dma->pfnmap == pfnmap)
+			return dma;
+	}
+
+	return NULL;
+}
+
+static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma);
+
+static int vfio_device_nb_cb(struct notifier_block *nb,
+			     unsigned long action, void *unused)
+{
+	struct pfnmap_obj *pfnmap = container_of(nb, struct pfnmap_obj, nb);
+
+	switch (action) {
+	case VFIO_DEVICE_RELEASE:
+	{
+		struct vfio_dma *dma, *dma_last = NULL;
+		int retries = 0;
+again:
+		mutex_lock(&pfnmap->iommu->lock);
+		dma = pfnmap_find_dma(pfnmap);
+		if (dma) {
+			if (unmap_dma_pfn_list(pfnmap->iommu, dma,
+					       &dma_last, &retries))
+				goto again;
+
+			dma->pfnmap = NULL;
+			vfio_remove_dma(pfnmap->iommu, dma);
+		}
+		mutex_unlock(&pfnmap->iommu->lock);
+
+		/* Cannot unregister notifier from callback chain */
+		INIT_WORK(&pfnmap->work, unregister_device_bg);
+		schedule_work(&pfnmap->work);
+		break;
+	}
+	}
+
+	return NOTIFY_OK;
+}
+
 static int vaddr_get_pfn(struct vfio_iommu *iommu, struct vfio_dma *dma,
 			 struct mm_struct *mm, unsigned long vaddr,
 			 unsigned long *pfn)
@@ -549,8 +625,48 @@  static int vaddr_get_pfn(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		if (ret == -EAGAIN)
 			goto retry;
 
-		if (!ret && !is_invalid_reserved_pfn(*pfn))
+		if (!ret && !is_invalid_reserved_pfn(*pfn)) {
 			ret = -EFAULT;
+			goto done;
+		}
+
+		if (!dma->pfnmap) {
+			struct pfnmap_obj *pfnmap;
+			struct vfio_device *device;
+
+			pfnmap = kzalloc(sizeof(*pfnmap), GFP_KERNEL);
+			if (!pfnmap) {
+				ret = -ENOMEM;
+				goto done;
+			}
+
+			pfnmap->iommu = iommu;
+			pfnmap->nb.notifier_call = vfio_device_nb_cb;
+
+			device = vfio_device_get_from_vma(vma);
+			if (IS_ERR(device)) {
+				kfree(pfnmap);
+				if (strict_mmio_maps)
+					ret = PTR_ERR(device);
+
+				goto done;
+			}
+
+			pfnmap->device = device;
+			ret = vfio_device_register_notifier(device,
+							    &pfnmap->nb);
+			if (ret) {
+				vfio_device_put(device);
+				kfree(pfnmap);
+				if (!strict_mmio_maps)
+					ret = 0;
+
+				goto done;
+			}
+
+			dma->pfnmap = pfnmap;
+		}
+
 	}
 done:
 	mmap_read_unlock(mm);
@@ -1097,6 +1213,12 @@  static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
 	WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
+	if (dma->pfnmap) {
+		vfio_device_unregister_notifier(dma->pfnmap->device,
+						&dma->pfnmap->nb);
+		vfio_device_put(dma->pfnmap->device);
+		kfree(dma->pfnmap);
+	}
 	vfio_unmap_unpin(iommu, dma, true);
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);