diff mbox series

[v2] vfio/type1: Limit DMA mappings per container

Message ID 155422160029.16896.1992475589398080933.stgit@gimli.home (mailing list archive)
State New, archived
Headers show
Series [v2] vfio/type1: Limit DMA mappings per container | expand

Commit Message

Alex Williamson April 2, 2019, 4:15 p.m. UTC
Memory backed DMA mappings are accounted against a user's locked
memory limit, including multiple mappings of the same memory.  This
accounting bounds the number of such mappings that a user can create.
However, DMA mappings that are not backed by memory, such as DMA
mappings of device MMIO via mmaps, do not make use of page pinning
and therefore do not count against the user's locked memory limit.
These mappings still consume memory, but the memory is not well
associated to the process for the purpose of oom killing a task.

To add bounding on this use case, we introduce a limit to the total
number of concurrent DMA mappings that a user is allowed to create.
This limit is exposed as a tunable module option where the default
value of 64K is expected to be well in excess of any reasonable use
case (a large virtual machine configuration would typically only make
use of tens of concurrent mappings).

This fixes CVE-2019-3882.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

v2: Remove unnecessary atomic, all runtime access occurs while
    holding vfio_iommu.lock.  Change to unsigned int since we're
    no longer bound by the atomic_t.

 drivers/vfio/vfio_iommu_type1.c |   14 ++++++++++++++
 1 file changed, 14 insertions(+)

Comments

Eric Auger April 2, 2019, 7:14 p.m. UTC | #1
Hi Alex,

On 4/2/19 6:15 PM, Alex Williamson wrote:
> Memory backed DMA mappings are accounted against a user's locked
> memory limit, including multiple mappings of the same memory.  This
> accounting bounds the number of such mappings that a user can create.
> However, DMA mappings that are not backed by memory, such as DMA
> mappings of device MMIO via mmaps, do not make use of page pinning
> and therefore do not count against the user's locked memory limit.
> These mappings still consume memory, but the memory is not well
> associated to the process for the purpose of oom killing a task.
> 
> To add bounding on this use case, we introduce a limit to the total
> number of concurrent DMA mappings that a user is allowed to create.
> This limit is exposed as a tunable module option where the default
> value of 64K is expected to be well in excess of any reasonable use
> case (a large virtual machine configuration would typically only make
> use of tens of concurrent mappings).
> 
> This fixes CVE-2019-3882.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>

Thanks

Eric

> ---
> 
> v2: Remove unnecessary atomic, all runtime access occurs while
>     holding vfio_iommu.lock.  Change to unsigned int since we're
>     no longer bound by the atomic_t.
> 
>  drivers/vfio/vfio_iommu_type1.c |   14 ++++++++++++++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 73652e21efec..d0f731c9920a 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -58,12 +58,18 @@ module_param_named(disable_hugepages,
>  MODULE_PARM_DESC(disable_hugepages,
>  		 "Disable VFIO IOMMU support for IOMMU hugepages.");
>  
> +static unsigned int dma_entry_limit __read_mostly = U16_MAX;
> +module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
> +MODULE_PARM_DESC(dma_entry_limit,
> +		 "Maximum number of user DMA mappings per container (65535).");
> +
>  struct vfio_iommu {
>  	struct list_head	domain_list;
>  	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	struct blocking_notifier_head notifier;
> +	unsigned int		dma_avail;
>  	bool			v2;
>  	bool			nesting;
>  };
> @@ -836,6 +842,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  	vfio_unlink_dma(iommu, dma);
>  	put_task_struct(dma->task);
>  	kfree(dma);
> +	iommu->dma_avail++;
>  }
>  
>  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> @@ -1081,12 +1088,18 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  		goto out_unlock;
>  	}
>  
> +	if (!iommu->dma_avail) {
> +		ret = -ENOSPC;
> +		goto out_unlock;
> +	}
> +
>  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
>  	if (!dma) {
>  		ret = -ENOMEM;
>  		goto out_unlock;
>  	}
>  
> +	iommu->dma_avail--;
>  	dma->iova = iova;
>  	dma->vaddr = vaddr;
>  	dma->prot = prot;
> @@ -1583,6 +1596,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  
>  	INIT_LIST_HEAD(&iommu->domain_list);
>  	iommu->dma_list = RB_ROOT;
> +	iommu->dma_avail = dma_entry_limit;
>  	mutex_init(&iommu->lock);
>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
>  
>
Peter Xu April 3, 2019, 6:05 a.m. UTC | #2
On Tue, Apr 02, 2019 at 10:15:38AM -0600, Alex Williamson wrote:
> Memory backed DMA mappings are accounted against a user's locked
> memory limit, including multiple mappings of the same memory.  This
> accounting bounds the number of such mappings that a user can create.
> However, DMA mappings that are not backed by memory, such as DMA
> mappings of device MMIO via mmaps, do not make use of page pinning
> and therefore do not count against the user's locked memory limit.
> These mappings still consume memory, but the memory is not well
> associated to the process for the purpose of oom killing a task.
> 
> To add bounding on this use case, we introduce a limit to the total
> number of concurrent DMA mappings that a user is allowed to create.
> This limit is exposed as a tunable module option where the default
> value of 64K is expected to be well in excess of any reasonable use
> case (a large virtual machine configuration would typically only make
> use of tens of concurrent mappings).
> 
> This fixes CVE-2019-3882.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

Reviewed-by: Peter Xu <peterx@redhat.com>

Thanks,
Cornelia Huck April 3, 2019, 7:18 a.m. UTC | #3
On Tue, 02 Apr 2019 10:15:38 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> Memory backed DMA mappings are accounted against a user's locked
> memory limit, including multiple mappings of the same memory.  This
> accounting bounds the number of such mappings that a user can create.
> However, DMA mappings that are not backed by memory, such as DMA
> mappings of device MMIO via mmaps, do not make use of page pinning
> and therefore do not count against the user's locked memory limit.
> These mappings still consume memory, but the memory is not well
> associated to the process for the purpose of oom killing a task.
> 
> To add bounding on this use case, we introduce a limit to the total
> number of concurrent DMA mappings that a user is allowed to create.
> This limit is exposed as a tunable module option where the default
> value of 64K is expected to be well in excess of any reasonable use
> case (a large virtual machine configuration would typically only make
> use of tens of concurrent mappings).
> 
> This fixes CVE-2019-3882.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> v2: Remove unnecessary atomic, all runtime access occurs while
>     holding vfio_iommu.lock.  Change to unsigned int since we're
>     no longer bound by the atomic_t.
> 
>  drivers/vfio/vfio_iommu_type1.c |   14 ++++++++++++++
>  1 file changed, 14 insertions(+)

Non-atomic seems fine.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Jerome Glisse April 3, 2019, 7:24 p.m. UTC | #4
On Tue, Apr 02, 2019 at 10:15:38AM -0600, Alex Williamson wrote:
> Memory backed DMA mappings are accounted against a user's locked
> memory limit, including multiple mappings of the same memory.  This
> accounting bounds the number of such mappings that a user can create.
> However, DMA mappings that are not backed by memory, such as DMA
> mappings of device MMIO via mmaps, do not make use of page pinning
> and therefore do not count against the user's locked memory limit.
> These mappings still consume memory, but the memory is not well
> associated to the process for the purpose of oom killing a task.
> 
> To add bounding on this use case, we introduce a limit to the total
> number of concurrent DMA mappings that a user is allowed to create.
> This limit is exposed as a tunable module option where the default
> value of 64K is expected to be well in excess of any reasonable use
> case (a large virtual machine configuration would typically only make
> use of tens of concurrent mappings).
> 
> This fixes CVE-2019-3882.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

Have you tested with GPU passthrough ? GPU have huge BAR from
hundred of mega bytes to giga bytes (some driver resize them
to cover the whole GPU memory). Driver need to map those to
properly work. I am not sure what path is taken by mmap of
mmio BAR by a guest on the host but i just thought i would
point that out.

Cheers,
Jérôme
Alex Williamson April 3, 2019, 8 p.m. UTC | #5
On Wed, 3 Apr 2019 15:24:26 -0400
Jerome Glisse <jglisse@redhat.com> wrote:

> On Tue, Apr 02, 2019 at 10:15:38AM -0600, Alex Williamson wrote:
> > Memory backed DMA mappings are accounted against a user's locked
> > memory limit, including multiple mappings of the same memory.  This
> > accounting bounds the number of such mappings that a user can create.
> > However, DMA mappings that are not backed by memory, such as DMA
> > mappings of device MMIO via mmaps, do not make use of page pinning
> > and therefore do not count against the user's locked memory limit.
> > These mappings still consume memory, but the memory is not well
> > associated to the process for the purpose of oom killing a task.
> > 
> > To add bounding on this use case, we introduce a limit to the total
> > number of concurrent DMA mappings that a user is allowed to create.
> > This limit is exposed as a tunable module option where the default
> > value of 64K is expected to be well in excess of any reasonable use
> > case (a large virtual machine configuration would typically only make
> > use of tens of concurrent mappings).
> > 
> > This fixes CVE-2019-3882.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>  
> 
> Have you tested with GPU passthrough ? GPU have huge BAR from
> hundred of mega bytes to giga bytes (some driver resize them
> to cover the whole GPU memory). Driver need to map those to
> properly work. I am not sure what path is taken by mmap of
> mmio BAR by a guest on the host but i just thought i would
> point that out.

The limit introduced is the number of mappings that a user can have
outstanding, not the size of the mappings.  We don't try to estimate
the overhead of a mapping based on the mapping size since IOMMU
super-page support can make a 1GB mapping comparable in overhead to a
4KB mapping.  QEMU will generally try to map a bar with a single
mapping, unless it's split by something like an MSI-X vector table or
quirks, which still results in a low single digit number of mappings per
BAR.  This does not affect how the guest drivers use the device, BARs
cannot be partially enabled from a DMA address space perspective.

If a userspace driver were trying to map a large GPU BAR with separate
4K mappings, they could indeed hit the limit, but it's far from the
common or expected use case and the module tunable could be used to
provide this functionality if it were really necessary.

There's really no support for resizable BARs through vfio-pci right now,
we get the device in its base configuration, QEMU maps that and exposes
a rather fixed device to the VM.  If this is something we need to
address for GPU assignment, let's talk.  Thanks,

Alex
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 73652e21efec..d0f731c9920a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -58,12 +58,18 @@  module_param_named(disable_hugepages,
 MODULE_PARM_DESC(disable_hugepages,
 		 "Disable VFIO IOMMU support for IOMMU hugepages.");
 
+static unsigned int dma_entry_limit __read_mostly = U16_MAX;
+module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
+MODULE_PARM_DESC(dma_entry_limit,
+		 "Maximum number of user DMA mappings per container (65535).");
+
 struct vfio_iommu {
 	struct list_head	domain_list;
 	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	struct blocking_notifier_head notifier;
+	unsigned int		dma_avail;
 	bool			v2;
 	bool			nesting;
 };
@@ -836,6 +842,7 @@  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
 	kfree(dma);
+	iommu->dma_avail++;
 }
 
 static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
@@ -1081,12 +1088,18 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		goto out_unlock;
 	}
 
+	if (!iommu->dma_avail) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+
 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 	if (!dma) {
 		ret = -ENOMEM;
 		goto out_unlock;
 	}
 
+	iommu->dma_avail--;
 	dma->iova = iova;
 	dma->vaddr = vaddr;
 	dma->prot = prot;
@@ -1583,6 +1596,7 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 
 	INIT_LIST_HEAD(&iommu->domain_list);
 	iommu->dma_list = RB_ROOT;
+	iommu->dma_avail = dma_entry_limit;
 	mutex_init(&iommu->lock);
 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);