diff mbox series

[RFC,v3,2/8] vfio/type1: Add a page fault handler

Message ID 20210409034420.1799-3-lushenming@huawei.com (mailing list archive)
State New, archived
Headers show
Series Add IOPF support for VFIO passthrough | expand

Commit Message

Shenming Lu April 9, 2021, 3:44 a.m. UTC
VFIO manages the DMA mapping itself. To support IOPF (on-demand paging)
for VFIO (IOMMU capable) devices, we add a VFIO page fault handler to
serve the reported page faults from the IOMMU driver.

Signed-off-by: Shenming Lu <lushenming@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 114 ++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

Comments

Alex Williamson May 18, 2021, 6:58 p.m. UTC | #1
On Fri, 9 Apr 2021 11:44:14 +0800
Shenming Lu <lushenming@huawei.com> wrote:

> VFIO manages the DMA mapping itself. To support IOPF (on-demand paging)
> for VFIO (IOMMU capable) devices, we add a VFIO page fault handler to
> serve the reported page faults from the IOMMU driver.
> 
> Signed-off-by: Shenming Lu <lushenming@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 114 ++++++++++++++++++++++++++++++++
>  1 file changed, 114 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 45cbfd4879a5..ab0ff60ee207 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -101,6 +101,7 @@ struct vfio_dma {
>  	struct task_struct	*task;
>  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>  	unsigned long		*bitmap;
> +	unsigned long		*iopf_mapped_bitmap;
>  };
>  
>  struct vfio_batch {
> @@ -141,6 +142,16 @@ struct vfio_regions {
>  	size_t len;
>  };
>  
> +/* A global IOPF enabled group list */
> +static struct rb_root iopf_group_list = RB_ROOT;
> +static DEFINE_MUTEX(iopf_group_list_lock);
> +
> +struct vfio_iopf_group {
> +	struct rb_node		node;
> +	struct iommu_group	*iommu_group;
> +	struct vfio_iommu	*iommu;
> +};
> +
>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>  					(!list_empty(&iommu->domain_list))
>  
> @@ -157,6 +168,10 @@ struct vfio_regions {
>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>  
> +#define IOPF_MAPPED_BITMAP_GET(dma, i)	\
> +			      ((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]	\
> +			       >> ((i) % BITS_PER_LONG)) & 0x1)


Can't we just use test_bit()?


> +
>  #define WAITED 1
>  
>  static int put_pfn(unsigned long pfn, int prot);
> @@ -416,6 +431,34 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
>  	return ret;
>  }
>  
> +/*
> + * Helper functions for iopf_group_list
> + */
> +static struct vfio_iopf_group *
> +vfio_find_iopf_group(struct iommu_group *iommu_group)
> +{
> +	struct vfio_iopf_group *iopf_group;
> +	struct rb_node *node;
> +
> +	mutex_lock(&iopf_group_list_lock);
> +
> +	node = iopf_group_list.rb_node;
> +
> +	while (node) {
> +		iopf_group = rb_entry(node, struct vfio_iopf_group, node);
> +
> +		if (iommu_group < iopf_group->iommu_group)
> +			node = node->rb_left;
> +		else if (iommu_group > iopf_group->iommu_group)
> +			node = node->rb_right;
> +		else
> +			break;
> +	}
> +
> +	mutex_unlock(&iopf_group_list_lock);
> +	return node ? iopf_group : NULL;
> +}

This looks like a pretty heavy weight operation per DMA fault.

I'm also suspicious of this validity of this iopf_group after we've
dropped the locking, the ordering of patches makes this very confusing.

> +
>  static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
>  {
>  	struct mm_struct *mm;
> @@ -3106,6 +3149,77 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
>  	return -EINVAL;
>  }
>  
> +/* VFIO I/O Page Fault handler */
> +static int vfio_iommu_type1_dma_map_iopf(struct iommu_fault *fault, void *data)

From the comment, this seems like the IOMMU fault handler (the
construction of this series makes this difficult to follow) and
eventually it handles more than DMA mapping, for example transferring
faults to the device driver.  "dma_map_iopf" seems like a poorly scoped
name.

> +{
> +	struct device *dev = (struct device *)data;
> +	struct iommu_group *iommu_group;
> +	struct vfio_iopf_group *iopf_group;
> +	struct vfio_iommu *iommu;
> +	struct vfio_dma *dma;
> +	dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
> +	int access_flags = 0;
> +	unsigned long bit_offset, vaddr, pfn;
> +	int ret;
> +	enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
> +	struct iommu_page_response resp = {0};
> +
> +	if (fault->type != IOMMU_FAULT_PAGE_REQ)
> +		return -EOPNOTSUPP;
> +
> +	iommu_group = iommu_group_get(dev);
> +	if (!iommu_group)
> +		return -ENODEV;
> +
> +	iopf_group = vfio_find_iopf_group(iommu_group);
> +	iommu_group_put(iommu_group);
> +	if (!iopf_group)
> +		return -ENODEV;
> +
> +	iommu = iopf_group->iommu;
> +
> +	mutex_lock(&iommu->lock);

Again, I'm dubious of our ability to grab this lock from an object with
an unknown lifecycle and races we might have with that group being
detached or DMA unmapped.  Also, how effective is enabling IOMMU page
faulting if we're serializing all faults within a container context?

> +
> +	ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
> +	if (ret < 0)
> +		goto out_invalid;
> +
> +	if (fault->prm.perm & IOMMU_FAULT_PERM_READ)
> +		access_flags |= IOMMU_READ;
> +	if (fault->prm.perm & IOMMU_FAULT_PERM_WRITE)
> +		access_flags |= IOMMU_WRITE;
> +	if ((dma->prot & access_flags) != access_flags)
> +		goto out_invalid;
> +
> +	bit_offset = (iova - dma->iova) >> PAGE_SHIFT;
> +	if (IOPF_MAPPED_BITMAP_GET(dma, bit_offset))
> +		goto out_success;

If the page is mapped, why did we get a fault?  Should we be returning
success for a fault we shouldn't have received and did nothing to
resolve?  We're also referencing a bitmap that has only been declared
and never allocated at this point in the patch series.

> +
> +	vaddr = iova - dma->iova + dma->vaddr;
> +
> +	if (vfio_pin_page_external(dma, vaddr, &pfn, true))
> +		goto out_invalid;
> +
> +	if (vfio_iommu_map(iommu, iova, pfn, 1, dma->prot)) {
> +		if (put_pfn(pfn, dma->prot))
> +			vfio_lock_acct(dma, -1, true);
> +		goto out_invalid;
> +	}
> +
> +	bitmap_set(dma->iopf_mapped_bitmap, bit_offset, 1);
> +
> +out_success:
> +	status = IOMMU_PAGE_RESP_SUCCESS;
> +
> +out_invalid:
> +	mutex_unlock(&iommu->lock);
> +	resp.version		= IOMMU_PAGE_RESP_VERSION_1;
> +	resp.grpid		= fault->prm.grpid;
> +	resp.code		= status;
> +	iommu_page_response(dev, &resp);
> +	return 0;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
Shenming Lu May 21, 2021, 6:38 a.m. UTC | #2
On 2021/5/19 2:58, Alex Williamson wrote:
> On Fri, 9 Apr 2021 11:44:14 +0800
> Shenming Lu <lushenming@huawei.com> wrote:
> 
>> VFIO manages the DMA mapping itself. To support IOPF (on-demand paging)
>> for VFIO (IOMMU capable) devices, we add a VFIO page fault handler to
>> serve the reported page faults from the IOMMU driver.
>>
>> Signed-off-by: Shenming Lu <lushenming@huawei.com>
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 114 ++++++++++++++++++++++++++++++++
>>  1 file changed, 114 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 45cbfd4879a5..ab0ff60ee207 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -101,6 +101,7 @@ struct vfio_dma {
>>  	struct task_struct	*task;
>>  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>>  	unsigned long		*bitmap;
>> +	unsigned long		*iopf_mapped_bitmap;
>>  };
>>  
>>  struct vfio_batch {
>> @@ -141,6 +142,16 @@ struct vfio_regions {
>>  	size_t len;
>>  };
>>  
>> +/* A global IOPF enabled group list */
>> +static struct rb_root iopf_group_list = RB_ROOT;
>> +static DEFINE_MUTEX(iopf_group_list_lock);
>> +
>> +struct vfio_iopf_group {
>> +	struct rb_node		node;
>> +	struct iommu_group	*iommu_group;
>> +	struct vfio_iommu	*iommu;
>> +};
>> +
>>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>>  					(!list_empty(&iommu->domain_list))
>>  
>> @@ -157,6 +168,10 @@ struct vfio_regions {
>>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
>>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>>  
>> +#define IOPF_MAPPED_BITMAP_GET(dma, i)	\
>> +			      ((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]	\
>> +			       >> ((i) % BITS_PER_LONG)) & 0x1)
> 
> 
> Can't we just use test_bit()?

Yeah, we can use it.

> 
> 
>> +
>>  #define WAITED 1
>>  
>>  static int put_pfn(unsigned long pfn, int prot);
>> @@ -416,6 +431,34 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
>>  	return ret;
>>  }
>>  
>> +/*
>> + * Helper functions for iopf_group_list
>> + */
>> +static struct vfio_iopf_group *
>> +vfio_find_iopf_group(struct iommu_group *iommu_group)
>> +{
>> +	struct vfio_iopf_group *iopf_group;
>> +	struct rb_node *node;
>> +
>> +	mutex_lock(&iopf_group_list_lock);
>> +
>> +	node = iopf_group_list.rb_node;
>> +
>> +	while (node) {
>> +		iopf_group = rb_entry(node, struct vfio_iopf_group, node);
>> +
>> +		if (iommu_group < iopf_group->iommu_group)
>> +			node = node->rb_left;
>> +		else if (iommu_group > iopf_group->iommu_group)
>> +			node = node->rb_right;
>> +		else
>> +			break;
>> +	}
>> +
>> +	mutex_unlock(&iopf_group_list_lock);
>> +	return node ? iopf_group : NULL;
>> +}
> 
> This looks like a pretty heavy weight operation per DMA fault.
> 
> I'm also suspicious of this validity of this iopf_group after we've
> dropped the locking, the ordering of patches makes this very confusing.

My thought was to include the handling of DMA faults completely in the type1
backend by introducing the vfio_iopf_group struct. But it seems that introducing
a struct with an unknown lifecycle causes more problems...
I will use the path from vfio-core as in the v2 for simplicity and validity.

Sorry for the confusing, I will reconstruct the series later. :-)

> 
>> +
>>  static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
>>  {
>>  	struct mm_struct *mm;
>> @@ -3106,6 +3149,77 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
>>  	return -EINVAL;
>>  }
>>  
>> +/* VFIO I/O Page Fault handler */
>> +static int vfio_iommu_type1_dma_map_iopf(struct iommu_fault *fault, void *data)
> 
>>From the comment, this seems like the IOMMU fault handler (the
> construction of this series makes this difficult to follow) and
> eventually it handles more than DMA mapping, for example transferring
> faults to the device driver.  "dma_map_iopf" seems like a poorly scoped
> name.

Maybe just call it dev_fault_handler?

> 
>> +{
>> +	struct device *dev = (struct device *)data;
>> +	struct iommu_group *iommu_group;
>> +	struct vfio_iopf_group *iopf_group;
>> +	struct vfio_iommu *iommu;
>> +	struct vfio_dma *dma;
>> +	dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
>> +	int access_flags = 0;
>> +	unsigned long bit_offset, vaddr, pfn;
>> +	int ret;
>> +	enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
>> +	struct iommu_page_response resp = {0};
>> +
>> +	if (fault->type != IOMMU_FAULT_PAGE_REQ)
>> +		return -EOPNOTSUPP;
>> +
>> +	iommu_group = iommu_group_get(dev);
>> +	if (!iommu_group)
>> +		return -ENODEV;
>> +
>> +	iopf_group = vfio_find_iopf_group(iommu_group);
>> +	iommu_group_put(iommu_group);
>> +	if (!iopf_group)
>> +		return -ENODEV;
>> +
>> +	iommu = iopf_group->iommu;
>> +
>> +	mutex_lock(&iommu->lock);
> 
> Again, I'm dubious of our ability to grab this lock from an object with
> an unknown lifecycle and races we might have with that group being
> detached or DMA unmapped.  Also, how effective is enabling IOMMU page
> faulting if we're serializing all faults within a container context?

Did you mean "efficient"?
I also worry about this as the mapping and unmapping of the faulting pages
are all with the same lock...
Is there a way to parallel them? Or could we have more fine grained lock
control?

> 
>> +
>> +	ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
>> +	if (ret < 0)
>> +		goto out_invalid;
>> +
>> +	if (fault->prm.perm & IOMMU_FAULT_PERM_READ)
>> +		access_flags |= IOMMU_READ;
>> +	if (fault->prm.perm & IOMMU_FAULT_PERM_WRITE)
>> +		access_flags |= IOMMU_WRITE;
>> +	if ((dma->prot & access_flags) != access_flags)
>> +		goto out_invalid;
>> +
>> +	bit_offset = (iova - dma->iova) >> PAGE_SHIFT;
>> +	if (IOPF_MAPPED_BITMAP_GET(dma, bit_offset))
>> +		goto out_success;
> 
> If the page is mapped, why did we get a fault?  Should we be returning
> success for a fault we shouldn't have received and did nothing to
> resolve?  We're also referencing a bitmap that has only been declared
> and never allocated at this point in the patch series.

Image that we have two inflight page faults which target the same iova,
shouldn't the later one just return SUCCESS since the previous one has
been handled and the mapping has been established?

I will allocate the bitmap first.

Thanks,
Shenming

> 
>> +
>> +	vaddr = iova - dma->iova + dma->vaddr;
>> +
>> +	if (vfio_pin_page_external(dma, vaddr, &pfn, true))
>> +		goto out_invalid;
>> +
>> +	if (vfio_iommu_map(iommu, iova, pfn, 1, dma->prot)) {
>> +		if (put_pfn(pfn, dma->prot))
>> +			vfio_lock_acct(dma, -1, true);
>> +		goto out_invalid;
>> +	}
>> +
>> +	bitmap_set(dma->iopf_mapped_bitmap, bit_offset, 1);
>> +
>> +out_success:
>> +	status = IOMMU_PAGE_RESP_SUCCESS;
>> +
>> +out_invalid:
>> +	mutex_unlock(&iommu->lock);
>> +	resp.version		= IOMMU_PAGE_RESP_VERSION_1;
>> +	resp.grpid		= fault->prm.grpid;
>> +	resp.code		= status;
>> +	iommu_page_response(dev, &resp);
>> +	return 0;
>> +}
>> +
>>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>>  				   unsigned int cmd, unsigned long arg)
>>  {
> 
> .
>
Alex Williamson May 24, 2021, 10:11 p.m. UTC | #3
On Fri, 21 May 2021 14:38:52 +0800
Shenming Lu <lushenming@huawei.com> wrote:

> On 2021/5/19 2:58, Alex Williamson wrote:
> > On Fri, 9 Apr 2021 11:44:14 +0800
> > Shenming Lu <lushenming@huawei.com> wrote:
> >   
> >> VFIO manages the DMA mapping itself. To support IOPF (on-demand paging)
> >> for VFIO (IOMMU capable) devices, we add a VFIO page fault handler to
> >> serve the reported page faults from the IOMMU driver.
> >>
> >> Signed-off-by: Shenming Lu <lushenming@huawei.com>
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 114 ++++++++++++++++++++++++++++++++
> >>  1 file changed, 114 insertions(+)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 45cbfd4879a5..ab0ff60ee207 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -101,6 +101,7 @@ struct vfio_dma {
> >>  	struct task_struct	*task;
> >>  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> >>  	unsigned long		*bitmap;
> >> +	unsigned long		*iopf_mapped_bitmap;
> >>  };
> >>  
> >>  struct vfio_batch {
> >> @@ -141,6 +142,16 @@ struct vfio_regions {
> >>  	size_t len;
> >>  };
> >>  
> >> +/* A global IOPF enabled group list */
> >> +static struct rb_root iopf_group_list = RB_ROOT;
> >> +static DEFINE_MUTEX(iopf_group_list_lock);
> >> +
> >> +struct vfio_iopf_group {
> >> +	struct rb_node		node;
> >> +	struct iommu_group	*iommu_group;
> >> +	struct vfio_iommu	*iommu;
> >> +};
> >> +
> >>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >>  					(!list_empty(&iommu->domain_list))
> >>  
> >> @@ -157,6 +168,10 @@ struct vfio_regions {
> >>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
> >>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> >>  
> >> +#define IOPF_MAPPED_BITMAP_GET(dma, i)	\
> >> +			      ((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]	\
> >> +			       >> ((i) % BITS_PER_LONG)) & 0x1)  
> > 
> > 
> > Can't we just use test_bit()?  
> 
> Yeah, we can use it.
> 
> > 
> >   
> >> +
> >>  #define WAITED 1
> >>  
> >>  static int put_pfn(unsigned long pfn, int prot);
> >> @@ -416,6 +431,34 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> >>  	return ret;
> >>  }
> >>  
> >> +/*
> >> + * Helper functions for iopf_group_list
> >> + */
> >> +static struct vfio_iopf_group *
> >> +vfio_find_iopf_group(struct iommu_group *iommu_group)
> >> +{
> >> +	struct vfio_iopf_group *iopf_group;
> >> +	struct rb_node *node;
> >> +
> >> +	mutex_lock(&iopf_group_list_lock);
> >> +
> >> +	node = iopf_group_list.rb_node;
> >> +
> >> +	while (node) {
> >> +		iopf_group = rb_entry(node, struct vfio_iopf_group, node);
> >> +
> >> +		if (iommu_group < iopf_group->iommu_group)
> >> +			node = node->rb_left;
> >> +		else if (iommu_group > iopf_group->iommu_group)
> >> +			node = node->rb_right;
> >> +		else
> >> +			break;
> >> +	}
> >> +
> >> +	mutex_unlock(&iopf_group_list_lock);
> >> +	return node ? iopf_group : NULL;
> >> +}  
> > 
> > This looks like a pretty heavy weight operation per DMA fault.
> > 
> > I'm also suspicious of this validity of this iopf_group after we've
> > dropped the locking, the ordering of patches makes this very confusing.  
> 
> My thought was to include the handling of DMA faults completely in the type1
> backend by introducing the vfio_iopf_group struct. But it seems that introducing
> a struct with an unknown lifecycle causes more problems...
> I will use the path from vfio-core as in the v2 for simplicity and validity.
> 
> Sorry for the confusing, I will reconstruct the series later. :-)
> 
> >   
> >> +
> >>  static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
> >>  {
> >>  	struct mm_struct *mm;
> >> @@ -3106,6 +3149,77 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
> >>  	return -EINVAL;
> >>  }
> >>  
> >> +/* VFIO I/O Page Fault handler */
> >> +static int vfio_iommu_type1_dma_map_iopf(struct iommu_fault *fault, void *data)  
> >   
> >>From the comment, this seems like the IOMMU fault handler (the  
> > construction of this series makes this difficult to follow) and
> > eventually it handles more than DMA mapping, for example transferring
> > faults to the device driver.  "dma_map_iopf" seems like a poorly scoped
> > name.  
> 
> Maybe just call it dev_fault_handler?

Better.

> >> +{
> >> +	struct device *dev = (struct device *)data;
> >> +	struct iommu_group *iommu_group;
> >> +	struct vfio_iopf_group *iopf_group;
> >> +	struct vfio_iommu *iommu;
> >> +	struct vfio_dma *dma;
> >> +	dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
> >> +	int access_flags = 0;
> >> +	unsigned long bit_offset, vaddr, pfn;
> >> +	int ret;
> >> +	enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
> >> +	struct iommu_page_response resp = {0};
> >> +
> >> +	if (fault->type != IOMMU_FAULT_PAGE_REQ)
> >> +		return -EOPNOTSUPP;
> >> +
> >> +	iommu_group = iommu_group_get(dev);
> >> +	if (!iommu_group)
> >> +		return -ENODEV;
> >> +
> >> +	iopf_group = vfio_find_iopf_group(iommu_group);
> >> +	iommu_group_put(iommu_group);
> >> +	if (!iopf_group)
> >> +		return -ENODEV;
> >> +
> >> +	iommu = iopf_group->iommu;
> >> +
> >> +	mutex_lock(&iommu->lock);  
> > 
> > Again, I'm dubious of our ability to grab this lock from an object with
> > an unknown lifecycle and races we might have with that group being
> > detached or DMA unmapped.  Also, how effective is enabling IOMMU page
> > faulting if we're serializing all faults within a container context?  
> 
> Did you mean "efficient"?

Yes, that's more appropriate.

> I also worry about this as the mapping and unmapping of the faulting pages
> are all with the same lock...
> Is there a way to parallel them? Or could we have more fine grained lock
> control?

It seems we need it; the current locking is designed for static
mappings by the user, therefore concurrency hasn't been a priority.
This again touches how far we want to extend type1 in the direction
we intend to go with SVA/PASID support in IOASID.  Thanks,

Alex
Shenming Lu May 27, 2021, 11:16 a.m. UTC | #4
On 2021/5/25 6:11, Alex Williamson wrote:
> On Fri, 21 May 2021 14:38:52 +0800
> Shenming Lu <lushenming@huawei.com> wrote:
> 
>> On 2021/5/19 2:58, Alex Williamson wrote:
>>> On Fri, 9 Apr 2021 11:44:14 +0800
>>> Shenming Lu <lushenming@huawei.com> wrote:
>>>   
>>>> VFIO manages the DMA mapping itself. To support IOPF (on-demand paging)
>>>> for VFIO (IOMMU capable) devices, we add a VFIO page fault handler to
>>>> serve the reported page faults from the IOMMU driver.
>>>>
>>>> Signed-off-by: Shenming Lu <lushenming@huawei.com>
>>>> ---
>>>>  drivers/vfio/vfio_iommu_type1.c | 114 ++++++++++++++++++++++++++++++++
>>>>  1 file changed, 114 insertions(+)
>>>>
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>> index 45cbfd4879a5..ab0ff60ee207 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -101,6 +101,7 @@ struct vfio_dma {
>>>>  	struct task_struct	*task;
>>>>  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>>>>  	unsigned long		*bitmap;
>>>> +	unsigned long		*iopf_mapped_bitmap;
>>>>  };
>>>>  
>>>>  struct vfio_batch {
>>>> @@ -141,6 +142,16 @@ struct vfio_regions {
>>>>  	size_t len;
>>>>  };
>>>>  
>>>> +/* A global IOPF enabled group list */
>>>> +static struct rb_root iopf_group_list = RB_ROOT;
>>>> +static DEFINE_MUTEX(iopf_group_list_lock);
>>>> +
>>>> +struct vfio_iopf_group {
>>>> +	struct rb_node		node;
>>>> +	struct iommu_group	*iommu_group;
>>>> +	struct vfio_iommu	*iommu;
>>>> +};
>>>> +
>>>>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>>>>  					(!list_empty(&iommu->domain_list))
>>>>  
>>>> @@ -157,6 +168,10 @@ struct vfio_regions {
>>>>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
>>>>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>>>>  
>>>> +#define IOPF_MAPPED_BITMAP_GET(dma, i)	\
>>>> +			      ((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]	\
>>>> +			       >> ((i) % BITS_PER_LONG)) & 0x1)  
>>>
>>>
>>> Can't we just use test_bit()?  
>>
>> Yeah, we can use it.
>>
>>>
>>>   
>>>> +
>>>>  #define WAITED 1
>>>>  
>>>>  static int put_pfn(unsigned long pfn, int prot);
>>>> @@ -416,6 +431,34 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
>>>>  	return ret;
>>>>  }
>>>>  
>>>> +/*
>>>> + * Helper functions for iopf_group_list
>>>> + */
>>>> +static struct vfio_iopf_group *
>>>> +vfio_find_iopf_group(struct iommu_group *iommu_group)
>>>> +{
>>>> +	struct vfio_iopf_group *iopf_group;
>>>> +	struct rb_node *node;
>>>> +
>>>> +	mutex_lock(&iopf_group_list_lock);
>>>> +
>>>> +	node = iopf_group_list.rb_node;
>>>> +
>>>> +	while (node) {
>>>> +		iopf_group = rb_entry(node, struct vfio_iopf_group, node);
>>>> +
>>>> +		if (iommu_group < iopf_group->iommu_group)
>>>> +			node = node->rb_left;
>>>> +		else if (iommu_group > iopf_group->iommu_group)
>>>> +			node = node->rb_right;
>>>> +		else
>>>> +			break;
>>>> +	}
>>>> +
>>>> +	mutex_unlock(&iopf_group_list_lock);
>>>> +	return node ? iopf_group : NULL;
>>>> +}  
>>>
>>> This looks like a pretty heavy weight operation per DMA fault.
>>>
>>> I'm also suspicious of this validity of this iopf_group after we've
>>> dropped the locking, the ordering of patches makes this very confusing.  
>>
>> My thought was to include the handling of DMA faults completely in the type1
>> backend by introducing the vfio_iopf_group struct. But it seems that introducing
>> a struct with an unknown lifecycle causes more problems...
>> I will use the path from vfio-core as in the v2 for simplicity and validity.
>>
>> Sorry for the confusing, I will reconstruct the series later. :-)
>>
>>>   
>>>> +
>>>>  static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
>>>>  {
>>>>  	struct mm_struct *mm;
>>>> @@ -3106,6 +3149,77 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
>>>>  	return -EINVAL;
>>>>  }
>>>>  
>>>> +/* VFIO I/O Page Fault handler */
>>>> +static int vfio_iommu_type1_dma_map_iopf(struct iommu_fault *fault, void *data)  
>>>   
>>> >From the comment, this seems like the IOMMU fault handler (the  
>>> construction of this series makes this difficult to follow) and
>>> eventually it handles more than DMA mapping, for example transferring
>>> faults to the device driver.  "dma_map_iopf" seems like a poorly scoped
>>> name.  
>>
>> Maybe just call it dev_fault_handler?
> 
> Better.
> 
>>>> +{
>>>> +	struct device *dev = (struct device *)data;
>>>> +	struct iommu_group *iommu_group;
>>>> +	struct vfio_iopf_group *iopf_group;
>>>> +	struct vfio_iommu *iommu;
>>>> +	struct vfio_dma *dma;
>>>> +	dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
>>>> +	int access_flags = 0;
>>>> +	unsigned long bit_offset, vaddr, pfn;
>>>> +	int ret;
>>>> +	enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
>>>> +	struct iommu_page_response resp = {0};
>>>> +
>>>> +	if (fault->type != IOMMU_FAULT_PAGE_REQ)
>>>> +		return -EOPNOTSUPP;
>>>> +
>>>> +	iommu_group = iommu_group_get(dev);
>>>> +	if (!iommu_group)
>>>> +		return -ENODEV;
>>>> +
>>>> +	iopf_group = vfio_find_iopf_group(iommu_group);
>>>> +	iommu_group_put(iommu_group);
>>>> +	if (!iopf_group)
>>>> +		return -ENODEV;
>>>> +
>>>> +	iommu = iopf_group->iommu;
>>>> +
>>>> +	mutex_lock(&iommu->lock);  
>>>
>>> Again, I'm dubious of our ability to grab this lock from an object with
>>> an unknown lifecycle and races we might have with that group being
>>> detached or DMA unmapped.  Also, how effective is enabling IOMMU page
>>> faulting if we're serializing all faults within a container context?  
>>
>> Did you mean "efficient"?
> 
> Yes, that's more appropriate.
> 
>> I also worry about this as the mapping and unmapping of the faulting pages
>> are all with the same lock...
>> Is there a way to parallel them? Or could we have more fine grained lock
>> control?
> 
> It seems we need it; the current locking is designed for static
> mappings by the user, therefore concurrency hasn't been a priority.

I will try to implement it. :-)

> This again touches how far we want to extend type1 in the direction
> we intend to go with SVA/PASID support in IOASID.  Thanks,

Reply in the cover.

Thanks,
Shenming

> 
> Alex
> 
> .
>
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 45cbfd4879a5..ab0ff60ee207 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -101,6 +101,7 @@  struct vfio_dma {
 	struct task_struct	*task;
 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
 	unsigned long		*bitmap;
+	unsigned long		*iopf_mapped_bitmap;
 };
 
 struct vfio_batch {
@@ -141,6 +142,16 @@  struct vfio_regions {
 	size_t len;
 };
 
+/* A global IOPF enabled group list */
+static struct rb_root iopf_group_list = RB_ROOT;
+static DEFINE_MUTEX(iopf_group_list_lock);
+
+struct vfio_iopf_group {
+	struct rb_node		node;
+	struct iommu_group	*iommu_group;
+	struct vfio_iommu	*iommu;
+};
+
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
@@ -157,6 +168,10 @@  struct vfio_regions {
 #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
 #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 
+#define IOPF_MAPPED_BITMAP_GET(dma, i)	\
+			      ((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]	\
+			       >> ((i) % BITS_PER_LONG)) & 0x1)
+
 #define WAITED 1
 
 static int put_pfn(unsigned long pfn, int prot);
@@ -416,6 +431,34 @@  static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
 	return ret;
 }
 
+/*
+ * Helper functions for iopf_group_list
+ */
+static struct vfio_iopf_group *
+vfio_find_iopf_group(struct iommu_group *iommu_group)
+{
+	struct vfio_iopf_group *iopf_group;
+	struct rb_node *node;
+
+	mutex_lock(&iopf_group_list_lock);
+
+	node = iopf_group_list.rb_node;
+
+	while (node) {
+		iopf_group = rb_entry(node, struct vfio_iopf_group, node);
+
+		if (iommu_group < iopf_group->iommu_group)
+			node = node->rb_left;
+		else if (iommu_group > iopf_group->iommu_group)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	mutex_unlock(&iopf_group_list_lock);
+	return node ? iopf_group : NULL;
+}
+
 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 {
 	struct mm_struct *mm;
@@ -3106,6 +3149,77 @@  static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
 	return -EINVAL;
 }
 
+/* VFIO I/O Page Fault handler */
+static int vfio_iommu_type1_dma_map_iopf(struct iommu_fault *fault, void *data)
+{
+	struct device *dev = (struct device *)data;
+	struct iommu_group *iommu_group;
+	struct vfio_iopf_group *iopf_group;
+	struct vfio_iommu *iommu;
+	struct vfio_dma *dma;
+	dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
+	int access_flags = 0;
+	unsigned long bit_offset, vaddr, pfn;
+	int ret;
+	enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
+	struct iommu_page_response resp = {0};
+
+	if (fault->type != IOMMU_FAULT_PAGE_REQ)
+		return -EOPNOTSUPP;
+
+	iommu_group = iommu_group_get(dev);
+	if (!iommu_group)
+		return -ENODEV;
+
+	iopf_group = vfio_find_iopf_group(iommu_group);
+	iommu_group_put(iommu_group);
+	if (!iopf_group)
+		return -ENODEV;
+
+	iommu = iopf_group->iommu;
+
+	mutex_lock(&iommu->lock);
+
+	ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
+	if (ret < 0)
+		goto out_invalid;
+
+	if (fault->prm.perm & IOMMU_FAULT_PERM_READ)
+		access_flags |= IOMMU_READ;
+	if (fault->prm.perm & IOMMU_FAULT_PERM_WRITE)
+		access_flags |= IOMMU_WRITE;
+	if ((dma->prot & access_flags) != access_flags)
+		goto out_invalid;
+
+	bit_offset = (iova - dma->iova) >> PAGE_SHIFT;
+	if (IOPF_MAPPED_BITMAP_GET(dma, bit_offset))
+		goto out_success;
+
+	vaddr = iova - dma->iova + dma->vaddr;
+
+	if (vfio_pin_page_external(dma, vaddr, &pfn, true))
+		goto out_invalid;
+
+	if (vfio_iommu_map(iommu, iova, pfn, 1, dma->prot)) {
+		if (put_pfn(pfn, dma->prot))
+			vfio_lock_acct(dma, -1, true);
+		goto out_invalid;
+	}
+
+	bitmap_set(dma->iopf_mapped_bitmap, bit_offset, 1);
+
+out_success:
+	status = IOMMU_PAGE_RESP_SUCCESS;
+
+out_invalid:
+	mutex_unlock(&iommu->lock);
+	resp.version		= IOMMU_PAGE_RESP_VERSION_1;
+	resp.grpid		= fault->prm.grpid;
+	resp.code		= status;
+	iommu_page_response(dev, &resp);
+	return 0;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {