Message ID | 1611078509-181959-10-git-send-email-steven.sistare@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio virtual address update | expand |
On Tue, 19 Jan 2021 09:48:29 -0800 Steve Sistare <steven.sistare@oracle.com> wrote: > Block translation of host virtual address while an iova range has an > invalid vaddr. > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com> > --- > drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++---- > 1 file changed, 76 insertions(+), 7 deletions(-) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 0167996..c97573a 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -31,6 +31,7 @@ > #include <linux/rbtree.h> > #include <linux/sched/signal.h> > #include <linux/sched/mm.h> > +#include <linux/kthread.h> > #include <linux/slab.h> > #include <linux/uaccess.h> > #include <linux/vfio.h> > @@ -75,6 +76,7 @@ struct vfio_iommu { > bool dirty_page_tracking; > bool pinned_page_dirty_scope; > bool controlled; > + wait_queue_head_t vaddr_wait; > }; > > struct vfio_domain { > @@ -145,6 +147,8 @@ struct vfio_regions { > #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) > #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) > > +#define WAITED 1 > + > static int put_pfn(unsigned long pfn, int prot); > > static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, > @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > } > > /* > + * Wait for vaddr of *dma_p to become valid. iommu lock is dropped if the task > + * waits, but is re-locked on return. If the task waits, then return an updated > + * dma struct in *dma_p. Return 0 on success with no waiting, 1 on success if s/1/WAITED/ > + * waited, and -errno on error. > + */ > +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p) > +{ > + struct vfio_dma *dma = *dma_p; > + unsigned long iova = dma->iova; > + size_t size = dma->size; > + int ret = 0; > + DEFINE_WAIT(wait); > + > + while (!dma->vaddr_valid) { > + ret = WAITED; > + prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE); > + mutex_unlock(&iommu->lock); > + schedule(); > + mutex_lock(&iommu->lock); > + finish_wait(&iommu->vaddr_wait, &wait); > + if (kthread_should_stop() || !iommu->controlled || > + fatal_signal_pending(current)) { > + return -EFAULT; > + } > + *dma_p = dma = vfio_find_dma(iommu, iova, size); > + if (!dma) > + return -EINVAL; > + } > + return ret; > +} > + > +/* > + * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped > + * if the task waits, but is re-locked on return. Return result in *dma_p. > + * Return 0 on success, 1 on success if waited, and -errno on error. > + */ > +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start, > + size_t size, struct vfio_dma **dma_p) more of a vfio_find_dma_valid() > +{ > + *dma_p = vfio_find_dma(iommu, start, size); > + if (!*dma_p) > + return -EINVAL; > + return vfio_vaddr_wait(iommu, dma_p); > +} > + > +/* > * Attempt to pin pages. We really don't want to track all the pfns and > * the iommu can only map chunks of consecutive pfns anyway, so get the > * first page and all consecutive pages with the same locking. > @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, > struct vfio_pfn *vpfn; > > iova = user_pfn[i] << PAGE_SHIFT; > - dma = vfio_find_dma(iommu, iova, PAGE_SIZE); > - if (!dma) { > - ret = -EINVAL; > + ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma); > + if (ret < 0) > goto pin_unwind; > - } > + else if (ret == WAITED) > + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); We're iterating through an array of pfns to provide translations, once we've released the lock it's not just the current one that could be invalid. I'm afraid we need to unwind and try again, but I think it's actually worse than that because if we've marked pages within a vfio_dma's pfn_list as pinned, then during the locking gap it gets unmapped, the unmap path will call the unmap notifier chain to release the page that the vendor driver doesn't have yet. Yikes! > > if ((dma->prot & prot) != prot) { > ret = -EPERM; > @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, > unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > int ret; > > + /* > + * Wait for all vaddr to be valid so they can be used in the main loop. > + * If we do wait, the lock was dropped and re-taken, so start over to > + * ensure the dma list is consistent. > + */ > +again: > + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { > + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); > + > + ret = vfio_vaddr_wait(iommu, &dma); > + if (ret < 0) > + return ret; > + else if (ret == WAITED) > + goto again; > + } This "do nothing until all the vaddrs are valid" approach could work above too, but gosh is that a lot of cache unfriendly work for a rare invalidation. Thanks, Alex > + > /* Arbitrarily pick the first domain in the list for lookups */ > if (!list_empty(&iommu->domain_list)) > d = list_first_entry(&iommu->domain_list, > @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) > iommu->controlled = true; > mutex_init(&iommu->lock); > BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); > + init_waitqueue_head(&iommu->vaddr_wait); > > return iommu; > } > @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, > struct vfio_dma *dma; > bool kthread = current->mm == NULL; > size_t offset; > + int ret; > > *copied = 0; > > - dma = vfio_find_dma(iommu, user_iova, 1); > - if (!dma) > - return -EINVAL; > + ret = vfio_find_vaddr(iommu, user_iova, 1, &dma); > + if (ret < 0) > + return ret; > > if ((write && !(dma->prot & IOMMU_WRITE)) || > !(dma->prot & IOMMU_READ)) > @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event, > mutex_lock(&iommu->lock); > iommu->controlled = false; > mutex_unlock(&iommu->lock); > + wake_up_all(&iommu->vaddr_wait); > } > > static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
On 1/22/2021 5:59 PM, Alex Williamson wrote: > On Tue, 19 Jan 2021 09:48:29 -0800 > Steve Sistare <steven.sistare@oracle.com> wrote: > >> Block translation of host virtual address while an iova range has an >> invalid vaddr. >> >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >> --- >> drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++---- >> 1 file changed, 76 insertions(+), 7 deletions(-) >> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c >> index 0167996..c97573a 100644 >> --- a/drivers/vfio/vfio_iommu_type1.c >> +++ b/drivers/vfio/vfio_iommu_type1.c >> @@ -31,6 +31,7 @@ >> #include <linux/rbtree.h> >> #include <linux/sched/signal.h> >> #include <linux/sched/mm.h> >> +#include <linux/kthread.h> >> #include <linux/slab.h> >> #include <linux/uaccess.h> >> #include <linux/vfio.h> >> @@ -75,6 +76,7 @@ struct vfio_iommu { >> bool dirty_page_tracking; >> bool pinned_page_dirty_scope; >> bool controlled; >> + wait_queue_head_t vaddr_wait; >> }; >> >> struct vfio_domain { >> @@ -145,6 +147,8 @@ struct vfio_regions { >> #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) >> #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) >> >> +#define WAITED 1 >> + >> static int put_pfn(unsigned long pfn, int prot); >> >> static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, >> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, >> } >> >> /* >> + * Wait for vaddr of *dma_p to become valid. iommu lock is dropped if the task >> + * waits, but is re-locked on return. If the task waits, then return an updated >> + * dma struct in *dma_p. Return 0 on success with no waiting, 1 on success if > > s/1/WAITED/ OK, but the WAITED state will not need to be returned in the new scheme below. >> + * waited, and -errno on error. >> + */ >> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p) >> +{ >> + struct vfio_dma *dma = *dma_p; >> + unsigned long iova = dma->iova; >> + size_t size = dma->size; >> + int ret = 0; >> + DEFINE_WAIT(wait); >> + >> + while (!dma->vaddr_valid) { >> + ret = WAITED; >> + prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE); >> + mutex_unlock(&iommu->lock); >> + schedule(); >> + mutex_lock(&iommu->lock); >> + finish_wait(&iommu->vaddr_wait, &wait); >> + if (kthread_should_stop() || !iommu->controlled || >> + fatal_signal_pending(current)) { >> + return -EFAULT; >> + } >> + *dma_p = dma = vfio_find_dma(iommu, iova, size); >> + if (!dma) >> + return -EINVAL; >> + } >> + return ret; >> +} >> + >> +/* >> + * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped >> + * if the task waits, but is re-locked on return. Return result in *dma_p. >> + * Return 0 on success, 1 on success if waited, and -errno on error. >> + */ >> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start, >> + size_t size, struct vfio_dma **dma_p) > > more of a vfio_find_dma_valid() I will slightly modify and rename this with the new scheme I describe below. >> +{ >> + *dma_p = vfio_find_dma(iommu, start, size); >> + if (!*dma_p) >> + return -EINVAL; >> + return vfio_vaddr_wait(iommu, dma_p); >> +} >> + >> +/* >> * Attempt to pin pages. We really don't want to track all the pfns and >> * the iommu can only map chunks of consecutive pfns anyway, so get the >> * first page and all consecutive pages with the same locking. >> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, >> struct vfio_pfn *vpfn; >> >> iova = user_pfn[i] << PAGE_SHIFT; >> - dma = vfio_find_dma(iommu, iova, PAGE_SIZE); >> - if (!dma) { >> - ret = -EINVAL; >> + ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma); >> + if (ret < 0) >> goto pin_unwind; >> - } >> + else if (ret == WAITED) >> + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); > > We're iterating through an array of pfns to provide translations, once > we've released the lock it's not just the current one that could be > invalid. I'm afraid we need to unwind and try again, but I think it's > actually worse than that because if we've marked pages within a > vfio_dma's pfn_list as pinned, then during the locking gap it gets > unmapped, the unmap path will call the unmap notifier chain to release > the page that the vendor driver doesn't have yet. Yikes! Yikes indeed. The fix is easy, though. I will maintain a count in vfio_iommu of vfio_dma objects with invalid vaddr's, modified and tested while holding the lock, provide a function to wait for the count to become 0, and call that function at the start of vfio_iommu_type1_pin_pages and vfio_iommu_replay. I will use iommu->vaddr_wait for wait and wake. >> if ((dma->prot & prot) != prot) { >> ret = -EPERM; >> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, >> unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; >> int ret; >> >> + /* >> + * Wait for all vaddr to be valid so they can be used in the main loop. >> + * If we do wait, the lock was dropped and re-taken, so start over to >> + * ensure the dma list is consistent. >> + */ >> +again: >> + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { >> + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); >> + >> + ret = vfio_vaddr_wait(iommu, &dma); >> + if (ret < 0) >> + return ret; >> + else if (ret == WAITED) >> + goto again; >> + } > > This "do nothing until all the vaddrs are valid" approach could work > above too, but gosh is that a lot of cache unfriendly work for a rare > invalidation. Thanks, The new wait function described above is fast in the common case, just a check that the invalid vaddr count is 0. - Steve >> + >> /* Arbitrarily pick the first domain in the list for lookups */ >> if (!list_empty(&iommu->domain_list)) >> d = list_first_entry(&iommu->domain_list, >> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) >> iommu->controlled = true; >> mutex_init(&iommu->lock); >> BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); >> + init_waitqueue_head(&iommu->vaddr_wait); >> >> return iommu; >> } >> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, >> struct vfio_dma *dma; >> bool kthread = current->mm == NULL; >> size_t offset; >> + int ret; >> >> *copied = 0; >> >> - dma = vfio_find_dma(iommu, user_iova, 1); >> - if (!dma) >> - return -EINVAL; >> + ret = vfio_find_vaddr(iommu, user_iova, 1, &dma); >> + if (ret < 0) >> + return ret; >> >> if ((write && !(dma->prot & IOMMU_WRITE)) || >> !(dma->prot & IOMMU_READ)) >> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event, >> mutex_lock(&iommu->lock); >> iommu->controlled = false; >> mutex_unlock(&iommu->lock); >> + wake_up_all(&iommu->vaddr_wait); >> } >> >> static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { >
On Wed, 27 Jan 2021 18:25:13 -0500 Steven Sistare <steven.sistare@oracle.com> wrote: > On 1/22/2021 5:59 PM, Alex Williamson wrote: > > On Tue, 19 Jan 2021 09:48:29 -0800 > > Steve Sistare <steven.sistare@oracle.com> wrote: > > > >> Block translation of host virtual address while an iova range has an > >> invalid vaddr. > >> > >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> > >> --- > >> drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++---- > >> 1 file changed, 76 insertions(+), 7 deletions(-) > >> > >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > >> index 0167996..c97573a 100644 > >> --- a/drivers/vfio/vfio_iommu_type1.c > >> +++ b/drivers/vfio/vfio_iommu_type1.c > >> @@ -31,6 +31,7 @@ > >> #include <linux/rbtree.h> > >> #include <linux/sched/signal.h> > >> #include <linux/sched/mm.h> > >> +#include <linux/kthread.h> > >> #include <linux/slab.h> > >> #include <linux/uaccess.h> > >> #include <linux/vfio.h> > >> @@ -75,6 +76,7 @@ struct vfio_iommu { > >> bool dirty_page_tracking; > >> bool pinned_page_dirty_scope; > >> bool controlled; > >> + wait_queue_head_t vaddr_wait; > >> }; > >> > >> struct vfio_domain { > >> @@ -145,6 +147,8 @@ struct vfio_regions { > >> #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) > >> #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) > >> > >> +#define WAITED 1 > >> + > >> static int put_pfn(unsigned long pfn, int prot); > >> > >> static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, > >> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > >> } > >> > >> /* > >> + * Wait for vaddr of *dma_p to become valid. iommu lock is dropped if the task > >> + * waits, but is re-locked on return. If the task waits, then return an updated > >> + * dma struct in *dma_p. Return 0 on success with no waiting, 1 on success if > > > > s/1/WAITED/ > > OK, but the WAITED state will not need to be returned in the new scheme below. > > >> + * waited, and -errno on error. > >> + */ > >> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p) > >> +{ > >> + struct vfio_dma *dma = *dma_p; > >> + unsigned long iova = dma->iova; > >> + size_t size = dma->size; > >> + int ret = 0; > >> + DEFINE_WAIT(wait); > >> + > >> + while (!dma->vaddr_valid) { > >> + ret = WAITED; > >> + prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE); > >> + mutex_unlock(&iommu->lock); > >> + schedule(); > >> + mutex_lock(&iommu->lock); > >> + finish_wait(&iommu->vaddr_wait, &wait); > >> + if (kthread_should_stop() || !iommu->controlled || > >> + fatal_signal_pending(current)) { > >> + return -EFAULT; > >> + } > >> + *dma_p = dma = vfio_find_dma(iommu, iova, size); > >> + if (!dma) > >> + return -EINVAL; > >> + } > >> + return ret; > >> +} > >> + > >> +/* > >> + * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped > >> + * if the task waits, but is re-locked on return. Return result in *dma_p. > >> + * Return 0 on success, 1 on success if waited, and -errno on error. > >> + */ > >> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start, > >> + size_t size, struct vfio_dma **dma_p) > > > > more of a vfio_find_dma_valid() > > I will slightly modify and rename this with the new scheme I describe below. > > >> +{ > >> + *dma_p = vfio_find_dma(iommu, start, size); > >> + if (!*dma_p) > >> + return -EINVAL; > >> + return vfio_vaddr_wait(iommu, dma_p); > >> +} > >> + > >> +/* > >> * Attempt to pin pages. We really don't want to track all the pfns and > >> * the iommu can only map chunks of consecutive pfns anyway, so get the > >> * first page and all consecutive pages with the same locking. > >> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, > >> struct vfio_pfn *vpfn; > >> > >> iova = user_pfn[i] << PAGE_SHIFT; > >> - dma = vfio_find_dma(iommu, iova, PAGE_SIZE); > >> - if (!dma) { > >> - ret = -EINVAL; > >> + ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma); > >> + if (ret < 0) > >> goto pin_unwind; > >> - } > >> + else if (ret == WAITED) > >> + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); > > > > We're iterating through an array of pfns to provide translations, once > > we've released the lock it's not just the current one that could be > > invalid. I'm afraid we need to unwind and try again, but I think it's > > actually worse than that because if we've marked pages within a > > vfio_dma's pfn_list as pinned, then during the locking gap it gets > > unmapped, the unmap path will call the unmap notifier chain to release > > the page that the vendor driver doesn't have yet. Yikes! > > Yikes indeed. The fix is easy, though. I will maintain a count in vfio_iommu of > vfio_dma objects with invalid vaddr's, modified and tested while holding the lock, > provide a function to wait for the count to become 0, and call that function at the > start of vfio_iommu_type1_pin_pages and vfio_iommu_replay. I will use iommu->vaddr_wait > for wait and wake. I prefer the overhead of this, but the resulting behavior seems pretty non-intuitive. Any invalidated vaddr blocks all vaddr use cases, which almost suggests the unmap _VADDR flag should only be allowed with the _ALL flag, but then the map _VADDR flag can only be per mapping, which would make accounting and recovering from _VADDR + _ALL pretty complicated. Thanks, Alex > >> if ((dma->prot & prot) != prot) { > >> ret = -EPERM; > >> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, > >> unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > >> int ret; > >> > >> + /* > >> + * Wait for all vaddr to be valid so they can be used in the main loop. > >> + * If we do wait, the lock was dropped and re-taken, so start over to > >> + * ensure the dma list is consistent. > >> + */ > >> +again: > >> + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { > >> + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); > >> + > >> + ret = vfio_vaddr_wait(iommu, &dma); > >> + if (ret < 0) > >> + return ret; > >> + else if (ret == WAITED) > >> + goto again; > >> + } > > > > This "do nothing until all the vaddrs are valid" approach could work > > above too, but gosh is that a lot of cache unfriendly work for a rare > > invalidation. Thanks, > > The new wait function described above is fast in the common case, just a check that > the invalid vaddr count is 0. > > - Steve > > >> + > >> /* Arbitrarily pick the first domain in the list for lookups */ > >> if (!list_empty(&iommu->domain_list)) > >> d = list_first_entry(&iommu->domain_list, > >> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) > >> iommu->controlled = true; > >> mutex_init(&iommu->lock); > >> BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); > >> + init_waitqueue_head(&iommu->vaddr_wait); > >> > >> return iommu; > >> } > >> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, > >> struct vfio_dma *dma; > >> bool kthread = current->mm == NULL; > >> size_t offset; > >> + int ret; > >> > >> *copied = 0; > >> > >> - dma = vfio_find_dma(iommu, user_iova, 1); > >> - if (!dma) > >> - return -EINVAL; > >> + ret = vfio_find_vaddr(iommu, user_iova, 1, &dma); > >> + if (ret < 0) > >> + return ret; > >> > >> if ((write && !(dma->prot & IOMMU_WRITE)) || > >> !(dma->prot & IOMMU_READ)) > >> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event, > >> mutex_lock(&iommu->lock); > >> iommu->controlled = false; > >> mutex_unlock(&iommu->lock); > >> + wake_up_all(&iommu->vaddr_wait); > >> } > >> > >> static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { > > >
On Wed, 27 Jan 2021 17:03:25 -0700 Alex Williamson <alex.williamson@redhat.com> wrote: > On Wed, 27 Jan 2021 18:25:13 -0500 > Steven Sistare <steven.sistare@oracle.com> wrote: > > > On 1/22/2021 5:59 PM, Alex Williamson wrote: > > > On Tue, 19 Jan 2021 09:48:29 -0800 > > > Steve Sistare <steven.sistare@oracle.com> wrote: > > > > > >> Block translation of host virtual address while an iova range has an > > >> invalid vaddr. > > >> > > >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> > > >> --- > > >> drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++---- > > >> 1 file changed, 76 insertions(+), 7 deletions(-) > > >> > > >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > > >> index 0167996..c97573a 100644 > > >> --- a/drivers/vfio/vfio_iommu_type1.c > > >> +++ b/drivers/vfio/vfio_iommu_type1.c > > >> @@ -31,6 +31,7 @@ > > >> #include <linux/rbtree.h> > > >> #include <linux/sched/signal.h> > > >> #include <linux/sched/mm.h> > > >> +#include <linux/kthread.h> > > >> #include <linux/slab.h> > > >> #include <linux/uaccess.h> > > >> #include <linux/vfio.h> > > >> @@ -75,6 +76,7 @@ struct vfio_iommu { > > >> bool dirty_page_tracking; > > >> bool pinned_page_dirty_scope; > > >> bool controlled; > > >> + wait_queue_head_t vaddr_wait; > > >> }; > > >> > > >> struct vfio_domain { > > >> @@ -145,6 +147,8 @@ struct vfio_regions { > > >> #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) > > >> #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) > > >> > > >> +#define WAITED 1 > > >> + > > >> static int put_pfn(unsigned long pfn, int prot); > > >> > > >> static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, > > >> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, > > >> } > > >> > > >> /* > > >> + * Wait for vaddr of *dma_p to become valid. iommu lock is dropped if the task > > >> + * waits, but is re-locked on return. If the task waits, then return an updated > > >> + * dma struct in *dma_p. Return 0 on success with no waiting, 1 on success if > > > > > > s/1/WAITED/ > > > > OK, but the WAITED state will not need to be returned in the new scheme below. > > > > >> + * waited, and -errno on error. > > >> + */ > > >> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p) > > >> +{ > > >> + struct vfio_dma *dma = *dma_p; > > >> + unsigned long iova = dma->iova; > > >> + size_t size = dma->size; > > >> + int ret = 0; > > >> + DEFINE_WAIT(wait); > > >> + > > >> + while (!dma->vaddr_valid) { > > >> + ret = WAITED; > > >> + prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE); > > >> + mutex_unlock(&iommu->lock); > > >> + schedule(); > > >> + mutex_lock(&iommu->lock); > > >> + finish_wait(&iommu->vaddr_wait, &wait); > > >> + if (kthread_should_stop() || !iommu->controlled || > > >> + fatal_signal_pending(current)) { > > >> + return -EFAULT; > > >> + } > > >> + *dma_p = dma = vfio_find_dma(iommu, iova, size); > > >> + if (!dma) > > >> + return -EINVAL; > > >> + } > > >> + return ret; > > >> +} > > >> + > > >> +/* > > >> + * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped > > >> + * if the task waits, but is re-locked on return. Return result in *dma_p. > > >> + * Return 0 on success, 1 on success if waited, and -errno on error. > > >> + */ > > >> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start, > > >> + size_t size, struct vfio_dma **dma_p) > > > > > > more of a vfio_find_dma_valid() > > > > I will slightly modify and rename this with the new scheme I describe below. > > > > >> +{ > > >> + *dma_p = vfio_find_dma(iommu, start, size); > > >> + if (!*dma_p) > > >> + return -EINVAL; > > >> + return vfio_vaddr_wait(iommu, dma_p); > > >> +} > > >> + > > >> +/* > > >> * Attempt to pin pages. We really don't want to track all the pfns and > > >> * the iommu can only map chunks of consecutive pfns anyway, so get the > > >> * first page and all consecutive pages with the same locking. > > >> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, > > >> struct vfio_pfn *vpfn; > > >> > > >> iova = user_pfn[i] << PAGE_SHIFT; > > >> - dma = vfio_find_dma(iommu, iova, PAGE_SIZE); > > >> - if (!dma) { > > >> - ret = -EINVAL; > > >> + ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma); > > >> + if (ret < 0) > > >> goto pin_unwind; > > >> - } > > >> + else if (ret == WAITED) > > >> + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); > > > > > > We're iterating through an array of pfns to provide translations, once > > > we've released the lock it's not just the current one that could be > > > invalid. I'm afraid we need to unwind and try again, but I think it's > > > actually worse than that because if we've marked pages within a > > > vfio_dma's pfn_list as pinned, then during the locking gap it gets > > > unmapped, the unmap path will call the unmap notifier chain to release > > > the page that the vendor driver doesn't have yet. Yikes! > > > > Yikes indeed. The fix is easy, though. I will maintain a count in vfio_iommu of > > vfio_dma objects with invalid vaddr's, modified and tested while holding the lock, > > provide a function to wait for the count to become 0, and call that function at the > > start of vfio_iommu_type1_pin_pages and vfio_iommu_replay. I will use iommu->vaddr_wait > > for wait and wake. > > I prefer the overhead of this, but the resulting behavior seems pretty > non-intuitive. Any invalidated vaddr blocks all vaddr use cases, which > almost suggests the unmap _VADDR flag should only be allowed with the > _ALL flag, but then the map _VADDR flag can only be per mapping, which > would make accounting and recovering from _VADDR + _ALL pretty > complicated. Thanks, I wonder if there's a hybrid approach, a counter on the vfio_iommu which when non-zero causes pin pages to pre-test vaddr on all required vfio_dma objects, waiting and being woken on counter decrement to check again. Thanks, Alex > > >> if ((dma->prot & prot) != prot) { > > >> ret = -EPERM; > > >> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, > > >> unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > >> int ret; > > >> > > >> + /* > > >> + * Wait for all vaddr to be valid so they can be used in the main loop. > > >> + * If we do wait, the lock was dropped and re-taken, so start over to > > >> + * ensure the dma list is consistent. > > >> + */ > > >> +again: > > >> + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { > > >> + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); > > >> + > > >> + ret = vfio_vaddr_wait(iommu, &dma); > > >> + if (ret < 0) > > >> + return ret; > > >> + else if (ret == WAITED) > > >> + goto again; > > >> + } > > > > > > This "do nothing until all the vaddrs are valid" approach could work > > > above too, but gosh is that a lot of cache unfriendly work for a rare > > > invalidation. Thanks, > > > > The new wait function described above is fast in the common case, just a check that > > the invalid vaddr count is 0. > > > > - Steve > > > > >> + > > >> /* Arbitrarily pick the first domain in the list for lookups */ > > >> if (!list_empty(&iommu->domain_list)) > > >> d = list_first_entry(&iommu->domain_list, > > >> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) > > >> iommu->controlled = true; > > >> mutex_init(&iommu->lock); > > >> BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); > > >> + init_waitqueue_head(&iommu->vaddr_wait); > > >> > > >> return iommu; > > >> } > > >> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, > > >> struct vfio_dma *dma; > > >> bool kthread = current->mm == NULL; > > >> size_t offset; > > >> + int ret; > > >> > > >> *copied = 0; > > >> > > >> - dma = vfio_find_dma(iommu, user_iova, 1); > > >> - if (!dma) > > >> - return -EINVAL; > > >> + ret = vfio_find_vaddr(iommu, user_iova, 1, &dma); > > >> + if (ret < 0) > > >> + return ret; > > >> > > >> if ((write && !(dma->prot & IOMMU_WRITE)) || > > >> !(dma->prot & IOMMU_READ)) > > >> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event, > > >> mutex_lock(&iommu->lock); > > >> iommu->controlled = false; > > >> mutex_unlock(&iommu->lock); > > >> + wake_up_all(&iommu->vaddr_wait); > > >> } > > >> > > >> static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { > > > > > >
On 1/28/2021 12:07 PM, Alex Williamson wrote: > On Wed, 27 Jan 2021 17:03:25 -0700 > Alex Williamson <alex.williamson@redhat.com> wrote: > >> On Wed, 27 Jan 2021 18:25:13 -0500 >> Steven Sistare <steven.sistare@oracle.com> wrote: >> >>> On 1/22/2021 5:59 PM, Alex Williamson wrote: >>>> On Tue, 19 Jan 2021 09:48:29 -0800 >>>> Steve Sistare <steven.sistare@oracle.com> wrote: >>>> >>>>> Block translation of host virtual address while an iova range has an >>>>> invalid vaddr. >>>>> >>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >>>>> --- >>>>> drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++---- >>>>> 1 file changed, 76 insertions(+), 7 deletions(-) >>>>> >>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c >>>>> index 0167996..c97573a 100644 >>>>> --- a/drivers/vfio/vfio_iommu_type1.c >>>>> +++ b/drivers/vfio/vfio_iommu_type1.c >>>>> @@ -31,6 +31,7 @@ >>>>> #include <linux/rbtree.h> >>>>> #include <linux/sched/signal.h> >>>>> #include <linux/sched/mm.h> >>>>> +#include <linux/kthread.h> >>>>> #include <linux/slab.h> >>>>> #include <linux/uaccess.h> >>>>> #include <linux/vfio.h> >>>>> @@ -75,6 +76,7 @@ struct vfio_iommu { >>>>> bool dirty_page_tracking; >>>>> bool pinned_page_dirty_scope; >>>>> bool controlled; >>>>> + wait_queue_head_t vaddr_wait; >>>>> }; >>>>> >>>>> struct vfio_domain { >>>>> @@ -145,6 +147,8 @@ struct vfio_regions { >>>>> #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) >>>>> #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) >>>>> >>>>> +#define WAITED 1 >>>>> + >>>>> static int put_pfn(unsigned long pfn, int prot); >>>>> >>>>> static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, >>>>> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, >>>>> } >>>>> >>>>> /* >>>>> + * Wait for vaddr of *dma_p to become valid. iommu lock is dropped if the task >>>>> + * waits, but is re-locked on return. If the task waits, then return an updated >>>>> + * dma struct in *dma_p. Return 0 on success with no waiting, 1 on success if >>>> >>>> s/1/WAITED/ >>> >>> OK, but the WAITED state will not need to be returned in the new scheme below. >>> >>>>> + * waited, and -errno on error. >>>>> + */ >>>>> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p) >>>>> +{ >>>>> + struct vfio_dma *dma = *dma_p; >>>>> + unsigned long iova = dma->iova; >>>>> + size_t size = dma->size; >>>>> + int ret = 0; >>>>> + DEFINE_WAIT(wait); >>>>> + >>>>> + while (!dma->vaddr_valid) { >>>>> + ret = WAITED; >>>>> + prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE); >>>>> + mutex_unlock(&iommu->lock); >>>>> + schedule(); >>>>> + mutex_lock(&iommu->lock); >>>>> + finish_wait(&iommu->vaddr_wait, &wait); >>>>> + if (kthread_should_stop() || !iommu->controlled || >>>>> + fatal_signal_pending(current)) { >>>>> + return -EFAULT; >>>>> + } >>>>> + *dma_p = dma = vfio_find_dma(iommu, iova, size); >>>>> + if (!dma) >>>>> + return -EINVAL; >>>>> + } >>>>> + return ret; >>>>> +} >>>>> + >>>>> +/* >>>>> + * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped >>>>> + * if the task waits, but is re-locked on return. Return result in *dma_p. >>>>> + * Return 0 on success, 1 on success if waited, and -errno on error. >>>>> + */ >>>>> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start, >>>>> + size_t size, struct vfio_dma **dma_p) >>>> >>>> more of a vfio_find_dma_valid() >>> >>> I will slightly modify and rename this with the new scheme I describe below. >>> >>>>> +{ >>>>> + *dma_p = vfio_find_dma(iommu, start, size); >>>>> + if (!*dma_p) >>>>> + return -EINVAL; >>>>> + return vfio_vaddr_wait(iommu, dma_p); >>>>> +} >>>>> + >>>>> +/* >>>>> * Attempt to pin pages. We really don't want to track all the pfns and >>>>> * the iommu can only map chunks of consecutive pfns anyway, so get the >>>>> * first page and all consecutive pages with the same locking. >>>>> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, >>>>> struct vfio_pfn *vpfn; >>>>> >>>>> iova = user_pfn[i] << PAGE_SHIFT; >>>>> - dma = vfio_find_dma(iommu, iova, PAGE_SIZE); >>>>> - if (!dma) { >>>>> - ret = -EINVAL; >>>>> + ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma); >>>>> + if (ret < 0) >>>>> goto pin_unwind; >>>>> - } >>>>> + else if (ret == WAITED) >>>>> + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); >>>> >>>> We're iterating through an array of pfns to provide translations, once >>>> we've released the lock it's not just the current one that could be >>>> invalid. I'm afraid we need to unwind and try again, but I think it's >>>> actually worse than that because if we've marked pages within a >>>> vfio_dma's pfn_list as pinned, then during the locking gap it gets >>>> unmapped, the unmap path will call the unmap notifier chain to release >>>> the page that the vendor driver doesn't have yet. Yikes! >>> >>> Yikes indeed. The fix is easy, though. I will maintain a count in vfio_iommu of >>> vfio_dma objects with invalid vaddr's, modified and tested while holding the lock, >>> provide a function to wait for the count to become 0, and call that function at the >>> start of vfio_iommu_type1_pin_pages and vfio_iommu_replay. I will use iommu->vaddr_wait >>> for wait and wake. >> >> I prefer the overhead of this, but the resulting behavior seems pretty >> non-intuitive. Any invalidated vaddr blocks all vaddr use cases, which >> almost suggests the unmap _VADDR flag should only be allowed with the >> _ALL flag, but then the map _VADDR flag can only be per mapping, which >> would make accounting and recovering from _VADDR + _ALL pretty >> complicated. Thanks, > > I wonder if there's a hybrid approach, a counter on the vfio_iommu > which when non-zero causes pin pages to pre-test vaddr on all required > vfio_dma objects, waiting and being woken on counter decrement to check > again. Thanks, Sounds good, thanks. - Steve >>>>> if ((dma->prot & prot) != prot) { >>>>> ret = -EPERM; >>>>> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, >>>>> unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; >>>>> int ret; >>>>> >>>>> + /* >>>>> + * Wait for all vaddr to be valid so they can be used in the main loop. >>>>> + * If we do wait, the lock was dropped and re-taken, so start over to >>>>> + * ensure the dma list is consistent. >>>>> + */ >>>>> +again: >>>>> + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { >>>>> + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); >>>>> + >>>>> + ret = vfio_vaddr_wait(iommu, &dma); >>>>> + if (ret < 0) >>>>> + return ret; >>>>> + else if (ret == WAITED) >>>>> + goto again; >>>>> + } >>>> >>>> This "do nothing until all the vaddrs are valid" approach could work >>>> above too, but gosh is that a lot of cache unfriendly work for a rare >>>> invalidation. Thanks, >>> >>> The new wait function described above is fast in the common case, just a check that >>> the invalid vaddr count is 0. >>> >>> - Steve >>> >>>>> + >>>>> /* Arbitrarily pick the first domain in the list for lookups */ >>>>> if (!list_empty(&iommu->domain_list)) >>>>> d = list_first_entry(&iommu->domain_list, >>>>> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) >>>>> iommu->controlled = true; >>>>> mutex_init(&iommu->lock); >>>>> BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); >>>>> + init_waitqueue_head(&iommu->vaddr_wait); >>>>> >>>>> return iommu; >>>>> } >>>>> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, >>>>> struct vfio_dma *dma; >>>>> bool kthread = current->mm == NULL; >>>>> size_t offset; >>>>> + int ret; >>>>> >>>>> *copied = 0; >>>>> >>>>> - dma = vfio_find_dma(iommu, user_iova, 1); >>>>> - if (!dma) >>>>> - return -EINVAL; >>>>> + ret = vfio_find_vaddr(iommu, user_iova, 1, &dma); >>>>> + if (ret < 0) >>>>> + return ret; >>>>> >>>>> if ((write && !(dma->prot & IOMMU_WRITE)) || >>>>> !(dma->prot & IOMMU_READ)) >>>>> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event, >>>>> mutex_lock(&iommu->lock); >>>>> iommu->controlled = false; >>>>> mutex_unlock(&iommu->lock); >>>>> + wake_up_all(&iommu->vaddr_wait); >>>>> } >>>>> >>>>> static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { >>>> >>> >> >
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0167996..c97573a 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -31,6 +31,7 @@ #include <linux/rbtree.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> +#include <linux/kthread.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/vfio.h> @@ -75,6 +76,7 @@ struct vfio_iommu { bool dirty_page_tracking; bool pinned_page_dirty_scope; bool controlled; + wait_queue_head_t vaddr_wait; }; struct vfio_domain { @@ -145,6 +147,8 @@ struct vfio_regions { #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) +#define WAITED 1 + static int put_pfn(unsigned long pfn, int prot); static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, } /* + * Wait for vaddr of *dma_p to become valid. iommu lock is dropped if the task + * waits, but is re-locked on return. If the task waits, then return an updated + * dma struct in *dma_p. Return 0 on success with no waiting, 1 on success if + * waited, and -errno on error. + */ +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p) +{ + struct vfio_dma *dma = *dma_p; + unsigned long iova = dma->iova; + size_t size = dma->size; + int ret = 0; + DEFINE_WAIT(wait); + + while (!dma->vaddr_valid) { + ret = WAITED; + prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE); + mutex_unlock(&iommu->lock); + schedule(); + mutex_lock(&iommu->lock); + finish_wait(&iommu->vaddr_wait, &wait); + if (kthread_should_stop() || !iommu->controlled || + fatal_signal_pending(current)) { + return -EFAULT; + } + *dma_p = dma = vfio_find_dma(iommu, iova, size); + if (!dma) + return -EINVAL; + } + return ret; +} + +/* + * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped + * if the task waits, but is re-locked on return. Return result in *dma_p. + * Return 0 on success, 1 on success if waited, and -errno on error. + */ +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start, + size_t size, struct vfio_dma **dma_p) +{ + *dma_p = vfio_find_dma(iommu, start, size); + if (!*dma_p) + return -EINVAL; + return vfio_vaddr_wait(iommu, dma_p); +} + +/* * Attempt to pin pages. We really don't want to track all the pfns and * the iommu can only map chunks of consecutive pfns anyway, so get the * first page and all consecutive pages with the same locking. @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, struct vfio_pfn *vpfn; iova = user_pfn[i] << PAGE_SHIFT; - dma = vfio_find_dma(iommu, iova, PAGE_SIZE); - if (!dma) { - ret = -EINVAL; + ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma); + if (ret < 0) goto pin_unwind; - } + else if (ret == WAITED) + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); if ((dma->prot & prot) != prot) { ret = -EPERM; @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; int ret; + /* + * Wait for all vaddr to be valid so they can be used in the main loop. + * If we do wait, the lock was dropped and re-taken, so start over to + * ensure the dma list is consistent. + */ +again: + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); + + ret = vfio_vaddr_wait(iommu, &dma); + if (ret < 0) + return ret; + else if (ret == WAITED) + goto again; + } + /* Arbitrarily pick the first domain in the list for lookups */ if (!list_empty(&iommu->domain_list)) d = list_first_entry(&iommu->domain_list, @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) iommu->controlled = true; mutex_init(&iommu->lock); BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); + init_waitqueue_head(&iommu->vaddr_wait); return iommu; } @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, struct vfio_dma *dma; bool kthread = current->mm == NULL; size_t offset; + int ret; *copied = 0; - dma = vfio_find_dma(iommu, user_iova, 1); - if (!dma) - return -EINVAL; + ret = vfio_find_vaddr(iommu, user_iova, 1, &dma); + if (ret < 0) + return ret; if ((write && !(dma->prot & IOMMU_WRITE)) || !(dma->prot & IOMMU_READ)) @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event, mutex_lock(&iommu->lock); iommu->controlled = false; mutex_unlock(&iommu->lock); + wake_up_all(&iommu->vaddr_wait); } static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
Block translation of host virtual address while an iova range has an invalid vaddr. Signed-off-by: Steve Sistare <steven.sistare@oracle.com> --- drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 7 deletions(-)