[RFC,v2,3/4] vfio iommu: Add support for mediated devices

Message ID	1472804172-25542-4-git-send-email-jike.song@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: Jike Song <jike.song@intel.com> To: alex.williamson@redhat.com, kwankhede@nvidia.com, cjia@nvidia.com Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org, bjsdjshi@linux.vnet.ibm.com, kevin.tian@intel.com, guangrong.xiao@linux.intel.com, zhenyuw@linux.intel.com, zhiyuan.lv@intel.com, jike.song@intel.com, pbonzini@redhat.com, kraxel@redhat.com Subject: [RFC v2 3/4] vfio iommu: Add support for mediated devices Date: Fri, 2 Sep 2016 16:16:11 +0800 Message-Id: <1472804172-25542-4-git-send-email-jike.song@intel.com> In-Reply-To: <1472804172-25542-1-git-send-email-jike.song@intel.com> References: <1472804172-25542-1-git-send-email-jike.song@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index d1d70e0..397c4be 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1782,6 +1782,88 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) } EXPORT_SYMBOL_GPL(vfio_info_cap_shift); +/* + * Pin a set of guest PFNs and return their associated host PFNs for mediated + * domain only. + * @user_pfn [in]: array of user/guest PFNs + * @npage [in]: count of array elements + * @prot [in] : protection flags + * @phys_pfn[out] : array of host PFNs + */ +long vfio_pin_pages(struct mdev_device *mdev, unsigned long *user_pfn, + long npage, int prot, unsigned long *phys_pfn) +{ + struct vfio_device *device; + struct vfio_container *container; + struct vfio_iommu_driver *driver; + ssize_t ret = -EINVAL; + + if (!mdev || !user_pfn || !phys_pfn) + return -EINVAL; + + device = dev_get_drvdata(&mdev->dev); + + if (!device || !device->group) + return -EINVAL; + + container = device->group->container; + + if (!container) + return -EINVAL; + + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->pin_pages)) + ret = driver->ops->pin_pages(container->iommu_data, user_pfn, + npage, prot, phys_pfn); + + up_read(&container->group_lock); + + return ret; + +} +EXPORT_SYMBOL(vfio_pin_pages); + +/* + * Unpin set of host PFNs for mediated domain only. + * @pfn [in] : array of host PFNs to be unpinned. + * @npage [in] :count of elements in array, that is number of pages. + */ +long vfio_unpin_pages(struct mdev_device *mdev, unsigned long *pfn, long npage) +{ + struct vfio_device *device; + struct vfio_container *container; + struct vfio_iommu_driver *driver; + ssize_t ret = -EINVAL; + + if (!mdev || !pfn) + return -EINVAL; + + device = dev_get_drvdata(&mdev->dev); + + if (!device || !device->group) + return -EINVAL; + + container = device->group->container; + + if (!container) + return -EINVAL; + + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->unpin_pages)) + ret = driver->ops->unpin_pages(container->iommu_data, pfn, + npage); + + up_read(&container->group_lock); + + return ret; + +} +EXPORT_SYMBOL(vfio_unpin_pages); + /** * Module/class support */ diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 2ba1942..3cd8098 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages, struct vfio_iommu { struct list_head domain_list; + struct vfio_domain *mediated_domain; struct mutex lock; struct rb_root dma_list; bool v2; bool nesting; }; +struct mdev_addr_space { + struct task_struct *task; + struct rb_root pfn_list; /* pinned Host pfn list */ + struct mutex pfn_list_lock; /* mutex for pfn_list */ +}; + struct vfio_domain { struct iommu_domain *domain; struct list_head next; struct list_head group_list; int prot; /* IOMMU_CACHE */ bool fgsp; /* Fine-grained super pages */ + struct mdev_addr_space *mdev_addr_space; }; struct vfio_dma { @@ -83,6 +91,22 @@ struct vfio_group { }; /* + * Guest RAM pinning working set or DMA target + */ +struct vfio_pfn { + struct rb_node node; + unsigned long vaddr; /* virtual addr */ + dma_addr_t iova; /* IOVA */ + unsigned long pfn; /* Host pfn */ + size_t prot; + atomic_t ref_count; +}; + + +#define IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu) \ + (list_empty(&iommu->domain_list) ? false : true) + +/* * This code handles mapping and unmapping of user data buffers * into DMA'ble space using the IOMMU */ @@ -130,6 +154,84 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) rb_erase(&old->node, &iommu->dma_list); } +/* + * Helper Functions for host pfn list + */ + +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain, + unsigned long pfn) +{ + struct rb_node *node; + struct vfio_pfn *vpfn, *ret = NULL; + + node = domain->mdev_addr_space->pfn_list.rb_node; + + while (node) { + vpfn = rb_entry(node, struct vfio_pfn, node); + + if (pfn < vpfn->pfn) + node = node->rb_left; + else if (pfn > vpfn->pfn) + node = node->rb_right; + else { + ret = vpfn; + break; + } + } + + return ret; +} + +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new) +{ + struct rb_node **link, *parent = NULL; + struct vfio_pfn *vpfn; + + link = &domain->mdev_addr_space->pfn_list.rb_node; + while (*link) { + parent = *link; + vpfn = rb_entry(parent, struct vfio_pfn, node); + + if (new->pfn < vpfn->pfn) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &domain->mdev_addr_space->pfn_list); +} + +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old) +{ + rb_erase(&old->node, &domain->mdev_addr_space->pfn_list); +} + +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr, + dma_addr_t iova, unsigned long pfn, size_t prot) +{ + struct vfio_pfn *vpfn; + + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL); + if (!vpfn) + return -ENOMEM; + + vpfn->vaddr = vaddr; + vpfn->iova = iova; + vpfn->pfn = pfn; + vpfn->prot = prot; + atomic_set(&vpfn->ref_count, 1); + vfio_link_pfn(domain, vpfn); + return 0; +} + +static void vfio_remove_from_pfn_list(struct vfio_domain *domain, + struct vfio_pfn *vpfn) +{ + vfio_unlink_pfn(domain, vpfn); + kfree(vpfn); +} + struct vwork { struct mm_struct *mm; long npage; @@ -150,17 +252,17 @@ static void vfio_lock_acct_bg(struct work_struct *work) kfree(vwork); } -static void vfio_lock_acct(long npage) +static void vfio_lock_acct(struct task_struct *task, long npage) { struct vwork *vwork; struct mm_struct *mm; - if (!current->mm || !npage) + if (!task->mm || !npage) return; /* process exited or nothing to do */ - if (down_write_trylock(&current->mm->mmap_sem)) { - current->mm->locked_vm += npage; - up_write(&current->mm->mmap_sem); + if (down_write_trylock(&task->mm->mmap_sem)) { + task->mm->locked_vm += npage; + up_write(&task->mm->mmap_sem); return; } @@ -172,7 +274,7 @@ static void vfio_lock_acct(long npage) vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); if (!vwork) return; - mm = get_task_mm(current); + mm = get_task_mm(task); if (!mm) { kfree(vwork); return; @@ -228,20 +330,31 @@ static int put_pfn(unsigned long pfn, int prot) return 0; } -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, + int prot, unsigned long *pfn) { struct page *page[1]; struct vm_area_struct *vma; + struct mm_struct *local_mm = mm ? mm : current->mm; int ret = -EFAULT; - if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { + if (mm) { + down_read(&local_mm->mmap_sem); + ret = get_user_pages_remote(NULL, local_mm, vaddr, 1, + !!(prot & IOMMU_WRITE), 0, page, NULL); + up_read(&local_mm->mmap_sem); + } else + ret = get_user_pages_fast(vaddr, 1, + !!(prot & IOMMU_WRITE), page); + + if (ret == 1) { *pfn = page_to_pfn(page[0]); return 0; } - down_read(&current->mm->mmap_sem); + down_read(&local_mm->mmap_sem); - vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); + vma = find_vma_intersection(local_mm, vaddr, vaddr + 1); if (vma && vma->vm_flags & VM_PFNMAP) { *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -249,7 +362,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) ret = 0; } - up_read(&current->mm->mmap_sem); + up_read(&local_mm->mmap_sem); return ret; } @@ -259,8 +372,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) * the iommu can only map chunks of consecutive pfns anyway, so get the * first page and all consecutive pages with the same locking. */ -static long vfio_pin_pages(unsigned long vaddr, long npage, - int prot, unsigned long *pfn_base) +static long __vfio_pin_pages(unsigned long vaddr, long npage, + int prot, unsigned long *pfn_base) { unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; bool lock_cap = capable(CAP_IPC_LOCK); @@ -270,7 +383,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, if (!current->mm) return -ENODEV; - ret = vaddr_get_pfn(vaddr, prot, pfn_base); + ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base); if (ret) return ret; @@ -285,7 +398,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, if (unlikely(disable_hugepages)) { if (!rsvd) - vfio_lock_acct(1); + vfio_lock_acct(current, 1); return 1; } @@ -293,7 +406,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { unsigned long pfn = 0; - ret = vaddr_get_pfn(vaddr, prot, &pfn); + ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn); if (ret) break; @@ -313,13 +426,13 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, } if (!rsvd) - vfio_lock_acct(i); + vfio_lock_acct(current, i); return i; } -static long vfio_unpin_pages(unsigned long pfn, long npage, - int prot, bool do_accounting) +static long __vfio_unpin_pages(unsigned long pfn, long npage, int prot, + bool do_accounting) { unsigned long unlocked = 0; long i; @@ -328,7 +441,188 @@ static long vfio_unpin_pages(unsigned long pfn, long npage, unlocked += put_pfn(pfn++, prot); if (do_accounting) - vfio_lock_acct(-unlocked); + vfio_lock_acct(current, -unlocked); + return unlocked; +} + +static long __vfio_pin_pages_for_mdev(struct vfio_domain *domain, + unsigned long vaddr, int prot, + unsigned long *pfn_base, + bool do_accounting) +{ + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = capable(CAP_IPC_LOCK); + long ret; + bool rsvd; + struct task_struct *task = domain->mdev_addr_space->task; + + if (!task->mm) + return -ENODEV; + + ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base); + if (ret) + return ret; + + rsvd = is_invalid_reserved_pfn(*pfn_base); + + if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, + limit << PAGE_SHIFT); + return -ENOMEM; + } + + if (!rsvd && do_accounting) + vfio_lock_acct(task, 1); + + return 1; +} + +static void __vfio_unpin_pages_for_mdev(struct vfio_domain *domain, + unsigned long pfn, int prot, + bool do_accounting) +{ + put_pfn(pfn, prot); + + if (do_accounting) + vfio_lock_acct(domain->mdev_addr_space->task, -1); +} + +static int vfio_unpin_pfn(struct vfio_domain *domain, + struct vfio_pfn *vpfn, bool do_accounting) +{ + __vfio_unpin_pages_for_mdev(domain, vpfn->pfn, vpfn->prot, + do_accounting); + + if (atomic_dec_and_test(&vpfn->ref_count)) + vfio_remove_from_pfn_list(domain, vpfn); + + return 1; +} + +static long vfio_iommu_type1_pin_pages(void *iommu_data, + unsigned long *user_pfn, + long npage, int prot, + unsigned long *phys_pfn) +{ + struct vfio_iommu *iommu = iommu_data; + struct vfio_domain *domain; + int i, j, ret; + long retpage; + unsigned long remote_vaddr; + unsigned long *pfn = phys_pfn; + struct vfio_dma *dma; + bool do_accounting = false; + + if (!iommu || !user_pfn || !phys_pfn) + return -EINVAL; + + mutex_lock(&iommu->lock); + + if (!iommu->mediated_domain) { + ret = -EINVAL; + goto pin_done; + } + + domain = iommu->mediated_domain; + + /* + * If iommu capable domain exist in the container then all pages are + * already pinned and accounted. Accouting should be done if there is no + * iommu capable domain in the container. + */ + do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu); + + for (i = 0; i < npage; i++) { + struct vfio_pfn *p; + dma_addr_t iova; + + iova = user_pfn[i] << PAGE_SHIFT; + + dma = vfio_find_dma(iommu, iova, 0); + if (!dma) { + ret = -EINVAL; + goto pin_unwind; + } + + remote_vaddr = dma->vaddr + iova - dma->iova; + + retpage = __vfio_pin_pages_for_mdev(domain, remote_vaddr, prot, + &pfn[i], do_accounting); + if (retpage <= 0) { + WARN_ON(!retpage); + ret = (int)retpage; + goto pin_unwind; + } + + mutex_lock(&domain->mdev_addr_space->pfn_list_lock); + + /* search if pfn exist */ + p = vfio_find_pfn(domain, pfn[i]); + if (p) { + atomic_inc(&p->ref_count); + mutex_unlock(&domain->mdev_addr_space->pfn_list_lock); + continue; + } + + ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova, + pfn[i], prot); + mutex_unlock(&domain->mdev_addr_space->pfn_list_lock); + + if (ret) { + __vfio_unpin_pages_for_mdev(domain, pfn[i], prot, + do_accounting); + goto pin_unwind; + } + } + + ret = i; + goto pin_done; + +pin_unwind: + pfn[i] = 0; + mutex_lock(&domain->mdev_addr_space->pfn_list_lock); + for (j = 0; j < i; j++) { + struct vfio_pfn *p; + + p = vfio_find_pfn(domain, pfn[j]); + if (p) + vfio_unpin_pfn(domain, p, do_accounting); + + pfn[j] = 0; + } + mutex_unlock(&domain->mdev_addr_space->pfn_list_lock); + +pin_done: + mutex_unlock(&iommu->lock); + return ret; +} + +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn, + long npage) +{ + struct vfio_iommu *iommu = iommu_data; + struct vfio_domain *domain = NULL; + long unlocked = 0; + int i; + + if (!iommu || !pfn) + return -EINVAL; + + domain = iommu->mediated_domain; + + for (i = 0; i < npage; i++) { + struct vfio_pfn *p; + + mutex_lock(&domain->mdev_addr_space->pfn_list_lock); + + /* verify if pfn exist in pfn_list */ + p = vfio_find_pfn(domain, pfn[i]); + if (p) + unlocked += vfio_unpin_pfn(domain, p, true); + + mutex_unlock(&domain->mdev_addr_space->pfn_list_lock); + } return unlocked; } @@ -341,6 +635,9 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) if (!dma->size) return; + + if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) + return; /* * We use the IOMMU to track the physical addresses, otherwise we'd * need a much more complicated tracking system. Unfortunately that @@ -382,15 +679,15 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) if (WARN_ON(!unmapped)) break; - unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, - unmapped >> PAGE_SHIFT, - dma->prot, false); + unlocked += __vfio_unpin_pages(phys >> PAGE_SHIFT, + unmapped >> PAGE_SHIFT, + dma->prot, false); iova += unmapped; cond_resched(); } - vfio_lock_acct(-unlocked); + vfio_lock_acct(current, -unlocked); } static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) @@ -611,10 +908,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, /* Insert zero-sized and grow as we map chunks of it */ vfio_link_dma(iommu, dma); + /* Don't pin and map if container doesn't contain IOMMU capable domain*/ + if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) { + dma->size = size; + goto map_done; + } + while (size) { /* Pin a contiguous chunk of memory */ - npage = vfio_pin_pages(vaddr + dma->size, - size >> PAGE_SHIFT, prot, &pfn); + npage = __vfio_pin_pages(vaddr + dma->size, + size >> PAGE_SHIFT, prot, &pfn); if (npage <= 0) { WARN_ON(!npage); ret = (int)npage; @@ -624,7 +927,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, /* Map it! */ ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot); if (ret) { - vfio_unpin_pages(pfn, npage, prot, true); + __vfio_unpin_pages(pfn, npage, prot, true); break; } @@ -635,6 +938,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, if (ret) vfio_remove_dma(iommu, dma); +map_done: mutex_unlock(&iommu->lock); return ret; } @@ -734,11 +1038,24 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain) __free_pages(pages, order); } +static struct vfio_group *find_iommu_group(struct vfio_domain *domain, + struct iommu_group *iommu_group) +{ + struct vfio_group *g; + + list_for_each_entry(g, &domain->group_list, next) { + if (g->iommu_group == iommu_group) + return g; + } + + return NULL; +} + static int vfio_iommu_type1_attach_group(void *iommu_data, struct iommu_group *iommu_group) { struct vfio_iommu *iommu = iommu_data; - struct vfio_group *group, *g; + struct vfio_group *group; struct vfio_domain *domain, *d; struct bus_type *bus = NULL; int ret; @@ -746,10 +1063,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, mutex_lock(&iommu->lock); list_for_each_entry(d, &iommu->domain_list, next) { - list_for_each_entry(g, &d->group_list, next) { - if (g->iommu_group != iommu_group) - continue; + if (find_iommu_group(d, iommu_group)) { + mutex_unlock(&iommu->lock); + return -EINVAL; + } + } + if (iommu->mediated_domain) { + if (find_iommu_group(iommu->mediated_domain, iommu_group)) { mutex_unlock(&iommu->lock); return -EINVAL; } @@ -769,6 +1090,34 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_free; +#if defined(CONFIG_VFIO_MDEV) || defined(CONFIG_VFIO_MDEV_MODULE) + if (!iommu_present(bus) && (bus == &mdev_bus_type)) { + if (iommu->mediated_domain) { + list_add(&group->next, + &iommu->mediated_domain->group_list); + kfree(domain); + mutex_unlock(&iommu->lock); + return 0; + } + + domain->mdev_addr_space = kzalloc(sizeof(*domain->mdev_addr_space), + GFP_KERNEL); + if (!domain->mdev_addr_space) { + ret = -ENOMEM; + goto out_free; + } + + domain->mdev_addr_space->task = current; + INIT_LIST_HEAD(&domain->group_list); + list_add(&group->next, &domain->group_list); + domain->mdev_addr_space->pfn_list = RB_ROOT; + mutex_init(&domain->mdev_addr_space->pfn_list_lock); + iommu->mediated_domain = domain; + mutex_unlock(&iommu->lock); + return 0; + } +#endif + domain->domain = iommu_domain_alloc(bus); if (!domain->domain) { ret = -EIO; @@ -859,6 +1208,18 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); } +static void vfio_mdev_unpin_all(struct vfio_domain *domain) +{ + struct rb_node *node; + + mutex_lock(&domain->mdev_addr_space->pfn_list_lock); + while ((node = rb_first(&domain->mdev_addr_space->pfn_list))) { + vfio_unpin_pfn(domain, + rb_entry(node, struct vfio_pfn, node), false); + } + mutex_unlock(&domain->mdev_addr_space->pfn_list_lock); +} + static void vfio_iommu_type1_detach_group(void *iommu_data, struct iommu_group *iommu_group) { @@ -868,31 +1229,52 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, mutex_lock(&iommu->lock); - list_for_each_entry(domain, &iommu->domain_list, next) { - list_for_each_entry(group, &domain->group_list, next) { - if (group->iommu_group != iommu_group) - continue; + if (iommu->mediated_domain) { + domain = iommu->mediated_domain; + group = find_iommu_group(domain, iommu_group); + if (group) { + list_del(&group->next); + kfree(group); + if (list_empty(&domain->group_list)) { + vfio_mdev_unpin_all(domain); + if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) + vfio_iommu_unmap_unpin_all(iommu); + kfree(domain); + iommu->mediated_domain = NULL; + } + goto detach_group_done; + } + } + + if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) + goto detach_group_done; + + list_for_each_entry(domain, &iommu->domain_list, next) { + group = find_iommu_group(domain, iommu_group); + if (group) { iommu_detach_group(domain->domain, iommu_group); list_del(&group->next); kfree(group); /* * Group ownership provides privilege, if the group * list is empty, the domain goes away. If it's the - * last domain, then all the mappings go away too. + * last domain with iommu and mediated domain doesn't + * exist, the all the mappings go away too. */ if (list_empty(&domain->group_list)) { - if (list_is_singular(&iommu->domain_list)) + if (list_is_singular(&iommu->domain_list) && + !iommu->mediated_domain) vfio_iommu_unmap_unpin_all(iommu); iommu_domain_free(domain->domain); list_del(&domain->next); kfree(domain); } - goto done; + break; } } -done: +detach_group_done: mutex_unlock(&iommu->lock); } @@ -924,27 +1306,48 @@ static void *vfio_iommu_type1_open(unsigned long arg) return iommu; } +static void vfio_release_domain(struct vfio_domain *domain) +{ + struct vfio_group *group, *group_tmp; + + list_for_each_entry_safe(group, group_tmp, + &domain->group_list, next) { + if (!domain->mdev_addr_space) + iommu_detach_group(domain->domain, group->iommu_group); + list_del(&group->next); + kfree(group); + } + + if (domain->mdev_addr_space) + vfio_mdev_unpin_all(domain); + else + iommu_domain_free(domain->domain); +} + static void vfio_iommu_type1_release(void *iommu_data) { struct vfio_iommu *iommu = iommu_data; struct vfio_domain *domain, *domain_tmp; - struct vfio_group *group, *group_tmp; + + if (iommu->mediated_domain) { + vfio_release_domain(iommu->mediated_domain); + kfree(iommu->mediated_domain); + iommu->mediated_domain = NULL; + } vfio_iommu_unmap_unpin_all(iommu); + if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) + goto release_exit; + list_for_each_entry_safe(domain, domain_tmp, &iommu->domain_list, next) { - list_for_each_entry_safe(group, group_tmp, - &domain->group_list, next) { - iommu_detach_group(domain->domain, group->iommu_group); - list_del(&group->next); - kfree(group); - } - iommu_domain_free(domain->domain); + vfio_release_domain(domain); list_del(&domain->next); kfree(domain); } +release_exit: kfree(iommu); } @@ -1048,6 +1451,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { .ioctl = vfio_iommu_type1_ioctl, .attach_group = vfio_iommu_type1_attach_group, .detach_group = vfio_iommu_type1_detach_group, + .pin_pages = vfio_iommu_type1_pin_pages, + .unpin_pages = vfio_iommu_type1_unpin_pages, }; static int __init vfio_iommu_type1_init(void) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 0ecae0b..f2f0daf 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -17,6 +17,7 @@ #include <linux/workqueue.h> #include <linux/poll.h> #include <uapi/linux/vfio.h> +#include <linux/mdev.h> /** * struct vfio_device_ops - VFIO bus driver device callbacks @@ -75,7 +76,11 @@ struct vfio_iommu_driver_ops { struct iommu_group *group); void (*detach_group)(void *iommu_data, struct iommu_group *group); - + long (*pin_pages)(void *iommu_data, unsigned long *user_pfn, + long npage, int prot, + unsigned long *phys_pfn); + long (*unpin_pages)(void *iommu_data, unsigned long *pfn, + long npage); }; extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); @@ -127,6 +132,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, } #endif /* CONFIG_EEH */ +extern long vfio_pin_pages(struct mdev_device *mdev, unsigned long *user_pfn, + long npage, int prot, unsigned long *phys_pfn); + +extern long vfio_unpin_pages(struct mdev_device *mdev, unsigned long *pfn, + long npage); + /* * IRQfd - generic */

[RFC,v2,3/4] vfio iommu: Add support for mediated devices

Commit Message

Patch