diff mbox series

[v5,08/11] vduse: Implement an MMU-based IOMMU driver

Message ID 20210315053721.189-9-xieyongji@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Introduce VDUSE - vDPA Device in Userspace | expand

Commit Message

Yongji Xie March 15, 2021, 5:37 a.m. UTC
This implements an MMU-based IOMMU driver to support mapping
kernel dma buffer into userspace. The basic idea behind it is
treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set
up MMU mapping instead of IOMMU mapping for the DMA transfer so
that the userspace process is able to use its virtual address to
access the dma buffer in kernel.

And to avoid security issue, a bounce-buffering mechanism is
introduced to prevent userspace accessing the original buffer
directly.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
 drivers/vdpa/vdpa_user/iova_domain.c | 535 +++++++++++++++++++++++++++++++++++
 drivers/vdpa/vdpa_user/iova_domain.h |  75 +++++
 2 files changed, 610 insertions(+)
 create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
 create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h

Comments

Jason Wang March 24, 2021, 3:54 a.m. UTC | #1
在 2021/3/15 下午1:37, Xie Yongji 写道:
> This implements an MMU-based IOMMU driver to support mapping
> kernel dma buffer into userspace. The basic idea behind it is
> treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set
> up MMU mapping instead of IOMMU mapping for the DMA transfer so
> that the userspace process is able to use its virtual address to
> access the dma buffer in kernel.
>
> And to avoid security issue, a bounce-buffering mechanism is
> introduced to prevent userspace accessing the original buffer
> directly.
>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> ---
>   drivers/vdpa/vdpa_user/iova_domain.c | 535 +++++++++++++++++++++++++++++++++++
>   drivers/vdpa/vdpa_user/iova_domain.h |  75 +++++
>   2 files changed, 610 insertions(+)
>   create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
>   create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h
>
> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
> new file mode 100644
> index 000000000000..83de216b0e51
> --- /dev/null
> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
> @@ -0,0 +1,535 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * MMU-based IOMMU implementation
> + *
> + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.


2021 as well.


> + *
> + * Author: Xie Yongji <xieyongji@bytedance.com>
> + *
> + */
> +
> +#include <linux/slab.h>
> +#include <linux/file.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/highmem.h>
> +#include <linux/vmalloc.h>
> +#include <linux/vdpa.h>
> +
> +#include "iova_domain.h"
> +
> +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
> +				 u64 start, u64 last,
> +				 u64 addr, unsigned int perm,
> +				 struct file *file, u64 offset)
> +{
> +	struct vdpa_map_file *map_file;
> +	int ret;
> +
> +	map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
> +	if (!map_file)
> +		return -ENOMEM;
> +
> +	map_file->file = get_file(file);
> +	map_file->offset = offset;
> +
> +	ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
> +					addr, perm, map_file);
> +	if (ret) {
> +		fput(map_file->file);
> +		kfree(map_file);
> +		return ret;
> +	}
> +	return 0;
> +}
> +
> +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
> +				  u64 start, u64 last)
> +{
> +	struct vdpa_map_file *map_file;
> +	struct vhost_iotlb_map *map;
> +
> +	while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
> +		map_file = (struct vdpa_map_file *)map->opaque;
> +		fput(map_file->file);
> +		kfree(map_file);
> +		vhost_iotlb_map_free(domain->iotlb, map);
> +	}
> +}
> +
> +int vduse_domain_set_map(struct vduse_iova_domain *domain,
> +			 struct vhost_iotlb *iotlb)
> +{
> +	struct vdpa_map_file *map_file;
> +	struct vhost_iotlb_map *map;
> +	u64 start = 0ULL, last = ULLONG_MAX;
> +	int ret;
> +
> +	spin_lock(&domain->iotlb_lock);
> +	vduse_iotlb_del_range(domain, start, last);
> +
> +	for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
> +	     map = vhost_iotlb_itree_next(map, start, last)) {
> +		map_file = (struct vdpa_map_file *)map->opaque;
> +		ret = vduse_iotlb_add_range(domain, map->start, map->last,
> +					    map->addr, map->perm,
> +					    map_file->file,
> +					    map_file->offset);
> +		if (ret)
> +			goto err;
> +	}
> +	spin_unlock(&domain->iotlb_lock);
> +
> +	return 0;
> +err:
> +	vduse_iotlb_del_range(domain, start, last);
> +	spin_unlock(&domain->iotlb_lock);
> +	return ret;
> +}
> +
> +static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
> +					 u64 iova, u64 size, u64 paddr)
> +{
> +	struct vduse_bounce_map *map;
> +	unsigned int index;
> +	u64 last = iova + size - 1;
> +
> +	while (iova < last) {
> +		map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> +		index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
> +		map->orig_phys[index] = paddr;
> +		paddr += IOVA_ALLOC_SIZE;
> +		iova += IOVA_ALLOC_SIZE;
> +	}
> +}
> +
> +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
> +					   u64 iova, u64 size)
> +{
> +	struct vduse_bounce_map *map;
> +	unsigned int index;
> +	u64 last = iova + size - 1;
> +
> +	while (iova < last) {
> +		map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> +		index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
> +		map->orig_phys[index] = INVALID_PHYS_ADDR;
> +		iova += IOVA_ALLOC_SIZE;
> +	}
> +}
> +
> +static void do_bounce(phys_addr_t orig, void *addr, size_t size,
> +		      enum dma_data_direction dir)
> +{
> +	unsigned long pfn = PFN_DOWN(orig);
> +
> +	if (PageHighMem(pfn_to_page(pfn))) {
> +		unsigned int offset = offset_in_page(orig);
> +		char *buffer;
> +		unsigned int sz = 0;
> +
> +		while (size) {
> +			sz = min_t(size_t, PAGE_SIZE - offset, size);
> +
> +			buffer = kmap_atomic(pfn_to_page(pfn));


So kmap_atomic() can autoamtically go with fast path if the page does 
not belong to highmem.

I think we can removce the condition and just use kmap_atomic() for all 
the cases here.


> +			if (dir == DMA_TO_DEVICE)
> +				memcpy(addr, buffer + offset, sz);
> +			else
> +				memcpy(buffer + offset, addr, sz);
> +			kunmap_atomic(buffer);
> +
> +			size -= sz;
> +			pfn++;
> +			addr += sz;
> +			offset = 0;
> +		}
> +	} else if (dir == DMA_TO_DEVICE) {
> +		memcpy(addr, phys_to_virt(orig), size);
> +	} else {
> +		memcpy(phys_to_virt(orig), addr, size);
> +	}
> +}
> +
> +static void vduse_domain_bounce(struct vduse_iova_domain *domain,
> +				dma_addr_t iova, size_t size,
> +				enum dma_data_direction dir)
> +{
> +	struct vduse_bounce_map *map;
> +	unsigned int index, offset;
> +	void *addr;
> +	size_t sz;
> +
> +	while (size) {
> +		map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> +		offset = offset_in_page(iova);
> +		sz = min_t(size_t, IOVA_ALLOC_SIZE, size);
> +
> +		if (map->bounce_page &&
> +		    map->orig_phys[index] != INVALID_PHYS_ADDR) {
> +			addr = page_address(map->bounce_page) + offset;
> +			index = offset >> IOVA_ALLOC_ORDER;
> +			do_bounce(map->orig_phys[index], addr, sz, dir);
> +		}
> +		size -= sz;
> +		iova += sz;
> +	}
> +}
> +
> +static struct page *
> +vduse_domain_get_mapping_page(struct vduse_iova_domain *domain, u64 iova)
> +{
> +	u64 start = iova & PAGE_MASK;
> +	u64 last = start + PAGE_SIZE - 1;
> +	struct vhost_iotlb_map *map;
> +	struct page *page = NULL;
> +
> +	spin_lock(&domain->iotlb_lock);
> +	map = vhost_iotlb_itree_first(domain->iotlb, start, last);
> +	if (!map)
> +		goto out;
> +
> +	page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
> +	get_page(page);
> +out:
> +	spin_unlock(&domain->iotlb_lock);
> +
> +	return page;
> +}
> +
> +static struct page *
> +vduse_domain_alloc_bounce_page(struct vduse_iova_domain *domain, u64 iova)
> +{
> +	u64 start = iova & PAGE_MASK;
> +	struct page *page = alloc_page(GFP_KERNEL);
> +	struct vduse_bounce_map *map;
> +
> +	if (!page)
> +		return NULL;
> +
> +	spin_lock(&domain->iotlb_lock);
> +	map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> +	if (map->bounce_page) {
> +		__free_page(page);
> +		goto out;
> +	}
> +	map->bounce_page = page;
> +
> +	/* paired with vduse_domain_map_page() */
> +	smp_mb();


So this is suspicious. It's better to explain like, we need make sure A 
must be done after B.

And it looks to me the iotlb_lock is sufficnet to do the synchronization 
here. E.g any reason that you don't take it in 
vduse_domain_map_bounce_page().

And what's more, is there anyway to aovid holding the spinlock during 
bouncing?


> +
> +	vduse_domain_bounce(domain, start, PAGE_SIZE, DMA_TO_DEVICE);
> +out:
> +	get_page(map->bounce_page);
> +	spin_unlock(&domain->iotlb_lock);
> +
> +	return map->bounce_page;
> +}
> +
> +static void
> +vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain)
> +{
> +	struct vduse_bounce_map *map;
> +	unsigned long i, pfn, bounce_pfns;
> +
> +	bounce_pfns = domain->bounce_size >> PAGE_SHIFT;
> +
> +	for (pfn = 0; pfn < bounce_pfns; pfn++) {
> +		map = &domain->bounce_maps[pfn];
> +		for (i = 0; i < IOVA_MAPS_PER_PAGE; i++) {
> +			if (WARN_ON(map->orig_phys[i] != INVALID_PHYS_ADDR))
> +				continue;
> +		}
> +		if (!map->bounce_page)
> +			continue;
> +
> +		__free_page(map->bounce_page);
> +		map->bounce_page = NULL;
> +	}
> +}
> +
> +void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain)
> +{
> +	if (!domain->bounce_map)
> +		return;
> +
> +	spin_lock(&domain->iotlb_lock);
> +	if (!domain->bounce_map)
> +		goto unlock;
> +
> +	vduse_iotlb_del_range(domain, 0, domain->bounce_size - 1);
> +	domain->bounce_map = 0;
> +	vduse_domain_free_bounce_pages(domain);
> +unlock:
> +	spin_unlock(&domain->iotlb_lock);
> +}
> +
> +static int vduse_domain_init_bounce_map(struct vduse_iova_domain *domain)
> +{
> +	int ret;
> +
> +	if (domain->bounce_map)
> +		return 0;
> +
> +	spin_lock(&domain->iotlb_lock);
> +	if (domain->bounce_map)
> +		goto unlock;
> +
> +	ret = vduse_iotlb_add_range(domain, 0, domain->bounce_size - 1,
> +				    0, VHOST_MAP_RW, domain->file, 0);
> +	if (!ret)
> +		domain->bounce_map = 1;
> +unlock:
> +	spin_unlock(&domain->iotlb_lock);
> +	return ret;
> +}
> +
> +static dma_addr_t
> +vduse_domain_alloc_iova(struct iova_domain *iovad,
> +			unsigned long size, unsigned long limit)
> +{
> +	unsigned long shift = iova_shift(iovad);
> +	unsigned long iova_len = iova_align(iovad, size) >> shift;
> +	unsigned long iova_pfn;
> +
> +	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
> +		iova_len = roundup_pow_of_two(iova_len);
> +	iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true);
> +
> +	return iova_pfn << shift;
> +}
> +
> +static void vduse_domain_free_iova(struct iova_domain *iovad,
> +				   dma_addr_t iova, size_t size)
> +{
> +	unsigned long shift = iova_shift(iovad);
> +	unsigned long iova_len = iova_align(iovad, size) >> shift;
> +
> +	free_iova_fast(iovad, iova >> shift, iova_len);
> +}
> +
> +dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
> +				 struct page *page, unsigned long offset,
> +				 size_t size, enum dma_data_direction dir,
> +				 unsigned long attrs)
> +{
> +	struct iova_domain *iovad = &domain->stream_iovad;
> +	unsigned long limit = domain->bounce_size - 1;
> +	phys_addr_t pa = page_to_phys(page) + offset;
> +	dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
> +
> +	if (!iova)
> +		return DMA_MAPPING_ERROR;
> +
> +	if (vduse_domain_init_bounce_map(domain)) {
> +		vduse_domain_free_iova(iovad, iova, size);
> +		return DMA_MAPPING_ERROR;
> +	}
> +
> +	vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa);
> +
> +	/* paired with vduse_domain_alloc_bounce_page() */
> +	smp_mb();
> +
> +	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
> +		vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE);
> +
> +	return iova;
> +}
> +
> +void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
> +			     dma_addr_t dma_addr, size_t size,
> +			     enum dma_data_direction dir, unsigned long attrs)
> +{
> +	struct iova_domain *iovad = &domain->stream_iovad;
> +
> +	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
> +		vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE);
> +
> +	vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size);
> +	vduse_domain_free_iova(iovad, dma_addr, size);
> +}
> +
> +void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
> +				  size_t size, dma_addr_t *dma_addr,
> +				  gfp_t flag, unsigned long attrs)
> +{
> +	struct iova_domain *iovad = &domain->consistent_iovad;
> +	unsigned long limit = domain->iova_limit;
> +	dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
> +	void *orig = alloc_pages_exact(size, flag);
> +
> +	if (!iova || !orig)
> +		goto err;
> +
> +	spin_lock(&domain->iotlb_lock);
> +	if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1,
> +				  virt_to_phys(orig), VHOST_MAP_RW,
> +				  domain->file, (u64)iova)) {
> +		spin_unlock(&domain->iotlb_lock);
> +		goto err;
> +	}
> +	spin_unlock(&domain->iotlb_lock);
> +
> +	*dma_addr = iova;
> +
> +	return orig;
> +err:
> +	*dma_addr = DMA_MAPPING_ERROR;
> +	if (orig)
> +		free_pages_exact(orig, size);
> +	if (iova)
> +		vduse_domain_free_iova(iovad, iova, size);
> +
> +	return NULL;
> +}
> +
> +void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
> +				void *vaddr, dma_addr_t dma_addr,
> +				unsigned long attrs)
> +{
> +	struct iova_domain *iovad = &domain->consistent_iovad;
> +	struct vhost_iotlb_map *map;
> +	struct vdpa_map_file *map_file;
> +	phys_addr_t pa;
> +
> +	spin_lock(&domain->iotlb_lock);
> +	map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr,
> +				      (u64)dma_addr + size - 1);
> +	if (WARN_ON(!map)) {
> +		spin_unlock(&domain->iotlb_lock);
> +		return;
> +	}
> +	map_file = (struct vdpa_map_file *)map->opaque;
> +	fput(map_file->file);
> +	kfree(map_file);
> +	pa = map->addr;
> +	vhost_iotlb_map_free(domain->iotlb, map);
> +	spin_unlock(&domain->iotlb_lock);
> +
> +	vduse_domain_free_iova(iovad, dma_addr, size);
> +	free_pages_exact(phys_to_virt(pa), size);


I wonder whether we should free the coherent page after munmap(). 
Otherwise usersapce can poke kernel pages in this way, e.g the page 
could be allocated and used by other subsystems?


> +}
> +
> +static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf)
> +{
> +	struct vduse_iova_domain *domain = vmf->vma->vm_private_data;
> +	unsigned long iova = vmf->pgoff << PAGE_SHIFT;
> +	struct page *page;
> +
> +	if (!domain)
> +		return VM_FAULT_SIGBUS;
> +
> +	if (iova < domain->bounce_size)
> +		page = vduse_domain_alloc_bounce_page(domain, iova);
> +	else
> +		page = vduse_domain_get_mapping_page(domain, iova);
> +
> +	if (!page)
> +		return VM_FAULT_SIGBUS;
> +
> +	vmf->page = page;
> +
> +	return 0;
> +}
> +
> +static const struct vm_operations_struct vduse_domain_mmap_ops = {
> +	.fault = vduse_domain_mmap_fault,
> +};
> +
> +static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	struct vduse_iova_domain *domain = file->private_data;
> +
> +	vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND;
> +	vma->vm_private_data = domain;
> +	vma->vm_ops = &vduse_domain_mmap_ops;
> +
> +	return 0;
> +}
> +
> +static int vduse_domain_release(struct inode *inode, struct file *file)
> +{
> +	struct vduse_iova_domain *domain = file->private_data;
> +
> +	vduse_domain_reset_bounce_map(domain);
> +	put_iova_domain(&domain->stream_iovad);
> +	put_iova_domain(&domain->consistent_iovad);
> +	vhost_iotlb_free(domain->iotlb);
> +	vfree(domain->bounce_maps);
> +	kfree(domain);
> +
> +	return 0;
> +}
> +
> +static const struct file_operations vduse_domain_fops = {
> +	.mmap = vduse_domain_mmap,
> +	.release = vduse_domain_release,
> +};
> +
> +void vduse_domain_destroy(struct vduse_iova_domain *domain)
> +{
> +	fput(domain->file);
> +}
> +
> +struct vduse_iova_domain *
> +vduse_domain_create(unsigned long iova_limit, size_t bounce_size)
> +{
> +	struct vduse_iova_domain *domain;
> +	struct file *file;
> +	struct vduse_bounce_map *map;
> +	unsigned long i, pfn, bounce_pfns;
> +
> +	bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT;
> +	if (iova_limit <= bounce_size)
> +		return NULL;
> +
> +	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
> +	if (!domain)
> +		return NULL;
> +
> +	domain->iotlb = vhost_iotlb_alloc(0, 0);
> +	if (!domain->iotlb)
> +		goto err_iotlb;
> +
> +	domain->iova_limit = iova_limit;
> +	domain->bounce_size = PAGE_ALIGN(bounce_size);
> +	domain->bounce_maps = vzalloc(bounce_pfns *
> +				sizeof(struct vduse_bounce_map));
> +	if (!domain->bounce_maps)
> +		goto err_map;
> +
> +	for (pfn = 0; pfn < bounce_pfns; pfn++) {
> +		map = &domain->bounce_maps[pfn];
> +		for (i = 0; i < IOVA_MAPS_PER_PAGE; i++)
> +			map->orig_phys[i] = INVALID_PHYS_ADDR;
> +	}
> +	file = anon_inode_getfile("[vduse-domain]", &vduse_domain_fops,
> +				domain, O_RDWR);
> +	if (IS_ERR(file))
> +		goto err_file;
> +
> +	domain->file = file;
> +	spin_lock_init(&domain->iotlb_lock);
> +	init_iova_domain(&domain->stream_iovad,
> +			IOVA_ALLOC_SIZE, IOVA_START_PFN);
> +	init_iova_domain(&domain->consistent_iovad,
> +			PAGE_SIZE, bounce_pfns);


Any reason for treating coherent and stream DMA differently (the 
different granule)?


> +
> +	return domain;
> +err_file:
> +	vfree(domain->bounce_maps);
> +err_map:
> +	vhost_iotlb_free(domain->iotlb);
> +err_iotlb:
> +	kfree(domain);
> +	return NULL;
> +}
> +
> +int vduse_domain_init(void)
> +{
> +	return iova_cache_get();
> +}
> +
> +void vduse_domain_exit(void)
> +{
> +	iova_cache_put();
> +}
> diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
> new file mode 100644
> index 000000000000..faeeedfaa786
> --- /dev/null
> +++ b/drivers/vdpa/vdpa_user/iova_domain.h
> @@ -0,0 +1,75 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * MMU-based IOMMU implementation
> + *
> + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
> + *
> + * Author: Xie Yongji <xieyongji@bytedance.com>
> + *
> + */
> +
> +#ifndef _VDUSE_IOVA_DOMAIN_H
> +#define _VDUSE_IOVA_DOMAIN_H
> +
> +#include <linux/iova.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/vhost_iotlb.h>
> +
> +#define IOVA_START_PFN 1
> +
> +#define IOVA_ALLOC_ORDER 12
> +#define IOVA_ALLOC_SIZE (1 << IOVA_ALLOC_ORDER)
> +
> +#define IOVA_MAPS_PER_PAGE (1 << (PAGE_SHIFT - IOVA_ALLOC_ORDER))
> +
> +#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
> +
> +struct vduse_bounce_map {
> +	struct page *bounce_page;
> +	u64 orig_phys[IOVA_MAPS_PER_PAGE];


Sorry if I had asked this before. But I'm not sure it's worth to have 
this extra complexitiy. If I read the code correctly, the 
IOVA_MAPS_PER_PAGE is 1 for the archs that have 4K page. Have you tested 
the code on the archs that have more than 4K page?

Thanks


> +};
> +
> +struct vduse_iova_domain {
> +	struct iova_domain stream_iovad;
> +	struct iova_domain consistent_iovad;
> +	struct vduse_bounce_map *bounce_maps;
> +	size_t bounce_size;
> +	unsigned long iova_limit;
> +	int bounce_map;
> +	struct vhost_iotlb *iotlb;
> +	spinlock_t iotlb_lock;
> +	struct file *file;
> +};
> +
> +int vduse_domain_set_map(struct vduse_iova_domain *domain,
> +			struct vhost_iotlb *iotlb);
> +
> +dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
> +				struct page *page, unsigned long offset,
> +				size_t size, enum dma_data_direction dir,
> +				unsigned long attrs);
> +
> +void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
> +			dma_addr_t dma_addr, size_t size,
> +			enum dma_data_direction dir, unsigned long attrs);
> +
> +void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
> +				size_t size, dma_addr_t *dma_addr,
> +				gfp_t flag, unsigned long attrs);
> +
> +void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
> +				void *vaddr, dma_addr_t dma_addr,
> +				unsigned long attrs);
> +
> +void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain);
> +
> +void vduse_domain_destroy(struct vduse_iova_domain *domain);
> +
> +struct vduse_iova_domain *vduse_domain_create(unsigned long iova_limit,
> +						size_t bounce_size);
> +
> +int vduse_domain_init(void);
> +
> +void vduse_domain_exit(void);
> +
> +#endif /* _VDUSE_IOVA_DOMAIN_H */
Yongji Xie March 24, 2021, 7:39 a.m. UTC | #2
On Wed, Mar 24, 2021 at 11:54 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/3/15 下午1:37, Xie Yongji 写道:
> > This implements an MMU-based IOMMU driver to support mapping
> > kernel dma buffer into userspace. The basic idea behind it is
> > treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set
> > up MMU mapping instead of IOMMU mapping for the DMA transfer so
> > that the userspace process is able to use its virtual address to
> > access the dma buffer in kernel.
> >
> > And to avoid security issue, a bounce-buffering mechanism is
> > introduced to prevent userspace accessing the original buffer
> > directly.
> >
> > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > ---
> >   drivers/vdpa/vdpa_user/iova_domain.c | 535 +++++++++++++++++++++++++++++++++++
> >   drivers/vdpa/vdpa_user/iova_domain.h |  75 +++++
> >   2 files changed, 610 insertions(+)
> >   create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
> >   create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h
> >
> > diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
> > new file mode 100644
> > index 000000000000..83de216b0e51
> > --- /dev/null
> > +++ b/drivers/vdpa/vdpa_user/iova_domain.c
> > @@ -0,0 +1,535 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * MMU-based IOMMU implementation
> > + *
> > + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
>
>
> 2021 as well.
>

Sure.

>
> > + *
> > + * Author: Xie Yongji <xieyongji@bytedance.com>
> > + *
> > + */
> > +
> > +#include <linux/slab.h>
> > +#include <linux/file.h>
> > +#include <linux/anon_inodes.h>
> > +#include <linux/highmem.h>
> > +#include <linux/vmalloc.h>
> > +#include <linux/vdpa.h>
> > +
> > +#include "iova_domain.h"
> > +
> > +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
> > +                              u64 start, u64 last,
> > +                              u64 addr, unsigned int perm,
> > +                              struct file *file, u64 offset)
> > +{
> > +     struct vdpa_map_file *map_file;
> > +     int ret;
> > +
> > +     map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
> > +     if (!map_file)
> > +             return -ENOMEM;
> > +
> > +     map_file->file = get_file(file);
> > +     map_file->offset = offset;
> > +
> > +     ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
> > +                                     addr, perm, map_file);
> > +     if (ret) {
> > +             fput(map_file->file);
> > +             kfree(map_file);
> > +             return ret;
> > +     }
> > +     return 0;
> > +}
> > +
> > +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
> > +                               u64 start, u64 last)
> > +{
> > +     struct vdpa_map_file *map_file;
> > +     struct vhost_iotlb_map *map;
> > +
> > +     while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
> > +             map_file = (struct vdpa_map_file *)map->opaque;
> > +             fput(map_file->file);
> > +             kfree(map_file);
> > +             vhost_iotlb_map_free(domain->iotlb, map);
> > +     }
> > +}
> > +
> > +int vduse_domain_set_map(struct vduse_iova_domain *domain,
> > +                      struct vhost_iotlb *iotlb)
> > +{
> > +     struct vdpa_map_file *map_file;
> > +     struct vhost_iotlb_map *map;
> > +     u64 start = 0ULL, last = ULLONG_MAX;
> > +     int ret;
> > +
> > +     spin_lock(&domain->iotlb_lock);
> > +     vduse_iotlb_del_range(domain, start, last);
> > +
> > +     for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
> > +          map = vhost_iotlb_itree_next(map, start, last)) {
> > +             map_file = (struct vdpa_map_file *)map->opaque;
> > +             ret = vduse_iotlb_add_range(domain, map->start, map->last,
> > +                                         map->addr, map->perm,
> > +                                         map_file->file,
> > +                                         map_file->offset);
> > +             if (ret)
> > +                     goto err;
> > +     }
> > +     spin_unlock(&domain->iotlb_lock);
> > +
> > +     return 0;
> > +err:
> > +     vduse_iotlb_del_range(domain, start, last);
> > +     spin_unlock(&domain->iotlb_lock);
> > +     return ret;
> > +}
> > +
> > +static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
> > +                                      u64 iova, u64 size, u64 paddr)
> > +{
> > +     struct vduse_bounce_map *map;
> > +     unsigned int index;
> > +     u64 last = iova + size - 1;
> > +
> > +     while (iova < last) {
> > +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> > +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
> > +             map->orig_phys[index] = paddr;
> > +             paddr += IOVA_ALLOC_SIZE;
> > +             iova += IOVA_ALLOC_SIZE;
> > +     }
> > +}
> > +
> > +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
> > +                                        u64 iova, u64 size)
> > +{
> > +     struct vduse_bounce_map *map;
> > +     unsigned int index;
> > +     u64 last = iova + size - 1;
> > +
> > +     while (iova < last) {
> > +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> > +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
> > +             map->orig_phys[index] = INVALID_PHYS_ADDR;
> > +             iova += IOVA_ALLOC_SIZE;
> > +     }
> > +}
> > +
> > +static void do_bounce(phys_addr_t orig, void *addr, size_t size,
> > +                   enum dma_data_direction dir)
> > +{
> > +     unsigned long pfn = PFN_DOWN(orig);
> > +
> > +     if (PageHighMem(pfn_to_page(pfn))) {
> > +             unsigned int offset = offset_in_page(orig);
> > +             char *buffer;
> > +             unsigned int sz = 0;
> > +
> > +             while (size) {
> > +                     sz = min_t(size_t, PAGE_SIZE - offset, size);
> > +
> > +                     buffer = kmap_atomic(pfn_to_page(pfn));
>
>
> So kmap_atomic() can autoamtically go with fast path if the page does
> not belong to highmem.
>
> I think we can removce the condition and just use kmap_atomic() for all
> the cases here.
>

Looks good to me.

>
> > +                     if (dir == DMA_TO_DEVICE)
> > +                             memcpy(addr, buffer + offset, sz);
> > +                     else
> > +                             memcpy(buffer + offset, addr, sz);
> > +                     kunmap_atomic(buffer);
> > +
> > +                     size -= sz;
> > +                     pfn++;
> > +                     addr += sz;
> > +                     offset = 0;
> > +             }
> > +     } else if (dir == DMA_TO_DEVICE) {
> > +             memcpy(addr, phys_to_virt(orig), size);
> > +     } else {
> > +             memcpy(phys_to_virt(orig), addr, size);
> > +     }
> > +}
> > +
> > +static void vduse_domain_bounce(struct vduse_iova_domain *domain,
> > +                             dma_addr_t iova, size_t size,
> > +                             enum dma_data_direction dir)
> > +{
> > +     struct vduse_bounce_map *map;
> > +     unsigned int index, offset;
> > +     void *addr;
> > +     size_t sz;
> > +
> > +     while (size) {
> > +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> > +             offset = offset_in_page(iova);
> > +             sz = min_t(size_t, IOVA_ALLOC_SIZE, size);
> > +
> > +             if (map->bounce_page &&
> > +                 map->orig_phys[index] != INVALID_PHYS_ADDR) {
> > +                     addr = page_address(map->bounce_page) + offset;
> > +                     index = offset >> IOVA_ALLOC_ORDER;
> > +                     do_bounce(map->orig_phys[index], addr, sz, dir);
> > +             }
> > +             size -= sz;
> > +             iova += sz;
> > +     }
> > +}
> > +
> > +static struct page *
> > +vduse_domain_get_mapping_page(struct vduse_iova_domain *domain, u64 iova)
> > +{
> > +     u64 start = iova & PAGE_MASK;
> > +     u64 last = start + PAGE_SIZE - 1;
> > +     struct vhost_iotlb_map *map;
> > +     struct page *page = NULL;
> > +
> > +     spin_lock(&domain->iotlb_lock);
> > +     map = vhost_iotlb_itree_first(domain->iotlb, start, last);
> > +     if (!map)
> > +             goto out;
> > +
> > +     page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
> > +     get_page(page);
> > +out:
> > +     spin_unlock(&domain->iotlb_lock);
> > +
> > +     return page;
> > +}
> > +
> > +static struct page *
> > +vduse_domain_alloc_bounce_page(struct vduse_iova_domain *domain, u64 iova)
> > +{
> > +     u64 start = iova & PAGE_MASK;
> > +     struct page *page = alloc_page(GFP_KERNEL);
> > +     struct vduse_bounce_map *map;
> > +
> > +     if (!page)
> > +             return NULL;
> > +
> > +     spin_lock(&domain->iotlb_lock);
> > +     map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> > +     if (map->bounce_page) {
> > +             __free_page(page);
> > +             goto out;
> > +     }
> > +     map->bounce_page = page;
> > +
> > +     /* paired with vduse_domain_map_page() */
> > +     smp_mb();
>
>
> So this is suspicious. It's better to explain like, we need make sure A
> must be done after B.

OK. I see. It's used to protect this pattern:

   vduse_domain_alloc_bounce_page:          vduse_domain_map_page:
   write map->bounce_page                           write map->orig_phys
   mb()                                                            mb()
   read map->orig_phys                                 read map->bounce_page

Make sure there will always be a path to do bouncing.

>
> And it looks to me the iotlb_lock is sufficnet to do the synchronization
> here. E.g any reason that you don't take it in
> vduse_domain_map_bounce_page().
>

Yes, we can. But the performance in multi-queue cases will go down if
we use iotlb_lock on this critical path.

> And what's more, is there anyway to aovid holding the spinlock during
> bouncing?
>

Looks like we can't. In the case that multiple page faults happen on
the same page, we should make sure the bouncing is done before any
page fault handler returns.

>
> > +
> > +     vduse_domain_bounce(domain, start, PAGE_SIZE, DMA_TO_DEVICE);
> > +out:
> > +     get_page(map->bounce_page);
> > +     spin_unlock(&domain->iotlb_lock);
> > +
> > +     return map->bounce_page;
> > +}
> > +
> > +static void
> > +vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain)
> > +{
> > +     struct vduse_bounce_map *map;
> > +     unsigned long i, pfn, bounce_pfns;
> > +
> > +     bounce_pfns = domain->bounce_size >> PAGE_SHIFT;
> > +
> > +     for (pfn = 0; pfn < bounce_pfns; pfn++) {
> > +             map = &domain->bounce_maps[pfn];
> > +             for (i = 0; i < IOVA_MAPS_PER_PAGE; i++) {
> > +                     if (WARN_ON(map->orig_phys[i] != INVALID_PHYS_ADDR))
> > +                             continue;
> > +             }
> > +             if (!map->bounce_page)
> > +                     continue;
> > +
> > +             __free_page(map->bounce_page);
> > +             map->bounce_page = NULL;
> > +     }
> > +}
> > +
> > +void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain)
> > +{
> > +     if (!domain->bounce_map)
> > +             return;
> > +
> > +     spin_lock(&domain->iotlb_lock);
> > +     if (!domain->bounce_map)
> > +             goto unlock;
> > +
> > +     vduse_iotlb_del_range(domain, 0, domain->bounce_size - 1);
> > +     domain->bounce_map = 0;
> > +     vduse_domain_free_bounce_pages(domain);
> > +unlock:
> > +     spin_unlock(&domain->iotlb_lock);
> > +}
> > +
> > +static int vduse_domain_init_bounce_map(struct vduse_iova_domain *domain)
> > +{
> > +     int ret;
> > +
> > +     if (domain->bounce_map)
> > +             return 0;
> > +
> > +     spin_lock(&domain->iotlb_lock);
> > +     if (domain->bounce_map)
> > +             goto unlock;
> > +
> > +     ret = vduse_iotlb_add_range(domain, 0, domain->bounce_size - 1,
> > +                                 0, VHOST_MAP_RW, domain->file, 0);
> > +     if (!ret)
> > +             domain->bounce_map = 1;
> > +unlock:
> > +     spin_unlock(&domain->iotlb_lock);
> > +     return ret;
> > +}
> > +
> > +static dma_addr_t
> > +vduse_domain_alloc_iova(struct iova_domain *iovad,
> > +                     unsigned long size, unsigned long limit)
> > +{
> > +     unsigned long shift = iova_shift(iovad);
> > +     unsigned long iova_len = iova_align(iovad, size) >> shift;
> > +     unsigned long iova_pfn;
> > +
> > +     if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
> > +             iova_len = roundup_pow_of_two(iova_len);
> > +     iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true);
> > +
> > +     return iova_pfn << shift;
> > +}
> > +
> > +static void vduse_domain_free_iova(struct iova_domain *iovad,
> > +                                dma_addr_t iova, size_t size)
> > +{
> > +     unsigned long shift = iova_shift(iovad);
> > +     unsigned long iova_len = iova_align(iovad, size) >> shift;
> > +
> > +     free_iova_fast(iovad, iova >> shift, iova_len);
> > +}
> > +
> > +dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
> > +                              struct page *page, unsigned long offset,
> > +                              size_t size, enum dma_data_direction dir,
> > +                              unsigned long attrs)
> > +{
> > +     struct iova_domain *iovad = &domain->stream_iovad;
> > +     unsigned long limit = domain->bounce_size - 1;
> > +     phys_addr_t pa = page_to_phys(page) + offset;
> > +     dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
> > +
> > +     if (!iova)
> > +             return DMA_MAPPING_ERROR;
> > +
> > +     if (vduse_domain_init_bounce_map(domain)) {
> > +             vduse_domain_free_iova(iovad, iova, size);
> > +             return DMA_MAPPING_ERROR;
> > +     }
> > +
> > +     vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa);
> > +
> > +     /* paired with vduse_domain_alloc_bounce_page() */
> > +     smp_mb();
> > +
> > +     if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
> > +             vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE);
> > +
> > +     return iova;
> > +}
> > +
> > +void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
> > +                          dma_addr_t dma_addr, size_t size,
> > +                          enum dma_data_direction dir, unsigned long attrs)
> > +{
> > +     struct iova_domain *iovad = &domain->stream_iovad;
> > +
> > +     if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
> > +             vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE);
> > +
> > +     vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size);
> > +     vduse_domain_free_iova(iovad, dma_addr, size);
> > +}
> > +
> > +void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
> > +                               size_t size, dma_addr_t *dma_addr,
> > +                               gfp_t flag, unsigned long attrs)
> > +{
> > +     struct iova_domain *iovad = &domain->consistent_iovad;
> > +     unsigned long limit = domain->iova_limit;
> > +     dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
> > +     void *orig = alloc_pages_exact(size, flag);
> > +
> > +     if (!iova || !orig)
> > +             goto err;
> > +
> > +     spin_lock(&domain->iotlb_lock);
> > +     if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1,
> > +                               virt_to_phys(orig), VHOST_MAP_RW,
> > +                               domain->file, (u64)iova)) {
> > +             spin_unlock(&domain->iotlb_lock);
> > +             goto err;
> > +     }
> > +     spin_unlock(&domain->iotlb_lock);
> > +
> > +     *dma_addr = iova;
> > +
> > +     return orig;
> > +err:
> > +     *dma_addr = DMA_MAPPING_ERROR;
> > +     if (orig)
> > +             free_pages_exact(orig, size);
> > +     if (iova)
> > +             vduse_domain_free_iova(iovad, iova, size);
> > +
> > +     return NULL;
> > +}
> > +
> > +void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
> > +                             void *vaddr, dma_addr_t dma_addr,
> > +                             unsigned long attrs)
> > +{
> > +     struct iova_domain *iovad = &domain->consistent_iovad;
> > +     struct vhost_iotlb_map *map;
> > +     struct vdpa_map_file *map_file;
> > +     phys_addr_t pa;
> > +
> > +     spin_lock(&domain->iotlb_lock);
> > +     map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr,
> > +                                   (u64)dma_addr + size - 1);
> > +     if (WARN_ON(!map)) {
> > +             spin_unlock(&domain->iotlb_lock);
> > +             return;
> > +     }
> > +     map_file = (struct vdpa_map_file *)map->opaque;
> > +     fput(map_file->file);
> > +     kfree(map_file);
> > +     pa = map->addr;
> > +     vhost_iotlb_map_free(domain->iotlb, map);
> > +     spin_unlock(&domain->iotlb_lock);
> > +
> > +     vduse_domain_free_iova(iovad, dma_addr, size);
> > +     free_pages_exact(phys_to_virt(pa), size);
>
>
> I wonder whether we should free the coherent page after munmap().

But we don't know whether this coherent page is still needed by
userspace. The userspace can call munmap() in any cases.

> Otherwise usersapce can poke kernel pages in this way, e.g the page
> could be allocated and used by other subsystems?
>

Sorry, I didn't get your point here. What's the relationship between
this problem and munmap()?

>
> > +}
> > +
> > +static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf)
> > +{
> > +     struct vduse_iova_domain *domain = vmf->vma->vm_private_data;
> > +     unsigned long iova = vmf->pgoff << PAGE_SHIFT;
> > +     struct page *page;
> > +
> > +     if (!domain)
> > +             return VM_FAULT_SIGBUS;
> > +
> > +     if (iova < domain->bounce_size)
> > +             page = vduse_domain_alloc_bounce_page(domain, iova);
> > +     else
> > +             page = vduse_domain_get_mapping_page(domain, iova);
> > +
> > +     if (!page)
> > +             return VM_FAULT_SIGBUS;
> > +
> > +     vmf->page = page;
> > +
> > +     return 0;
> > +}
> > +
> > +static const struct vm_operations_struct vduse_domain_mmap_ops = {
> > +     .fault = vduse_domain_mmap_fault,
> > +};
> > +
> > +static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +     struct vduse_iova_domain *domain = file->private_data;
> > +
> > +     vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND;
> > +     vma->vm_private_data = domain;
> > +     vma->vm_ops = &vduse_domain_mmap_ops;
> > +
> > +     return 0;
> > +}
> > +
> > +static int vduse_domain_release(struct inode *inode, struct file *file)
> > +{
> > +     struct vduse_iova_domain *domain = file->private_data;
> > +
> > +     vduse_domain_reset_bounce_map(domain);
> > +     put_iova_domain(&domain->stream_iovad);
> > +     put_iova_domain(&domain->consistent_iovad);
> > +     vhost_iotlb_free(domain->iotlb);
> > +     vfree(domain->bounce_maps);
> > +     kfree(domain);
> > +
> > +     return 0;
> > +}
> > +
> > +static const struct file_operations vduse_domain_fops = {
> > +     .mmap = vduse_domain_mmap,
> > +     .release = vduse_domain_release,
> > +};
> > +
> > +void vduse_domain_destroy(struct vduse_iova_domain *domain)
> > +{
> > +     fput(domain->file);
> > +}
> > +
> > +struct vduse_iova_domain *
> > +vduse_domain_create(unsigned long iova_limit, size_t bounce_size)
> > +{
> > +     struct vduse_iova_domain *domain;
> > +     struct file *file;
> > +     struct vduse_bounce_map *map;
> > +     unsigned long i, pfn, bounce_pfns;
> > +
> > +     bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT;
> > +     if (iova_limit <= bounce_size)
> > +             return NULL;
> > +
> > +     domain = kzalloc(sizeof(*domain), GFP_KERNEL);
> > +     if (!domain)
> > +             return NULL;
> > +
> > +     domain->iotlb = vhost_iotlb_alloc(0, 0);
> > +     if (!domain->iotlb)
> > +             goto err_iotlb;
> > +
> > +     domain->iova_limit = iova_limit;
> > +     domain->bounce_size = PAGE_ALIGN(bounce_size);
> > +     domain->bounce_maps = vzalloc(bounce_pfns *
> > +                             sizeof(struct vduse_bounce_map));
> > +     if (!domain->bounce_maps)
> > +             goto err_map;
> > +
> > +     for (pfn = 0; pfn < bounce_pfns; pfn++) {
> > +             map = &domain->bounce_maps[pfn];
> > +             for (i = 0; i < IOVA_MAPS_PER_PAGE; i++)
> > +                     map->orig_phys[i] = INVALID_PHYS_ADDR;
> > +     }
> > +     file = anon_inode_getfile("[vduse-domain]", &vduse_domain_fops,
> > +                             domain, O_RDWR);
> > +     if (IS_ERR(file))
> > +             goto err_file;
> > +
> > +     domain->file = file;
> > +     spin_lock_init(&domain->iotlb_lock);
> > +     init_iova_domain(&domain->stream_iovad,
> > +                     IOVA_ALLOC_SIZE, IOVA_START_PFN);
> > +     init_iova_domain(&domain->consistent_iovad,
> > +                     PAGE_SIZE, bounce_pfns);
>
>
> Any reason for treating coherent and stream DMA differently (the
> different granule)?
>

To save space for small I/Os (less than PAGE_SIZE). We can have one
bounce page for multiple small I/Os.

>
> > +
> > +     return domain;
> > +err_file:
> > +     vfree(domain->bounce_maps);
> > +err_map:
> > +     vhost_iotlb_free(domain->iotlb);
> > +err_iotlb:
> > +     kfree(domain);
> > +     return NULL;
> > +}
> > +
> > +int vduse_domain_init(void)
> > +{
> > +     return iova_cache_get();
> > +}
> > +
> > +void vduse_domain_exit(void)
> > +{
> > +     iova_cache_put();
> > +}
> > diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
> > new file mode 100644
> > index 000000000000..faeeedfaa786
> > --- /dev/null
> > +++ b/drivers/vdpa/vdpa_user/iova_domain.h
> > @@ -0,0 +1,75 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * MMU-based IOMMU implementation
> > + *
> > + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
> > + *
> > + * Author: Xie Yongji <xieyongji@bytedance.com>
> > + *
> > + */
> > +
> > +#ifndef _VDUSE_IOVA_DOMAIN_H
> > +#define _VDUSE_IOVA_DOMAIN_H
> > +
> > +#include <linux/iova.h>
> > +#include <linux/dma-mapping.h>
> > +#include <linux/vhost_iotlb.h>
> > +
> > +#define IOVA_START_PFN 1
> > +
> > +#define IOVA_ALLOC_ORDER 12
> > +#define IOVA_ALLOC_SIZE (1 << IOVA_ALLOC_ORDER)
> > +
> > +#define IOVA_MAPS_PER_PAGE (1 << (PAGE_SHIFT - IOVA_ALLOC_ORDER))
> > +
> > +#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
> > +
> > +struct vduse_bounce_map {
> > +     struct page *bounce_page;
> > +     u64 orig_phys[IOVA_MAPS_PER_PAGE];
>
>
> Sorry if I had asked this before. But I'm not sure it's worth to have
> this extra complexitiy. If I read the code correctly, the
> IOVA_MAPS_PER_PAGE is 1 for the archs that have 4K page. Have you tested
> the code on the archs that have more than 4K page?
>

No, I haven't test it. Now I think it's OK to remove this optimization
in this patchset.

Thanks,
Yongji
Jason Wang March 25, 2021, 4:52 a.m. UTC | #3
在 2021/3/24 下午3:39, Yongji Xie 写道:
> On Wed, Mar 24, 2021 at 11:54 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/3/15 下午1:37, Xie Yongji 写道:
>>> This implements an MMU-based IOMMU driver to support mapping
>>> kernel dma buffer into userspace. The basic idea behind it is
>>> treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set
>>> up MMU mapping instead of IOMMU mapping for the DMA transfer so
>>> that the userspace process is able to use its virtual address to
>>> access the dma buffer in kernel.
>>>
>>> And to avoid security issue, a bounce-buffering mechanism is
>>> introduced to prevent userspace accessing the original buffer
>>> directly.
>>>
>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
>>> ---
>>>    drivers/vdpa/vdpa_user/iova_domain.c | 535 +++++++++++++++++++++++++++++++++++
>>>    drivers/vdpa/vdpa_user/iova_domain.h |  75 +++++
>>>    2 files changed, 610 insertions(+)
>>>    create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
>>>    create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h
>>>
>>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
>>> new file mode 100644
>>> index 000000000000..83de216b0e51
>>> --- /dev/null
>>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
>>> @@ -0,0 +1,535 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +/*
>>> + * MMU-based IOMMU implementation
>>> + *
>>> + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
>>
>> 2021 as well.
>>
> Sure.
>
>>> + *
>>> + * Author: Xie Yongji <xieyongji@bytedance.com>
>>> + *
>>> + */
>>> +
>>> +#include <linux/slab.h>
>>> +#include <linux/file.h>
>>> +#include <linux/anon_inodes.h>
>>> +#include <linux/highmem.h>
>>> +#include <linux/vmalloc.h>
>>> +#include <linux/vdpa.h>
>>> +
>>> +#include "iova_domain.h"
>>> +
>>> +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
>>> +                              u64 start, u64 last,
>>> +                              u64 addr, unsigned int perm,
>>> +                              struct file *file, u64 offset)
>>> +{
>>> +     struct vdpa_map_file *map_file;
>>> +     int ret;
>>> +
>>> +     map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
>>> +     if (!map_file)
>>> +             return -ENOMEM;
>>> +
>>> +     map_file->file = get_file(file);
>>> +     map_file->offset = offset;
>>> +
>>> +     ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
>>> +                                     addr, perm, map_file);
>>> +     if (ret) {
>>> +             fput(map_file->file);
>>> +             kfree(map_file);
>>> +             return ret;
>>> +     }
>>> +     return 0;
>>> +}
>>> +
>>> +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
>>> +                               u64 start, u64 last)
>>> +{
>>> +     struct vdpa_map_file *map_file;
>>> +     struct vhost_iotlb_map *map;
>>> +
>>> +     while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
>>> +             map_file = (struct vdpa_map_file *)map->opaque;
>>> +             fput(map_file->file);
>>> +             kfree(map_file);
>>> +             vhost_iotlb_map_free(domain->iotlb, map);
>>> +     }
>>> +}
>>> +
>>> +int vduse_domain_set_map(struct vduse_iova_domain *domain,
>>> +                      struct vhost_iotlb *iotlb)
>>> +{
>>> +     struct vdpa_map_file *map_file;
>>> +     struct vhost_iotlb_map *map;
>>> +     u64 start = 0ULL, last = ULLONG_MAX;
>>> +     int ret;
>>> +
>>> +     spin_lock(&domain->iotlb_lock);
>>> +     vduse_iotlb_del_range(domain, start, last);
>>> +
>>> +     for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
>>> +          map = vhost_iotlb_itree_next(map, start, last)) {
>>> +             map_file = (struct vdpa_map_file *)map->opaque;
>>> +             ret = vduse_iotlb_add_range(domain, map->start, map->last,
>>> +                                         map->addr, map->perm,
>>> +                                         map_file->file,
>>> +                                         map_file->offset);
>>> +             if (ret)
>>> +                     goto err;
>>> +     }
>>> +     spin_unlock(&domain->iotlb_lock);
>>> +
>>> +     return 0;
>>> +err:
>>> +     vduse_iotlb_del_range(domain, start, last);
>>> +     spin_unlock(&domain->iotlb_lock);
>>> +     return ret;
>>> +}
>>> +
>>> +static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
>>> +                                      u64 iova, u64 size, u64 paddr)
>>> +{
>>> +     struct vduse_bounce_map *map;
>>> +     unsigned int index;
>>> +     u64 last = iova + size - 1;
>>> +
>>> +     while (iova < last) {
>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
>>> +             map->orig_phys[index] = paddr;
>>> +             paddr += IOVA_ALLOC_SIZE;
>>> +             iova += IOVA_ALLOC_SIZE;
>>> +     }
>>> +}
>>> +
>>> +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
>>> +                                        u64 iova, u64 size)
>>> +{
>>> +     struct vduse_bounce_map *map;
>>> +     unsigned int index;
>>> +     u64 last = iova + size - 1;
>>> +
>>> +     while (iova < last) {
>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
>>> +             map->orig_phys[index] = INVALID_PHYS_ADDR;
>>> +             iova += IOVA_ALLOC_SIZE;
>>> +     }
>>> +}
>>> +
>>> +static void do_bounce(phys_addr_t orig, void *addr, size_t size,
>>> +                   enum dma_data_direction dir)
>>> +{
>>> +     unsigned long pfn = PFN_DOWN(orig);
>>> +
>>> +     if (PageHighMem(pfn_to_page(pfn))) {
>>> +             unsigned int offset = offset_in_page(orig);
>>> +             char *buffer;
>>> +             unsigned int sz = 0;
>>> +
>>> +             while (size) {
>>> +                     sz = min_t(size_t, PAGE_SIZE - offset, size);
>>> +
>>> +                     buffer = kmap_atomic(pfn_to_page(pfn));
>>
>> So kmap_atomic() can autoamtically go with fast path if the page does
>> not belong to highmem.
>>
>> I think we can removce the condition and just use kmap_atomic() for all
>> the cases here.
>>
> Looks good to me.
>
>>> +                     if (dir == DMA_TO_DEVICE)
>>> +                             memcpy(addr, buffer + offset, sz);
>>> +                     else
>>> +                             memcpy(buffer + offset, addr, sz);
>>> +                     kunmap_atomic(buffer);
>>> +
>>> +                     size -= sz;
>>> +                     pfn++;
>>> +                     addr += sz;
>>> +                     offset = 0;
>>> +             }
>>> +     } else if (dir == DMA_TO_DEVICE) {
>>> +             memcpy(addr, phys_to_virt(orig), size);
>>> +     } else {
>>> +             memcpy(phys_to_virt(orig), addr, size);
>>> +     }
>>> +}
>>> +
>>> +static void vduse_domain_bounce(struct vduse_iova_domain *domain,
>>> +                             dma_addr_t iova, size_t size,
>>> +                             enum dma_data_direction dir)
>>> +{
>>> +     struct vduse_bounce_map *map;
>>> +     unsigned int index, offset;
>>> +     void *addr;
>>> +     size_t sz;
>>> +
>>> +     while (size) {
>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>> +             offset = offset_in_page(iova);
>>> +             sz = min_t(size_t, IOVA_ALLOC_SIZE, size);
>>> +
>>> +             if (map->bounce_page &&
>>> +                 map->orig_phys[index] != INVALID_PHYS_ADDR) {
>>> +                     addr = page_address(map->bounce_page) + offset;
>>> +                     index = offset >> IOVA_ALLOC_ORDER;
>>> +                     do_bounce(map->orig_phys[index], addr, sz, dir);
>>> +             }
>>> +             size -= sz;
>>> +             iova += sz;
>>> +     }
>>> +}
>>> +
>>> +static struct page *
>>> +vduse_domain_get_mapping_page(struct vduse_iova_domain *domain, u64 iova)
>>> +{
>>> +     u64 start = iova & PAGE_MASK;
>>> +     u64 last = start + PAGE_SIZE - 1;
>>> +     struct vhost_iotlb_map *map;
>>> +     struct page *page = NULL;
>>> +
>>> +     spin_lock(&domain->iotlb_lock);
>>> +     map = vhost_iotlb_itree_first(domain->iotlb, start, last);
>>> +     if (!map)
>>> +             goto out;
>>> +
>>> +     page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
>>> +     get_page(page);
>>> +out:
>>> +     spin_unlock(&domain->iotlb_lock);
>>> +
>>> +     return page;
>>> +}
>>> +
>>> +static struct page *
>>> +vduse_domain_alloc_bounce_page(struct vduse_iova_domain *domain, u64 iova)
>>> +{
>>> +     u64 start = iova & PAGE_MASK;
>>> +     struct page *page = alloc_page(GFP_KERNEL);
>>> +     struct vduse_bounce_map *map;
>>> +
>>> +     if (!page)
>>> +             return NULL;
>>> +
>>> +     spin_lock(&domain->iotlb_lock);
>>> +     map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>> +     if (map->bounce_page) {
>>> +             __free_page(page);
>>> +             goto out;
>>> +     }
>>> +     map->bounce_page = page;
>>> +
>>> +     /* paired with vduse_domain_map_page() */
>>> +     smp_mb();
>>
>> So this is suspicious. It's better to explain like, we need make sure A
>> must be done after B.
> OK. I see. It's used to protect this pattern:
>
>     vduse_domain_alloc_bounce_page:          vduse_domain_map_page:
>     write map->bounce_page                           write map->orig_phys
>     mb()                                                            mb()
>     read map->orig_phys                                 read map->bounce_page
>
> Make sure there will always be a path to do bouncing.


Ok.


>
>> And it looks to me the iotlb_lock is sufficnet to do the synchronization
>> here. E.g any reason that you don't take it in
>> vduse_domain_map_bounce_page().
>>
> Yes, we can. But the performance in multi-queue cases will go down if
> we use iotlb_lock on this critical path.
>
>> And what's more, is there anyway to aovid holding the spinlock during
>> bouncing?
>>
> Looks like we can't. In the case that multiple page faults happen on
> the same page, we should make sure the bouncing is done before any
> page fault handler returns.


So it looks to me all those extra complexitiy comes from the fact that 
the bounce_page and orig_phys are set by different places so we need to 
do the bouncing in two places.

I wonder how much we can gain from the "lazy" boucning in page fault. 
The buffer mapped via dma_ops from virtio driver is expected to be 
accessed by the userspace soon.  It looks to me we can do all those 
stuffs during dma_map() then things would be greatly simplified.


>
>>> +
>>> +     vduse_domain_bounce(domain, start, PAGE_SIZE, DMA_TO_DEVICE);
>>> +out:
>>> +     get_page(map->bounce_page);
>>> +     spin_unlock(&domain->iotlb_lock);
>>> +
>>> +     return map->bounce_page;
>>> +}
>>> +
>>> +static void
>>> +vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain)
>>> +{
>>> +     struct vduse_bounce_map *map;
>>> +     unsigned long i, pfn, bounce_pfns;
>>> +
>>> +     bounce_pfns = domain->bounce_size >> PAGE_SHIFT;
>>> +
>>> +     for (pfn = 0; pfn < bounce_pfns; pfn++) {
>>> +             map = &domain->bounce_maps[pfn];
>>> +             for (i = 0; i < IOVA_MAPS_PER_PAGE; i++) {
>>> +                     if (WARN_ON(map->orig_phys[i] != INVALID_PHYS_ADDR))
>>> +                             continue;
>>> +             }
>>> +             if (!map->bounce_page)
>>> +                     continue;
>>> +
>>> +             __free_page(map->bounce_page);
>>> +             map->bounce_page = NULL;
>>> +     }
>>> +}
>>> +
>>> +void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain)
>>> +{
>>> +     if (!domain->bounce_map)
>>> +             return;
>>> +
>>> +     spin_lock(&domain->iotlb_lock);
>>> +     if (!domain->bounce_map)
>>> +             goto unlock;
>>> +
>>> +     vduse_iotlb_del_range(domain, 0, domain->bounce_size - 1);
>>> +     domain->bounce_map = 0;
>>> +     vduse_domain_free_bounce_pages(domain);
>>> +unlock:
>>> +     spin_unlock(&domain->iotlb_lock);
>>> +}
>>> +
>>> +static int vduse_domain_init_bounce_map(struct vduse_iova_domain *domain)
>>> +{
>>> +     int ret;
>>> +
>>> +     if (domain->bounce_map)
>>> +             return 0;
>>> +
>>> +     spin_lock(&domain->iotlb_lock);
>>> +     if (domain->bounce_map)
>>> +             goto unlock;
>>> +
>>> +     ret = vduse_iotlb_add_range(domain, 0, domain->bounce_size - 1,
>>> +                                 0, VHOST_MAP_RW, domain->file, 0);
>>> +     if (!ret)
>>> +             domain->bounce_map = 1;
>>> +unlock:
>>> +     spin_unlock(&domain->iotlb_lock);
>>> +     return ret;
>>> +}
>>> +
>>> +static dma_addr_t
>>> +vduse_domain_alloc_iova(struct iova_domain *iovad,
>>> +                     unsigned long size, unsigned long limit)
>>> +{
>>> +     unsigned long shift = iova_shift(iovad);
>>> +     unsigned long iova_len = iova_align(iovad, size) >> shift;
>>> +     unsigned long iova_pfn;
>>> +
>>> +     if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
>>> +             iova_len = roundup_pow_of_two(iova_len);
>>> +     iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true);
>>> +
>>> +     return iova_pfn << shift;
>>> +}
>>> +
>>> +static void vduse_domain_free_iova(struct iova_domain *iovad,
>>> +                                dma_addr_t iova, size_t size)
>>> +{
>>> +     unsigned long shift = iova_shift(iovad);
>>> +     unsigned long iova_len = iova_align(iovad, size) >> shift;
>>> +
>>> +     free_iova_fast(iovad, iova >> shift, iova_len);
>>> +}
>>> +
>>> +dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
>>> +                              struct page *page, unsigned long offset,
>>> +                              size_t size, enum dma_data_direction dir,
>>> +                              unsigned long attrs)
>>> +{
>>> +     struct iova_domain *iovad = &domain->stream_iovad;
>>> +     unsigned long limit = domain->bounce_size - 1;
>>> +     phys_addr_t pa = page_to_phys(page) + offset;
>>> +     dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
>>> +
>>> +     if (!iova)
>>> +             return DMA_MAPPING_ERROR;
>>> +
>>> +     if (vduse_domain_init_bounce_map(domain)) {
>>> +             vduse_domain_free_iova(iovad, iova, size);
>>> +             return DMA_MAPPING_ERROR;
>>> +     }
>>> +
>>> +     vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa);
>>> +
>>> +     /* paired with vduse_domain_alloc_bounce_page() */
>>> +     smp_mb();
>>> +
>>> +     if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
>>> +             vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE);
>>> +
>>> +     return iova;
>>> +}
>>> +
>>> +void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
>>> +                          dma_addr_t dma_addr, size_t size,
>>> +                          enum dma_data_direction dir, unsigned long attrs)
>>> +{
>>> +     struct iova_domain *iovad = &domain->stream_iovad;
>>> +
>>> +     if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
>>> +             vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE);
>>> +
>>> +     vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size);
>>> +     vduse_domain_free_iova(iovad, dma_addr, size);
>>> +}
>>> +
>>> +void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
>>> +                               size_t size, dma_addr_t *dma_addr,
>>> +                               gfp_t flag, unsigned long attrs)
>>> +{
>>> +     struct iova_domain *iovad = &domain->consistent_iovad;
>>> +     unsigned long limit = domain->iova_limit;
>>> +     dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
>>> +     void *orig = alloc_pages_exact(size, flag);
>>> +
>>> +     if (!iova || !orig)
>>> +             goto err;
>>> +
>>> +     spin_lock(&domain->iotlb_lock);
>>> +     if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1,
>>> +                               virt_to_phys(orig), VHOST_MAP_RW,
>>> +                               domain->file, (u64)iova)) {
>>> +             spin_unlock(&domain->iotlb_lock);
>>> +             goto err;
>>> +     }
>>> +     spin_unlock(&domain->iotlb_lock);
>>> +
>>> +     *dma_addr = iova;
>>> +
>>> +     return orig;
>>> +err:
>>> +     *dma_addr = DMA_MAPPING_ERROR;
>>> +     if (orig)
>>> +             free_pages_exact(orig, size);
>>> +     if (iova)
>>> +             vduse_domain_free_iova(iovad, iova, size);
>>> +
>>> +     return NULL;
>>> +}
>>> +
>>> +void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
>>> +                             void *vaddr, dma_addr_t dma_addr,
>>> +                             unsigned long attrs)
>>> +{
>>> +     struct iova_domain *iovad = &domain->consistent_iovad;
>>> +     struct vhost_iotlb_map *map;
>>> +     struct vdpa_map_file *map_file;
>>> +     phys_addr_t pa;
>>> +
>>> +     spin_lock(&domain->iotlb_lock);
>>> +     map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr,
>>> +                                   (u64)dma_addr + size - 1);
>>> +     if (WARN_ON(!map)) {
>>> +             spin_unlock(&domain->iotlb_lock);
>>> +             return;
>>> +     }
>>> +     map_file = (struct vdpa_map_file *)map->opaque;
>>> +     fput(map_file->file);
>>> +     kfree(map_file);
>>> +     pa = map->addr;
>>> +     vhost_iotlb_map_free(domain->iotlb, map);
>>> +     spin_unlock(&domain->iotlb_lock);
>>> +
>>> +     vduse_domain_free_iova(iovad, dma_addr, size);
>>> +     free_pages_exact(phys_to_virt(pa), size);
>>
>> I wonder whether we should free the coherent page after munmap().
> But we don't know whether this coherent page is still needed by
> userspace. The userspace can call munmap() in any cases.
>
>> Otherwise usersapce can poke kernel pages in this way, e.g the page
>> could be allocated and used by other subsystems?
>>
> Sorry, I didn't get your point here. What's the relationship between
> this problem and munmap()?


Ok, so it should be fine, I miss the code that takes an extra refcnt 
when trying to map coherent page.

Thanks


>
>>> +}
>>> +
>>> +static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf)
>>> +{
>>> +     struct vduse_iova_domain *domain = vmf->vma->vm_private_data;
>>> +     unsigned long iova = vmf->pgoff << PAGE_SHIFT;
>>> +     struct page *page;
>>> +
>>> +     if (!domain)
>>> +             return VM_FAULT_SIGBUS;
>>> +
>>> +     if (iova < domain->bounce_size)
>>> +             page = vduse_domain_alloc_bounce_page(domain, iova);
>>> +     else
>>> +             page = vduse_domain_get_mapping_page(domain, iova);
>>> +
>>> +     if (!page)
>>> +             return VM_FAULT_SIGBUS;
>>> +
>>> +     vmf->page = page;
>>> +
>>> +     return 0;
>>> +}
>>> +
>>> +static const struct vm_operations_struct vduse_domain_mmap_ops = {
>>> +     .fault = vduse_domain_mmap_fault,
>>> +};
>>> +
>>> +static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma)
>>> +{
>>> +     struct vduse_iova_domain *domain = file->private_data;
>>> +
>>> +     vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND;
>>> +     vma->vm_private_data = domain;
>>> +     vma->vm_ops = &vduse_domain_mmap_ops;
>>> +
>>> +     return 0;
>>> +}
>>> +
>>> +static int vduse_domain_release(struct inode *inode, struct file *file)
>>> +{
>>> +     struct vduse_iova_domain *domain = file->private_data;
>>> +
>>> +     vduse_domain_reset_bounce_map(domain);
>>> +     put_iova_domain(&domain->stream_iovad);
>>> +     put_iova_domain(&domain->consistent_iovad);
>>> +     vhost_iotlb_free(domain->iotlb);
>>> +     vfree(domain->bounce_maps);
>>> +     kfree(domain);
>>> +
>>> +     return 0;
>>> +}
>>> +
>>> +static const struct file_operations vduse_domain_fops = {
>>> +     .mmap = vduse_domain_mmap,
>>> +     .release = vduse_domain_release,
>>> +};
>>> +
>>> +void vduse_domain_destroy(struct vduse_iova_domain *domain)
>>> +{
>>> +     fput(domain->file);
>>> +}
>>> +
>>> +struct vduse_iova_domain *
>>> +vduse_domain_create(unsigned long iova_limit, size_t bounce_size)
>>> +{
>>> +     struct vduse_iova_domain *domain;
>>> +     struct file *file;
>>> +     struct vduse_bounce_map *map;
>>> +     unsigned long i, pfn, bounce_pfns;
>>> +
>>> +     bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT;
>>> +     if (iova_limit <= bounce_size)
>>> +             return NULL;
>>> +
>>> +     domain = kzalloc(sizeof(*domain), GFP_KERNEL);
>>> +     if (!domain)
>>> +             return NULL;
>>> +
>>> +     domain->iotlb = vhost_iotlb_alloc(0, 0);
>>> +     if (!domain->iotlb)
>>> +             goto err_iotlb;
>>> +
>>> +     domain->iova_limit = iova_limit;
>>> +     domain->bounce_size = PAGE_ALIGN(bounce_size);
>>> +     domain->bounce_maps = vzalloc(bounce_pfns *
>>> +                             sizeof(struct vduse_bounce_map));
>>> +     if (!domain->bounce_maps)
>>> +             goto err_map;
>>> +
>>> +     for (pfn = 0; pfn < bounce_pfns; pfn++) {
>>> +             map = &domain->bounce_maps[pfn];
>>> +             for (i = 0; i < IOVA_MAPS_PER_PAGE; i++)
>>> +                     map->orig_phys[i] = INVALID_PHYS_ADDR;
>>> +     }
>>> +     file = anon_inode_getfile("[vduse-domain]", &vduse_domain_fops,
>>> +                             domain, O_RDWR);
>>> +     if (IS_ERR(file))
>>> +             goto err_file;
>>> +
>>> +     domain->file = file;
>>> +     spin_lock_init(&domain->iotlb_lock);
>>> +     init_iova_domain(&domain->stream_iovad,
>>> +                     IOVA_ALLOC_SIZE, IOVA_START_PFN);
>>> +     init_iova_domain(&domain->consistent_iovad,
>>> +                     PAGE_SIZE, bounce_pfns);
>>
>> Any reason for treating coherent and stream DMA differently (the
>> different granule)?
>>
> To save space for small I/Os (less than PAGE_SIZE). We can have one
> bounce page for multiple small I/Os.
>
>>> +
>>> +     return domain;
>>> +err_file:
>>> +     vfree(domain->bounce_maps);
>>> +err_map:
>>> +     vhost_iotlb_free(domain->iotlb);
>>> +err_iotlb:
>>> +     kfree(domain);
>>> +     return NULL;
>>> +}
>>> +
>>> +int vduse_domain_init(void)
>>> +{
>>> +     return iova_cache_get();
>>> +}
>>> +
>>> +void vduse_domain_exit(void)
>>> +{
>>> +     iova_cache_put();
>>> +}
>>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
>>> new file mode 100644
>>> index 000000000000..faeeedfaa786
>>> --- /dev/null
>>> +++ b/drivers/vdpa/vdpa_user/iova_domain.h
>>> @@ -0,0 +1,75 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +/*
>>> + * MMU-based IOMMU implementation
>>> + *
>>> + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
>>> + *
>>> + * Author: Xie Yongji <xieyongji@bytedance.com>
>>> + *
>>> + */
>>> +
>>> +#ifndef _VDUSE_IOVA_DOMAIN_H
>>> +#define _VDUSE_IOVA_DOMAIN_H
>>> +
>>> +#include <linux/iova.h>
>>> +#include <linux/dma-mapping.h>
>>> +#include <linux/vhost_iotlb.h>
>>> +
>>> +#define IOVA_START_PFN 1
>>> +
>>> +#define IOVA_ALLOC_ORDER 12
>>> +#define IOVA_ALLOC_SIZE (1 << IOVA_ALLOC_ORDER)
>>> +
>>> +#define IOVA_MAPS_PER_PAGE (1 << (PAGE_SHIFT - IOVA_ALLOC_ORDER))
>>> +
>>> +#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
>>> +
>>> +struct vduse_bounce_map {
>>> +     struct page *bounce_page;
>>> +     u64 orig_phys[IOVA_MAPS_PER_PAGE];
>>
>> Sorry if I had asked this before. But I'm not sure it's worth to have
>> this extra complexitiy. If I read the code correctly, the
>> IOVA_MAPS_PER_PAGE is 1 for the archs that have 4K page. Have you tested
>> the code on the archs that have more than 4K page?
>>
> No, I haven't test it. Now I think it's OK to remove this optimization
> in this patchset.
>
> Thanks,
> Yongji
>
Yongji Xie March 25, 2021, 7:38 a.m. UTC | #4
On Thu, Mar 25, 2021 at 12:53 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/3/24 下午3:39, Yongji Xie 写道:
> > On Wed, Mar 24, 2021 at 11:54 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2021/3/15 下午1:37, Xie Yongji 写道:
> >>> This implements an MMU-based IOMMU driver to support mapping
> >>> kernel dma buffer into userspace. The basic idea behind it is
> >>> treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set
> >>> up MMU mapping instead of IOMMU mapping for the DMA transfer so
> >>> that the userspace process is able to use its virtual address to
> >>> access the dma buffer in kernel.
> >>>
> >>> And to avoid security issue, a bounce-buffering mechanism is
> >>> introduced to prevent userspace accessing the original buffer
> >>> directly.
> >>>
> >>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> >>> ---
> >>>    drivers/vdpa/vdpa_user/iova_domain.c | 535 +++++++++++++++++++++++++++++++++++
> >>>    drivers/vdpa/vdpa_user/iova_domain.h |  75 +++++
> >>>    2 files changed, 610 insertions(+)
> >>>    create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
> >>>    create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h
> >>>
> >>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
> >>> new file mode 100644
> >>> index 000000000000..83de216b0e51
> >>> --- /dev/null
> >>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
> >>> @@ -0,0 +1,535 @@
> >>> +// SPDX-License-Identifier: GPL-2.0-only
> >>> +/*
> >>> + * MMU-based IOMMU implementation
> >>> + *
> >>> + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
> >>
> >> 2021 as well.
> >>
> > Sure.
> >
> >>> + *
> >>> + * Author: Xie Yongji <xieyongji@bytedance.com>
> >>> + *
> >>> + */
> >>> +
> >>> +#include <linux/slab.h>
> >>> +#include <linux/file.h>
> >>> +#include <linux/anon_inodes.h>
> >>> +#include <linux/highmem.h>
> >>> +#include <linux/vmalloc.h>
> >>> +#include <linux/vdpa.h>
> >>> +
> >>> +#include "iova_domain.h"
> >>> +
> >>> +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
> >>> +                              u64 start, u64 last,
> >>> +                              u64 addr, unsigned int perm,
> >>> +                              struct file *file, u64 offset)
> >>> +{
> >>> +     struct vdpa_map_file *map_file;
> >>> +     int ret;
> >>> +
> >>> +     map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
> >>> +     if (!map_file)
> >>> +             return -ENOMEM;
> >>> +
> >>> +     map_file->file = get_file(file);
> >>> +     map_file->offset = offset;
> >>> +
> >>> +     ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
> >>> +                                     addr, perm, map_file);
> >>> +     if (ret) {
> >>> +             fput(map_file->file);
> >>> +             kfree(map_file);
> >>> +             return ret;
> >>> +     }
> >>> +     return 0;
> >>> +}
> >>> +
> >>> +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
> >>> +                               u64 start, u64 last)
> >>> +{
> >>> +     struct vdpa_map_file *map_file;
> >>> +     struct vhost_iotlb_map *map;
> >>> +
> >>> +     while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
> >>> +             map_file = (struct vdpa_map_file *)map->opaque;
> >>> +             fput(map_file->file);
> >>> +             kfree(map_file);
> >>> +             vhost_iotlb_map_free(domain->iotlb, map);
> >>> +     }
> >>> +}
> >>> +
> >>> +int vduse_domain_set_map(struct vduse_iova_domain *domain,
> >>> +                      struct vhost_iotlb *iotlb)
> >>> +{
> >>> +     struct vdpa_map_file *map_file;
> >>> +     struct vhost_iotlb_map *map;
> >>> +     u64 start = 0ULL, last = ULLONG_MAX;
> >>> +     int ret;
> >>> +
> >>> +     spin_lock(&domain->iotlb_lock);
> >>> +     vduse_iotlb_del_range(domain, start, last);
> >>> +
> >>> +     for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
> >>> +          map = vhost_iotlb_itree_next(map, start, last)) {
> >>> +             map_file = (struct vdpa_map_file *)map->opaque;
> >>> +             ret = vduse_iotlb_add_range(domain, map->start, map->last,
> >>> +                                         map->addr, map->perm,
> >>> +                                         map_file->file,
> >>> +                                         map_file->offset);
> >>> +             if (ret)
> >>> +                     goto err;
> >>> +     }
> >>> +     spin_unlock(&domain->iotlb_lock);
> >>> +
> >>> +     return 0;
> >>> +err:
> >>> +     vduse_iotlb_del_range(domain, start, last);
> >>> +     spin_unlock(&domain->iotlb_lock);
> >>> +     return ret;
> >>> +}
> >>> +
> >>> +static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
> >>> +                                      u64 iova, u64 size, u64 paddr)
> >>> +{
> >>> +     struct vduse_bounce_map *map;
> >>> +     unsigned int index;
> >>> +     u64 last = iova + size - 1;
> >>> +
> >>> +     while (iova < last) {
> >>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
> >>> +             map->orig_phys[index] = paddr;
> >>> +             paddr += IOVA_ALLOC_SIZE;
> >>> +             iova += IOVA_ALLOC_SIZE;
> >>> +     }
> >>> +}
> >>> +
> >>> +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
> >>> +                                        u64 iova, u64 size)
> >>> +{
> >>> +     struct vduse_bounce_map *map;
> >>> +     unsigned int index;
> >>> +     u64 last = iova + size - 1;
> >>> +
> >>> +     while (iova < last) {
> >>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
> >>> +             map->orig_phys[index] = INVALID_PHYS_ADDR;
> >>> +             iova += IOVA_ALLOC_SIZE;
> >>> +     }
> >>> +}
> >>> +
> >>> +static void do_bounce(phys_addr_t orig, void *addr, size_t size,
> >>> +                   enum dma_data_direction dir)
> >>> +{
> >>> +     unsigned long pfn = PFN_DOWN(orig);
> >>> +
> >>> +     if (PageHighMem(pfn_to_page(pfn))) {
> >>> +             unsigned int offset = offset_in_page(orig);
> >>> +             char *buffer;
> >>> +             unsigned int sz = 0;
> >>> +
> >>> +             while (size) {
> >>> +                     sz = min_t(size_t, PAGE_SIZE - offset, size);
> >>> +
> >>> +                     buffer = kmap_atomic(pfn_to_page(pfn));
> >>
> >> So kmap_atomic() can autoamtically go with fast path if the page does
> >> not belong to highmem.
> >>
> >> I think we can removce the condition and just use kmap_atomic() for all
> >> the cases here.
> >>
> > Looks good to me.
> >
> >>> +                     if (dir == DMA_TO_DEVICE)
> >>> +                             memcpy(addr, buffer + offset, sz);
> >>> +                     else
> >>> +                             memcpy(buffer + offset, addr, sz);
> >>> +                     kunmap_atomic(buffer);
> >>> +
> >>> +                     size -= sz;
> >>> +                     pfn++;
> >>> +                     addr += sz;
> >>> +                     offset = 0;
> >>> +             }
> >>> +     } else if (dir == DMA_TO_DEVICE) {
> >>> +             memcpy(addr, phys_to_virt(orig), size);
> >>> +     } else {
> >>> +             memcpy(phys_to_virt(orig), addr, size);
> >>> +     }
> >>> +}
> >>> +
> >>> +static void vduse_domain_bounce(struct vduse_iova_domain *domain,
> >>> +                             dma_addr_t iova, size_t size,
> >>> +                             enum dma_data_direction dir)
> >>> +{
> >>> +     struct vduse_bounce_map *map;
> >>> +     unsigned int index, offset;
> >>> +     void *addr;
> >>> +     size_t sz;
> >>> +
> >>> +     while (size) {
> >>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >>> +             offset = offset_in_page(iova);
> >>> +             sz = min_t(size_t, IOVA_ALLOC_SIZE, size);
> >>> +
> >>> +             if (map->bounce_page &&
> >>> +                 map->orig_phys[index] != INVALID_PHYS_ADDR) {
> >>> +                     addr = page_address(map->bounce_page) + offset;
> >>> +                     index = offset >> IOVA_ALLOC_ORDER;
> >>> +                     do_bounce(map->orig_phys[index], addr, sz, dir);
> >>> +             }
> >>> +             size -= sz;
> >>> +             iova += sz;
> >>> +     }
> >>> +}
> >>> +
> >>> +static struct page *
> >>> +vduse_domain_get_mapping_page(struct vduse_iova_domain *domain, u64 iova)
> >>> +{
> >>> +     u64 start = iova & PAGE_MASK;
> >>> +     u64 last = start + PAGE_SIZE - 1;
> >>> +     struct vhost_iotlb_map *map;
> >>> +     struct page *page = NULL;
> >>> +
> >>> +     spin_lock(&domain->iotlb_lock);
> >>> +     map = vhost_iotlb_itree_first(domain->iotlb, start, last);
> >>> +     if (!map)
> >>> +             goto out;
> >>> +
> >>> +     page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
> >>> +     get_page(page);
> >>> +out:
> >>> +     spin_unlock(&domain->iotlb_lock);
> >>> +
> >>> +     return page;
> >>> +}
> >>> +
> >>> +static struct page *
> >>> +vduse_domain_alloc_bounce_page(struct vduse_iova_domain *domain, u64 iova)
> >>> +{
> >>> +     u64 start = iova & PAGE_MASK;
> >>> +     struct page *page = alloc_page(GFP_KERNEL);
> >>> +     struct vduse_bounce_map *map;
> >>> +
> >>> +     if (!page)
> >>> +             return NULL;
> >>> +
> >>> +     spin_lock(&domain->iotlb_lock);
> >>> +     map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >>> +     if (map->bounce_page) {
> >>> +             __free_page(page);
> >>> +             goto out;
> >>> +     }
> >>> +     map->bounce_page = page;
> >>> +
> >>> +     /* paired with vduse_domain_map_page() */
> >>> +     smp_mb();
> >>
> >> So this is suspicious. It's better to explain like, we need make sure A
> >> must be done after B.
> > OK. I see. It's used to protect this pattern:
> >
> >     vduse_domain_alloc_bounce_page:          vduse_domain_map_page:
> >     write map->bounce_page                           write map->orig_phys
> >     mb()                                                            mb()
> >     read map->orig_phys                                 read map->bounce_page
> >
> > Make sure there will always be a path to do bouncing.
>
>
> Ok.
>
>
> >
> >> And it looks to me the iotlb_lock is sufficnet to do the synchronization
> >> here. E.g any reason that you don't take it in
> >> vduse_domain_map_bounce_page().
> >>
> > Yes, we can. But the performance in multi-queue cases will go down if
> > we use iotlb_lock on this critical path.
> >
> >> And what's more, is there anyway to aovid holding the spinlock during
> >> bouncing?
> >>
> > Looks like we can't. In the case that multiple page faults happen on
> > the same page, we should make sure the bouncing is done before any
> > page fault handler returns.
>
>
> So it looks to me all those extra complexitiy comes from the fact that
> the bounce_page and orig_phys are set by different places so we need to
> do the bouncing in two places.
>
> I wonder how much we can gain from the "lazy" boucning in page fault.
> The buffer mapped via dma_ops from virtio driver is expected to be
> accessed by the userspace soon.  It looks to me we can do all those
> stuffs during dma_map() then things would be greatly simplified.
>

If so, we need to allocate lots of pages from the pool reserved for
atomic memory allocation requests.

Thanks,
Yongji
Jason Wang March 26, 2021, 4:26 a.m. UTC | #5
在 2021/3/25 下午3:38, Yongji Xie 写道:
> On Thu, Mar 25, 2021 at 12:53 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/3/24 下午3:39, Yongji Xie 写道:
>>> On Wed, Mar 24, 2021 at 11:54 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> 在 2021/3/15 下午1:37, Xie Yongji 写道:
>>>>> This implements an MMU-based IOMMU driver to support mapping
>>>>> kernel dma buffer into userspace. The basic idea behind it is
>>>>> treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set
>>>>> up MMU mapping instead of IOMMU mapping for the DMA transfer so
>>>>> that the userspace process is able to use its virtual address to
>>>>> access the dma buffer in kernel.
>>>>>
>>>>> And to avoid security issue, a bounce-buffering mechanism is
>>>>> introduced to prevent userspace accessing the original buffer
>>>>> directly.
>>>>>
>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
>>>>> ---
>>>>>     drivers/vdpa/vdpa_user/iova_domain.c | 535 +++++++++++++++++++++++++++++++++++
>>>>>     drivers/vdpa/vdpa_user/iova_domain.h |  75 +++++
>>>>>     2 files changed, 610 insertions(+)
>>>>>     create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
>>>>>     create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h
>>>>>
>>>>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
>>>>> new file mode 100644
>>>>> index 000000000000..83de216b0e51
>>>>> --- /dev/null
>>>>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
>>>>> @@ -0,0 +1,535 @@
>>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>>> +/*
>>>>> + * MMU-based IOMMU implementation
>>>>> + *
>>>>> + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
>>>> 2021 as well.
>>>>
>>> Sure.
>>>
>>>>> + *
>>>>> + * Author: Xie Yongji <xieyongji@bytedance.com>
>>>>> + *
>>>>> + */
>>>>> +
>>>>> +#include <linux/slab.h>
>>>>> +#include <linux/file.h>
>>>>> +#include <linux/anon_inodes.h>
>>>>> +#include <linux/highmem.h>
>>>>> +#include <linux/vmalloc.h>
>>>>> +#include <linux/vdpa.h>
>>>>> +
>>>>> +#include "iova_domain.h"
>>>>> +
>>>>> +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
>>>>> +                              u64 start, u64 last,
>>>>> +                              u64 addr, unsigned int perm,
>>>>> +                              struct file *file, u64 offset)
>>>>> +{
>>>>> +     struct vdpa_map_file *map_file;
>>>>> +     int ret;
>>>>> +
>>>>> +     map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
>>>>> +     if (!map_file)
>>>>> +             return -ENOMEM;
>>>>> +
>>>>> +     map_file->file = get_file(file);
>>>>> +     map_file->offset = offset;
>>>>> +
>>>>> +     ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
>>>>> +                                     addr, perm, map_file);
>>>>> +     if (ret) {
>>>>> +             fput(map_file->file);
>>>>> +             kfree(map_file);
>>>>> +             return ret;
>>>>> +     }
>>>>> +     return 0;
>>>>> +}
>>>>> +
>>>>> +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
>>>>> +                               u64 start, u64 last)
>>>>> +{
>>>>> +     struct vdpa_map_file *map_file;
>>>>> +     struct vhost_iotlb_map *map;
>>>>> +
>>>>> +     while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
>>>>> +             map_file = (struct vdpa_map_file *)map->opaque;
>>>>> +             fput(map_file->file);
>>>>> +             kfree(map_file);
>>>>> +             vhost_iotlb_map_free(domain->iotlb, map);
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +int vduse_domain_set_map(struct vduse_iova_domain *domain,
>>>>> +                      struct vhost_iotlb *iotlb)
>>>>> +{
>>>>> +     struct vdpa_map_file *map_file;
>>>>> +     struct vhost_iotlb_map *map;
>>>>> +     u64 start = 0ULL, last = ULLONG_MAX;
>>>>> +     int ret;
>>>>> +
>>>>> +     spin_lock(&domain->iotlb_lock);
>>>>> +     vduse_iotlb_del_range(domain, start, last);
>>>>> +
>>>>> +     for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
>>>>> +          map = vhost_iotlb_itree_next(map, start, last)) {
>>>>> +             map_file = (struct vdpa_map_file *)map->opaque;
>>>>> +             ret = vduse_iotlb_add_range(domain, map->start, map->last,
>>>>> +                                         map->addr, map->perm,
>>>>> +                                         map_file->file,
>>>>> +                                         map_file->offset);
>>>>> +             if (ret)
>>>>> +                     goto err;
>>>>> +     }
>>>>> +     spin_unlock(&domain->iotlb_lock);
>>>>> +
>>>>> +     return 0;
>>>>> +err:
>>>>> +     vduse_iotlb_del_range(domain, start, last);
>>>>> +     spin_unlock(&domain->iotlb_lock);
>>>>> +     return ret;
>>>>> +}
>>>>> +
>>>>> +static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
>>>>> +                                      u64 iova, u64 size, u64 paddr)
>>>>> +{
>>>>> +     struct vduse_bounce_map *map;
>>>>> +     unsigned int index;
>>>>> +     u64 last = iova + size - 1;
>>>>> +
>>>>> +     while (iova < last) {
>>>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
>>>>> +             map->orig_phys[index] = paddr;
>>>>> +             paddr += IOVA_ALLOC_SIZE;
>>>>> +             iova += IOVA_ALLOC_SIZE;
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
>>>>> +                                        u64 iova, u64 size)
>>>>> +{
>>>>> +     struct vduse_bounce_map *map;
>>>>> +     unsigned int index;
>>>>> +     u64 last = iova + size - 1;
>>>>> +
>>>>> +     while (iova < last) {
>>>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
>>>>> +             map->orig_phys[index] = INVALID_PHYS_ADDR;
>>>>> +             iova += IOVA_ALLOC_SIZE;
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +static void do_bounce(phys_addr_t orig, void *addr, size_t size,
>>>>> +                   enum dma_data_direction dir)
>>>>> +{
>>>>> +     unsigned long pfn = PFN_DOWN(orig);
>>>>> +
>>>>> +     if (PageHighMem(pfn_to_page(pfn))) {
>>>>> +             unsigned int offset = offset_in_page(orig);
>>>>> +             char *buffer;
>>>>> +             unsigned int sz = 0;
>>>>> +
>>>>> +             while (size) {
>>>>> +                     sz = min_t(size_t, PAGE_SIZE - offset, size);
>>>>> +
>>>>> +                     buffer = kmap_atomic(pfn_to_page(pfn));
>>>> So kmap_atomic() can autoamtically go with fast path if the page does
>>>> not belong to highmem.
>>>>
>>>> I think we can removce the condition and just use kmap_atomic() for all
>>>> the cases here.
>>>>
>>> Looks good to me.
>>>
>>>>> +                     if (dir == DMA_TO_DEVICE)
>>>>> +                             memcpy(addr, buffer + offset, sz);
>>>>> +                     else
>>>>> +                             memcpy(buffer + offset, addr, sz);
>>>>> +                     kunmap_atomic(buffer);
>>>>> +
>>>>> +                     size -= sz;
>>>>> +                     pfn++;
>>>>> +                     addr += sz;
>>>>> +                     offset = 0;
>>>>> +             }
>>>>> +     } else if (dir == DMA_TO_DEVICE) {
>>>>> +             memcpy(addr, phys_to_virt(orig), size);
>>>>> +     } else {
>>>>> +             memcpy(phys_to_virt(orig), addr, size);
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +static void vduse_domain_bounce(struct vduse_iova_domain *domain,
>>>>> +                             dma_addr_t iova, size_t size,
>>>>> +                             enum dma_data_direction dir)
>>>>> +{
>>>>> +     struct vduse_bounce_map *map;
>>>>> +     unsigned int index, offset;
>>>>> +     void *addr;
>>>>> +     size_t sz;
>>>>> +
>>>>> +     while (size) {
>>>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>>> +             offset = offset_in_page(iova);
>>>>> +             sz = min_t(size_t, IOVA_ALLOC_SIZE, size);
>>>>> +
>>>>> +             if (map->bounce_page &&
>>>>> +                 map->orig_phys[index] != INVALID_PHYS_ADDR) {
>>>>> +                     addr = page_address(map->bounce_page) + offset;
>>>>> +                     index = offset >> IOVA_ALLOC_ORDER;
>>>>> +                     do_bounce(map->orig_phys[index], addr, sz, dir);
>>>>> +             }
>>>>> +             size -= sz;
>>>>> +             iova += sz;
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +static struct page *
>>>>> +vduse_domain_get_mapping_page(struct vduse_iova_domain *domain, u64 iova)
>>>>> +{
>>>>> +     u64 start = iova & PAGE_MASK;
>>>>> +     u64 last = start + PAGE_SIZE - 1;
>>>>> +     struct vhost_iotlb_map *map;
>>>>> +     struct page *page = NULL;
>>>>> +
>>>>> +     spin_lock(&domain->iotlb_lock);
>>>>> +     map = vhost_iotlb_itree_first(domain->iotlb, start, last);
>>>>> +     if (!map)
>>>>> +             goto out;
>>>>> +
>>>>> +     page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
>>>>> +     get_page(page);
>>>>> +out:
>>>>> +     spin_unlock(&domain->iotlb_lock);
>>>>> +
>>>>> +     return page;
>>>>> +}
>>>>> +
>>>>> +static struct page *
>>>>> +vduse_domain_alloc_bounce_page(struct vduse_iova_domain *domain, u64 iova)
>>>>> +{
>>>>> +     u64 start = iova & PAGE_MASK;
>>>>> +     struct page *page = alloc_page(GFP_KERNEL);
>>>>> +     struct vduse_bounce_map *map;
>>>>> +
>>>>> +     if (!page)
>>>>> +             return NULL;
>>>>> +
>>>>> +     spin_lock(&domain->iotlb_lock);
>>>>> +     map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>>> +     if (map->bounce_page) {
>>>>> +             __free_page(page);
>>>>> +             goto out;
>>>>> +     }
>>>>> +     map->bounce_page = page;
>>>>> +
>>>>> +     /* paired with vduse_domain_map_page() */
>>>>> +     smp_mb();
>>>> So this is suspicious. It's better to explain like, we need make sure A
>>>> must be done after B.
>>> OK. I see. It's used to protect this pattern:
>>>
>>>      vduse_domain_alloc_bounce_page:          vduse_domain_map_page:
>>>      write map->bounce_page                           write map->orig_phys
>>>      mb()                                                            mb()
>>>      read map->orig_phys                                 read map->bounce_page
>>>
>>> Make sure there will always be a path to do bouncing.
>>
>> Ok.
>>
>>
>>>> And it looks to me the iotlb_lock is sufficnet to do the synchronization
>>>> here. E.g any reason that you don't take it in
>>>> vduse_domain_map_bounce_page().
>>>>
>>> Yes, we can. But the performance in multi-queue cases will go down if
>>> we use iotlb_lock on this critical path.
>>>
>>>> And what's more, is there anyway to aovid holding the spinlock during
>>>> bouncing?
>>>>
>>> Looks like we can't. In the case that multiple page faults happen on
>>> the same page, we should make sure the bouncing is done before any
>>> page fault handler returns.
>>
>> So it looks to me all those extra complexitiy comes from the fact that
>> the bounce_page and orig_phys are set by different places so we need to
>> do the bouncing in two places.
>>
>> I wonder how much we can gain from the "lazy" boucning in page fault.
>> The buffer mapped via dma_ops from virtio driver is expected to be
>> accessed by the userspace soon.  It looks to me we can do all those
>> stuffs during dma_map() then things would be greatly simplified.
>>
> If so, we need to allocate lots of pages from the pool reserved for
> atomic memory allocation requests.


This should be fine, a lot of drivers tries to allocate pages in atomic 
context. The point is to simplify the codes to make it easy to 
determince the correctness so we can add optimization on top simply by 
benchmarking the difference.

E.g we have serveral places that accesses orig_phys:

1) map_page(), write
2) unmap_page(), write
3) page fault handler, read

It's not clear to me how they were synchronized. Or if it was 
synchronzied implicitly (via iova allocator?), we'd better document it. 
Or simply use spinlock (which is the preferrable way I'd like to go). We 
probably don't need to worry too much about the cost of spinlock since 
iova allocater use it heavily.

Thanks


>
> Thanks,
> Yongji
>
Yongji Xie March 26, 2021, 5:14 a.m. UTC | #6
On Fri, Mar 26, 2021 at 12:27 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/3/25 下午3:38, Yongji Xie 写道:
> > On Thu, Mar 25, 2021 at 12:53 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2021/3/24 下午3:39, Yongji Xie 写道:
> >>> On Wed, Mar 24, 2021 at 11:54 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> 在 2021/3/15 下午1:37, Xie Yongji 写道:
> >>>>> This implements an MMU-based IOMMU driver to support mapping
> >>>>> kernel dma buffer into userspace. The basic idea behind it is
> >>>>> treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set
> >>>>> up MMU mapping instead of IOMMU mapping for the DMA transfer so
> >>>>> that the userspace process is able to use its virtual address to
> >>>>> access the dma buffer in kernel.
> >>>>>
> >>>>> And to avoid security issue, a bounce-buffering mechanism is
> >>>>> introduced to prevent userspace accessing the original buffer
> >>>>> directly.
> >>>>>
> >>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> >>>>> ---
> >>>>>     drivers/vdpa/vdpa_user/iova_domain.c | 535 +++++++++++++++++++++++++++++++++++
> >>>>>     drivers/vdpa/vdpa_user/iova_domain.h |  75 +++++
> >>>>>     2 files changed, 610 insertions(+)
> >>>>>     create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
> >>>>>     create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h
> >>>>>
> >>>>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
> >>>>> new file mode 100644
> >>>>> index 000000000000..83de216b0e51
> >>>>> --- /dev/null
> >>>>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
> >>>>> @@ -0,0 +1,535 @@
> >>>>> +// SPDX-License-Identifier: GPL-2.0-only
> >>>>> +/*
> >>>>> + * MMU-based IOMMU implementation
> >>>>> + *
> >>>>> + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
> >>>> 2021 as well.
> >>>>
> >>> Sure.
> >>>
> >>>>> + *
> >>>>> + * Author: Xie Yongji <xieyongji@bytedance.com>
> >>>>> + *
> >>>>> + */
> >>>>> +
> >>>>> +#include <linux/slab.h>
> >>>>> +#include <linux/file.h>
> >>>>> +#include <linux/anon_inodes.h>
> >>>>> +#include <linux/highmem.h>
> >>>>> +#include <linux/vmalloc.h>
> >>>>> +#include <linux/vdpa.h>
> >>>>> +
> >>>>> +#include "iova_domain.h"
> >>>>> +
> >>>>> +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
> >>>>> +                              u64 start, u64 last,
> >>>>> +                              u64 addr, unsigned int perm,
> >>>>> +                              struct file *file, u64 offset)
> >>>>> +{
> >>>>> +     struct vdpa_map_file *map_file;
> >>>>> +     int ret;
> >>>>> +
> >>>>> +     map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
> >>>>> +     if (!map_file)
> >>>>> +             return -ENOMEM;
> >>>>> +
> >>>>> +     map_file->file = get_file(file);
> >>>>> +     map_file->offset = offset;
> >>>>> +
> >>>>> +     ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
> >>>>> +                                     addr, perm, map_file);
> >>>>> +     if (ret) {
> >>>>> +             fput(map_file->file);
> >>>>> +             kfree(map_file);
> >>>>> +             return ret;
> >>>>> +     }
> >>>>> +     return 0;
> >>>>> +}
> >>>>> +
> >>>>> +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
> >>>>> +                               u64 start, u64 last)
> >>>>> +{
> >>>>> +     struct vdpa_map_file *map_file;
> >>>>> +     struct vhost_iotlb_map *map;
> >>>>> +
> >>>>> +     while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
> >>>>> +             map_file = (struct vdpa_map_file *)map->opaque;
> >>>>> +             fput(map_file->file);
> >>>>> +             kfree(map_file);
> >>>>> +             vhost_iotlb_map_free(domain->iotlb, map);
> >>>>> +     }
> >>>>> +}
> >>>>> +
> >>>>> +int vduse_domain_set_map(struct vduse_iova_domain *domain,
> >>>>> +                      struct vhost_iotlb *iotlb)
> >>>>> +{
> >>>>> +     struct vdpa_map_file *map_file;
> >>>>> +     struct vhost_iotlb_map *map;
> >>>>> +     u64 start = 0ULL, last = ULLONG_MAX;
> >>>>> +     int ret;
> >>>>> +
> >>>>> +     spin_lock(&domain->iotlb_lock);
> >>>>> +     vduse_iotlb_del_range(domain, start, last);
> >>>>> +
> >>>>> +     for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
> >>>>> +          map = vhost_iotlb_itree_next(map, start, last)) {
> >>>>> +             map_file = (struct vdpa_map_file *)map->opaque;
> >>>>> +             ret = vduse_iotlb_add_range(domain, map->start, map->last,
> >>>>> +                                         map->addr, map->perm,
> >>>>> +                                         map_file->file,
> >>>>> +                                         map_file->offset);
> >>>>> +             if (ret)
> >>>>> +                     goto err;
> >>>>> +     }
> >>>>> +     spin_unlock(&domain->iotlb_lock);
> >>>>> +
> >>>>> +     return 0;
> >>>>> +err:
> >>>>> +     vduse_iotlb_del_range(domain, start, last);
> >>>>> +     spin_unlock(&domain->iotlb_lock);
> >>>>> +     return ret;
> >>>>> +}
> >>>>> +
> >>>>> +static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
> >>>>> +                                      u64 iova, u64 size, u64 paddr)
> >>>>> +{
> >>>>> +     struct vduse_bounce_map *map;
> >>>>> +     unsigned int index;
> >>>>> +     u64 last = iova + size - 1;
> >>>>> +
> >>>>> +     while (iova < last) {
> >>>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >>>>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
> >>>>> +             map->orig_phys[index] = paddr;
> >>>>> +             paddr += IOVA_ALLOC_SIZE;
> >>>>> +             iova += IOVA_ALLOC_SIZE;
> >>>>> +     }
> >>>>> +}
> >>>>> +
> >>>>> +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
> >>>>> +                                        u64 iova, u64 size)
> >>>>> +{
> >>>>> +     struct vduse_bounce_map *map;
> >>>>> +     unsigned int index;
> >>>>> +     u64 last = iova + size - 1;
> >>>>> +
> >>>>> +     while (iova < last) {
> >>>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >>>>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
> >>>>> +             map->orig_phys[index] = INVALID_PHYS_ADDR;
> >>>>> +             iova += IOVA_ALLOC_SIZE;
> >>>>> +     }
> >>>>> +}
> >>>>> +
> >>>>> +static void do_bounce(phys_addr_t orig, void *addr, size_t size,
> >>>>> +                   enum dma_data_direction dir)
> >>>>> +{
> >>>>> +     unsigned long pfn = PFN_DOWN(orig);
> >>>>> +
> >>>>> +     if (PageHighMem(pfn_to_page(pfn))) {
> >>>>> +             unsigned int offset = offset_in_page(orig);
> >>>>> +             char *buffer;
> >>>>> +             unsigned int sz = 0;
> >>>>> +
> >>>>> +             while (size) {
> >>>>> +                     sz = min_t(size_t, PAGE_SIZE - offset, size);
> >>>>> +
> >>>>> +                     buffer = kmap_atomic(pfn_to_page(pfn));
> >>>> So kmap_atomic() can autoamtically go with fast path if the page does
> >>>> not belong to highmem.
> >>>>
> >>>> I think we can removce the condition and just use kmap_atomic() for all
> >>>> the cases here.
> >>>>
> >>> Looks good to me.
> >>>
> >>>>> +                     if (dir == DMA_TO_DEVICE)
> >>>>> +                             memcpy(addr, buffer + offset, sz);
> >>>>> +                     else
> >>>>> +                             memcpy(buffer + offset, addr, sz);
> >>>>> +                     kunmap_atomic(buffer);
> >>>>> +
> >>>>> +                     size -= sz;
> >>>>> +                     pfn++;
> >>>>> +                     addr += sz;
> >>>>> +                     offset = 0;
> >>>>> +             }
> >>>>> +     } else if (dir == DMA_TO_DEVICE) {
> >>>>> +             memcpy(addr, phys_to_virt(orig), size);
> >>>>> +     } else {
> >>>>> +             memcpy(phys_to_virt(orig), addr, size);
> >>>>> +     }
> >>>>> +}
> >>>>> +
> >>>>> +static void vduse_domain_bounce(struct vduse_iova_domain *domain,
> >>>>> +                             dma_addr_t iova, size_t size,
> >>>>> +                             enum dma_data_direction dir)
> >>>>> +{
> >>>>> +     struct vduse_bounce_map *map;
> >>>>> +     unsigned int index, offset;
> >>>>> +     void *addr;
> >>>>> +     size_t sz;
> >>>>> +
> >>>>> +     while (size) {
> >>>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >>>>> +             offset = offset_in_page(iova);
> >>>>> +             sz = min_t(size_t, IOVA_ALLOC_SIZE, size);
> >>>>> +
> >>>>> +             if (map->bounce_page &&
> >>>>> +                 map->orig_phys[index] != INVALID_PHYS_ADDR) {
> >>>>> +                     addr = page_address(map->bounce_page) + offset;
> >>>>> +                     index = offset >> IOVA_ALLOC_ORDER;
> >>>>> +                     do_bounce(map->orig_phys[index], addr, sz, dir);
> >>>>> +             }
> >>>>> +             size -= sz;
> >>>>> +             iova += sz;
> >>>>> +     }
> >>>>> +}
> >>>>> +
> >>>>> +static struct page *
> >>>>> +vduse_domain_get_mapping_page(struct vduse_iova_domain *domain, u64 iova)
> >>>>> +{
> >>>>> +     u64 start = iova & PAGE_MASK;
> >>>>> +     u64 last = start + PAGE_SIZE - 1;
> >>>>> +     struct vhost_iotlb_map *map;
> >>>>> +     struct page *page = NULL;
> >>>>> +
> >>>>> +     spin_lock(&domain->iotlb_lock);
> >>>>> +     map = vhost_iotlb_itree_first(domain->iotlb, start, last);
> >>>>> +     if (!map)
> >>>>> +             goto out;
> >>>>> +
> >>>>> +     page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
> >>>>> +     get_page(page);
> >>>>> +out:
> >>>>> +     spin_unlock(&domain->iotlb_lock);
> >>>>> +
> >>>>> +     return page;
> >>>>> +}
> >>>>> +
> >>>>> +static struct page *
> >>>>> +vduse_domain_alloc_bounce_page(struct vduse_iova_domain *domain, u64 iova)
> >>>>> +{
> >>>>> +     u64 start = iova & PAGE_MASK;
> >>>>> +     struct page *page = alloc_page(GFP_KERNEL);
> >>>>> +     struct vduse_bounce_map *map;
> >>>>> +
> >>>>> +     if (!page)
> >>>>> +             return NULL;
> >>>>> +
> >>>>> +     spin_lock(&domain->iotlb_lock);
> >>>>> +     map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >>>>> +     if (map->bounce_page) {
> >>>>> +             __free_page(page);
> >>>>> +             goto out;
> >>>>> +     }
> >>>>> +     map->bounce_page = page;
> >>>>> +
> >>>>> +     /* paired with vduse_domain_map_page() */
> >>>>> +     smp_mb();
> >>>> So this is suspicious. It's better to explain like, we need make sure A
> >>>> must be done after B.
> >>> OK. I see. It's used to protect this pattern:
> >>>
> >>>      vduse_domain_alloc_bounce_page:          vduse_domain_map_page:
> >>>      write map->bounce_page                           write map->orig_phys
> >>>      mb()                                                            mb()
> >>>      read map->orig_phys                                 read map->bounce_page
> >>>
> >>> Make sure there will always be a path to do bouncing.
> >>
> >> Ok.
> >>
> >>
> >>>> And it looks to me the iotlb_lock is sufficnet to do the synchronization
> >>>> here. E.g any reason that you don't take it in
> >>>> vduse_domain_map_bounce_page().
> >>>>
> >>> Yes, we can. But the performance in multi-queue cases will go down if
> >>> we use iotlb_lock on this critical path.
> >>>
> >>>> And what's more, is there anyway to aovid holding the spinlock during
> >>>> bouncing?
> >>>>
> >>> Looks like we can't. In the case that multiple page faults happen on
> >>> the same page, we should make sure the bouncing is done before any
> >>> page fault handler returns.
> >>
> >> So it looks to me all those extra complexitiy comes from the fact that
> >> the bounce_page and orig_phys are set by different places so we need to
> >> do the bouncing in two places.
> >>
> >> I wonder how much we can gain from the "lazy" boucning in page fault.
> >> The buffer mapped via dma_ops from virtio driver is expected to be
> >> accessed by the userspace soon.  It looks to me we can do all those
> >> stuffs during dma_map() then things would be greatly simplified.
> >>
> > If so, we need to allocate lots of pages from the pool reserved for
> > atomic memory allocation requests.
>
>
> This should be fine, a lot of drivers tries to allocate pages in atomic
> context. The point is to simplify the codes to make it easy to
> determince the correctness so we can add optimization on top simply by
> benchmarking the difference.
>

OK. I will use this way in the next version.

> E.g we have serveral places that accesses orig_phys:
>
> 1) map_page(), write
> 2) unmap_page(), write
> 3) page fault handler, read
>
> It's not clear to me how they were synchronized. Or if it was
> synchronzied implicitly (via iova allocator?), we'd better document it.

Yes.

> Or simply use spinlock (which is the preferrable way I'd like to go). We
> probably don't need to worry too much about the cost of spinlock since
> iova allocater use it heavily.
>

Actually iova allocator implements a per-CPU cache to optimize it.

Thanks,
Yongji
Yongji Xie March 26, 2021, 6:56 a.m. UTC | #7
On Fri, Mar 26, 2021 at 2:16 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/3/26 下午1:14, Yongji Xie 写道:
>
> +     }
> +     map->bounce_page = page;
> +
> +     /* paired with vduse_domain_map_page() */
> +     smp_mb();
>
> So this is suspicious. It's better to explain like, we need make sure A
> must be done after B.
>
> OK. I see. It's used to protect this pattern:
>
>      vduse_domain_alloc_bounce_page:          vduse_domain_map_page:
>      write map->bounce_page                           write map->orig_phys
>      mb()                                                            mb()
>      read map->orig_phys                                 read map->bounce_page
>
> Make sure there will always be a path to do bouncing.
>
> Ok.
>
>
> And it looks to me the iotlb_lock is sufficnet to do the synchronization
> here. E.g any reason that you don't take it in
> vduse_domain_map_bounce_page().
>
> Yes, we can. But the performance in multi-queue cases will go down if
> we use iotlb_lock on this critical path.
>
> And what's more, is there anyway to aovid holding the spinlock during
> bouncing?
>
> Looks like we can't. In the case that multiple page faults happen on
> the same page, we should make sure the bouncing is done before any
> page fault handler returns.
>
> So it looks to me all those extra complexitiy comes from the fact that
> the bounce_page and orig_phys are set by different places so we need to
> do the bouncing in two places.
>
> I wonder how much we can gain from the "lazy" boucning in page fault.
> The buffer mapped via dma_ops from virtio driver is expected to be
> accessed by the userspace soon.  It looks to me we can do all those
> stuffs during dma_map() then things would be greatly simplified.
>
> If so, we need to allocate lots of pages from the pool reserved for
> atomic memory allocation requests.
>
> This should be fine, a lot of drivers tries to allocate pages in atomic
> context. The point is to simplify the codes to make it easy to
> determince the correctness so we can add optimization on top simply by
> benchmarking the difference.
>
> OK. I will use this way in the next version.
>
> E.g we have serveral places that accesses orig_phys:
>
> 1) map_page(), write
> 2) unmap_page(), write
> 3) page fault handler, read
>
> It's not clear to me how they were synchronized. Or if it was
> synchronzied implicitly (via iova allocator?), we'd better document it.
>
> Yes.
>
> Or simply use spinlock (which is the preferrable way I'd like to go). We
> probably don't need to worry too much about the cost of spinlock since
> iova allocater use it heavily.
>
> Actually iova allocator implements a per-CPU cache to optimize it.
>
> Thanks,
> Yongji
>
>
> Right, but have a quick glance, I guess what you meant is that usually there's no lock contention unless cpu hot-plug. This can work but the problem is that such synchornization depends on the internal implementation of IOVA allocator which is kind of fragile. I still think we should do that on our own.
>

I might miss something. Looks like we don't need any synchronization
if the page fault handler is removed as you suggested. We should not
access the same orig_phys concurrently (in map_page() and
unmap_page()) unless we free the iova before accessing.

Thanks,
Yongji
Jason Wang March 26, 2021, 7:36 a.m. UTC | #8
在 2021/3/26 下午2:56, Yongji Xie 写道:
> On Fri, Mar 26, 2021 at 2:16 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/3/26 下午1:14, Yongji Xie 写道:
>>
>> +     }
>> +     map->bounce_page = page;
>> +
>> +     /* paired with vduse_domain_map_page() */
>> +     smp_mb();
>>
>> So this is suspicious. It's better to explain like, we need make sure A
>> must be done after B.
>>
>> OK. I see. It's used to protect this pattern:
>>
>>       vduse_domain_alloc_bounce_page:          vduse_domain_map_page:
>>       write map->bounce_page                           write map->orig_phys
>>       mb()                                                            mb()
>>       read map->orig_phys                                 read map->bounce_page
>>
>> Make sure there will always be a path to do bouncing.
>>
>> Ok.
>>
>>
>> And it looks to me the iotlb_lock is sufficnet to do the synchronization
>> here. E.g any reason that you don't take it in
>> vduse_domain_map_bounce_page().
>>
>> Yes, we can. But the performance in multi-queue cases will go down if
>> we use iotlb_lock on this critical path.
>>
>> And what's more, is there anyway to aovid holding the spinlock during
>> bouncing?
>>
>> Looks like we can't. In the case that multiple page faults happen on
>> the same page, we should make sure the bouncing is done before any
>> page fault handler returns.
>>
>> So it looks to me all those extra complexitiy comes from the fact that
>> the bounce_page and orig_phys are set by different places so we need to
>> do the bouncing in two places.
>>
>> I wonder how much we can gain from the "lazy" boucning in page fault.
>> The buffer mapped via dma_ops from virtio driver is expected to be
>> accessed by the userspace soon.  It looks to me we can do all those
>> stuffs during dma_map() then things would be greatly simplified.
>>
>> If so, we need to allocate lots of pages from the pool reserved for
>> atomic memory allocation requests.
>>
>> This should be fine, a lot of drivers tries to allocate pages in atomic
>> context. The point is to simplify the codes to make it easy to
>> determince the correctness so we can add optimization on top simply by
>> benchmarking the difference.
>>
>> OK. I will use this way in the next version.
>>
>> E.g we have serveral places that accesses orig_phys:
>>
>> 1) map_page(), write
>> 2) unmap_page(), write
>> 3) page fault handler, read
>>
>> It's not clear to me how they were synchronized. Or if it was
>> synchronzied implicitly (via iova allocator?), we'd better document it.
>>
>> Yes.
>>
>> Or simply use spinlock (which is the preferrable way I'd like to go). We
>> probably don't need to worry too much about the cost of spinlock since
>> iova allocater use it heavily.
>>
>> Actually iova allocator implements a per-CPU cache to optimize it.
>>
>> Thanks,
>> Yongji
>>
>>
>> Right, but have a quick glance, I guess what you meant is that usually there's no lock contention unless cpu hot-plug. This can work but the problem is that such synchornization depends on the internal implementation of IOVA allocator which is kind of fragile. I still think we should do that on our own.
>>
> I might miss something. Looks like we don't need any synchronization
> if the page fault handler is removed as you suggested. We should not
> access the same orig_phys concurrently (in map_page() and
> unmap_page()) unless we free the iova before accessing.
>
> Thanks,
> Yongji


You're right. I overestimate the complexitiy that is required by the 
synchronization.

Thanks


>
diff mbox series

Patch

diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
new file mode 100644
index 000000000000..83de216b0e51
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/iova_domain.c
@@ -0,0 +1,535 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MMU-based IOMMU implementation
+ *
+ * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/vdpa.h>
+
+#include "iova_domain.h"
+
+static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
+				 u64 start, u64 last,
+				 u64 addr, unsigned int perm,
+				 struct file *file, u64 offset)
+{
+	struct vdpa_map_file *map_file;
+	int ret;
+
+	map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
+	if (!map_file)
+		return -ENOMEM;
+
+	map_file->file = get_file(file);
+	map_file->offset = offset;
+
+	ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
+					addr, perm, map_file);
+	if (ret) {
+		fput(map_file->file);
+		kfree(map_file);
+		return ret;
+	}
+	return 0;
+}
+
+static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
+				  u64 start, u64 last)
+{
+	struct vdpa_map_file *map_file;
+	struct vhost_iotlb_map *map;
+
+	while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
+		map_file = (struct vdpa_map_file *)map->opaque;
+		fput(map_file->file);
+		kfree(map_file);
+		vhost_iotlb_map_free(domain->iotlb, map);
+	}
+}
+
+int vduse_domain_set_map(struct vduse_iova_domain *domain,
+			 struct vhost_iotlb *iotlb)
+{
+	struct vdpa_map_file *map_file;
+	struct vhost_iotlb_map *map;
+	u64 start = 0ULL, last = ULLONG_MAX;
+	int ret;
+
+	spin_lock(&domain->iotlb_lock);
+	vduse_iotlb_del_range(domain, start, last);
+
+	for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+	     map = vhost_iotlb_itree_next(map, start, last)) {
+		map_file = (struct vdpa_map_file *)map->opaque;
+		ret = vduse_iotlb_add_range(domain, map->start, map->last,
+					    map->addr, map->perm,
+					    map_file->file,
+					    map_file->offset);
+		if (ret)
+			goto err;
+	}
+	spin_unlock(&domain->iotlb_lock);
+
+	return 0;
+err:
+	vduse_iotlb_del_range(domain, start, last);
+	spin_unlock(&domain->iotlb_lock);
+	return ret;
+}
+
+static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
+					 u64 iova, u64 size, u64 paddr)
+{
+	struct vduse_bounce_map *map;
+	unsigned int index;
+	u64 last = iova + size - 1;
+
+	while (iova < last) {
+		map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+		index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
+		map->orig_phys[index] = paddr;
+		paddr += IOVA_ALLOC_SIZE;
+		iova += IOVA_ALLOC_SIZE;
+	}
+}
+
+static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
+					   u64 iova, u64 size)
+{
+	struct vduse_bounce_map *map;
+	unsigned int index;
+	u64 last = iova + size - 1;
+
+	while (iova < last) {
+		map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+		index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
+		map->orig_phys[index] = INVALID_PHYS_ADDR;
+		iova += IOVA_ALLOC_SIZE;
+	}
+}
+
+static void do_bounce(phys_addr_t orig, void *addr, size_t size,
+		      enum dma_data_direction dir)
+{
+	unsigned long pfn = PFN_DOWN(orig);
+
+	if (PageHighMem(pfn_to_page(pfn))) {
+		unsigned int offset = offset_in_page(orig);
+		char *buffer;
+		unsigned int sz = 0;
+
+		while (size) {
+			sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+			buffer = kmap_atomic(pfn_to_page(pfn));
+			if (dir == DMA_TO_DEVICE)
+				memcpy(addr, buffer + offset, sz);
+			else
+				memcpy(buffer + offset, addr, sz);
+			kunmap_atomic(buffer);
+
+			size -= sz;
+			pfn++;
+			addr += sz;
+			offset = 0;
+		}
+	} else if (dir == DMA_TO_DEVICE) {
+		memcpy(addr, phys_to_virt(orig), size);
+	} else {
+		memcpy(phys_to_virt(orig), addr, size);
+	}
+}
+
+static void vduse_domain_bounce(struct vduse_iova_domain *domain,
+				dma_addr_t iova, size_t size,
+				enum dma_data_direction dir)
+{
+	struct vduse_bounce_map *map;
+	unsigned int index, offset;
+	void *addr;
+	size_t sz;
+
+	while (size) {
+		map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+		offset = offset_in_page(iova);
+		sz = min_t(size_t, IOVA_ALLOC_SIZE, size);
+
+		if (map->bounce_page &&
+		    map->orig_phys[index] != INVALID_PHYS_ADDR) {
+			addr = page_address(map->bounce_page) + offset;
+			index = offset >> IOVA_ALLOC_ORDER;
+			do_bounce(map->orig_phys[index], addr, sz, dir);
+		}
+		size -= sz;
+		iova += sz;
+	}
+}
+
+static struct page *
+vduse_domain_get_mapping_page(struct vduse_iova_domain *domain, u64 iova)
+{
+	u64 start = iova & PAGE_MASK;
+	u64 last = start + PAGE_SIZE - 1;
+	struct vhost_iotlb_map *map;
+	struct page *page = NULL;
+
+	spin_lock(&domain->iotlb_lock);
+	map = vhost_iotlb_itree_first(domain->iotlb, start, last);
+	if (!map)
+		goto out;
+
+	page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
+	get_page(page);
+out:
+	spin_unlock(&domain->iotlb_lock);
+
+	return page;
+}
+
+static struct page *
+vduse_domain_alloc_bounce_page(struct vduse_iova_domain *domain, u64 iova)
+{
+	u64 start = iova & PAGE_MASK;
+	struct page *page = alloc_page(GFP_KERNEL);
+	struct vduse_bounce_map *map;
+
+	if (!page)
+		return NULL;
+
+	spin_lock(&domain->iotlb_lock);
+	map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+	if (map->bounce_page) {
+		__free_page(page);
+		goto out;
+	}
+	map->bounce_page = page;
+
+	/* paired with vduse_domain_map_page() */
+	smp_mb();
+
+	vduse_domain_bounce(domain, start, PAGE_SIZE, DMA_TO_DEVICE);
+out:
+	get_page(map->bounce_page);
+	spin_unlock(&domain->iotlb_lock);
+
+	return map->bounce_page;
+}
+
+static void
+vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain)
+{
+	struct vduse_bounce_map *map;
+	unsigned long i, pfn, bounce_pfns;
+
+	bounce_pfns = domain->bounce_size >> PAGE_SHIFT;
+
+	for (pfn = 0; pfn < bounce_pfns; pfn++) {
+		map = &domain->bounce_maps[pfn];
+		for (i = 0; i < IOVA_MAPS_PER_PAGE; i++) {
+			if (WARN_ON(map->orig_phys[i] != INVALID_PHYS_ADDR))
+				continue;
+		}
+		if (!map->bounce_page)
+			continue;
+
+		__free_page(map->bounce_page);
+		map->bounce_page = NULL;
+	}
+}
+
+void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain)
+{
+	if (!domain->bounce_map)
+		return;
+
+	spin_lock(&domain->iotlb_lock);
+	if (!domain->bounce_map)
+		goto unlock;
+
+	vduse_iotlb_del_range(domain, 0, domain->bounce_size - 1);
+	domain->bounce_map = 0;
+	vduse_domain_free_bounce_pages(domain);
+unlock:
+	spin_unlock(&domain->iotlb_lock);
+}
+
+static int vduse_domain_init_bounce_map(struct vduse_iova_domain *domain)
+{
+	int ret;
+
+	if (domain->bounce_map)
+		return 0;
+
+	spin_lock(&domain->iotlb_lock);
+	if (domain->bounce_map)
+		goto unlock;
+
+	ret = vduse_iotlb_add_range(domain, 0, domain->bounce_size - 1,
+				    0, VHOST_MAP_RW, domain->file, 0);
+	if (!ret)
+		domain->bounce_map = 1;
+unlock:
+	spin_unlock(&domain->iotlb_lock);
+	return ret;
+}
+
+static dma_addr_t
+vduse_domain_alloc_iova(struct iova_domain *iovad,
+			unsigned long size, unsigned long limit)
+{
+	unsigned long shift = iova_shift(iovad);
+	unsigned long iova_len = iova_align(iovad, size) >> shift;
+	unsigned long iova_pfn;
+
+	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
+		iova_len = roundup_pow_of_two(iova_len);
+	iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true);
+
+	return iova_pfn << shift;
+}
+
+static void vduse_domain_free_iova(struct iova_domain *iovad,
+				   dma_addr_t iova, size_t size)
+{
+	unsigned long shift = iova_shift(iovad);
+	unsigned long iova_len = iova_align(iovad, size) >> shift;
+
+	free_iova_fast(iovad, iova >> shift, iova_len);
+}
+
+dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
+				 struct page *page, unsigned long offset,
+				 size_t size, enum dma_data_direction dir,
+				 unsigned long attrs)
+{
+	struct iova_domain *iovad = &domain->stream_iovad;
+	unsigned long limit = domain->bounce_size - 1;
+	phys_addr_t pa = page_to_phys(page) + offset;
+	dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
+
+	if (!iova)
+		return DMA_MAPPING_ERROR;
+
+	if (vduse_domain_init_bounce_map(domain)) {
+		vduse_domain_free_iova(iovad, iova, size);
+		return DMA_MAPPING_ERROR;
+	}
+
+	vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa);
+
+	/* paired with vduse_domain_alloc_bounce_page() */
+	smp_mb();
+
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+		vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE);
+
+	return iova;
+}
+
+void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
+			     dma_addr_t dma_addr, size_t size,
+			     enum dma_data_direction dir, unsigned long attrs)
+{
+	struct iova_domain *iovad = &domain->stream_iovad;
+
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE);
+
+	vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size);
+	vduse_domain_free_iova(iovad, dma_addr, size);
+}
+
+void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
+				  size_t size, dma_addr_t *dma_addr,
+				  gfp_t flag, unsigned long attrs)
+{
+	struct iova_domain *iovad = &domain->consistent_iovad;
+	unsigned long limit = domain->iova_limit;
+	dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
+	void *orig = alloc_pages_exact(size, flag);
+
+	if (!iova || !orig)
+		goto err;
+
+	spin_lock(&domain->iotlb_lock);
+	if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1,
+				  virt_to_phys(orig), VHOST_MAP_RW,
+				  domain->file, (u64)iova)) {
+		spin_unlock(&domain->iotlb_lock);
+		goto err;
+	}
+	spin_unlock(&domain->iotlb_lock);
+
+	*dma_addr = iova;
+
+	return orig;
+err:
+	*dma_addr = DMA_MAPPING_ERROR;
+	if (orig)
+		free_pages_exact(orig, size);
+	if (iova)
+		vduse_domain_free_iova(iovad, iova, size);
+
+	return NULL;
+}
+
+void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
+				void *vaddr, dma_addr_t dma_addr,
+				unsigned long attrs)
+{
+	struct iova_domain *iovad = &domain->consistent_iovad;
+	struct vhost_iotlb_map *map;
+	struct vdpa_map_file *map_file;
+	phys_addr_t pa;
+
+	spin_lock(&domain->iotlb_lock);
+	map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr,
+				      (u64)dma_addr + size - 1);
+	if (WARN_ON(!map)) {
+		spin_unlock(&domain->iotlb_lock);
+		return;
+	}
+	map_file = (struct vdpa_map_file *)map->opaque;
+	fput(map_file->file);
+	kfree(map_file);
+	pa = map->addr;
+	vhost_iotlb_map_free(domain->iotlb, map);
+	spin_unlock(&domain->iotlb_lock);
+
+	vduse_domain_free_iova(iovad, dma_addr, size);
+	free_pages_exact(phys_to_virt(pa), size);
+}
+
+static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf)
+{
+	struct vduse_iova_domain *domain = vmf->vma->vm_private_data;
+	unsigned long iova = vmf->pgoff << PAGE_SHIFT;
+	struct page *page;
+
+	if (!domain)
+		return VM_FAULT_SIGBUS;
+
+	if (iova < domain->bounce_size)
+		page = vduse_domain_alloc_bounce_page(domain, iova);
+	else
+		page = vduse_domain_get_mapping_page(domain, iova);
+
+	if (!page)
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = page;
+
+	return 0;
+}
+
+static const struct vm_operations_struct vduse_domain_mmap_ops = {
+	.fault = vduse_domain_mmap_fault,
+};
+
+static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct vduse_iova_domain *domain = file->private_data;
+
+	vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND;
+	vma->vm_private_data = domain;
+	vma->vm_ops = &vduse_domain_mmap_ops;
+
+	return 0;
+}
+
+static int vduse_domain_release(struct inode *inode, struct file *file)
+{
+	struct vduse_iova_domain *domain = file->private_data;
+
+	vduse_domain_reset_bounce_map(domain);
+	put_iova_domain(&domain->stream_iovad);
+	put_iova_domain(&domain->consistent_iovad);
+	vhost_iotlb_free(domain->iotlb);
+	vfree(domain->bounce_maps);
+	kfree(domain);
+
+	return 0;
+}
+
+static const struct file_operations vduse_domain_fops = {
+	.mmap = vduse_domain_mmap,
+	.release = vduse_domain_release,
+};
+
+void vduse_domain_destroy(struct vduse_iova_domain *domain)
+{
+	fput(domain->file);
+}
+
+struct vduse_iova_domain *
+vduse_domain_create(unsigned long iova_limit, size_t bounce_size)
+{
+	struct vduse_iova_domain *domain;
+	struct file *file;
+	struct vduse_bounce_map *map;
+	unsigned long i, pfn, bounce_pfns;
+
+	bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT;
+	if (iova_limit <= bounce_size)
+		return NULL;
+
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		return NULL;
+
+	domain->iotlb = vhost_iotlb_alloc(0, 0);
+	if (!domain->iotlb)
+		goto err_iotlb;
+
+	domain->iova_limit = iova_limit;
+	domain->bounce_size = PAGE_ALIGN(bounce_size);
+	domain->bounce_maps = vzalloc(bounce_pfns *
+				sizeof(struct vduse_bounce_map));
+	if (!domain->bounce_maps)
+		goto err_map;
+
+	for (pfn = 0; pfn < bounce_pfns; pfn++) {
+		map = &domain->bounce_maps[pfn];
+		for (i = 0; i < IOVA_MAPS_PER_PAGE; i++)
+			map->orig_phys[i] = INVALID_PHYS_ADDR;
+	}
+	file = anon_inode_getfile("[vduse-domain]", &vduse_domain_fops,
+				domain, O_RDWR);
+	if (IS_ERR(file))
+		goto err_file;
+
+	domain->file = file;
+	spin_lock_init(&domain->iotlb_lock);
+	init_iova_domain(&domain->stream_iovad,
+			IOVA_ALLOC_SIZE, IOVA_START_PFN);
+	init_iova_domain(&domain->consistent_iovad,
+			PAGE_SIZE, bounce_pfns);
+
+	return domain;
+err_file:
+	vfree(domain->bounce_maps);
+err_map:
+	vhost_iotlb_free(domain->iotlb);
+err_iotlb:
+	kfree(domain);
+	return NULL;
+}
+
+int vduse_domain_init(void)
+{
+	return iova_cache_get();
+}
+
+void vduse_domain_exit(void)
+{
+	iova_cache_put();
+}
diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
new file mode 100644
index 000000000000..faeeedfaa786
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/iova_domain.h
@@ -0,0 +1,75 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MMU-based IOMMU implementation
+ *
+ * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#ifndef _VDUSE_IOVA_DOMAIN_H
+#define _VDUSE_IOVA_DOMAIN_H
+
+#include <linux/iova.h>
+#include <linux/dma-mapping.h>
+#include <linux/vhost_iotlb.h>
+
+#define IOVA_START_PFN 1
+
+#define IOVA_ALLOC_ORDER 12
+#define IOVA_ALLOC_SIZE (1 << IOVA_ALLOC_ORDER)
+
+#define IOVA_MAPS_PER_PAGE (1 << (PAGE_SHIFT - IOVA_ALLOC_ORDER))
+
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+
+struct vduse_bounce_map {
+	struct page *bounce_page;
+	u64 orig_phys[IOVA_MAPS_PER_PAGE];
+};
+
+struct vduse_iova_domain {
+	struct iova_domain stream_iovad;
+	struct iova_domain consistent_iovad;
+	struct vduse_bounce_map *bounce_maps;
+	size_t bounce_size;
+	unsigned long iova_limit;
+	int bounce_map;
+	struct vhost_iotlb *iotlb;
+	spinlock_t iotlb_lock;
+	struct file *file;
+};
+
+int vduse_domain_set_map(struct vduse_iova_domain *domain,
+			struct vhost_iotlb *iotlb);
+
+dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
+				struct page *page, unsigned long offset,
+				size_t size, enum dma_data_direction dir,
+				unsigned long attrs);
+
+void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
+			dma_addr_t dma_addr, size_t size,
+			enum dma_data_direction dir, unsigned long attrs);
+
+void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
+				size_t size, dma_addr_t *dma_addr,
+				gfp_t flag, unsigned long attrs);
+
+void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
+				void *vaddr, dma_addr_t dma_addr,
+				unsigned long attrs);
+
+void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain);
+
+void vduse_domain_destroy(struct vduse_iova_domain *domain);
+
+struct vduse_iova_domain *vduse_domain_create(unsigned long iova_limit,
+						size_t bounce_size);
+
+int vduse_domain_init(void);
+
+void vduse_domain_exit(void);
+
+#endif /* _VDUSE_IOVA_DOMAIN_H */