diff mbox series

[1/2] dma-buf: heaps: DMA_HEAP_IOCTL_ALLOC_READ_FILE framework

Message ID 20240711074221.459589-2-link@vivo.com (mailing list archive)
State New
Headers show
Series Introduce DMA_HEAP_IOCTL_ALLOC_AND_READ | expand

Commit Message

Huan Yang July 11, 2024, 7:42 a.m. UTC
Some user may need load file into dma-buf, current
way is:
  1. allocate a dma-buf, get dma-buf fd
  2. mmap dma-buf fd into vaddr
  3. read(file_fd, vaddr, fsz)
This is too heavy if fsz reached to GB.

This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
User need to offer a file_fd which you want to load into dma-buf, then,
it promise if you got a dma-buf fd, it will contains the file content.

Notice, file_fd depends on user how to open this file. So, both buffer
I/O and Direct I/O is supported.

Signed-off-by: Huan Yang <link@vivo.com>
---
 drivers/dma-buf/dma-heap.c    | 525 +++++++++++++++++++++++++++++++++-
 include/linux/dma-heap.h      |  57 +++-
 include/uapi/linux/dma-heap.h |  32 +++
 3 files changed, 611 insertions(+), 3 deletions(-)

Comments

Christian König July 11, 2024, 9 a.m. UTC | #1
Am 11.07.24 um 09:42 schrieb Huan Yang:
> Some user may need load file into dma-buf, current
> way is:
>    1. allocate a dma-buf, get dma-buf fd
>    2. mmap dma-buf fd into vaddr
>    3. read(file_fd, vaddr, fsz)
> This is too heavy if fsz reached to GB.

You need to describe a bit more why that is to heavy. I can only assume 
you need to save memory bandwidth and avoid the extra copy with the CPU.

> This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
> User need to offer a file_fd which you want to load into dma-buf, then,
> it promise if you got a dma-buf fd, it will contains the file content.

Interesting idea, that has at least more potential than trying to enable 
direct I/O on mmap()ed DMA-bufs.

The approach with the new IOCTL might not work because it is a very 
specialized use case.

But IIRC there was a copy_file_range callback in the file_operations 
structure you could use for that. I'm just not sure when and how that's 
used with the copy_file_range() system call.

Regards,
Christian.

>
> Notice, file_fd depends on user how to open this file. So, both buffer
> I/O and Direct I/O is supported.
>
> Signed-off-by: Huan Yang <link@vivo.com>
> ---
>   drivers/dma-buf/dma-heap.c    | 525 +++++++++++++++++++++++++++++++++-
>   include/linux/dma-heap.h      |  57 +++-
>   include/uapi/linux/dma-heap.h |  32 +++
>   3 files changed, 611 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
> index 2298ca5e112e..abe17281adb8 100644
> --- a/drivers/dma-buf/dma-heap.c
> +++ b/drivers/dma-buf/dma-heap.c
> @@ -15,9 +15,11 @@
>   #include <linux/list.h>
>   #include <linux/slab.h>
>   #include <linux/nospec.h>
> +#include <linux/highmem.h>
>   #include <linux/uaccess.h>
>   #include <linux/syscalls.h>
>   #include <linux/dma-heap.h>
> +#include <linux/vmalloc.h>
>   #include <uapi/linux/dma-heap.h>
>   
>   #define DEVNAME "dma_heap"
> @@ -43,12 +45,462 @@ struct dma_heap {
>   	struct cdev heap_cdev;
>   };
>   
> +/**
> + * struct dma_heap_file - wrap the file, read task for dma_heap allocate use.
> + * @file:		file to read from.
> + *
> + * @cred:		kthread use, user cred copy to use for the read.
> + *
> + * @max_batch:		maximum batch size to read, if collect match batch,
> + *			trigger read, default 128MB, must below file size.
> + *
> + * @fsz:		file size.
> + *
> + * @direct:		use direct IO?
> + */
> +struct dma_heap_file {
> +	struct file *file;
> +	struct cred *cred;
> +	size_t max_batch;
> +	size_t fsz;
> +	bool direct;
> +};
> +
> +/**
> + * struct dma_heap_file_work - represents a dma_heap file read real work.
> + * @vaddr:		contigous virtual address alloc by vmap, file read need.
> + *
> + * @start_size:		file read start offset, same to @dma_heap_file_task->roffset.
> + *
> + * @need_size:		file read need size, same to @dma_heap_file_task->rsize.
> + *
> + * @heap_file:		file wrapper.
> + *
> + * @list:		child node of @dma_heap_file_control->works.
> + *
> + * @refp:		same @dma_heap_file_task->ref, if end of read, put ref.
> + *
> + * @failp:		if any work io failed, set it true, pointp @dma_heap_file_task->fail.
> + */
> +struct dma_heap_file_work {
> +	void *vaddr;
> +	ssize_t start_size;
> +	ssize_t need_size;
> +	struct dma_heap_file *heap_file;
> +	struct list_head list;
> +	atomic_t *refp;
> +	bool *failp;
> +};
> +
> +/**
> + * struct dma_heap_file_task - represents a dma_heap file read process
> + * @ref:		current file work counter, if zero, allocate and read
> + *			done.
> + *
> + * @roffset:		last read offset, current prepared work' begin file
> + *			start offset.
> + *
> + * @rsize:		current allocated page size use to read, if reach rbatch,
> + *			trigger commit.
> + *
> + * @rbatch:		current prepared work's batch, below @dma_heap_file's
> + *			batch.
> + *
> + * @heap_file:		current dma_heap_file
> + *
> + * @parray:		used for vmap, size is @dma_heap_file's batch's number
> + *			pages.(this is maximum). Due to single thread file read,
> + *			one page array reuse each work prepare is OK.
> + *			Each index in parray is PAGE_SIZE.(vmap need)
> + *
> + * @pindex:		current allocated page filled in @parray's index.
> + *
> + * @fail:		any work failed when file read?
> + *
> + * dma_heap_file_task is the production of file read, will prepare each work
> + * during allocate dma_buf pages, if match current batch, then trigger commit
> + * and prepare next work. After all batch queued, user going on prepare dma_buf
> + * and so on, but before return dma_buf fd, need to wait file read end and
> + * check read result.
> + */
> +struct dma_heap_file_task {
> +	atomic_t ref;
> +	size_t roffset;
> +	size_t rsize;
> +	size_t rbatch;
> +	struct dma_heap_file *heap_file;
> +	struct page **parray;
> +	unsigned int pindex;
> +	bool fail;
> +};
> +
> +/**
> + * struct dma_heap_file_control - global control of dma_heap file read.
> + * @works:		@dma_heap_file_work's list head.
> + *
> + * @lock:		only lock for @works.
> + *
> + * @threadwq:		wait queue for @work_thread, if commit work, @work_thread
> + *			wakeup and read this work's file contains.
> + *
> + * @workwq:		used for main thread wait for file read end, if allocation
> + *			end before file read. @dma_heap_file_task ref effect this.
> + *
> + * @work_thread:	file read kthread. the dma_heap_file_task work's consumer.
> + *
> + * @heap_fwork_cachep:	@dma_heap_file_work's cachep, it's alloc/free frequently.
> + *
> + * @nr_work:		global number of how many work committed.
> + */
> +struct dma_heap_file_control {
> +	struct list_head works;
> +	spinlock_t lock;
> +	wait_queue_head_t threadwq;
> +	wait_queue_head_t workwq;
> +	struct task_struct *work_thread;
> +	struct kmem_cache *heap_fwork_cachep;
> +	atomic_t nr_work;
> +};
> +
> +static struct dma_heap_file_control *heap_fctl;
>   static LIST_HEAD(heap_list);
>   static DEFINE_MUTEX(heap_list_lock);
>   static dev_t dma_heap_devt;
>   static struct class *dma_heap_class;
>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>   
> +/**
> + * map_pages_to_vaddr - map each scatter page into contiguous virtual address.
> + * @heap_ftask:		prepared and need to commit's work.
> + *
> + * Cached pages need to trigger file read, this function map each scatter page
> + * into contiguous virtual address, so that file read can easy use.
> + * Now that we get vaddr page, cached pages can return to original user, so we
> + * will not effect dma-buf export even if file read not end.
> + */
> +static void *map_pages_to_vaddr(struct dma_heap_file_task *heap_ftask)
> +{
> +	return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
> +		    PAGE_KERNEL);
> +}
> +
> +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
> +				struct page *page)
> +{
> +	struct page **array = heap_ftask->parray;
> +	int index = heap_ftask->pindex;
> +	int num = compound_nr(page), i;
> +	unsigned long sz = page_size(page);
> +
> +	heap_ftask->rsize += sz;
> +	for (i = 0; i < num; ++i)
> +		array[index++] = &page[i];
> +	heap_ftask->pindex = index;
> +
> +	return heap_ftask->rsize >= heap_ftask->rbatch;
> +}
> +
> +static struct dma_heap_file_work *
> +init_file_work(struct dma_heap_file_task *heap_ftask)
> +{
> +	struct dma_heap_file_work *heap_fwork;
> +	struct dma_heap_file *heap_file = heap_ftask->heap_file;
> +
> +	if (READ_ONCE(heap_ftask->fail))
> +		return NULL;
> +
> +	heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, GFP_KERNEL);
> +	if (unlikely(!heap_fwork))
> +		return NULL;
> +
> +	heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
> +	if (unlikely(!heap_fwork->vaddr)) {
> +		kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
> +		return NULL;
> +	}
> +
> +	heap_fwork->heap_file = heap_file;
> +	heap_fwork->start_size = heap_ftask->roffset;
> +	heap_fwork->need_size = heap_ftask->rsize;
> +	heap_fwork->refp = &heap_ftask->ref;
> +	heap_fwork->failp = &heap_ftask->fail;
> +	atomic_inc(&heap_ftask->ref);
> +	return heap_fwork;
> +}
> +
> +static void destroy_file_work(struct dma_heap_file_work *heap_fwork)
> +{
> +	vunmap(heap_fwork->vaddr);
> +	atomic_dec(heap_fwork->refp);
> +	wake_up(&heap_fctl->workwq);
> +
> +	kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
> +}
> +
> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
> +{
> +	struct dma_heap_file_work *heap_fwork = init_file_work(heap_ftask);
> +	struct page *last = NULL;
> +	struct dma_heap_file *heap_file = heap_ftask->heap_file;
> +	size_t start = heap_ftask->roffset;
> +	struct file *file = heap_file->file;
> +	size_t fsz = heap_file->fsz;
> +
> +	if (unlikely(!heap_fwork))
> +		return -ENOMEM;
> +
> +	/**
> +	 * If file size is not page aligned, direct io can't process the tail.
> +	 * So, if reach to tail, remain the last page use buffer read.
> +	 */
> +	if (heap_file->direct && start + heap_ftask->rsize > fsz) {
> +		heap_fwork->need_size -= PAGE_SIZE;
> +		last = heap_ftask->parray[heap_ftask->pindex - 1];
> +	}
> +
> +	spin_lock(&heap_fctl->lock);
> +	list_add_tail(&heap_fwork->list, &heap_fctl->works);
> +	spin_unlock(&heap_fctl->lock);
> +	atomic_inc(&heap_fctl->nr_work);
> +
> +	wake_up(&heap_fctl->threadwq);
> +
> +	if (last) {
> +		char *buf, *pathp;
> +		ssize_t err;
> +		void *buffer;
> +
> +		buf = kmalloc(PATH_MAX, GFP_KERNEL);
> +		if (unlikely(!buf))
> +			return -ENOMEM;
> +
> +		start = PAGE_ALIGN_DOWN(fsz);
> +
> +		pathp = file_path(file, buf, PATH_MAX);
> +		if (IS_ERR(pathp)) {
> +			kfree(buf);
> +			return PTR_ERR(pathp);
> +		}
> +
> +		buffer = kmap_local_page(last); // use page's kaddr.
> +		err = kernel_read_file_from_path(pathp, start, &buffer,
> +						 fsz - start, &fsz,
> +						 READING_POLICY);
> +		kunmap_local(buffer);
> +		kfree(buf);
> +		if (err < 0) {
> +			pr_err("failed to use buffer kernel_read_file %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
> +			       pathp, err, start, fsz, fsz);
> +
> +			return err;
> +		}
> +	}
> +
> +	heap_ftask->roffset += heap_ftask->rsize;
> +	heap_ftask->rsize = 0;
> +	heap_ftask->pindex = 0;
> +	heap_ftask->rbatch = min_t(size_t,
> +				   PAGE_ALIGN(fsz) - heap_ftask->roffset,
> +				   heap_ftask->rbatch);
> +	return 0;
> +}
> +
> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask)
> +{
> +	wait_event_freezable(heap_fctl->workwq,
> +			     atomic_read(&heap_ftask->ref) == 0);
> +	return heap_ftask->fail;
> +}
> +
> +bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask)
> +{
> +	bool fail;
> +
> +	dma_heap_wait_for_file_read(heap_ftask);
> +	fail = heap_ftask->fail;
> +	kvfree(heap_ftask->parray);
> +	kfree(heap_ftask);
> +	return fail;
> +}
> +
> +struct dma_heap_file_task *
> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
> +{
> +	struct dma_heap_file_task *heap_ftask =
> +		kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
> +	if (unlikely(!heap_ftask))
> +		return NULL;
> +
> +	/**
> +	 * Batch is the maximum size which we prepare work will meet.
> +	 * So, direct alloc this number's page array is OK.
> +	 */
> +	heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> PAGE_SHIFT,
> +					    sizeof(struct page *), GFP_KERNEL);
> +	if (unlikely(!heap_ftask->parray))
> +		goto put;
> +
> +	heap_ftask->heap_file = heap_file;
> +	heap_ftask->rbatch = heap_file->max_batch;
> +	return heap_ftask;
> +put:
> +	kfree(heap_ftask);
> +	return NULL;
> +}
> +
> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
> +{
> +	struct dma_heap_file *heap_file = heap_fwork->heap_file;
> +	struct file *file = heap_file->file;
> +	ssize_t start = heap_fwork->start_size;
> +	ssize_t size = heap_fwork->need_size;
> +	void *buffer = heap_fwork->vaddr;
> +	const struct cred *old_cred;
> +	ssize_t err;
> +
> +	// use real task's cred to read this file.
> +	old_cred = override_creds(heap_file->cred);
> +	err = kernel_read_file(file, start, &buffer, size, &heap_file->fsz,
> +			       READING_POLICY);
> +	if (err < 0) {
> +		pr_err("use kernel_read_file, err=%ld, [%ld, %ld], f_sz=%ld\n",
> +		       err, start, (start + size), heap_file->fsz);
> +		WRITE_ONCE(*heap_fwork->failp, true);
> +	}
> +	// recovery to my cred.
> +	revert_creds(old_cred);
> +}
> +
> +static int dma_heap_file_control_thread(void *data)
> +{
> +	struct dma_heap_file_control *heap_fctl =
> +		(struct dma_heap_file_control *)data;
> +	struct dma_heap_file_work *worker, *tmp;
> +	int nr_work;
> +
> +	LIST_HEAD(pages);
> +	LIST_HEAD(workers);
> +
> +	while (true) {
> +		wait_event_freezable(heap_fctl->threadwq,
> +				     atomic_read(&heap_fctl->nr_work) > 0);
> +recheck:
> +		spin_lock(&heap_fctl->lock);
> +		list_splice_init(&heap_fctl->works, &workers);
> +		spin_unlock(&heap_fctl->lock);
> +
> +		if (unlikely(kthread_should_stop())) {
> +			list_for_each_entry_safe(worker, tmp, &workers, list) {
> +				list_del(&worker->list);
> +				destroy_file_work(worker);
> +			}
> +			break;
> +		}
> +
> +		nr_work = 0;
> +		list_for_each_entry_safe(worker, tmp, &workers, list) {
> +			++nr_work;
> +			list_del(&worker->list);
> +			__work_this_io(worker);
> +
> +			destroy_file_work(worker);
> +		}
> +		atomic_sub(nr_work, &heap_fctl->nr_work);
> +
> +		if (atomic_read(&heap_fctl->nr_work) > 0)
> +			goto recheck;
> +	}
> +	return 0;
> +}
> +
> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
> +{
> +	return heap_file->fsz;
> +}
> +
> +static int prepare_dma_heap_file(struct dma_heap_file *heap_file, int file_fd,
> +				 size_t batch)
> +{
> +	struct file *file;
> +	size_t fsz;
> +	int ret;
> +
> +	file = fget(file_fd);
> +	if (!file)
> +		return -EINVAL;
> +
> +	fsz = i_size_read(file_inode(file));
> +	if (fsz < batch) {
> +		ret = -EINVAL;
> +		goto err;
> +	}
> +
> +	/**
> +	 * Selinux block our read, but actually we are reading the stand-in
> +	 * for this file.
> +	 * So save current's cred and when going to read, override mine, and
> +	 * end of read, revert.
> +	 */
> +	heap_file->cred = prepare_kernel_cred(current);
> +	if (unlikely(!heap_file->cred)) {
> +		ret = -ENOMEM;
> +		goto err;
> +	}
> +
> +	heap_file->file = file;
> +	heap_file->max_batch = batch;
> +	heap_file->fsz = fsz;
> +
> +	heap_file->direct = file->f_flags & O_DIRECT;
> +
> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
> +	if (!heap_file->direct && fsz >= DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
> +		pr_warn("alloc read file better to use O_DIRECT to read larget file\n");
> +
> +	return 0;
> +
> +err:
> +	fput(file);
> +	return ret;
> +}
> +
> +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
> +{
> +	fput(heap_file->file);
> +	put_cred(heap_file->cred);
> +}
> +
> +static int dma_heap_buffer_alloc_read_file(struct dma_heap *heap, int file_fd,
> +					   size_t batch, unsigned int fd_flags,
> +					   unsigned int heap_flags)
> +{
> +	struct dma_buf *dmabuf;
> +	int fd;
> +	struct dma_heap_file heap_file;
> +
> +	fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
> +	if (fd)
> +		goto error_file;
> +
> +	dmabuf = heap->ops->allocate_read_file(heap, &heap_file, fd_flags,
> +					       heap_flags);
> +	if (IS_ERR(dmabuf)) {
> +		fd = PTR_ERR(dmabuf);
> +		goto error;
> +	}
> +
> +	fd = dma_buf_fd(dmabuf, fd_flags);
> +	if (fd < 0) {
> +		dma_buf_put(dmabuf);
> +		/* just return, as put will call release and that will free */
> +	}
> +
> +error:
> +	destroy_dma_heap_file(&heap_file);
> +error_file:
> +	return fd;
> +}
> +
>   static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
>   				 u32 fd_flags,
>   				 u64 heap_flags)
> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode *inode, struct file *file)
>   	return 0;
>   }
>   
> +static long dma_heap_ioctl_allocate_read_file(struct file *file, void *data)
> +{
> +	struct dma_heap_allocation_file_data *heap_allocation_file = data;
> +	struct dma_heap *heap = file->private_data;
> +	int fd;
> +
> +	if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
> +		return -EINVAL;
> +
> +	if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
> +		return -EINVAL;
> +
> +	if (heap_allocation_file->heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS)
> +		return -EINVAL;
> +
> +	if (!heap->ops->allocate_read_file)
> +		return -EINVAL;
> +
> +	fd = dma_heap_buffer_alloc_read_file(
> +		heap, heap_allocation_file->file_fd,
> +		heap_allocation_file->batch ?
> +			PAGE_ALIGN(heap_allocation_file->batch) :
> +			DEFAULT_ADI_BATCH,
> +		heap_allocation_file->fd_flags,
> +		heap_allocation_file->heap_flags);
> +	if (fd < 0)
> +		return fd;
> +
> +	heap_allocation_file->fd = fd;
> +	return 0;
> +}
> +
>   static long dma_heap_ioctl_allocate(struct file *file, void *data)
>   {
>   	struct dma_heap_allocation_data *heap_allocation = data;
> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct file *file, void *data)
>   
>   static unsigned int dma_heap_ioctl_cmds[] = {
>   	DMA_HEAP_IOCTL_ALLOC,
> +	DMA_HEAP_IOCTL_ALLOC_AND_READ,
>   };
>   
>   static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
>   	case DMA_HEAP_IOCTL_ALLOC:
>   		ret = dma_heap_ioctl_allocate(file, kdata);
>   		break;
> +	case DMA_HEAP_IOCTL_ALLOC_AND_READ:
> +		ret = dma_heap_ioctl_allocate_read_file(file, kdata);
> +		break;
>   	default:
>   		ret = -ENOTTY;
>   		goto err;
> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>   
>   	dma_heap_class = class_create(DEVNAME);
>   	if (IS_ERR(dma_heap_class)) {
> -		unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
> -		return PTR_ERR(dma_heap_class);
> +		ret = PTR_ERR(dma_heap_class);
> +		goto fail_class;
>   	}
>   	dma_heap_class->devnode = dma_heap_devnode;
>   
> +	heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
> +	if (unlikely(!heap_fctl)) {
> +		ret =  -ENOMEM;
> +		goto fail_alloc;
> +	}
> +
> +	INIT_LIST_HEAD(&heap_fctl->works);
> +	init_waitqueue_head(&heap_fctl->threadwq);
> +	init_waitqueue_head(&heap_fctl->workwq);
> +
> +	heap_fctl->work_thread = kthread_run(dma_heap_file_control_thread,
> +					     heap_fctl, "heap_fwork_t");
> +	if (IS_ERR(heap_fctl->work_thread)) {
> +		ret = -ENOMEM;
> +		goto fail_thread;
> +	}
> +
> +	heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 0);
> +	if (unlikely(!heap_fctl->heap_fwork_cachep)) {
> +		ret = -ENOMEM;
> +		goto fail_cache;
> +	}
> +
>   	return 0;
> +
> +fail_cache:
> +	kthread_stop(heap_fctl->work_thread);
> +fail_thread:
> +	kfree(heap_fctl);
> +fail_alloc:
> +	class_destroy(dma_heap_class);
> +fail_class:
> +	unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
> +	return ret;
>   }
>   subsys_initcall(dma_heap_init);
> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
> index 064bad725061..9c25383f816c 100644
> --- a/include/linux/dma-heap.h
> +++ b/include/linux/dma-heap.h
> @@ -12,12 +12,17 @@
>   #include <linux/cdev.h>
>   #include <linux/types.h>
>   
> +#define DEFAULT_ADI_BATCH (128 << 20)
> +
>   struct dma_heap;
> +struct dma_heap_file_task;
> +struct dma_heap_file;
>   
>   /**
>    * struct dma_heap_ops - ops to operate on a given heap
>    * @allocate:		allocate dmabuf and return struct dma_buf ptr
> - *
> + * @allocate_read_file: allocate dmabuf and read file, then return struct
> + * dma_buf ptr.
>    * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>    */
>   struct dma_heap_ops {
> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>   				    unsigned long len,
>   				    u32 fd_flags,
>   				    u64 heap_flags);
> +
> +	struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
> +					      struct dma_heap_file *heap_file,
> +					      u32 fd_flags,
> +					      u64 heap_flags);
>   };
>   
>   /**
> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap *heap);
>    */
>   struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info);
>   
> +/**
> + * dma_heap_destroy_file_read - waits for a file read to complete then destroy it
> + * Returns: true if the file read failed, false otherwise
> + */
> +bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask);
> +
> +/**
> + * dma_heap_wait_for_file_read - waits for a file read to complete
> + * Returns: true if the file read failed, false otherwise
> + */
> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask);
> +
> +/**
> + * dma_heap_alloc_file_read - Declare a task to read file when allocate pages.
> + * @heap_file:		target file to read
> + *
> + * Return NULL if failed, otherwise return a struct pointer.
> + */
> +struct dma_heap_file_task *
> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
> +
> +/**
> + * dma_heap_prepare_file_read - cache each allocated page until we meet this batch.
> + * @heap_ftask:		prepared and need to commit's work.
> + * @page:		current allocated page. don't care which order.
> + *
> + * Returns true if reach to batch, false so go on prepare.
> + */
> +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
> +				struct page *page);
> +
> +/**
> + * dma_heap_commit_file_read -  prepare collect enough memory, going to trigger IO
> + * @heap_ftask:			info that current IO needs
> + *
> + * This commit will also check if reach to tail read.
> + * For direct I/O submissions, it is necessary to pay attention to file reads
> + * that are not page-aligned. For the unaligned portion of the read, buffer IO
> + * needs to be triggered.
> + * Returns:
> + *   0 if all right, -errno if something wrong
> + */
> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask);
> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
> +
>   #endif /* _DMA_HEAPS_H */
> diff --git a/include/uapi/linux/dma-heap.h b/include/uapi/linux/dma-heap.h
> index a4cf716a49fa..8c20e8b74eed 100644
> --- a/include/uapi/linux/dma-heap.h
> +++ b/include/uapi/linux/dma-heap.h
> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>   	__u64 heap_flags;
>   };
>   
> +/**
> + * struct dma_heap_allocation_file_data - metadata passed from userspace for
> + *                                      allocations and read file
> + * @fd:			will be populated with a fd which provides the
> + *			handle to the allocated dma-buf
> + * @file_fd:		file descriptor to read from(suggested to use O_DIRECT open file)
> + * @batch:		how many memory alloced then file read(bytes), default 128MB
> + *			will auto aligned to PAGE_SIZE
> + * @fd_flags:		file descriptor flags used when allocating
> + * @heap_flags:		flags passed to heap
> + *
> + * Provided by userspace as an argument to the ioctl
> + */
> +struct dma_heap_allocation_file_data {
> +	__u32 fd;
> +	__u32 file_fd;
> +	__u32 batch;
> +	__u32 fd_flags;
> +	__u64 heap_flags;
> +};
> +
>   #define DMA_HEAP_IOC_MAGIC		'H'
>   
>   /**
> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>   #define DMA_HEAP_IOCTL_ALLOC	_IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>   				      struct dma_heap_allocation_data)
>   
> +/**
> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from pool and both
> + *					read file when allocate memory.
> + *
> + * Takes a dma_heap_allocation_file_data struct and returns it with the fd field
> + * populated with the dmabuf handle of the allocation. When return, the dma-buf
> + * content is read from file.
> + */
> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
> +	_IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct dma_heap_allocation_file_data)
> +
>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
Huan Yang July 11, 2024, 9:18 a.m. UTC | #2
Hi Christian,

Thanks for your reply.

在 2024/7/11 17:00, Christian König 写道:
> Am 11.07.24 um 09:42 schrieb Huan Yang:
>> Some user may need load file into dma-buf, current
>> way is:
>>    1. allocate a dma-buf, get dma-buf fd
>>    2. mmap dma-buf fd into vaddr
>>    3. read(file_fd, vaddr, fsz)
>> This is too heavy if fsz reached to GB.
>
> You need to describe a bit more why that is to heavy. I can only 
> assume you need to save memory bandwidth and avoid the extra copy with 
> the CPU.

Sorry for the oversimplified explanation. But, yes, you're right, we 
want to avoid this.

As we are dealing with embedded devices, the available memory and 
computing power for users are usually limited.(The maximum available 
memory is currently

24GB, typically ranging from 8-12GB. )

Also, the CPU computing power is also usually in short supply, due to 
limited battery capacity and limited heat dissipation capabilities.

So, we hope to avoid ineffective paths as much as possible.

>
>> This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>> User need to offer a file_fd which you want to load into dma-buf, then,
>> it promise if you got a dma-buf fd, it will contains the file content.
>
> Interesting idea, that has at least more potential than trying to 
> enable direct I/O on mmap()ed DMA-bufs.
>
> The approach with the new IOCTL might not work because it is a very 
> specialized use case.

Thank you for your advice. maybe the "read file" behavior can be 
attached to an existing allocation?

I am currently creating a new ioctl to remind the user that memory is 
being allocated and read, and I am also unsure

whether it is appropriate to add additional parameters to the existing 
allocate behavior.

Please, give me more suggestion. Thanks.

>
> But IIRC there was a copy_file_range callback in the file_operations 
> structure you could use for that. I'm just not sure when and how 
> that's used with the copy_file_range() system call.

Sorry, I'm not familiar with this, but I will look into it. However, 
this type of callback function is not currently implemented when exporting

the dma_buf file, which means that I need to implement the callback for it?

>
> Regards,
> Christian.
>
>>
>> Notice, file_fd depends on user how to open this file. So, both buffer
>> I/O and Direct I/O is supported.
>>
>> Signed-off-by: Huan Yang <link@vivo.com>
>> ---
>>   drivers/dma-buf/dma-heap.c    | 525 +++++++++++++++++++++++++++++++++-
>>   include/linux/dma-heap.h      |  57 +++-
>>   include/uapi/linux/dma-heap.h |  32 +++
>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
>> index 2298ca5e112e..abe17281adb8 100644
>> --- a/drivers/dma-buf/dma-heap.c
>> +++ b/drivers/dma-buf/dma-heap.c
>> @@ -15,9 +15,11 @@
>>   #include <linux/list.h>
>>   #include <linux/slab.h>
>>   #include <linux/nospec.h>
>> +#include <linux/highmem.h>
>>   #include <linux/uaccess.h>
>>   #include <linux/syscalls.h>
>>   #include <linux/dma-heap.h>
>> +#include <linux/vmalloc.h>
>>   #include <uapi/linux/dma-heap.h>
>>     #define DEVNAME "dma_heap"
>> @@ -43,12 +45,462 @@ struct dma_heap {
>>       struct cdev heap_cdev;
>>   };
>>   +/**
>> + * struct dma_heap_file - wrap the file, read task for dma_heap 
>> allocate use.
>> + * @file:        file to read from.
>> + *
>> + * @cred:        kthread use, user cred copy to use for the read.
>> + *
>> + * @max_batch:        maximum batch size to read, if collect match 
>> batch,
>> + *            trigger read, default 128MB, must below file size.
>> + *
>> + * @fsz:        file size.
>> + *
>> + * @direct:        use direct IO?
>> + */
>> +struct dma_heap_file {
>> +    struct file *file;
>> +    struct cred *cred;
>> +    size_t max_batch;
>> +    size_t fsz;
>> +    bool direct;
>> +};
>> +
>> +/**
>> + * struct dma_heap_file_work - represents a dma_heap file read real 
>> work.
>> + * @vaddr:        contigous virtual address alloc by vmap, file read 
>> need.
>> + *
>> + * @start_size:        file read start offset, same to 
>> @dma_heap_file_task->roffset.
>> + *
>> + * @need_size:        file read need size, same to 
>> @dma_heap_file_task->rsize.
>> + *
>> + * @heap_file:        file wrapper.
>> + *
>> + * @list:        child node of @dma_heap_file_control->works.
>> + *
>> + * @refp:        same @dma_heap_file_task->ref, if end of read, put 
>> ref.
>> + *
>> + * @failp:        if any work io failed, set it true, pointp 
>> @dma_heap_file_task->fail.
>> + */
>> +struct dma_heap_file_work {
>> +    void *vaddr;
>> +    ssize_t start_size;
>> +    ssize_t need_size;
>> +    struct dma_heap_file *heap_file;
>> +    struct list_head list;
>> +    atomic_t *refp;
>> +    bool *failp;
>> +};
>> +
>> +/**
>> + * struct dma_heap_file_task - represents a dma_heap file read process
>> + * @ref:        current file work counter, if zero, allocate and read
>> + *            done.
>> + *
>> + * @roffset:        last read offset, current prepared work' begin file
>> + *            start offset.
>> + *
>> + * @rsize:        current allocated page size use to read, if reach 
>> rbatch,
>> + *            trigger commit.
>> + *
>> + * @rbatch:        current prepared work's batch, below 
>> @dma_heap_file's
>> + *            batch.
>> + *
>> + * @heap_file:        current dma_heap_file
>> + *
>> + * @parray:        used for vmap, size is @dma_heap_file's batch's 
>> number
>> + *            pages.(this is maximum). Due to single thread file read,
>> + *            one page array reuse each work prepare is OK.
>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>> + *
>> + * @pindex:        current allocated page filled in @parray's index.
>> + *
>> + * @fail:        any work failed when file read?
>> + *
>> + * dma_heap_file_task is the production of file read, will prepare 
>> each work
>> + * during allocate dma_buf pages, if match current batch, then 
>> trigger commit
>> + * and prepare next work. After all batch queued, user going on 
>> prepare dma_buf
>> + * and so on, but before return dma_buf fd, need to wait file read 
>> end and
>> + * check read result.
>> + */
>> +struct dma_heap_file_task {
>> +    atomic_t ref;
>> +    size_t roffset;
>> +    size_t rsize;
>> +    size_t rbatch;
>> +    struct dma_heap_file *heap_file;
>> +    struct page **parray;
>> +    unsigned int pindex;
>> +    bool fail;
>> +};
>> +
>> +/**
>> + * struct dma_heap_file_control - global control of dma_heap file read.
>> + * @works:        @dma_heap_file_work's list head.
>> + *
>> + * @lock:        only lock for @works.
>> + *
>> + * @threadwq:        wait queue for @work_thread, if commit work, 
>> @work_thread
>> + *            wakeup and read this work's file contains.
>> + *
>> + * @workwq:        used for main thread wait for file read end, if 
>> allocation
>> + *            end before file read. @dma_heap_file_task ref effect 
>> this.
>> + *
>> + * @work_thread:    file read kthread. the dma_heap_file_task work's 
>> consumer.
>> + *
>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>> alloc/free frequently.
>> + *
>> + * @nr_work:        global number of how many work committed.
>> + */
>> +struct dma_heap_file_control {
>> +    struct list_head works;
>> +    spinlock_t lock;
>> +    wait_queue_head_t threadwq;
>> +    wait_queue_head_t workwq;
>> +    struct task_struct *work_thread;
>> +    struct kmem_cache *heap_fwork_cachep;
>> +    atomic_t nr_work;
>> +};
>> +
>> +static struct dma_heap_file_control *heap_fctl;
>>   static LIST_HEAD(heap_list);
>>   static DEFINE_MUTEX(heap_list_lock);
>>   static dev_t dma_heap_devt;
>>   static struct class *dma_heap_class;
>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>   +/**
>> + * map_pages_to_vaddr - map each scatter page into contiguous 
>> virtual address.
>> + * @heap_ftask:        prepared and need to commit's work.
>> + *
>> + * Cached pages need to trigger file read, this function map each 
>> scatter page
>> + * into contiguous virtual address, so that file read can easy use.
>> + * Now that we get vaddr page, cached pages can return to original 
>> user, so we
>> + * will not effect dma-buf export even if file read not end.
>> + */
>> +static void *map_pages_to_vaddr(struct dma_heap_file_task *heap_ftask)
>> +{
>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>> +            PAGE_KERNEL);
>> +}
>> +
>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
>> +                struct page *page)
>> +{
>> +    struct page **array = heap_ftask->parray;
>> +    int index = heap_ftask->pindex;
>> +    int num = compound_nr(page), i;
>> +    unsigned long sz = page_size(page);
>> +
>> +    heap_ftask->rsize += sz;
>> +    for (i = 0; i < num; ++i)
>> +        array[index++] = &page[i];
>> +    heap_ftask->pindex = index;
>> +
>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>> +}
>> +
>> +static struct dma_heap_file_work *
>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>> +{
>> +    struct dma_heap_file_work *heap_fwork;
>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>> +
>> +    if (READ_ONCE(heap_ftask->fail))
>> +        return NULL;
>> +
>> +    heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, 
>> GFP_KERNEL);
>> +    if (unlikely(!heap_fwork))
>> +        return NULL;
>> +
>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>> +    if (unlikely(!heap_fwork->vaddr)) {
>> +        kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>> +        return NULL;
>> +    }
>> +
>> +    heap_fwork->heap_file = heap_file;
>> +    heap_fwork->start_size = heap_ftask->roffset;
>> +    heap_fwork->need_size = heap_ftask->rsize;
>> +    heap_fwork->refp = &heap_ftask->ref;
>> +    heap_fwork->failp = &heap_ftask->fail;
>> +    atomic_inc(&heap_ftask->ref);
>> +    return heap_fwork;
>> +}
>> +
>> +static void destroy_file_work(struct dma_heap_file_work *heap_fwork)
>> +{
>> +    vunmap(heap_fwork->vaddr);
>> +    atomic_dec(heap_fwork->refp);
>> +    wake_up(&heap_fctl->workwq);
>> +
>> +    kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>> +}
>> +
>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
>> +{
>> +    struct dma_heap_file_work *heap_fwork = init_file_work(heap_ftask);
>> +    struct page *last = NULL;
>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>> +    size_t start = heap_ftask->roffset;
>> +    struct file *file = heap_file->file;
>> +    size_t fsz = heap_file->fsz;
>> +
>> +    if (unlikely(!heap_fwork))
>> +        return -ENOMEM;
>> +
>> +    /**
>> +     * If file size is not page aligned, direct io can't process the 
>> tail.
>> +     * So, if reach to tail, remain the last page use buffer read.
>> +     */
>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>> +        heap_fwork->need_size -= PAGE_SIZE;
>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>> +    }
>> +
>> +    spin_lock(&heap_fctl->lock);
>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>> +    spin_unlock(&heap_fctl->lock);
>> +    atomic_inc(&heap_fctl->nr_work);
>> +
>> +    wake_up(&heap_fctl->threadwq);
>> +
>> +    if (last) {
>> +        char *buf, *pathp;
>> +        ssize_t err;
>> +        void *buffer;
>> +
>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>> +        if (unlikely(!buf))
>> +            return -ENOMEM;
>> +
>> +        start = PAGE_ALIGN_DOWN(fsz);
>> +
>> +        pathp = file_path(file, buf, PATH_MAX);
>> +        if (IS_ERR(pathp)) {
>> +            kfree(buf);
>> +            return PTR_ERR(pathp);
>> +        }
>> +
>> +        buffer = kmap_local_page(last); // use page's kaddr.
>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>> +                         fsz - start, &fsz,
>> +                         READING_POLICY);
>> +        kunmap_local(buffer);
>> +        kfree(buf);
>> +        if (err < 0) {
>> +            pr_err("failed to use buffer kernel_read_file %s, 
>> err=%ld, [%ld, %ld], f_sz=%ld\n",
>> +                   pathp, err, start, fsz, fsz);
>> +
>> +            return err;
>> +        }
>> +    }
>> +
>> +    heap_ftask->roffset += heap_ftask->rsize;
>> +    heap_ftask->rsize = 0;
>> +    heap_ftask->pindex = 0;
>> +    heap_ftask->rbatch = min_t(size_t,
>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>> +                   heap_ftask->rbatch);
>> +    return 0;
>> +}
>> +
>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask)
>> +{
>> +    wait_event_freezable(heap_fctl->workwq,
>> +                 atomic_read(&heap_ftask->ref) == 0);
>> +    return heap_ftask->fail;
>> +}
>> +
>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask)
>> +{
>> +    bool fail;
>> +
>> +    dma_heap_wait_for_file_read(heap_ftask);
>> +    fail = heap_ftask->fail;
>> +    kvfree(heap_ftask->parray);
>> +    kfree(heap_ftask);
>> +    return fail;
>> +}
>> +
>> +struct dma_heap_file_task *
>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>> +{
>> +    struct dma_heap_file_task *heap_ftask =
>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>> +    if (unlikely(!heap_ftask))
>> +        return NULL;
>> +
>> +    /**
>> +     * Batch is the maximum size which we prepare work will meet.
>> +     * So, direct alloc this number's page array is OK.
>> +     */
>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> 
>> PAGE_SHIFT,
>> +                        sizeof(struct page *), GFP_KERNEL);
>> +    if (unlikely(!heap_ftask->parray))
>> +        goto put;
>> +
>> +    heap_ftask->heap_file = heap_file;
>> +    heap_ftask->rbatch = heap_file->max_batch;
>> +    return heap_ftask;
>> +put:
>> +    kfree(heap_ftask);
>> +    return NULL;
>> +}
>> +
>> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
>> +{
>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>> +    struct file *file = heap_file->file;
>> +    ssize_t start = heap_fwork->start_size;
>> +    ssize_t size = heap_fwork->need_size;
>> +    void *buffer = heap_fwork->vaddr;
>> +    const struct cred *old_cred;
>> +    ssize_t err;
>> +
>> +    // use real task's cred to read this file.
>> +    old_cred = override_creds(heap_file->cred);
>> +    err = kernel_read_file(file, start, &buffer, size, &heap_file->fsz,
>> +                   READING_POLICY);
>> +    if (err < 0) {
>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], f_sz=%ld\n",
>> +               err, start, (start + size), heap_file->fsz);
>> +        WRITE_ONCE(*heap_fwork->failp, true);
>> +    }
>> +    // recovery to my cred.
>> +    revert_creds(old_cred);
>> +}
>> +
>> +static int dma_heap_file_control_thread(void *data)
>> +{
>> +    struct dma_heap_file_control *heap_fctl =
>> +        (struct dma_heap_file_control *)data;
>> +    struct dma_heap_file_work *worker, *tmp;
>> +    int nr_work;
>> +
>> +    LIST_HEAD(pages);
>> +    LIST_HEAD(workers);
>> +
>> +    while (true) {
>> +        wait_event_freezable(heap_fctl->threadwq,
>> +                     atomic_read(&heap_fctl->nr_work) > 0);
>> +recheck:
>> +        spin_lock(&heap_fctl->lock);
>> +        list_splice_init(&heap_fctl->works, &workers);
>> +        spin_unlock(&heap_fctl->lock);
>> +
>> +        if (unlikely(kthread_should_stop())) {
>> +            list_for_each_entry_safe(worker, tmp, &workers, list) {
>> +                list_del(&worker->list);
>> +                destroy_file_work(worker);
>> +            }
>> +            break;
>> +        }
>> +
>> +        nr_work = 0;
>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>> +            ++nr_work;
>> +            list_del(&worker->list);
>> +            __work_this_io(worker);
>> +
>> +            destroy_file_work(worker);
>> +        }
>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>> +
>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>> +            goto recheck;
>> +    }
>> +    return 0;
>> +}
>> +
>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>> +{
>> +    return heap_file->fsz;
>> +}
>> +
>> +static int prepare_dma_heap_file(struct dma_heap_file *heap_file, 
>> int file_fd,
>> +                 size_t batch)
>> +{
>> +    struct file *file;
>> +    size_t fsz;
>> +    int ret;
>> +
>> +    file = fget(file_fd);
>> +    if (!file)
>> +        return -EINVAL;
>> +
>> +    fsz = i_size_read(file_inode(file));
>> +    if (fsz < batch) {
>> +        ret = -EINVAL;
>> +        goto err;
>> +    }
>> +
>> +    /**
>> +     * Selinux block our read, but actually we are reading the stand-in
>> +     * for this file.
>> +     * So save current's cred and when going to read, override mine, 
>> and
>> +     * end of read, revert.
>> +     */
>> +    heap_file->cred = prepare_kernel_cred(current);
>> +    if (unlikely(!heap_file->cred)) {
>> +        ret = -ENOMEM;
>> +        goto err;
>> +    }
>> +
>> +    heap_file->file = file;
>> +    heap_file->max_batch = batch;
>> +    heap_file->fsz = fsz;
>> +
>> +    heap_file->direct = file->f_flags & O_DIRECT;
>> +
>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>> +    if (!heap_file->direct && fsz >= DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>> +        pr_warn("alloc read file better to use O_DIRECT to read 
>> larget file\n");
>> +
>> +    return 0;
>> +
>> +err:
>> +    fput(file);
>> +    return ret;
>> +}
>> +
>> +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
>> +{
>> +    fput(heap_file->file);
>> +    put_cred(heap_file->cred);
>> +}
>> +
>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap *heap, 
>> int file_fd,
>> +                       size_t batch, unsigned int fd_flags,
>> +                       unsigned int heap_flags)
>> +{
>> +    struct dma_buf *dmabuf;
>> +    int fd;
>> +    struct dma_heap_file heap_file;
>> +
>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>> +    if (fd)
>> +        goto error_file;
>> +
>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, fd_flags,
>> +                           heap_flags);
>> +    if (IS_ERR(dmabuf)) {
>> +        fd = PTR_ERR(dmabuf);
>> +        goto error;
>> +    }
>> +
>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>> +    if (fd < 0) {
>> +        dma_buf_put(dmabuf);
>> +        /* just return, as put will call release and that will free */
>> +    }
>> +
>> +error:
>> +    destroy_dma_heap_file(&heap_file);
>> +error_file:
>> +    return fd;
>> +}
>> +
>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
>>                    u32 fd_flags,
>>                    u64 heap_flags)
>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode *inode, 
>> struct file *file)
>>       return 0;
>>   }
>>   +static long dma_heap_ioctl_allocate_read_file(struct file *file, 
>> void *data)
>> +{
>> +    struct dma_heap_allocation_file_data *heap_allocation_file = data;
>> +    struct dma_heap *heap = file->private_data;
>> +    int fd;
>> +
>> +    if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
>> +        return -EINVAL;
>> +
>> +    if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
>> +        return -EINVAL;
>> +
>> +    if (heap_allocation_file->heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS)
>> +        return -EINVAL;
>> +
>> +    if (!heap->ops->allocate_read_file)
>> +        return -EINVAL;
>> +
>> +    fd = dma_heap_buffer_alloc_read_file(
>> +        heap, heap_allocation_file->file_fd,
>> +        heap_allocation_file->batch ?
>> +            PAGE_ALIGN(heap_allocation_file->batch) :
>> +            DEFAULT_ADI_BATCH,
>> +        heap_allocation_file->fd_flags,
>> +        heap_allocation_file->heap_flags);
>> +    if (fd < 0)
>> +        return fd;
>> +
>> +    heap_allocation_file->fd = fd;
>> +    return 0;
>> +}
>> +
>>   static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>   {
>>       struct dma_heap_allocation_data *heap_allocation = data;
>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct file 
>> *file, void *data)
>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>       DMA_HEAP_IOCTL_ALLOC,
>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>   };
>>     static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file *file, 
>> unsigned int ucmd,
>>       case DMA_HEAP_IOCTL_ALLOC:
>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>           break;
>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>> +        break;
>>       default:
>>           ret = -ENOTTY;
>>           goto err;
>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>         dma_heap_class = class_create(DEVNAME);
>>       if (IS_ERR(dma_heap_class)) {
>> -        unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>> -        return PTR_ERR(dma_heap_class);
>> +        ret = PTR_ERR(dma_heap_class);
>> +        goto fail_class;
>>       }
>>       dma_heap_class->devnode = dma_heap_devnode;
>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>> +    if (unlikely(!heap_fctl)) {
>> +        ret =  -ENOMEM;
>> +        goto fail_alloc;
>> +    }
>> +
>> +    INIT_LIST_HEAD(&heap_fctl->works);
>> +    init_waitqueue_head(&heap_fctl->threadwq);
>> +    init_waitqueue_head(&heap_fctl->workwq);
>> +
>> +    heap_fctl->work_thread = kthread_run(dma_heap_file_control_thread,
>> +                         heap_fctl, "heap_fwork_t");
>> +    if (IS_ERR(heap_fctl->work_thread)) {
>> +        ret = -ENOMEM;
>> +        goto fail_thread;
>> +    }
>> +
>> +    heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 0);
>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>> +        ret = -ENOMEM;
>> +        goto fail_cache;
>> +    }
>> +
>>       return 0;
>> +
>> +fail_cache:
>> +    kthread_stop(heap_fctl->work_thread);
>> +fail_thread:
>> +    kfree(heap_fctl);
>> +fail_alloc:
>> +    class_destroy(dma_heap_class);
>> +fail_class:
>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>> +    return ret;
>>   }
>>   subsys_initcall(dma_heap_init);
>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>> index 064bad725061..9c25383f816c 100644
>> --- a/include/linux/dma-heap.h
>> +++ b/include/linux/dma-heap.h
>> @@ -12,12 +12,17 @@
>>   #include <linux/cdev.h>
>>   #include <linux/types.h>
>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>> +
>>   struct dma_heap;
>> +struct dma_heap_file_task;
>> +struct dma_heap_file;
>>     /**
>>    * struct dma_heap_ops - ops to operate on a given heap
>>    * @allocate:        allocate dmabuf and return struct dma_buf ptr
>> - *
>> + * @allocate_read_file: allocate dmabuf and read file, then return 
>> struct
>> + * dma_buf ptr.
>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>>    */
>>   struct dma_heap_ops {
>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>                       unsigned long len,
>>                       u32 fd_flags,
>>                       u64 heap_flags);
>> +
>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>> +                          struct dma_heap_file *heap_file,
>> +                          u32 fd_flags,
>> +                          u64 heap_flags);
>>   };
>>     /**
>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap *heap);
>>    */
>>   struct dma_heap *dma_heap_add(const struct dma_heap_export_info 
>> *exp_info);
>>   +/**
>> + * dma_heap_destroy_file_read - waits for a file read to complete 
>> then destroy it
>> + * Returns: true if the file read failed, false otherwise
>> + */
>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask);
>> +
>> +/**
>> + * dma_heap_wait_for_file_read - waits for a file read to complete
>> + * Returns: true if the file read failed, false otherwise
>> + */
>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>> *heap_ftask);
>> +
>> +/**
>> + * dma_heap_alloc_file_read - Declare a task to read file when 
>> allocate pages.
>> + * @heap_file:        target file to read
>> + *
>> + * Return NULL if failed, otherwise return a struct pointer.
>> + */
>> +struct dma_heap_file_task *
>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>> +
>> +/**
>> + * dma_heap_prepare_file_read - cache each allocated page until we 
>> meet this batch.
>> + * @heap_ftask:        prepared and need to commit's work.
>> + * @page:        current allocated page. don't care which order.
>> + *
>> + * Returns true if reach to batch, false so go on prepare.
>> + */
>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
>> +                struct page *page);
>> +
>> +/**
>> + * dma_heap_commit_file_read -  prepare collect enough memory, going 
>> to trigger IO
>> + * @heap_ftask:            info that current IO needs
>> + *
>> + * This commit will also check if reach to tail read.
>> + * For direct I/O submissions, it is necessary to pay attention to 
>> file reads
>> + * that are not page-aligned. For the unaligned portion of the read, 
>> buffer IO
>> + * needs to be triggered.
>> + * Returns:
>> + *   0 if all right, -errno if something wrong
>> + */
>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask);
>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>> +
>>   #endif /* _DMA_HEAPS_H */
>> diff --git a/include/uapi/linux/dma-heap.h 
>> b/include/uapi/linux/dma-heap.h
>> index a4cf716a49fa..8c20e8b74eed 100644
>> --- a/include/uapi/linux/dma-heap.h
>> +++ b/include/uapi/linux/dma-heap.h
>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>       __u64 heap_flags;
>>   };
>>   +/**
>> + * struct dma_heap_allocation_file_data - metadata passed from 
>> userspace for
>> + *                                      allocations and read file
>> + * @fd:            will be populated with a fd which provides the
>> + *            handle to the allocated dma-buf
>> + * @file_fd:        file descriptor to read from(suggested to use 
>> O_DIRECT open file)
>> + * @batch:        how many memory alloced then file read(bytes), 
>> default 128MB
>> + *            will auto aligned to PAGE_SIZE
>> + * @fd_flags:        file descriptor flags used when allocating
>> + * @heap_flags:        flags passed to heap
>> + *
>> + * Provided by userspace as an argument to the ioctl
>> + */
>> +struct dma_heap_allocation_file_data {
>> +    __u32 fd;
>> +    __u32 file_fd;
>> +    __u32 batch;
>> +    __u32 fd_flags;
>> +    __u64 heap_flags;
>> +};
>> +
>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>     /**
>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>   #define DMA_HEAP_IOCTL_ALLOC    _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>                         struct dma_heap_allocation_data)
>>   +/**
>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from pool 
>> and both
>> + *                    read file when allocate memory.
>> + *
>> + * Takes a dma_heap_allocation_file_data struct and returns it with 
>> the fd field
>> + * populated with the dmabuf handle of the allocation. When return, 
>> the dma-buf
>> + * content is read from file.
>> + */
>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>> dma_heap_allocation_file_data)
>> +
>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>
Christian König July 11, 2024, 11:39 a.m. UTC | #3
Am 11.07.24 um 11:18 schrieb Huan Yang:
> Hi Christian,
>
> Thanks for your reply.
>
> 在 2024/7/11 17:00, Christian König 写道:
>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>> Some user may need load file into dma-buf, current
>>> way is:
>>>    1. allocate a dma-buf, get dma-buf fd
>>>    2. mmap dma-buf fd into vaddr
>>>    3. read(file_fd, vaddr, fsz)
>>> This is too heavy if fsz reached to GB.
>>
>> You need to describe a bit more why that is to heavy. I can only 
>> assume you need to save memory bandwidth and avoid the extra copy 
>> with the CPU.
>
> Sorry for the oversimplified explanation. But, yes, you're right, we 
> want to avoid this.
>
> As we are dealing with embedded devices, the available memory and 
> computing power for users are usually limited.(The maximum available 
> memory is currently
>
> 24GB, typically ranging from 8-12GB. )
>
> Also, the CPU computing power is also usually in short supply, due to 
> limited battery capacity and limited heat dissipation capabilities.
>
> So, we hope to avoid ineffective paths as much as possible.
>
>>
>>> This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>> User need to offer a file_fd which you want to load into dma-buf, then,
>>> it promise if you got a dma-buf fd, it will contains the file content.
>>
>> Interesting idea, that has at least more potential than trying to 
>> enable direct I/O on mmap()ed DMA-bufs.
>>
>> The approach with the new IOCTL might not work because it is a very 
>> specialized use case.
>
> Thank you for your advice. maybe the "read file" behavior can be 
> attached to an existing allocation?

The point is there are already system calls to do something like that.

See copy_file_range() 
(https://man7.org/linux/man-pages/man2/copy_file_range.2.html) and 
send_file() (https://man7.org/linux/man-pages/man2/sendfile.2.html).

What we probably could do is to internally optimize those.

> I am currently creating a new ioctl to remind the user that memory is 
> being allocated and read, and I am also unsure
>
> whether it is appropriate to add additional parameters to the existing 
> allocate behavior.
>
> Please, give me more suggestion. Thanks.
>
>>
>> But IIRC there was a copy_file_range callback in the file_operations 
>> structure you could use for that. I'm just not sure when and how 
>> that's used with the copy_file_range() system call.
>
> Sorry, I'm not familiar with this, but I will look into it. However, 
> this type of callback function is not currently implemented when 
> exporting
>
> the dma_buf file, which means that I need to implement the callback 
> for it?

If I'm not completely mistaken the copy_file_range, splice_read and 
splice_write callbacks on the struct file_operations 
(https://elixir.bootlin.com/linux/v6.10-rc7/source/include/linux/fs.h#L1999).

Can be used to implement what you want to do.

Regards,
Christian.

>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Notice, file_fd depends on user how to open this file. So, both buffer
>>> I/O and Direct I/O is supported.
>>>
>>> Signed-off-by: Huan Yang <link@vivo.com>
>>> ---
>>>   drivers/dma-buf/dma-heap.c    | 525 
>>> +++++++++++++++++++++++++++++++++-
>>>   include/linux/dma-heap.h      |  57 +++-
>>>   include/uapi/linux/dma-heap.h |  32 +++
>>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
>>> index 2298ca5e112e..abe17281adb8 100644
>>> --- a/drivers/dma-buf/dma-heap.c
>>> +++ b/drivers/dma-buf/dma-heap.c
>>> @@ -15,9 +15,11 @@
>>>   #include <linux/list.h>
>>>   #include <linux/slab.h>
>>>   #include <linux/nospec.h>
>>> +#include <linux/highmem.h>
>>>   #include <linux/uaccess.h>
>>>   #include <linux/syscalls.h>
>>>   #include <linux/dma-heap.h>
>>> +#include <linux/vmalloc.h>
>>>   #include <uapi/linux/dma-heap.h>
>>>     #define DEVNAME "dma_heap"
>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>       struct cdev heap_cdev;
>>>   };
>>>   +/**
>>> + * struct dma_heap_file - wrap the file, read task for dma_heap 
>>> allocate use.
>>> + * @file:        file to read from.
>>> + *
>>> + * @cred:        kthread use, user cred copy to use for the read.
>>> + *
>>> + * @max_batch:        maximum batch size to read, if collect match 
>>> batch,
>>> + *            trigger read, default 128MB, must below file size.
>>> + *
>>> + * @fsz:        file size.
>>> + *
>>> + * @direct:        use direct IO?
>>> + */
>>> +struct dma_heap_file {
>>> +    struct file *file;
>>> +    struct cred *cred;
>>> +    size_t max_batch;
>>> +    size_t fsz;
>>> +    bool direct;
>>> +};
>>> +
>>> +/**
>>> + * struct dma_heap_file_work - represents a dma_heap file read real 
>>> work.
>>> + * @vaddr:        contigous virtual address alloc by vmap, file 
>>> read need.
>>> + *
>>> + * @start_size:        file read start offset, same to 
>>> @dma_heap_file_task->roffset.
>>> + *
>>> + * @need_size:        file read need size, same to 
>>> @dma_heap_file_task->rsize.
>>> + *
>>> + * @heap_file:        file wrapper.
>>> + *
>>> + * @list:        child node of @dma_heap_file_control->works.
>>> + *
>>> + * @refp:        same @dma_heap_file_task->ref, if end of read, put 
>>> ref.
>>> + *
>>> + * @failp:        if any work io failed, set it true, pointp 
>>> @dma_heap_file_task->fail.
>>> + */
>>> +struct dma_heap_file_work {
>>> +    void *vaddr;
>>> +    ssize_t start_size;
>>> +    ssize_t need_size;
>>> +    struct dma_heap_file *heap_file;
>>> +    struct list_head list;
>>> +    atomic_t *refp;
>>> +    bool *failp;
>>> +};
>>> +
>>> +/**
>>> + * struct dma_heap_file_task - represents a dma_heap file read process
>>> + * @ref:        current file work counter, if zero, allocate and read
>>> + *            done.
>>> + *
>>> + * @roffset:        last read offset, current prepared work' begin 
>>> file
>>> + *            start offset.
>>> + *
>>> + * @rsize:        current allocated page size use to read, if reach 
>>> rbatch,
>>> + *            trigger commit.
>>> + *
>>> + * @rbatch:        current prepared work's batch, below 
>>> @dma_heap_file's
>>> + *            batch.
>>> + *
>>> + * @heap_file:        current dma_heap_file
>>> + *
>>> + * @parray:        used for vmap, size is @dma_heap_file's batch's 
>>> number
>>> + *            pages.(this is maximum). Due to single thread file read,
>>> + *            one page array reuse each work prepare is OK.
>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>> + *
>>> + * @pindex:        current allocated page filled in @parray's index.
>>> + *
>>> + * @fail:        any work failed when file read?
>>> + *
>>> + * dma_heap_file_task is the production of file read, will prepare 
>>> each work
>>> + * during allocate dma_buf pages, if match current batch, then 
>>> trigger commit
>>> + * and prepare next work. After all batch queued, user going on 
>>> prepare dma_buf
>>> + * and so on, but before return dma_buf fd, need to wait file read 
>>> end and
>>> + * check read result.
>>> + */
>>> +struct dma_heap_file_task {
>>> +    atomic_t ref;
>>> +    size_t roffset;
>>> +    size_t rsize;
>>> +    size_t rbatch;
>>> +    struct dma_heap_file *heap_file;
>>> +    struct page **parray;
>>> +    unsigned int pindex;
>>> +    bool fail;
>>> +};
>>> +
>>> +/**
>>> + * struct dma_heap_file_control - global control of dma_heap file 
>>> read.
>>> + * @works:        @dma_heap_file_work's list head.
>>> + *
>>> + * @lock:        only lock for @works.
>>> + *
>>> + * @threadwq:        wait queue for @work_thread, if commit work, 
>>> @work_thread
>>> + *            wakeup and read this work's file contains.
>>> + *
>>> + * @workwq:        used for main thread wait for file read end, if 
>>> allocation
>>> + *            end before file read. @dma_heap_file_task ref effect 
>>> this.
>>> + *
>>> + * @work_thread:    file read kthread. the dma_heap_file_task 
>>> work's consumer.
>>> + *
>>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>>> alloc/free frequently.
>>> + *
>>> + * @nr_work:        global number of how many work committed.
>>> + */
>>> +struct dma_heap_file_control {
>>> +    struct list_head works;
>>> +    spinlock_t lock;
>>> +    wait_queue_head_t threadwq;
>>> +    wait_queue_head_t workwq;
>>> +    struct task_struct *work_thread;
>>> +    struct kmem_cache *heap_fwork_cachep;
>>> +    atomic_t nr_work;
>>> +};
>>> +
>>> +static struct dma_heap_file_control *heap_fctl;
>>>   static LIST_HEAD(heap_list);
>>>   static DEFINE_MUTEX(heap_list_lock);
>>>   static dev_t dma_heap_devt;
>>>   static struct class *dma_heap_class;
>>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>   +/**
>>> + * map_pages_to_vaddr - map each scatter page into contiguous 
>>> virtual address.
>>> + * @heap_ftask:        prepared and need to commit's work.
>>> + *
>>> + * Cached pages need to trigger file read, this function map each 
>>> scatter page
>>> + * into contiguous virtual address, so that file read can easy use.
>>> + * Now that we get vaddr page, cached pages can return to original 
>>> user, so we
>>> + * will not effect dma-buf export even if file read not end.
>>> + */
>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task *heap_ftask)
>>> +{
>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>> +            PAGE_KERNEL);
>>> +}
>>> +
>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
>>> +                struct page *page)
>>> +{
>>> +    struct page **array = heap_ftask->parray;
>>> +    int index = heap_ftask->pindex;
>>> +    int num = compound_nr(page), i;
>>> +    unsigned long sz = page_size(page);
>>> +
>>> +    heap_ftask->rsize += sz;
>>> +    for (i = 0; i < num; ++i)
>>> +        array[index++] = &page[i];
>>> +    heap_ftask->pindex = index;
>>> +
>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>> +}
>>> +
>>> +static struct dma_heap_file_work *
>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>> +{
>>> +    struct dma_heap_file_work *heap_fwork;
>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>> +
>>> +    if (READ_ONCE(heap_ftask->fail))
>>> +        return NULL;
>>> +
>>> +    heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, 
>>> GFP_KERNEL);
>>> +    if (unlikely(!heap_fwork))
>>> +        return NULL;
>>> +
>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>> +        kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>> +        return NULL;
>>> +    }
>>> +
>>> +    heap_fwork->heap_file = heap_file;
>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>> +    heap_fwork->refp = &heap_ftask->ref;
>>> +    heap_fwork->failp = &heap_ftask->fail;
>>> +    atomic_inc(&heap_ftask->ref);
>>> +    return heap_fwork;
>>> +}
>>> +
>>> +static void destroy_file_work(struct dma_heap_file_work *heap_fwork)
>>> +{
>>> +    vunmap(heap_fwork->vaddr);
>>> +    atomic_dec(heap_fwork->refp);
>>> +    wake_up(&heap_fctl->workwq);
>>> +
>>> +    kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>> +}
>>> +
>>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
>>> +{
>>> +    struct dma_heap_file_work *heap_fwork = 
>>> init_file_work(heap_ftask);
>>> +    struct page *last = NULL;
>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>> +    size_t start = heap_ftask->roffset;
>>> +    struct file *file = heap_file->file;
>>> +    size_t fsz = heap_file->fsz;
>>> +
>>> +    if (unlikely(!heap_fwork))
>>> +        return -ENOMEM;
>>> +
>>> +    /**
>>> +     * If file size is not page aligned, direct io can't process 
>>> the tail.
>>> +     * So, if reach to tail, remain the last page use buffer read.
>>> +     */
>>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>> +    }
>>> +
>>> +    spin_lock(&heap_fctl->lock);
>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>> +    spin_unlock(&heap_fctl->lock);
>>> +    atomic_inc(&heap_fctl->nr_work);
>>> +
>>> +    wake_up(&heap_fctl->threadwq);
>>> +
>>> +    if (last) {
>>> +        char *buf, *pathp;
>>> +        ssize_t err;
>>> +        void *buffer;
>>> +
>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>> +        if (unlikely(!buf))
>>> +            return -ENOMEM;
>>> +
>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>> +
>>> +        pathp = file_path(file, buf, PATH_MAX);
>>> +        if (IS_ERR(pathp)) {
>>> +            kfree(buf);
>>> +            return PTR_ERR(pathp);
>>> +        }
>>> +
>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>>> +                         fsz - start, &fsz,
>>> +                         READING_POLICY);
>>> +        kunmap_local(buffer);
>>> +        kfree(buf);
>>> +        if (err < 0) {
>>> +            pr_err("failed to use buffer kernel_read_file %s, 
>>> err=%ld, [%ld, %ld], f_sz=%ld\n",
>>> +                   pathp, err, start, fsz, fsz);
>>> +
>>> +            return err;
>>> +        }
>>> +    }
>>> +
>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>> +    heap_ftask->rsize = 0;
>>> +    heap_ftask->pindex = 0;
>>> +    heap_ftask->rbatch = min_t(size_t,
>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>> +                   heap_ftask->rbatch);
>>> +    return 0;
>>> +}
>>> +
>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>> *heap_ftask)
>>> +{
>>> +    wait_event_freezable(heap_fctl->workwq,
>>> +                 atomic_read(&heap_ftask->ref) == 0);
>>> +    return heap_ftask->fail;
>>> +}
>>> +
>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask)
>>> +{
>>> +    bool fail;
>>> +
>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>> +    fail = heap_ftask->fail;
>>> +    kvfree(heap_ftask->parray);
>>> +    kfree(heap_ftask);
>>> +    return fail;
>>> +}
>>> +
>>> +struct dma_heap_file_task *
>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>> +{
>>> +    struct dma_heap_file_task *heap_ftask =
>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>> +    if (unlikely(!heap_ftask))
>>> +        return NULL;
>>> +
>>> +    /**
>>> +     * Batch is the maximum size which we prepare work will meet.
>>> +     * So, direct alloc this number's page array is OK.
>>> +     */
>>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> 
>>> PAGE_SHIFT,
>>> +                        sizeof(struct page *), GFP_KERNEL);
>>> +    if (unlikely(!heap_ftask->parray))
>>> +        goto put;
>>> +
>>> +    heap_ftask->heap_file = heap_file;
>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>> +    return heap_ftask;
>>> +put:
>>> +    kfree(heap_ftask);
>>> +    return NULL;
>>> +}
>>> +
>>> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
>>> +{
>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>> +    struct file *file = heap_file->file;
>>> +    ssize_t start = heap_fwork->start_size;
>>> +    ssize_t size = heap_fwork->need_size;
>>> +    void *buffer = heap_fwork->vaddr;
>>> +    const struct cred *old_cred;
>>> +    ssize_t err;
>>> +
>>> +    // use real task's cred to read this file.
>>> +    old_cred = override_creds(heap_file->cred);
>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>> &heap_file->fsz,
>>> +                   READING_POLICY);
>>> +    if (err < 0) {
>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>> f_sz=%ld\n",
>>> +               err, start, (start + size), heap_file->fsz);
>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>> +    }
>>> +    // recovery to my cred.
>>> +    revert_creds(old_cred);
>>> +}
>>> +
>>> +static int dma_heap_file_control_thread(void *data)
>>> +{
>>> +    struct dma_heap_file_control *heap_fctl =
>>> +        (struct dma_heap_file_control *)data;
>>> +    struct dma_heap_file_work *worker, *tmp;
>>> +    int nr_work;
>>> +
>>> +    LIST_HEAD(pages);
>>> +    LIST_HEAD(workers);
>>> +
>>> +    while (true) {
>>> +        wait_event_freezable(heap_fctl->threadwq,
>>> +                     atomic_read(&heap_fctl->nr_work) > 0);
>>> +recheck:
>>> +        spin_lock(&heap_fctl->lock);
>>> +        list_splice_init(&heap_fctl->works, &workers);
>>> +        spin_unlock(&heap_fctl->lock);
>>> +
>>> +        if (unlikely(kthread_should_stop())) {
>>> +            list_for_each_entry_safe(worker, tmp, &workers, list) {
>>> +                list_del(&worker->list);
>>> +                destroy_file_work(worker);
>>> +            }
>>> +            break;
>>> +        }
>>> +
>>> +        nr_work = 0;
>>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>>> +            ++nr_work;
>>> +            list_del(&worker->list);
>>> +            __work_this_io(worker);
>>> +
>>> +            destroy_file_work(worker);
>>> +        }
>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>> +
>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>> +            goto recheck;
>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>> +{
>>> +    return heap_file->fsz;
>>> +}
>>> +
>>> +static int prepare_dma_heap_file(struct dma_heap_file *heap_file, 
>>> int file_fd,
>>> +                 size_t batch)
>>> +{
>>> +    struct file *file;
>>> +    size_t fsz;
>>> +    int ret;
>>> +
>>> +    file = fget(file_fd);
>>> +    if (!file)
>>> +        return -EINVAL;
>>> +
>>> +    fsz = i_size_read(file_inode(file));
>>> +    if (fsz < batch) {
>>> +        ret = -EINVAL;
>>> +        goto err;
>>> +    }
>>> +
>>> +    /**
>>> +     * Selinux block our read, but actually we are reading the 
>>> stand-in
>>> +     * for this file.
>>> +     * So save current's cred and when going to read, override 
>>> mine, and
>>> +     * end of read, revert.
>>> +     */
>>> +    heap_file->cred = prepare_kernel_cred(current);
>>> +    if (unlikely(!heap_file->cred)) {
>>> +        ret = -ENOMEM;
>>> +        goto err;
>>> +    }
>>> +
>>> +    heap_file->file = file;
>>> +    heap_file->max_batch = batch;
>>> +    heap_file->fsz = fsz;
>>> +
>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>> +
>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>> +    if (!heap_file->direct && fsz >= DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>> +        pr_warn("alloc read file better to use O_DIRECT to read 
>>> larget file\n");
>>> +
>>> +    return 0;
>>> +
>>> +err:
>>> +    fput(file);
>>> +    return ret;
>>> +}
>>> +
>>> +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
>>> +{
>>> +    fput(heap_file->file);
>>> +    put_cred(heap_file->cred);
>>> +}
>>> +
>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap *heap, 
>>> int file_fd,
>>> +                       size_t batch, unsigned int fd_flags,
>>> +                       unsigned int heap_flags)
>>> +{
>>> +    struct dma_buf *dmabuf;
>>> +    int fd;
>>> +    struct dma_heap_file heap_file;
>>> +
>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>> +    if (fd)
>>> +        goto error_file;
>>> +
>>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, fd_flags,
>>> +                           heap_flags);
>>> +    if (IS_ERR(dmabuf)) {
>>> +        fd = PTR_ERR(dmabuf);
>>> +        goto error;
>>> +    }
>>> +
>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>> +    if (fd < 0) {
>>> +        dma_buf_put(dmabuf);
>>> +        /* just return, as put will call release and that will free */
>>> +    }
>>> +
>>> +error:
>>> +    destroy_dma_heap_file(&heap_file);
>>> +error_file:
>>> +    return fd;
>>> +}
>>> +
>>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
>>>                    u32 fd_flags,
>>>                    u64 heap_flags)
>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode *inode, 
>>> struct file *file)
>>>       return 0;
>>>   }
>>>   +static long dma_heap_ioctl_allocate_read_file(struct file *file, 
>>> void *data)
>>> +{
>>> +    struct dma_heap_allocation_file_data *heap_allocation_file = data;
>>> +    struct dma_heap *heap = file->private_data;
>>> +    int fd;
>>> +
>>> +    if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
>>> +        return -EINVAL;
>>> +
>>> +    if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
>>> +        return -EINVAL;
>>> +
>>> +    if (heap_allocation_file->heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS)
>>> +        return -EINVAL;
>>> +
>>> +    if (!heap->ops->allocate_read_file)
>>> +        return -EINVAL;
>>> +
>>> +    fd = dma_heap_buffer_alloc_read_file(
>>> +        heap, heap_allocation_file->file_fd,
>>> +        heap_allocation_file->batch ?
>>> +            PAGE_ALIGN(heap_allocation_file->batch) :
>>> +            DEFAULT_ADI_BATCH,
>>> +        heap_allocation_file->fd_flags,
>>> +        heap_allocation_file->heap_flags);
>>> +    if (fd < 0)
>>> +        return fd;
>>> +
>>> +    heap_allocation_file->fd = fd;
>>> +    return 0;
>>> +}
>>> +
>>>   static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>>   {
>>>       struct dma_heap_allocation_data *heap_allocation = data;
>>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct file 
>>> *file, void *data)
>>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>>       DMA_HEAP_IOCTL_ALLOC,
>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>   };
>>>     static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file *file, 
>>> unsigned int ucmd,
>>>       case DMA_HEAP_IOCTL_ALLOC:
>>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>>           break;
>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>> +        break;
>>>       default:
>>>           ret = -ENOTTY;
>>>           goto err;
>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>         dma_heap_class = class_create(DEVNAME);
>>>       if (IS_ERR(dma_heap_class)) {
>>> -        unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>> -        return PTR_ERR(dma_heap_class);
>>> +        ret = PTR_ERR(dma_heap_class);
>>> +        goto fail_class;
>>>       }
>>>       dma_heap_class->devnode = dma_heap_devnode;
>>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>> +    if (unlikely(!heap_fctl)) {
>>> +        ret =  -ENOMEM;
>>> +        goto fail_alloc;
>>> +    }
>>> +
>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>> +    init_waitqueue_head(&heap_fctl->threadwq);
>>> +    init_waitqueue_head(&heap_fctl->workwq);
>>> +
>>> +    heap_fctl->work_thread = kthread_run(dma_heap_file_control_thread,
>>> +                         heap_fctl, "heap_fwork_t");
>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>> +        ret = -ENOMEM;
>>> +        goto fail_thread;
>>> +    }
>>> +
>>> +    heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 0);
>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>> +        ret = -ENOMEM;
>>> +        goto fail_cache;
>>> +    }
>>> +
>>>       return 0;
>>> +
>>> +fail_cache:
>>> +    kthread_stop(heap_fctl->work_thread);
>>> +fail_thread:
>>> +    kfree(heap_fctl);
>>> +fail_alloc:
>>> +    class_destroy(dma_heap_class);
>>> +fail_class:
>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>> +    return ret;
>>>   }
>>>   subsys_initcall(dma_heap_init);
>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>> index 064bad725061..9c25383f816c 100644
>>> --- a/include/linux/dma-heap.h
>>> +++ b/include/linux/dma-heap.h
>>> @@ -12,12 +12,17 @@
>>>   #include <linux/cdev.h>
>>>   #include <linux/types.h>
>>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>>> +
>>>   struct dma_heap;
>>> +struct dma_heap_file_task;
>>> +struct dma_heap_file;
>>>     /**
>>>    * struct dma_heap_ops - ops to operate on a given heap
>>>    * @allocate:        allocate dmabuf and return struct dma_buf ptr
>>> - *
>>> + * @allocate_read_file: allocate dmabuf and read file, then return 
>>> struct
>>> + * dma_buf ptr.
>>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>>>    */
>>>   struct dma_heap_ops {
>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>                       unsigned long len,
>>>                       u32 fd_flags,
>>>                       u64 heap_flags);
>>> +
>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>>> +                          struct dma_heap_file *heap_file,
>>> +                          u32 fd_flags,
>>> +                          u64 heap_flags);
>>>   };
>>>     /**
>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap 
>>> *heap);
>>>    */
>>>   struct dma_heap *dma_heap_add(const struct dma_heap_export_info 
>>> *exp_info);
>>>   +/**
>>> + * dma_heap_destroy_file_read - waits for a file read to complete 
>>> then destroy it
>>> + * Returns: true if the file read failed, false otherwise
>>> + */
>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>> *heap_ftask);
>>> +
>>> +/**
>>> + * dma_heap_wait_for_file_read - waits for a file read to complete
>>> + * Returns: true if the file read failed, false otherwise
>>> + */
>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>> *heap_ftask);
>>> +
>>> +/**
>>> + * dma_heap_alloc_file_read - Declare a task to read file when 
>>> allocate pages.
>>> + * @heap_file:        target file to read
>>> + *
>>> + * Return NULL if failed, otherwise return a struct pointer.
>>> + */
>>> +struct dma_heap_file_task *
>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>> +
>>> +/**
>>> + * dma_heap_prepare_file_read - cache each allocated page until we 
>>> meet this batch.
>>> + * @heap_ftask:        prepared and need to commit's work.
>>> + * @page:        current allocated page. don't care which order.
>>> + *
>>> + * Returns true if reach to batch, false so go on prepare.
>>> + */
>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
>>> +                struct page *page);
>>> +
>>> +/**
>>> + * dma_heap_commit_file_read -  prepare collect enough memory, 
>>> going to trigger IO
>>> + * @heap_ftask:            info that current IO needs
>>> + *
>>> + * This commit will also check if reach to tail read.
>>> + * For direct I/O submissions, it is necessary to pay attention to 
>>> file reads
>>> + * that are not page-aligned. For the unaligned portion of the 
>>> read, buffer IO
>>> + * needs to be triggered.
>>> + * Returns:
>>> + *   0 if all right, -errno if something wrong
>>> + */
>>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask);
>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>> +
>>>   #endif /* _DMA_HEAPS_H */
>>> diff --git a/include/uapi/linux/dma-heap.h 
>>> b/include/uapi/linux/dma-heap.h
>>> index a4cf716a49fa..8c20e8b74eed 100644
>>> --- a/include/uapi/linux/dma-heap.h
>>> +++ b/include/uapi/linux/dma-heap.h
>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>       __u64 heap_flags;
>>>   };
>>>   +/**
>>> + * struct dma_heap_allocation_file_data - metadata passed from 
>>> userspace for
>>> + *                                      allocations and read file
>>> + * @fd:            will be populated with a fd which provides the
>>> + *            handle to the allocated dma-buf
>>> + * @file_fd:        file descriptor to read from(suggested to use 
>>> O_DIRECT open file)
>>> + * @batch:        how many memory alloced then file read(bytes), 
>>> default 128MB
>>> + *            will auto aligned to PAGE_SIZE
>>> + * @fd_flags:        file descriptor flags used when allocating
>>> + * @heap_flags:        flags passed to heap
>>> + *
>>> + * Provided by userspace as an argument to the ioctl
>>> + */
>>> +struct dma_heap_allocation_file_data {
>>> +    __u32 fd;
>>> +    __u32 file_fd;
>>> +    __u32 batch;
>>> +    __u32 fd_flags;
>>> +    __u64 heap_flags;
>>> +};
>>> +
>>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>>     /**
>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>   #define DMA_HEAP_IOCTL_ALLOC    _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>                         struct dma_heap_allocation_data)
>>>   +/**
>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from pool 
>>> and both
>>> + *                    read file when allocate memory.
>>> + *
>>> + * Takes a dma_heap_allocation_file_data struct and returns it with 
>>> the fd field
>>> + * populated with the dmabuf handle of the allocation. When return, 
>>> the dma-buf
>>> + * content is read from file.
>>> + */
>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>> dma_heap_allocation_file_data)
>>> +
>>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>>
Huan Yang July 12, 2024, 1:59 a.m. UTC | #4
Hi Christian,

在 2024/7/11 19:39, Christian König 写道:
> Am 11.07.24 um 11:18 schrieb Huan Yang:
>> Hi Christian,
>>
>> Thanks for your reply.
>>
>> 在 2024/7/11 17:00, Christian König 写道:
>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>> Some user may need load file into dma-buf, current
>>>> way is:
>>>>    1. allocate a dma-buf, get dma-buf fd
>>>>    2. mmap dma-buf fd into vaddr
>>>>    3. read(file_fd, vaddr, fsz)
>>>> This is too heavy if fsz reached to GB.
>>>
>>> You need to describe a bit more why that is to heavy. I can only 
>>> assume you need to save memory bandwidth and avoid the extra copy 
>>> with the CPU.
>>
>> Sorry for the oversimplified explanation. But, yes, you're right, we 
>> want to avoid this.
>>
>> As we are dealing with embedded devices, the available memory and 
>> computing power for users are usually limited.(The maximum available 
>> memory is currently
>>
>> 24GB, typically ranging from 8-12GB. )
>>
>> Also, the CPU computing power is also usually in short supply, due to 
>> limited battery capacity and limited heat dissipation capabilities.
>>
>> So, we hope to avoid ineffective paths as much as possible.
>>
>>>
>>>> This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>> User need to offer a file_fd which you want to load into dma-buf, 
>>>> then,
>>>> it promise if you got a dma-buf fd, it will contains the file content.
>>>
>>> Interesting idea, that has at least more potential than trying to 
>>> enable direct I/O on mmap()ed DMA-bufs.
>>>
>>> The approach with the new IOCTL might not work because it is a very 
>>> specialized use case.
>>
>> Thank you for your advice. maybe the "read file" behavior can be 
>> attached to an existing allocation?
>
> The point is there are already system calls to do something like that.
>
> See copy_file_range() 
> (https://man7.org/linux/man-pages/man2/copy_file_range.2.html) 
> and send_file() 
> (https://man7.org/linux/man-pages/man2/sendfile.2.html).

That's helpfull to learn it, thanks.

In terms of only DMA-BUF supporting direct I/O, 
copy_file_range/send_file may help to achieve this functionality.

However, my patchset also aims to achieve parallel copying of file 
contents while allocating the DMA-BUF, which is something that the 
current set of calls may not be able to accomplish.

Perhaps simply returning the DMA-BUF file descriptor and then 
implementing copy_file_range, while populating the memory and content 
during the copy process, could achieve this? At present, it seems that 
it will be quite complex - We need to ensure that only the returned 
DMA-BUF file descriptor will fail in case of memory not fill, like mmap, 
vmap, attach, and so on.

>
> What we probably could do is to internally optimize those.
>
>> I am currently creating a new ioctl to remind the user that memory is 
>> being allocated and read, and I am also unsure
>>
>> whether it is appropriate to add additional parameters to the 
>> existing allocate behavior.
>>
>> Please, give me more suggestion. Thanks.
>>
>>>
>>> But IIRC there was a copy_file_range callback in the file_operations 
>>> structure you could use for that. I'm just not sure when and how 
>>> that's used with the copy_file_range() system call.
>>
>> Sorry, I'm not familiar with this, but I will look into it. However, 
>> this type of callback function is not currently implemented when 
>> exporting
>>
>> the dma_buf file, which means that I need to implement the callback 
>> for it?
>
> If I'm not completely mistaken the copy_file_range, splice_read and 
> splice_write callbacks on the struct file_operations 
> (https://elixir.bootlin.com/linux/v6.10-rc7/source/include/linux/fs.h#L1999).
>
> Can be used to implement what you want to do.
Yes.
>
> Regards,
> Christian.
>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Notice, file_fd depends on user how to open this file. So, both buffer
>>>> I/O and Direct I/O is supported.
>>>>
>>>> Signed-off-by: Huan Yang <link@vivo.com>
>>>> ---
>>>>   drivers/dma-buf/dma-heap.c    | 525 
>>>> +++++++++++++++++++++++++++++++++-
>>>>   include/linux/dma-heap.h      |  57 +++-
>>>>   include/uapi/linux/dma-heap.h |  32 +++
>>>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
>>>> index 2298ca5e112e..abe17281adb8 100644
>>>> --- a/drivers/dma-buf/dma-heap.c
>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>> @@ -15,9 +15,11 @@
>>>>   #include <linux/list.h>
>>>>   #include <linux/slab.h>
>>>>   #include <linux/nospec.h>
>>>> +#include <linux/highmem.h>
>>>>   #include <linux/uaccess.h>
>>>>   #include <linux/syscalls.h>
>>>>   #include <linux/dma-heap.h>
>>>> +#include <linux/vmalloc.h>
>>>>   #include <uapi/linux/dma-heap.h>
>>>>     #define DEVNAME "dma_heap"
>>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>>       struct cdev heap_cdev;
>>>>   };
>>>>   +/**
>>>> + * struct dma_heap_file - wrap the file, read task for dma_heap 
>>>> allocate use.
>>>> + * @file:        file to read from.
>>>> + *
>>>> + * @cred:        kthread use, user cred copy to use for the read.
>>>> + *
>>>> + * @max_batch:        maximum batch size to read, if collect match 
>>>> batch,
>>>> + *            trigger read, default 128MB, must below file size.
>>>> + *
>>>> + * @fsz:        file size.
>>>> + *
>>>> + * @direct:        use direct IO?
>>>> + */
>>>> +struct dma_heap_file {
>>>> +    struct file *file;
>>>> +    struct cred *cred;
>>>> +    size_t max_batch;
>>>> +    size_t fsz;
>>>> +    bool direct;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct dma_heap_file_work - represents a dma_heap file read 
>>>> real work.
>>>> + * @vaddr:        contigous virtual address alloc by vmap, file 
>>>> read need.
>>>> + *
>>>> + * @start_size:        file read start offset, same to 
>>>> @dma_heap_file_task->roffset.
>>>> + *
>>>> + * @need_size:        file read need size, same to 
>>>> @dma_heap_file_task->rsize.
>>>> + *
>>>> + * @heap_file:        file wrapper.
>>>> + *
>>>> + * @list:        child node of @dma_heap_file_control->works.
>>>> + *
>>>> + * @refp:        same @dma_heap_file_task->ref, if end of read, 
>>>> put ref.
>>>> + *
>>>> + * @failp:        if any work io failed, set it true, pointp 
>>>> @dma_heap_file_task->fail.
>>>> + */
>>>> +struct dma_heap_file_work {
>>>> +    void *vaddr;
>>>> +    ssize_t start_size;
>>>> +    ssize_t need_size;
>>>> +    struct dma_heap_file *heap_file;
>>>> +    struct list_head list;
>>>> +    atomic_t *refp;
>>>> +    bool *failp;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct dma_heap_file_task - represents a dma_heap file read 
>>>> process
>>>> + * @ref:        current file work counter, if zero, allocate and read
>>>> + *            done.
>>>> + *
>>>> + * @roffset:        last read offset, current prepared work' begin 
>>>> file
>>>> + *            start offset.
>>>> + *
>>>> + * @rsize:        current allocated page size use to read, if 
>>>> reach rbatch,
>>>> + *            trigger commit.
>>>> + *
>>>> + * @rbatch:        current prepared work's batch, below 
>>>> @dma_heap_file's
>>>> + *            batch.
>>>> + *
>>>> + * @heap_file:        current dma_heap_file
>>>> + *
>>>> + * @parray:        used for vmap, size is @dma_heap_file's batch's 
>>>> number
>>>> + *            pages.(this is maximum). Due to single thread file 
>>>> read,
>>>> + *            one page array reuse each work prepare is OK.
>>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>>> + *
>>>> + * @pindex:        current allocated page filled in @parray's index.
>>>> + *
>>>> + * @fail:        any work failed when file read?
>>>> + *
>>>> + * dma_heap_file_task is the production of file read, will prepare 
>>>> each work
>>>> + * during allocate dma_buf pages, if match current batch, then 
>>>> trigger commit
>>>> + * and prepare next work. After all batch queued, user going on 
>>>> prepare dma_buf
>>>> + * and so on, but before return dma_buf fd, need to wait file read 
>>>> end and
>>>> + * check read result.
>>>> + */
>>>> +struct dma_heap_file_task {
>>>> +    atomic_t ref;
>>>> +    size_t roffset;
>>>> +    size_t rsize;
>>>> +    size_t rbatch;
>>>> +    struct dma_heap_file *heap_file;
>>>> +    struct page **parray;
>>>> +    unsigned int pindex;
>>>> +    bool fail;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct dma_heap_file_control - global control of dma_heap file 
>>>> read.
>>>> + * @works:        @dma_heap_file_work's list head.
>>>> + *
>>>> + * @lock:        only lock for @works.
>>>> + *
>>>> + * @threadwq:        wait queue for @work_thread, if commit work, 
>>>> @work_thread
>>>> + *            wakeup and read this work's file contains.
>>>> + *
>>>> + * @workwq:        used for main thread wait for file read end, if 
>>>> allocation
>>>> + *            end before file read. @dma_heap_file_task ref effect 
>>>> this.
>>>> + *
>>>> + * @work_thread:    file read kthread. the dma_heap_file_task 
>>>> work's consumer.
>>>> + *
>>>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>>>> alloc/free frequently.
>>>> + *
>>>> + * @nr_work:        global number of how many work committed.
>>>> + */
>>>> +struct dma_heap_file_control {
>>>> +    struct list_head works;
>>>> +    spinlock_t lock;
>>>> +    wait_queue_head_t threadwq;
>>>> +    wait_queue_head_t workwq;
>>>> +    struct task_struct *work_thread;
>>>> +    struct kmem_cache *heap_fwork_cachep;
>>>> +    atomic_t nr_work;
>>>> +};
>>>> +
>>>> +static struct dma_heap_file_control *heap_fctl;
>>>>   static LIST_HEAD(heap_list);
>>>>   static DEFINE_MUTEX(heap_list_lock);
>>>>   static dev_t dma_heap_devt;
>>>>   static struct class *dma_heap_class;
>>>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>>   +/**
>>>> + * map_pages_to_vaddr - map each scatter page into contiguous 
>>>> virtual address.
>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>> + *
>>>> + * Cached pages need to trigger file read, this function map each 
>>>> scatter page
>>>> + * into contiguous virtual address, so that file read can easy use.
>>>> + * Now that we get vaddr page, cached pages can return to original 
>>>> user, so we
>>>> + * will not effect dma-buf export even if file read not end.
>>>> + */
>>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task 
>>>> *heap_ftask)
>>>> +{
>>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>>> +            PAGE_KERNEL);
>>>> +}
>>>> +
>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>> *heap_ftask,
>>>> +                struct page *page)
>>>> +{
>>>> +    struct page **array = heap_ftask->parray;
>>>> +    int index = heap_ftask->pindex;
>>>> +    int num = compound_nr(page), i;
>>>> +    unsigned long sz = page_size(page);
>>>> +
>>>> +    heap_ftask->rsize += sz;
>>>> +    for (i = 0; i < num; ++i)
>>>> +        array[index++] = &page[i];
>>>> +    heap_ftask->pindex = index;
>>>> +
>>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>>> +}
>>>> +
>>>> +static struct dma_heap_file_work *
>>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>>> +{
>>>> +    struct dma_heap_file_work *heap_fwork;
>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>> +
>>>> +    if (READ_ONCE(heap_ftask->fail))
>>>> +        return NULL;
>>>> +
>>>> +    heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, 
>>>> GFP_KERNEL);
>>>> +    if (unlikely(!heap_fwork))
>>>> +        return NULL;
>>>> +
>>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>>> +        kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>> +        return NULL;
>>>> +    }
>>>> +
>>>> +    heap_fwork->heap_file = heap_file;
>>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>>> +    heap_fwork->refp = &heap_ftask->ref;
>>>> +    heap_fwork->failp = &heap_ftask->fail;
>>>> +    atomic_inc(&heap_ftask->ref);
>>>> +    return heap_fwork;
>>>> +}
>>>> +
>>>> +static void destroy_file_work(struct dma_heap_file_work *heap_fwork)
>>>> +{
>>>> +    vunmap(heap_fwork->vaddr);
>>>> +    atomic_dec(heap_fwork->refp);
>>>> +    wake_up(&heap_fctl->workwq);
>>>> +
>>>> +    kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>> +}
>>>> +
>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
>>>> +{
>>>> +    struct dma_heap_file_work *heap_fwork = 
>>>> init_file_work(heap_ftask);
>>>> +    struct page *last = NULL;
>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>> +    size_t start = heap_ftask->roffset;
>>>> +    struct file *file = heap_file->file;
>>>> +    size_t fsz = heap_file->fsz;
>>>> +
>>>> +    if (unlikely(!heap_fwork))
>>>> +        return -ENOMEM;
>>>> +
>>>> +    /**
>>>> +     * If file size is not page aligned, direct io can't process 
>>>> the tail.
>>>> +     * So, if reach to tail, remain the last page use buffer read.
>>>> +     */
>>>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>>> +    }
>>>> +
>>>> +    spin_lock(&heap_fctl->lock);
>>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>>> +    spin_unlock(&heap_fctl->lock);
>>>> +    atomic_inc(&heap_fctl->nr_work);
>>>> +
>>>> +    wake_up(&heap_fctl->threadwq);
>>>> +
>>>> +    if (last) {
>>>> +        char *buf, *pathp;
>>>> +        ssize_t err;
>>>> +        void *buffer;
>>>> +
>>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>>> +        if (unlikely(!buf))
>>>> +            return -ENOMEM;
>>>> +
>>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>>> +
>>>> +        pathp = file_path(file, buf, PATH_MAX);
>>>> +        if (IS_ERR(pathp)) {
>>>> +            kfree(buf);
>>>> +            return PTR_ERR(pathp);
>>>> +        }
>>>> +
>>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>>>> +                         fsz - start, &fsz,
>>>> +                         READING_POLICY);
>>>> +        kunmap_local(buffer);
>>>> +        kfree(buf);
>>>> +        if (err < 0) {
>>>> +            pr_err("failed to use buffer kernel_read_file %s, 
>>>> err=%ld, [%ld, %ld], f_sz=%ld\n",
>>>> +                   pathp, err, start, fsz, fsz);
>>>> +
>>>> +            return err;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>>> +    heap_ftask->rsize = 0;
>>>> +    heap_ftask->pindex = 0;
>>>> +    heap_ftask->rbatch = min_t(size_t,
>>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>>> +                   heap_ftask->rbatch);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>> *heap_ftask)
>>>> +{
>>>> +    wait_event_freezable(heap_fctl->workwq,
>>>> +                 atomic_read(&heap_ftask->ref) == 0);
>>>> +    return heap_ftask->fail;
>>>> +}
>>>> +
>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>> *heap_ftask)
>>>> +{
>>>> +    bool fail;
>>>> +
>>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>>> +    fail = heap_ftask->fail;
>>>> +    kvfree(heap_ftask->parray);
>>>> +    kfree(heap_ftask);
>>>> +    return fail;
>>>> +}
>>>> +
>>>> +struct dma_heap_file_task *
>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>>> +{
>>>> +    struct dma_heap_file_task *heap_ftask =
>>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>>> +    if (unlikely(!heap_ftask))
>>>> +        return NULL;
>>>> +
>>>> +    /**
>>>> +     * Batch is the maximum size which we prepare work will meet.
>>>> +     * So, direct alloc this number's page array is OK.
>>>> +     */
>>>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> 
>>>> PAGE_SHIFT,
>>>> +                        sizeof(struct page *), GFP_KERNEL);
>>>> +    if (unlikely(!heap_ftask->parray))
>>>> +        goto put;
>>>> +
>>>> +    heap_ftask->heap_file = heap_file;
>>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>>> +    return heap_ftask;
>>>> +put:
>>>> +    kfree(heap_ftask);
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
>>>> +{
>>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>>> +    struct file *file = heap_file->file;
>>>> +    ssize_t start = heap_fwork->start_size;
>>>> +    ssize_t size = heap_fwork->need_size;
>>>> +    void *buffer = heap_fwork->vaddr;
>>>> +    const struct cred *old_cred;
>>>> +    ssize_t err;
>>>> +
>>>> +    // use real task's cred to read this file.
>>>> +    old_cred = override_creds(heap_file->cred);
>>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>>> &heap_file->fsz,
>>>> +                   READING_POLICY);
>>>> +    if (err < 0) {
>>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>>> f_sz=%ld\n",
>>>> +               err, start, (start + size), heap_file->fsz);
>>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>>> +    }
>>>> +    // recovery to my cred.
>>>> +    revert_creds(old_cred);
>>>> +}
>>>> +
>>>> +static int dma_heap_file_control_thread(void *data)
>>>> +{
>>>> +    struct dma_heap_file_control *heap_fctl =
>>>> +        (struct dma_heap_file_control *)data;
>>>> +    struct dma_heap_file_work *worker, *tmp;
>>>> +    int nr_work;
>>>> +
>>>> +    LIST_HEAD(pages);
>>>> +    LIST_HEAD(workers);
>>>> +
>>>> +    while (true) {
>>>> +        wait_event_freezable(heap_fctl->threadwq,
>>>> + atomic_read(&heap_fctl->nr_work) > 0);
>>>> +recheck:
>>>> +        spin_lock(&heap_fctl->lock);
>>>> +        list_splice_init(&heap_fctl->works, &workers);
>>>> +        spin_unlock(&heap_fctl->lock);
>>>> +
>>>> +        if (unlikely(kthread_should_stop())) {
>>>> +            list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>> +                list_del(&worker->list);
>>>> +                destroy_file_work(worker);
>>>> +            }
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        nr_work = 0;
>>>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>> +            ++nr_work;
>>>> +            list_del(&worker->list);
>>>> +            __work_this_io(worker);
>>>> +
>>>> +            destroy_file_work(worker);
>>>> +        }
>>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>>> +
>>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>>> +            goto recheck;
>>>> +    }
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>>> +{
>>>> +    return heap_file->fsz;
>>>> +}
>>>> +
>>>> +static int prepare_dma_heap_file(struct dma_heap_file *heap_file, 
>>>> int file_fd,
>>>> +                 size_t batch)
>>>> +{
>>>> +    struct file *file;
>>>> +    size_t fsz;
>>>> +    int ret;
>>>> +
>>>> +    file = fget(file_fd);
>>>> +    if (!file)
>>>> +        return -EINVAL;
>>>> +
>>>> +    fsz = i_size_read(file_inode(file));
>>>> +    if (fsz < batch) {
>>>> +        ret = -EINVAL;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    /**
>>>> +     * Selinux block our read, but actually we are reading the 
>>>> stand-in
>>>> +     * for this file.
>>>> +     * So save current's cred and when going to read, override 
>>>> mine, and
>>>> +     * end of read, revert.
>>>> +     */
>>>> +    heap_file->cred = prepare_kernel_cred(current);
>>>> +    if (unlikely(!heap_file->cred)) {
>>>> +        ret = -ENOMEM;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    heap_file->file = file;
>>>> +    heap_file->max_batch = batch;
>>>> +    heap_file->fsz = fsz;
>>>> +
>>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>>> +
>>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>>> +    if (!heap_file->direct && fsz >= DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>>> +        pr_warn("alloc read file better to use O_DIRECT to read 
>>>> larget file\n");
>>>> +
>>>> +    return 0;
>>>> +
>>>> +err:
>>>> +    fput(file);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
>>>> +{
>>>> +    fput(heap_file->file);
>>>> +    put_cred(heap_file->cred);
>>>> +}
>>>> +
>>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap *heap, 
>>>> int file_fd,
>>>> +                       size_t batch, unsigned int fd_flags,
>>>> +                       unsigned int heap_flags)
>>>> +{
>>>> +    struct dma_buf *dmabuf;
>>>> +    int fd;
>>>> +    struct dma_heap_file heap_file;
>>>> +
>>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>>> +    if (fd)
>>>> +        goto error_file;
>>>> +
>>>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, 
>>>> fd_flags,
>>>> +                           heap_flags);
>>>> +    if (IS_ERR(dmabuf)) {
>>>> +        fd = PTR_ERR(dmabuf);
>>>> +        goto error;
>>>> +    }
>>>> +
>>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>>> +    if (fd < 0) {
>>>> +        dma_buf_put(dmabuf);
>>>> +        /* just return, as put will call release and that will 
>>>> free */
>>>> +    }
>>>> +
>>>> +error:
>>>> +    destroy_dma_heap_file(&heap_file);
>>>> +error_file:
>>>> +    return fd;
>>>> +}
>>>> +
>>>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
>>>>                    u32 fd_flags,
>>>>                    u64 heap_flags)
>>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode *inode, 
>>>> struct file *file)
>>>>       return 0;
>>>>   }
>>>>   +static long dma_heap_ioctl_allocate_read_file(struct file *file, 
>>>> void *data)
>>>> +{
>>>> +    struct dma_heap_allocation_file_data *heap_allocation_file = 
>>>> data;
>>>> +    struct dma_heap *heap = file->private_data;
>>>> +    int fd;
>>>> +
>>>> +    if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
>>>> +        return -EINVAL;
>>>> +
>>>> +    if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
>>>> +        return -EINVAL;
>>>> +
>>>> +    if (heap_allocation_file->heap_flags & 
>>>> ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>> +        return -EINVAL;
>>>> +
>>>> +    if (!heap->ops->allocate_read_file)
>>>> +        return -EINVAL;
>>>> +
>>>> +    fd = dma_heap_buffer_alloc_read_file(
>>>> +        heap, heap_allocation_file->file_fd,
>>>> +        heap_allocation_file->batch ?
>>>> +            PAGE_ALIGN(heap_allocation_file->batch) :
>>>> +            DEFAULT_ADI_BATCH,
>>>> +        heap_allocation_file->fd_flags,
>>>> +        heap_allocation_file->heap_flags);
>>>> +    if (fd < 0)
>>>> +        return fd;
>>>> +
>>>> +    heap_allocation_file->fd = fd;
>>>> +    return 0;
>>>> +}
>>>> +
>>>>   static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>>>   {
>>>>       struct dma_heap_allocation_data *heap_allocation = data;
>>>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct file 
>>>> *file, void *data)
>>>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>>>       DMA_HEAP_IOCTL_ALLOC,
>>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>>   };
>>>>     static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
>>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file *file, 
>>>> unsigned int ucmd,
>>>>       case DMA_HEAP_IOCTL_ALLOC:
>>>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>>>           break;
>>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>>> +        break;
>>>>       default:
>>>>           ret = -ENOTTY;
>>>>           goto err;
>>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>>         dma_heap_class = class_create(DEVNAME);
>>>>       if (IS_ERR(dma_heap_class)) {
>>>> -        unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>> -        return PTR_ERR(dma_heap_class);
>>>> +        ret = PTR_ERR(dma_heap_class);
>>>> +        goto fail_class;
>>>>       }
>>>>       dma_heap_class->devnode = dma_heap_devnode;
>>>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>>> +    if (unlikely(!heap_fctl)) {
>>>> +        ret =  -ENOMEM;
>>>> +        goto fail_alloc;
>>>> +    }
>>>> +
>>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>>> +    init_waitqueue_head(&heap_fctl->threadwq);
>>>> +    init_waitqueue_head(&heap_fctl->workwq);
>>>> +
>>>> +    heap_fctl->work_thread = 
>>>> kthread_run(dma_heap_file_control_thread,
>>>> +                         heap_fctl, "heap_fwork_t");
>>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>>> +        ret = -ENOMEM;
>>>> +        goto fail_thread;
>>>> +    }
>>>> +
>>>> +    heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 0);
>>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>>> +        ret = -ENOMEM;
>>>> +        goto fail_cache;
>>>> +    }
>>>> +
>>>>       return 0;
>>>> +
>>>> +fail_cache:
>>>> +    kthread_stop(heap_fctl->work_thread);
>>>> +fail_thread:
>>>> +    kfree(heap_fctl);
>>>> +fail_alloc:
>>>> +    class_destroy(dma_heap_class);
>>>> +fail_class:
>>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>> +    return ret;
>>>>   }
>>>>   subsys_initcall(dma_heap_init);
>>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>>> index 064bad725061..9c25383f816c 100644
>>>> --- a/include/linux/dma-heap.h
>>>> +++ b/include/linux/dma-heap.h
>>>> @@ -12,12 +12,17 @@
>>>>   #include <linux/cdev.h>
>>>>   #include <linux/types.h>
>>>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>>>> +
>>>>   struct dma_heap;
>>>> +struct dma_heap_file_task;
>>>> +struct dma_heap_file;
>>>>     /**
>>>>    * struct dma_heap_ops - ops to operate on a given heap
>>>>    * @allocate:        allocate dmabuf and return struct dma_buf ptr
>>>> - *
>>>> + * @allocate_read_file: allocate dmabuf and read file, then return 
>>>> struct
>>>> + * dma_buf ptr.
>>>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>>>>    */
>>>>   struct dma_heap_ops {
>>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>>                       unsigned long len,
>>>>                       u32 fd_flags,
>>>>                       u64 heap_flags);
>>>> +
>>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>>>> +                          struct dma_heap_file *heap_file,
>>>> +                          u32 fd_flags,
>>>> +                          u64 heap_flags);
>>>>   };
>>>>     /**
>>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap 
>>>> *heap);
>>>>    */
>>>>   struct dma_heap *dma_heap_add(const struct dma_heap_export_info 
>>>> *exp_info);
>>>>   +/**
>>>> + * dma_heap_destroy_file_read - waits for a file read to complete 
>>>> then destroy it
>>>> + * Returns: true if the file read failed, false otherwise
>>>> + */
>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>> *heap_ftask);
>>>> +
>>>> +/**
>>>> + * dma_heap_wait_for_file_read - waits for a file read to complete
>>>> + * Returns: true if the file read failed, false otherwise
>>>> + */
>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>> *heap_ftask);
>>>> +
>>>> +/**
>>>> + * dma_heap_alloc_file_read - Declare a task to read file when 
>>>> allocate pages.
>>>> + * @heap_file:        target file to read
>>>> + *
>>>> + * Return NULL if failed, otherwise return a struct pointer.
>>>> + */
>>>> +struct dma_heap_file_task *
>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>>> +
>>>> +/**
>>>> + * dma_heap_prepare_file_read - cache each allocated page until we 
>>>> meet this batch.
>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>> + * @page:        current allocated page. don't care which order.
>>>> + *
>>>> + * Returns true if reach to batch, false so go on prepare.
>>>> + */
>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>> *heap_ftask,
>>>> +                struct page *page);
>>>> +
>>>> +/**
>>>> + * dma_heap_commit_file_read -  prepare collect enough memory, 
>>>> going to trigger IO
>>>> + * @heap_ftask:            info that current IO needs
>>>> + *
>>>> + * This commit will also check if reach to tail read.
>>>> + * For direct I/O submissions, it is necessary to pay attention to 
>>>> file reads
>>>> + * that are not page-aligned. For the unaligned portion of the 
>>>> read, buffer IO
>>>> + * needs to be triggered.
>>>> + * Returns:
>>>> + *   0 if all right, -errno if something wrong
>>>> + */
>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask);
>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>>> +
>>>>   #endif /* _DMA_HEAPS_H */
>>>> diff --git a/include/uapi/linux/dma-heap.h 
>>>> b/include/uapi/linux/dma-heap.h
>>>> index a4cf716a49fa..8c20e8b74eed 100644
>>>> --- a/include/uapi/linux/dma-heap.h
>>>> +++ b/include/uapi/linux/dma-heap.h
>>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>>       __u64 heap_flags;
>>>>   };
>>>>   +/**
>>>> + * struct dma_heap_allocation_file_data - metadata passed from 
>>>> userspace for
>>>> + *                                      allocations and read file
>>>> + * @fd:            will be populated with a fd which provides the
>>>> + *            handle to the allocated dma-buf
>>>> + * @file_fd:        file descriptor to read from(suggested to use 
>>>> O_DIRECT open file)
>>>> + * @batch:        how many memory alloced then file read(bytes), 
>>>> default 128MB
>>>> + *            will auto aligned to PAGE_SIZE
>>>> + * @fd_flags:        file descriptor flags used when allocating
>>>> + * @heap_flags:        flags passed to heap
>>>> + *
>>>> + * Provided by userspace as an argument to the ioctl
>>>> + */
>>>> +struct dma_heap_allocation_file_data {
>>>> +    __u32 fd;
>>>> +    __u32 file_fd;
>>>> +    __u32 batch;
>>>> +    __u32 fd_flags;
>>>> +    __u64 heap_flags;
>>>> +};
>>>> +
>>>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>>>     /**
>>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>>   #define DMA_HEAP_IOCTL_ALLOC    _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>>                         struct dma_heap_allocation_data)
>>>>   +/**
>>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from pool 
>>>> and both
>>>> + *                    read file when allocate memory.
>>>> + *
>>>> + * Takes a dma_heap_allocation_file_data struct and returns it 
>>>> with the fd field
>>>> + * populated with the dmabuf handle of the allocation. When 
>>>> return, the dma-buf
>>>> + * content is read from file.
>>>> + */
>>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>>> dma_heap_allocation_file_data)
>>>> +
>>>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>>>
>
Huan Yang July 12, 2024, 2:14 a.m. UTC | #5
在 2024/7/12 9:59, Huan Yang 写道:
> Hi Christian,
>
> 在 2024/7/11 19:39, Christian König 写道:
>> Am 11.07.24 um 11:18 schrieb Huan Yang:
>>> Hi Christian,
>>>
>>> Thanks for your reply.
>>>
>>> 在 2024/7/11 17:00, Christian König 写道:
>>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>>> Some user may need load file into dma-buf, current
>>>>> way is:
>>>>>    1. allocate a dma-buf, get dma-buf fd
>>>>>    2. mmap dma-buf fd into vaddr
>>>>>    3. read(file_fd, vaddr, fsz)
>>>>> This is too heavy if fsz reached to GB.
>>>>
>>>> You need to describe a bit more why that is to heavy. I can only 
>>>> assume you need to save memory bandwidth and avoid the extra copy 
>>>> with the CPU.
>>>
>>> Sorry for the oversimplified explanation. But, yes, you're right, we 
>>> want to avoid this.
>>>
>>> As we are dealing with embedded devices, the available memory and 
>>> computing power for users are usually limited.(The maximum available 
>>> memory is currently
>>>
>>> 24GB, typically ranging from 8-12GB. )
>>>
>>> Also, the CPU computing power is also usually in short supply, due 
>>> to limited battery capacity and limited heat dissipation capabilities.
>>>
>>> So, we hope to avoid ineffective paths as much as possible.
>>>
>>>>
>>>>> This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>>> User need to offer a file_fd which you want to load into dma-buf, 
>>>>> then,
>>>>> it promise if you got a dma-buf fd, it will contains the file 
>>>>> content.
>>>>
>>>> Interesting idea, that has at least more potential than trying to 
>>>> enable direct I/O on mmap()ed DMA-bufs.
>>>>
>>>> The approach with the new IOCTL might not work because it is a very 
>>>> specialized use case.
>>>
>>> Thank you for your advice. maybe the "read file" behavior can be 
>>> attached to an existing allocation?
>>
>> The point is there are already system calls to do something like that.
>>
>> See copy_file_range() 
>> (https://man7.org/linux/man-pages/man2/copy_file_range.2.html) 
>> and send_file() 
>> (https://man7.org/linux/man-pages/man2/sendfile.2.html).
>
> That's helpfull to learn it, thanks.
>
> In terms of only DMA-BUF supporting direct I/O, 
> copy_file_range/send_file may help to achieve this functionality.
>
> However, my patchset also aims to achieve parallel copying of file 
> contents while allocating the DMA-BUF, which is something that the 
> current set of calls may not be able to accomplish.

You can see cover-letter, here are the normal test and this IOCTL's 
compare in memory pressure, even if buffered I/O in this ioctl can have 
50% improve by  parallel.

dd a 3GB file for test, 12G RAM phone, UFS4.0, stressapptest 4G memory pressure.

1. original
```shel
# create a model file
dd if=/dev/zero of=./model.txt bs=1M count=3072
# drop page cache
echo 3 > /proc/sys/vm/drop_caches
./dmabuf-heap-file-read mtk_mm-uncached normal

> result is total cost 13087213847ns

```

2.DMA_HEAP_IOCTL_ALLOC_AND_READ O_DIRECT
```shel
# create a model file
dd if=/dev/zero of=./model.txt bs=1M count=3072
# drop page cache
echo 3 > /proc/sys/vm/drop_caches
./dmabuf-heap-file-read mtk_mm-uncached direct_io

> result is total cost 2902386846ns

# use direct_io_check can check the content if is same to file.
```

3. DMA_HEAP_IOCTL_ALLOC_AND_READ BUFFER I/O
```shel
# create a model file
dd if=/dev/zero of=./model.txt bs=1M count=3072
# drop page cache
echo 3 > /proc/sys/vm/drop_caches
./dmabuf-heap-file-read mtk_mm-uncached normal_io

> result is total cost 5735579385ns

```

>
> Perhaps simply returning the DMA-BUF file descriptor and then 
> implementing copy_file_range, while populating the memory and content 
> during the copy process, could achieve this? At present, it seems that 
> it will be quite complex - We need to ensure that only the returned 
> DMA-BUF file descriptor will fail in case of memory not fill, like 
> mmap, vmap, attach, and so on.
>
>>
>> What we probably could do is to internally optimize those.
>>
>>> I am currently creating a new ioctl to remind the user that memory 
>>> is being allocated and read, and I am also unsure
>>>
>>> whether it is appropriate to add additional parameters to the 
>>> existing allocate behavior.
>>>
>>> Please, give me more suggestion. Thanks.
>>>
>>>>
>>>> But IIRC there was a copy_file_range callback in the 
>>>> file_operations structure you could use for that. I'm just not sure 
>>>> when and how that's used with the copy_file_range() system call.
>>>
>>> Sorry, I'm not familiar with this, but I will look into it. However, 
>>> this type of callback function is not currently implemented when 
>>> exporting
>>>
>>> the dma_buf file, which means that I need to implement the callback 
>>> for it?
>>
>> If I'm not completely mistaken the copy_file_range, splice_read and 
>> splice_write callbacks on the struct file_operations 
>> (https://elixir.bootlin.com/linux/v6.10-rc7/source/include/linux/fs.h#L1999).
>>
>> Can be used to implement what you want to do.
> Yes.
>>
>> Regards,
>> Christian.
>>
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> Notice, file_fd depends on user how to open this file. So, both 
>>>>> buffer
>>>>> I/O and Direct I/O is supported.
>>>>>
>>>>> Signed-off-by: Huan Yang <link@vivo.com>
>>>>> ---
>>>>>   drivers/dma-buf/dma-heap.c    | 525 
>>>>> +++++++++++++++++++++++++++++++++-
>>>>>   include/linux/dma-heap.h      |  57 +++-
>>>>>   include/uapi/linux/dma-heap.h |  32 +++
>>>>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
>>>>> index 2298ca5e112e..abe17281adb8 100644
>>>>> --- a/drivers/dma-buf/dma-heap.c
>>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>>> @@ -15,9 +15,11 @@
>>>>>   #include <linux/list.h>
>>>>>   #include <linux/slab.h>
>>>>>   #include <linux/nospec.h>
>>>>> +#include <linux/highmem.h>
>>>>>   #include <linux/uaccess.h>
>>>>>   #include <linux/syscalls.h>
>>>>>   #include <linux/dma-heap.h>
>>>>> +#include <linux/vmalloc.h>
>>>>>   #include <uapi/linux/dma-heap.h>
>>>>>     #define DEVNAME "dma_heap"
>>>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>>>       struct cdev heap_cdev;
>>>>>   };
>>>>>   +/**
>>>>> + * struct dma_heap_file - wrap the file, read task for dma_heap 
>>>>> allocate use.
>>>>> + * @file:        file to read from.
>>>>> + *
>>>>> + * @cred:        kthread use, user cred copy to use for the read.
>>>>> + *
>>>>> + * @max_batch:        maximum batch size to read, if collect 
>>>>> match batch,
>>>>> + *            trigger read, default 128MB, must below file size.
>>>>> + *
>>>>> + * @fsz:        file size.
>>>>> + *
>>>>> + * @direct:        use direct IO?
>>>>> + */
>>>>> +struct dma_heap_file {
>>>>> +    struct file *file;
>>>>> +    struct cred *cred;
>>>>> +    size_t max_batch;
>>>>> +    size_t fsz;
>>>>> +    bool direct;
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct dma_heap_file_work - represents a dma_heap file read 
>>>>> real work.
>>>>> + * @vaddr:        contigous virtual address alloc by vmap, file 
>>>>> read need.
>>>>> + *
>>>>> + * @start_size:        file read start offset, same to 
>>>>> @dma_heap_file_task->roffset.
>>>>> + *
>>>>> + * @need_size:        file read need size, same to 
>>>>> @dma_heap_file_task->rsize.
>>>>> + *
>>>>> + * @heap_file:        file wrapper.
>>>>> + *
>>>>> + * @list:        child node of @dma_heap_file_control->works.
>>>>> + *
>>>>> + * @refp:        same @dma_heap_file_task->ref, if end of read, 
>>>>> put ref.
>>>>> + *
>>>>> + * @failp:        if any work io failed, set it true, pointp 
>>>>> @dma_heap_file_task->fail.
>>>>> + */
>>>>> +struct dma_heap_file_work {
>>>>> +    void *vaddr;
>>>>> +    ssize_t start_size;
>>>>> +    ssize_t need_size;
>>>>> +    struct dma_heap_file *heap_file;
>>>>> +    struct list_head list;
>>>>> +    atomic_t *refp;
>>>>> +    bool *failp;
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct dma_heap_file_task - represents a dma_heap file read 
>>>>> process
>>>>> + * @ref:        current file work counter, if zero, allocate and 
>>>>> read
>>>>> + *            done.
>>>>> + *
>>>>> + * @roffset:        last read offset, current prepared work' 
>>>>> begin file
>>>>> + *            start offset.
>>>>> + *
>>>>> + * @rsize:        current allocated page size use to read, if 
>>>>> reach rbatch,
>>>>> + *            trigger commit.
>>>>> + *
>>>>> + * @rbatch:        current prepared work's batch, below 
>>>>> @dma_heap_file's
>>>>> + *            batch.
>>>>> + *
>>>>> + * @heap_file:        current dma_heap_file
>>>>> + *
>>>>> + * @parray:        used for vmap, size is @dma_heap_file's 
>>>>> batch's number
>>>>> + *            pages.(this is maximum). Due to single thread file 
>>>>> read,
>>>>> + *            one page array reuse each work prepare is OK.
>>>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>>>> + *
>>>>> + * @pindex:        current allocated page filled in @parray's index.
>>>>> + *
>>>>> + * @fail:        any work failed when file read?
>>>>> + *
>>>>> + * dma_heap_file_task is the production of file read, will 
>>>>> prepare each work
>>>>> + * during allocate dma_buf pages, if match current batch, then 
>>>>> trigger commit
>>>>> + * and prepare next work. After all batch queued, user going on 
>>>>> prepare dma_buf
>>>>> + * and so on, but before return dma_buf fd, need to wait file 
>>>>> read end and
>>>>> + * check read result.
>>>>> + */
>>>>> +struct dma_heap_file_task {
>>>>> +    atomic_t ref;
>>>>> +    size_t roffset;
>>>>> +    size_t rsize;
>>>>> +    size_t rbatch;
>>>>> +    struct dma_heap_file *heap_file;
>>>>> +    struct page **parray;
>>>>> +    unsigned int pindex;
>>>>> +    bool fail;
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct dma_heap_file_control - global control of dma_heap file 
>>>>> read.
>>>>> + * @works:        @dma_heap_file_work's list head.
>>>>> + *
>>>>> + * @lock:        only lock for @works.
>>>>> + *
>>>>> + * @threadwq:        wait queue for @work_thread, if commit work, 
>>>>> @work_thread
>>>>> + *            wakeup and read this work's file contains.
>>>>> + *
>>>>> + * @workwq:        used for main thread wait for file read end, 
>>>>> if allocation
>>>>> + *            end before file read. @dma_heap_file_task ref 
>>>>> effect this.
>>>>> + *
>>>>> + * @work_thread:    file read kthread. the dma_heap_file_task 
>>>>> work's consumer.
>>>>> + *
>>>>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>>>>> alloc/free frequently.
>>>>> + *
>>>>> + * @nr_work:        global number of how many work committed.
>>>>> + */
>>>>> +struct dma_heap_file_control {
>>>>> +    struct list_head works;
>>>>> +    spinlock_t lock;
>>>>> +    wait_queue_head_t threadwq;
>>>>> +    wait_queue_head_t workwq;
>>>>> +    struct task_struct *work_thread;
>>>>> +    struct kmem_cache *heap_fwork_cachep;
>>>>> +    atomic_t nr_work;
>>>>> +};
>>>>> +
>>>>> +static struct dma_heap_file_control *heap_fctl;
>>>>>   static LIST_HEAD(heap_list);
>>>>>   static DEFINE_MUTEX(heap_list_lock);
>>>>>   static dev_t dma_heap_devt;
>>>>>   static struct class *dma_heap_class;
>>>>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>>>   +/**
>>>>> + * map_pages_to_vaddr - map each scatter page into contiguous 
>>>>> virtual address.
>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>> + *
>>>>> + * Cached pages need to trigger file read, this function map each 
>>>>> scatter page
>>>>> + * into contiguous virtual address, so that file read can easy use.
>>>>> + * Now that we get vaddr page, cached pages can return to 
>>>>> original user, so we
>>>>> + * will not effect dma-buf export even if file read not end.
>>>>> + */
>>>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task 
>>>>> *heap_ftask)
>>>>> +{
>>>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>>>> +            PAGE_KERNEL);
>>>>> +}
>>>>> +
>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>> *heap_ftask,
>>>>> +                struct page *page)
>>>>> +{
>>>>> +    struct page **array = heap_ftask->parray;
>>>>> +    int index = heap_ftask->pindex;
>>>>> +    int num = compound_nr(page), i;
>>>>> +    unsigned long sz = page_size(page);
>>>>> +
>>>>> +    heap_ftask->rsize += sz;
>>>>> +    for (i = 0; i < num; ++i)
>>>>> +        array[index++] = &page[i];
>>>>> +    heap_ftask->pindex = index;
>>>>> +
>>>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>>>> +}
>>>>> +
>>>>> +static struct dma_heap_file_work *
>>>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>>>> +{
>>>>> +    struct dma_heap_file_work *heap_fwork;
>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>> +
>>>>> +    if (READ_ONCE(heap_ftask->fail))
>>>>> +        return NULL;
>>>>> +
>>>>> +    heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, 
>>>>> GFP_KERNEL);
>>>>> +    if (unlikely(!heap_fwork))
>>>>> +        return NULL;
>>>>> +
>>>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>>>> +        kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>> +        return NULL;
>>>>> +    }
>>>>> +
>>>>> +    heap_fwork->heap_file = heap_file;
>>>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>>>> +    heap_fwork->refp = &heap_ftask->ref;
>>>>> +    heap_fwork->failp = &heap_ftask->fail;
>>>>> +    atomic_inc(&heap_ftask->ref);
>>>>> +    return heap_fwork;
>>>>> +}
>>>>> +
>>>>> +static void destroy_file_work(struct dma_heap_file_work *heap_fwork)
>>>>> +{
>>>>> +    vunmap(heap_fwork->vaddr);
>>>>> +    atomic_dec(heap_fwork->refp);
>>>>> +    wake_up(&heap_fctl->workwq);
>>>>> +
>>>>> +    kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>> +}
>>>>> +
>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
>>>>> +{
>>>>> +    struct dma_heap_file_work *heap_fwork = 
>>>>> init_file_work(heap_ftask);
>>>>> +    struct page *last = NULL;
>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>> +    size_t start = heap_ftask->roffset;
>>>>> +    struct file *file = heap_file->file;
>>>>> +    size_t fsz = heap_file->fsz;
>>>>> +
>>>>> +    if (unlikely(!heap_fwork))
>>>>> +        return -ENOMEM;
>>>>> +
>>>>> +    /**
>>>>> +     * If file size is not page aligned, direct io can't process 
>>>>> the tail.
>>>>> +     * So, if reach to tail, remain the last page use buffer read.
>>>>> +     */
>>>>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>>>> +    }
>>>>> +
>>>>> +    spin_lock(&heap_fctl->lock);
>>>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>>>> +    spin_unlock(&heap_fctl->lock);
>>>>> +    atomic_inc(&heap_fctl->nr_work);
>>>>> +
>>>>> +    wake_up(&heap_fctl->threadwq);
>>>>> +
>>>>> +    if (last) {
>>>>> +        char *buf, *pathp;
>>>>> +        ssize_t err;
>>>>> +        void *buffer;
>>>>> +
>>>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>>>> +        if (unlikely(!buf))
>>>>> +            return -ENOMEM;
>>>>> +
>>>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>>>> +
>>>>> +        pathp = file_path(file, buf, PATH_MAX);
>>>>> +        if (IS_ERR(pathp)) {
>>>>> +            kfree(buf);
>>>>> +            return PTR_ERR(pathp);
>>>>> +        }
>>>>> +
>>>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>>>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>>>>> +                         fsz - start, &fsz,
>>>>> +                         READING_POLICY);
>>>>> +        kunmap_local(buffer);
>>>>> +        kfree(buf);
>>>>> +        if (err < 0) {
>>>>> +            pr_err("failed to use buffer kernel_read_file %s, 
>>>>> err=%ld, [%ld, %ld], f_sz=%ld\n",
>>>>> +                   pathp, err, start, fsz, fsz);
>>>>> +
>>>>> +            return err;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>>>> +    heap_ftask->rsize = 0;
>>>>> +    heap_ftask->pindex = 0;
>>>>> +    heap_ftask->rbatch = min_t(size_t,
>>>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>>>> +                   heap_ftask->rbatch);
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>> *heap_ftask)
>>>>> +{
>>>>> +    wait_event_freezable(heap_fctl->workwq,
>>>>> +                 atomic_read(&heap_ftask->ref) == 0);
>>>>> +    return heap_ftask->fail;
>>>>> +}
>>>>> +
>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>> *heap_ftask)
>>>>> +{
>>>>> +    bool fail;
>>>>> +
>>>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>>>> +    fail = heap_ftask->fail;
>>>>> +    kvfree(heap_ftask->parray);
>>>>> +    kfree(heap_ftask);
>>>>> +    return fail;
>>>>> +}
>>>>> +
>>>>> +struct dma_heap_file_task *
>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>>>> +{
>>>>> +    struct dma_heap_file_task *heap_ftask =
>>>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>>>> +    if (unlikely(!heap_ftask))
>>>>> +        return NULL;
>>>>> +
>>>>> +    /**
>>>>> +     * Batch is the maximum size which we prepare work will meet.
>>>>> +     * So, direct alloc this number's page array is OK.
>>>>> +     */
>>>>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> 
>>>>> PAGE_SHIFT,
>>>>> +                        sizeof(struct page *), GFP_KERNEL);
>>>>> +    if (unlikely(!heap_ftask->parray))
>>>>> +        goto put;
>>>>> +
>>>>> +    heap_ftask->heap_file = heap_file;
>>>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>>>> +    return heap_ftask;
>>>>> +put:
>>>>> +    kfree(heap_ftask);
>>>>> +    return NULL;
>>>>> +}
>>>>> +
>>>>> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
>>>>> +{
>>>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>>>> +    struct file *file = heap_file->file;
>>>>> +    ssize_t start = heap_fwork->start_size;
>>>>> +    ssize_t size = heap_fwork->need_size;
>>>>> +    void *buffer = heap_fwork->vaddr;
>>>>> +    const struct cred *old_cred;
>>>>> +    ssize_t err;
>>>>> +
>>>>> +    // use real task's cred to read this file.
>>>>> +    old_cred = override_creds(heap_file->cred);
>>>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>>>> &heap_file->fsz,
>>>>> +                   READING_POLICY);
>>>>> +    if (err < 0) {
>>>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>>>> f_sz=%ld\n",
>>>>> +               err, start, (start + size), heap_file->fsz);
>>>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>>>> +    }
>>>>> +    // recovery to my cred.
>>>>> +    revert_creds(old_cred);
>>>>> +}
>>>>> +
>>>>> +static int dma_heap_file_control_thread(void *data)
>>>>> +{
>>>>> +    struct dma_heap_file_control *heap_fctl =
>>>>> +        (struct dma_heap_file_control *)data;
>>>>> +    struct dma_heap_file_work *worker, *tmp;
>>>>> +    int nr_work;
>>>>> +
>>>>> +    LIST_HEAD(pages);
>>>>> +    LIST_HEAD(workers);
>>>>> +
>>>>> +    while (true) {
>>>>> +        wait_event_freezable(heap_fctl->threadwq,
>>>>> + atomic_read(&heap_fctl->nr_work) > 0);
>>>>> +recheck:
>>>>> +        spin_lock(&heap_fctl->lock);
>>>>> +        list_splice_init(&heap_fctl->works, &workers);
>>>>> +        spin_unlock(&heap_fctl->lock);
>>>>> +
>>>>> +        if (unlikely(kthread_should_stop())) {
>>>>> +            list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>>> +                list_del(&worker->list);
>>>>> +                destroy_file_work(worker);
>>>>> +            }
>>>>> +            break;
>>>>> +        }
>>>>> +
>>>>> +        nr_work = 0;
>>>>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>>> +            ++nr_work;
>>>>> +            list_del(&worker->list);
>>>>> +            __work_this_io(worker);
>>>>> +
>>>>> +            destroy_file_work(worker);
>>>>> +        }
>>>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>>>> +
>>>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>>>> +            goto recheck;
>>>>> +    }
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>>>> +{
>>>>> +    return heap_file->fsz;
>>>>> +}
>>>>> +
>>>>> +static int prepare_dma_heap_file(struct dma_heap_file *heap_file, 
>>>>> int file_fd,
>>>>> +                 size_t batch)
>>>>> +{
>>>>> +    struct file *file;
>>>>> +    size_t fsz;
>>>>> +    int ret;
>>>>> +
>>>>> +    file = fget(file_fd);
>>>>> +    if (!file)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    fsz = i_size_read(file_inode(file));
>>>>> +    if (fsz < batch) {
>>>>> +        ret = -EINVAL;
>>>>> +        goto err;
>>>>> +    }
>>>>> +
>>>>> +    /**
>>>>> +     * Selinux block our read, but actually we are reading the 
>>>>> stand-in
>>>>> +     * for this file.
>>>>> +     * So save current's cred and when going to read, override 
>>>>> mine, and
>>>>> +     * end of read, revert.
>>>>> +     */
>>>>> +    heap_file->cred = prepare_kernel_cred(current);
>>>>> +    if (unlikely(!heap_file->cred)) {
>>>>> +        ret = -ENOMEM;
>>>>> +        goto err;
>>>>> +    }
>>>>> +
>>>>> +    heap_file->file = file;
>>>>> +    heap_file->max_batch = batch;
>>>>> +    heap_file->fsz = fsz;
>>>>> +
>>>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>>>> +
>>>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>>>> +    if (!heap_file->direct && fsz >= 
>>>>> DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>>>> +        pr_warn("alloc read file better to use O_DIRECT to read 
>>>>> larget file\n");
>>>>> +
>>>>> +    return 0;
>>>>> +
>>>>> +err:
>>>>> +    fput(file);
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
>>>>> +{
>>>>> +    fput(heap_file->file);
>>>>> +    put_cred(heap_file->cred);
>>>>> +}
>>>>> +
>>>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap *heap, 
>>>>> int file_fd,
>>>>> +                       size_t batch, unsigned int fd_flags,
>>>>> +                       unsigned int heap_flags)
>>>>> +{
>>>>> +    struct dma_buf *dmabuf;
>>>>> +    int fd;
>>>>> +    struct dma_heap_file heap_file;
>>>>> +
>>>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>>>> +    if (fd)
>>>>> +        goto error_file;
>>>>> +
>>>>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, 
>>>>> fd_flags,
>>>>> +                           heap_flags);
>>>>> +    if (IS_ERR(dmabuf)) {
>>>>> +        fd = PTR_ERR(dmabuf);
>>>>> +        goto error;
>>>>> +    }
>>>>> +
>>>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>>>> +    if (fd < 0) {
>>>>> +        dma_buf_put(dmabuf);
>>>>> +        /* just return, as put will call release and that will 
>>>>> free */
>>>>> +    }
>>>>> +
>>>>> +error:
>>>>> +    destroy_dma_heap_file(&heap_file);
>>>>> +error_file:
>>>>> +    return fd;
>>>>> +}
>>>>> +
>>>>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
>>>>>                    u32 fd_flags,
>>>>>                    u64 heap_flags)
>>>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode *inode, 
>>>>> struct file *file)
>>>>>       return 0;
>>>>>   }
>>>>>   +static long dma_heap_ioctl_allocate_read_file(struct file 
>>>>> *file, void *data)
>>>>> +{
>>>>> +    struct dma_heap_allocation_file_data *heap_allocation_file = 
>>>>> data;
>>>>> +    struct dma_heap *heap = file->private_data;
>>>>> +    int fd;
>>>>> +
>>>>> +    if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    if (heap_allocation_file->heap_flags & 
>>>>> ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    if (!heap->ops->allocate_read_file)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    fd = dma_heap_buffer_alloc_read_file(
>>>>> +        heap, heap_allocation_file->file_fd,
>>>>> +        heap_allocation_file->batch ?
>>>>> +            PAGE_ALIGN(heap_allocation_file->batch) :
>>>>> +            DEFAULT_ADI_BATCH,
>>>>> +        heap_allocation_file->fd_flags,
>>>>> +        heap_allocation_file->heap_flags);
>>>>> +    if (fd < 0)
>>>>> +        return fd;
>>>>> +
>>>>> +    heap_allocation_file->fd = fd;
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>>   static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>>>>   {
>>>>>       struct dma_heap_allocation_data *heap_allocation = data;
>>>>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct 
>>>>> file *file, void *data)
>>>>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>>>>       DMA_HEAP_IOCTL_ALLOC,
>>>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>>>   };
>>>>>     static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
>>>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file *file, 
>>>>> unsigned int ucmd,
>>>>>       case DMA_HEAP_IOCTL_ALLOC:
>>>>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>>>>           break;
>>>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>>>> +        break;
>>>>>       default:
>>>>>           ret = -ENOTTY;
>>>>>           goto err;
>>>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>>>         dma_heap_class = class_create(DEVNAME);
>>>>>       if (IS_ERR(dma_heap_class)) {
>>>>> -        unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>> -        return PTR_ERR(dma_heap_class);
>>>>> +        ret = PTR_ERR(dma_heap_class);
>>>>> +        goto fail_class;
>>>>>       }
>>>>>       dma_heap_class->devnode = dma_heap_devnode;
>>>>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>>>> +    if (unlikely(!heap_fctl)) {
>>>>> +        ret =  -ENOMEM;
>>>>> +        goto fail_alloc;
>>>>> +    }
>>>>> +
>>>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>>>> +    init_waitqueue_head(&heap_fctl->threadwq);
>>>>> +    init_waitqueue_head(&heap_fctl->workwq);
>>>>> +
>>>>> +    heap_fctl->work_thread = 
>>>>> kthread_run(dma_heap_file_control_thread,
>>>>> +                         heap_fctl, "heap_fwork_t");
>>>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>>>> +        ret = -ENOMEM;
>>>>> +        goto fail_thread;
>>>>> +    }
>>>>> +
>>>>> +    heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 
>>>>> 0);
>>>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>>>> +        ret = -ENOMEM;
>>>>> +        goto fail_cache;
>>>>> +    }
>>>>> +
>>>>>       return 0;
>>>>> +
>>>>> +fail_cache:
>>>>> +    kthread_stop(heap_fctl->work_thread);
>>>>> +fail_thread:
>>>>> +    kfree(heap_fctl);
>>>>> +fail_alloc:
>>>>> +    class_destroy(dma_heap_class);
>>>>> +fail_class:
>>>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>> +    return ret;
>>>>>   }
>>>>>   subsys_initcall(dma_heap_init);
>>>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>>>> index 064bad725061..9c25383f816c 100644
>>>>> --- a/include/linux/dma-heap.h
>>>>> +++ b/include/linux/dma-heap.h
>>>>> @@ -12,12 +12,17 @@
>>>>>   #include <linux/cdev.h>
>>>>>   #include <linux/types.h>
>>>>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>>>>> +
>>>>>   struct dma_heap;
>>>>> +struct dma_heap_file_task;
>>>>> +struct dma_heap_file;
>>>>>     /**
>>>>>    * struct dma_heap_ops - ops to operate on a given heap
>>>>>    * @allocate:        allocate dmabuf and return struct dma_buf ptr
>>>>> - *
>>>>> + * @allocate_read_file: allocate dmabuf and read file, then 
>>>>> return struct
>>>>> + * dma_buf ptr.
>>>>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>>>>>    */
>>>>>   struct dma_heap_ops {
>>>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>>>                       unsigned long len,
>>>>>                       u32 fd_flags,
>>>>>                       u64 heap_flags);
>>>>> +
>>>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>>>>> +                          struct dma_heap_file *heap_file,
>>>>> +                          u32 fd_flags,
>>>>> +                          u64 heap_flags);
>>>>>   };
>>>>>     /**
>>>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap 
>>>>> *heap);
>>>>>    */
>>>>>   struct dma_heap *dma_heap_add(const struct dma_heap_export_info 
>>>>> *exp_info);
>>>>>   +/**
>>>>> + * dma_heap_destroy_file_read - waits for a file read to complete 
>>>>> then destroy it
>>>>> + * Returns: true if the file read failed, false otherwise
>>>>> + */
>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>> *heap_ftask);
>>>>> +
>>>>> +/**
>>>>> + * dma_heap_wait_for_file_read - waits for a file read to complete
>>>>> + * Returns: true if the file read failed, false otherwise
>>>>> + */
>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>> *heap_ftask);
>>>>> +
>>>>> +/**
>>>>> + * dma_heap_alloc_file_read - Declare a task to read file when 
>>>>> allocate pages.
>>>>> + * @heap_file:        target file to read
>>>>> + *
>>>>> + * Return NULL if failed, otherwise return a struct pointer.
>>>>> + */
>>>>> +struct dma_heap_file_task *
>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>>>> +
>>>>> +/**
>>>>> + * dma_heap_prepare_file_read - cache each allocated page until 
>>>>> we meet this batch.
>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>> + * @page:        current allocated page. don't care which order.
>>>>> + *
>>>>> + * Returns true if reach to batch, false so go on prepare.
>>>>> + */
>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>> *heap_ftask,
>>>>> +                struct page *page);
>>>>> +
>>>>> +/**
>>>>> + * dma_heap_commit_file_read -  prepare collect enough memory, 
>>>>> going to trigger IO
>>>>> + * @heap_ftask:            info that current IO needs
>>>>> + *
>>>>> + * This commit will also check if reach to tail read.
>>>>> + * For direct I/O submissions, it is necessary to pay attention 
>>>>> to file reads
>>>>> + * that are not page-aligned. For the unaligned portion of the 
>>>>> read, buffer IO
>>>>> + * needs to be triggered.
>>>>> + * Returns:
>>>>> + *   0 if all right, -errno if something wrong
>>>>> + */
>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>> *heap_ftask);
>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>>>> +
>>>>>   #endif /* _DMA_HEAPS_H */
>>>>> diff --git a/include/uapi/linux/dma-heap.h 
>>>>> b/include/uapi/linux/dma-heap.h
>>>>> index a4cf716a49fa..8c20e8b74eed 100644
>>>>> --- a/include/uapi/linux/dma-heap.h
>>>>> +++ b/include/uapi/linux/dma-heap.h
>>>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>>>       __u64 heap_flags;
>>>>>   };
>>>>>   +/**
>>>>> + * struct dma_heap_allocation_file_data - metadata passed from 
>>>>> userspace for
>>>>> + *                                      allocations and read file
>>>>> + * @fd:            will be populated with a fd which provides the
>>>>> + *     ��      handle to the allocated dma-buf
>>>>> + * @file_fd:        file descriptor to read from(suggested to use 
>>>>> O_DIRECT open file)
>>>>> + * @batch:        how many memory alloced then file read(bytes), 
>>>>> default 128MB
>>>>> + *            will auto aligned to PAGE_SIZE
>>>>> + * @fd_flags:        file descriptor flags used when allocating
>>>>> + * @heap_flags:        flags passed to heap
>>>>> + *
>>>>> + * Provided by userspace as an argument to the ioctl
>>>>> + */
>>>>> +struct dma_heap_allocation_file_data {
>>>>> +    __u32 fd;
>>>>> +    __u32 file_fd;
>>>>> +    __u32 batch;
>>>>> +    __u32 fd_flags;
>>>>> +    __u64 heap_flags;
>>>>> +};
>>>>> +
>>>>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>>>>     /**
>>>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>>>   #define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>>>                         struct dma_heap_allocation_data)
>>>>>   +/**
>>>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from pool 
>>>>> and both
>>>>> + *                    read file when allocate memory.
>>>>> + *
>>>>> + * Takes a dma_heap_allocation_file_data struct and returns it 
>>>>> with the fd field
>>>>> + * populated with the dmabuf handle of the allocation. When 
>>>>> return, the dma-buf
>>>>> + * content is read from file.
>>>>> + */
>>>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>>>> dma_heap_allocation_file_data)
>>>>> +
>>>>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>>>>
>>
Christian König July 12, 2024, 7:10 a.m. UTC | #6
Am 12.07.24 um 04:14 schrieb Huan Yang:
> 在 2024/7/12 9:59, Huan Yang 写道:
>> Hi Christian,
>>
>> 在 2024/7/11 19:39, Christian König 写道:
>>> Am 11.07.24 um 11:18 schrieb Huan Yang:
>>>> Hi Christian,
>>>>
>>>> Thanks for your reply.
>>>>
>>>> 在 2024/7/11 17:00, Christian König 写道:
>>>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>>>> Some user may need load file into dma-buf, current
>>>>>> way is:
>>>>>>    1. allocate a dma-buf, get dma-buf fd
>>>>>>    2. mmap dma-buf fd into vaddr
>>>>>>    3. read(file_fd, vaddr, fsz)
>>>>>> This is too heavy if fsz reached to GB.
>>>>>
>>>>> You need to describe a bit more why that is to heavy. I can only 
>>>>> assume you need to save memory bandwidth and avoid the extra copy 
>>>>> with the CPU.
>>>>
>>>> Sorry for the oversimplified explanation. But, yes, you're right, 
>>>> we want to avoid this.
>>>>
>>>> As we are dealing with embedded devices, the available memory and 
>>>> computing power for users are usually limited.(The maximum 
>>>> available memory is currently
>>>>
>>>> 24GB, typically ranging from 8-12GB. )
>>>>
>>>> Also, the CPU computing power is also usually in short supply, due 
>>>> to limited battery capacity and limited heat dissipation capabilities.
>>>>
>>>> So, we hope to avoid ineffective paths as much as possible.
>>>>
>>>>>
>>>>>> This patch implement a feature called 
>>>>>> DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>>>> User need to offer a file_fd which you want to load into dma-buf, 
>>>>>> then,
>>>>>> it promise if you got a dma-buf fd, it will contains the file 
>>>>>> content.
>>>>>
>>>>> Interesting idea, that has at least more potential than trying to 
>>>>> enable direct I/O on mmap()ed DMA-bufs.
>>>>>
>>>>> The approach with the new IOCTL might not work because it is a 
>>>>> very specialized use case.
>>>>
>>>> Thank you for your advice. maybe the "read file" behavior can be 
>>>> attached to an existing allocation?
>>>
>>> The point is there are already system calls to do something like that.
>>>
>>> See copy_file_range() 
>>> (https://man7.org/linux/man-pages/man2/copy_file_range.2.html) and 
>>> send_file() (https://man7.org/linux/man-pages/man2/sendfile.2.html).
>>
>> That's helpfull to learn it, thanks.
>>
>> In terms of only DMA-BUF supporting direct I/O, 
>> copy_file_range/send_file may help to achieve this functionality.
>>
>> However, my patchset also aims to achieve parallel copying of file 
>> contents while allocating the DMA-BUF, which is something that the 
>> current set of calls may not be able to accomplish.

And exactly that is a no-go. Use the existing IOCTLs and system calls 
instead they should have similar performance when done right.

Regards,
Christian.

>
> You can see cover-letter, here are the normal test and this IOCTL's 
> compare in memory pressure, even if buffered I/O in this ioctl can 
> have 50% improve by  parallel.
>
> dd a 3GB file for test, 12G RAM phone, UFS4.0, stressapptest 4G memory 
> pressure.
>
> 1. original
> ```shel
> # create a model file
> dd if=/dev/zero of=./model.txt bs=1M count=3072
> # drop page cache
> echo 3 > /proc/sys/vm/drop_caches
> ./dmabuf-heap-file-read mtk_mm-uncached normal
>
>> result is total cost 13087213847ns
>
> ```
>
> 2.DMA_HEAP_IOCTL_ALLOC_AND_READ O_DIRECT
> ```shel
> # create a model file
> dd if=/dev/zero of=./model.txt bs=1M count=3072
> # drop page cache
> echo 3 > /proc/sys/vm/drop_caches
> ./dmabuf-heap-file-read mtk_mm-uncached direct_io
>
>> result is total cost 2902386846ns
>
> # use direct_io_check can check the content if is same to file.
> ```
>
> 3. DMA_HEAP_IOCTL_ALLOC_AND_READ BUFFER I/O
> ```shel
> # create a model file
> dd if=/dev/zero of=./model.txt bs=1M count=3072
> # drop page cache
> echo 3 > /proc/sys/vm/drop_caches
> ./dmabuf-heap-file-read mtk_mm-uncached normal_io
>
>> result is total cost 5735579385ns
>
> ```
>
>>
>> Perhaps simply returning the DMA-BUF file descriptor and then 
>> implementing copy_file_range, while populating the memory and content 
>> during the copy process, could achieve this? At present, it seems 
>> that it will be quite complex - We need to ensure that only the 
>> returned DMA-BUF file descriptor will fail in case of memory not 
>> fill, like mmap, vmap, attach, and so on.
>>
>>>
>>> What we probably could do is to internally optimize those.
>>>
>>>> I am currently creating a new ioctl to remind the user that memory 
>>>> is being allocated and read, and I am also unsure
>>>>
>>>> whether it is appropriate to add additional parameters to the 
>>>> existing allocate behavior.
>>>>
>>>> Please, give me more suggestion. Thanks.
>>>>
>>>>>
>>>>> But IIRC there was a copy_file_range callback in the 
>>>>> file_operations structure you could use for that. I'm just not 
>>>>> sure when and how that's used with the copy_file_range() system call.
>>>>
>>>> Sorry, I'm not familiar with this, but I will look into it. 
>>>> However, this type of callback function is not currently 
>>>> implemented when exporting
>>>>
>>>> the dma_buf file, which means that I need to implement the callback 
>>>> for it?
>>>
>>> If I'm not completely mistaken the copy_file_range, splice_read and 
>>> splice_write callbacks on the struct file_operations 
>>> (https://elixir.bootlin.com/linux/v6.10-rc7/source/include/linux/fs.h#L1999).
>>>
>>> Can be used to implement what you want to do.
>> Yes.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>> Notice, file_fd depends on user how to open this file. So, both 
>>>>>> buffer
>>>>>> I/O and Direct I/O is supported.
>>>>>>
>>>>>> Signed-off-by: Huan Yang <link@vivo.com>
>>>>>> ---
>>>>>>   drivers/dma-buf/dma-heap.c    | 525 
>>>>>> +++++++++++++++++++++++++++++++++-
>>>>>>   include/linux/dma-heap.h      |  57 +++-
>>>>>>   include/uapi/linux/dma-heap.h |  32 +++
>>>>>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
>>>>>> index 2298ca5e112e..abe17281adb8 100644
>>>>>> --- a/drivers/dma-buf/dma-heap.c
>>>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>>>> @@ -15,9 +15,11 @@
>>>>>>   #include <linux/list.h>
>>>>>>   #include <linux/slab.h>
>>>>>>   #include <linux/nospec.h>
>>>>>> +#include <linux/highmem.h>
>>>>>>   #include <linux/uaccess.h>
>>>>>>   #include <linux/syscalls.h>
>>>>>>   #include <linux/dma-heap.h>
>>>>>> +#include <linux/vmalloc.h>
>>>>>>   #include <uapi/linux/dma-heap.h>
>>>>>>     #define DEVNAME "dma_heap"
>>>>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>>>>       struct cdev heap_cdev;
>>>>>>   };
>>>>>>   +/**
>>>>>> + * struct dma_heap_file - wrap the file, read task for dma_heap 
>>>>>> allocate use.
>>>>>> + * @file:        file to read from.
>>>>>> + *
>>>>>> + * @cred:        kthread use, user cred copy to use for the read.
>>>>>> + *
>>>>>> + * @max_batch:        maximum batch size to read, if collect 
>>>>>> match batch,
>>>>>> + *            trigger read, default 128MB, must below file size.
>>>>>> + *
>>>>>> + * @fsz:        file size.
>>>>>> + *
>>>>>> + * @direct:        use direct IO?
>>>>>> + */
>>>>>> +struct dma_heap_file {
>>>>>> +    struct file *file;
>>>>>> +    struct cred *cred;
>>>>>> +    size_t max_batch;
>>>>>> +    size_t fsz;
>>>>>> +    bool direct;
>>>>>> +};
>>>>>> +
>>>>>> +/**
>>>>>> + * struct dma_heap_file_work - represents a dma_heap file read 
>>>>>> real work.
>>>>>> + * @vaddr:        contigous virtual address alloc by vmap, file 
>>>>>> read need.
>>>>>> + *
>>>>>> + * @start_size:        file read start offset, same to 
>>>>>> @dma_heap_file_task->roffset.
>>>>>> + *
>>>>>> + * @need_size:        file read need size, same to 
>>>>>> @dma_heap_file_task->rsize.
>>>>>> + *
>>>>>> + * @heap_file:        file wrapper.
>>>>>> + *
>>>>>> + * @list:        child node of @dma_heap_file_control->works.
>>>>>> + *
>>>>>> + * @refp:        same @dma_heap_file_task->ref, if end of read, 
>>>>>> put ref.
>>>>>> + *
>>>>>> + * @failp:        if any work io failed, set it true, pointp 
>>>>>> @dma_heap_file_task->fail.
>>>>>> + */
>>>>>> +struct dma_heap_file_work {
>>>>>> +    void *vaddr;
>>>>>> +    ssize_t start_size;
>>>>>> +    ssize_t need_size;
>>>>>> +    struct dma_heap_file *heap_file;
>>>>>> +    struct list_head list;
>>>>>> +    atomic_t *refp;
>>>>>> +    bool *failp;
>>>>>> +};
>>>>>> +
>>>>>> +/**
>>>>>> + * struct dma_heap_file_task - represents a dma_heap file read 
>>>>>> process
>>>>>> + * @ref:        current file work counter, if zero, allocate and 
>>>>>> read
>>>>>> + *            done.
>>>>>> + *
>>>>>> + * @roffset:        last read offset, current prepared work' 
>>>>>> begin file
>>>>>> + *            start offset.
>>>>>> + *
>>>>>> + * @rsize:        current allocated page size use to read, if 
>>>>>> reach rbatch,
>>>>>> + *            trigger commit.
>>>>>> + *
>>>>>> + * @rbatch:        current prepared work's batch, below 
>>>>>> @dma_heap_file's
>>>>>> + *            batch.
>>>>>> + *
>>>>>> + * @heap_file:        current dma_heap_file
>>>>>> + *
>>>>>> + * @parray:        used for vmap, size is @dma_heap_file's 
>>>>>> batch's number
>>>>>> + *            pages.(this is maximum). Due to single thread file 
>>>>>> read,
>>>>>> + *            one page array reuse each work prepare is OK.
>>>>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>>>>> + *
>>>>>> + * @pindex:        current allocated page filled in @parray's 
>>>>>> index.
>>>>>> + *
>>>>>> + * @fail:        any work failed when file read?
>>>>>> + *
>>>>>> + * dma_heap_file_task is the production of file read, will 
>>>>>> prepare each work
>>>>>> + * during allocate dma_buf pages, if match current batch, then 
>>>>>> trigger commit
>>>>>> + * and prepare next work. After all batch queued, user going on 
>>>>>> prepare dma_buf
>>>>>> + * and so on, but before return dma_buf fd, need to wait file 
>>>>>> read end and
>>>>>> + * check read result.
>>>>>> + */
>>>>>> +struct dma_heap_file_task {
>>>>>> +    atomic_t ref;
>>>>>> +    size_t roffset;
>>>>>> +    size_t rsize;
>>>>>> +    size_t rbatch;
>>>>>> +    struct dma_heap_file *heap_file;
>>>>>> +    struct page **parray;
>>>>>> +    unsigned int pindex;
>>>>>> +    bool fail;
>>>>>> +};
>>>>>> +
>>>>>> +/**
>>>>>> + * struct dma_heap_file_control - global control of dma_heap 
>>>>>> file read.
>>>>>> + * @works:        @dma_heap_file_work's list head.
>>>>>> + *
>>>>>> + * @lock:        only lock for @works.
>>>>>> + *
>>>>>> + * @threadwq:        wait queue for @work_thread, if commit 
>>>>>> work, @work_thread
>>>>>> + *            wakeup and read this work's file contains.
>>>>>> + *
>>>>>> + * @workwq:        used for main thread wait for file read end, 
>>>>>> if allocation
>>>>>> + *            end before file read. @dma_heap_file_task ref 
>>>>>> effect this.
>>>>>> + *
>>>>>> + * @work_thread:    file read kthread. the dma_heap_file_task 
>>>>>> work's consumer.
>>>>>> + *
>>>>>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>>>>>> alloc/free frequently.
>>>>>> + *
>>>>>> + * @nr_work:        global number of how many work committed.
>>>>>> + */
>>>>>> +struct dma_heap_file_control {
>>>>>> +    struct list_head works;
>>>>>> +    spinlock_t lock;
>>>>>> +    wait_queue_head_t threadwq;
>>>>>> +    wait_queue_head_t workwq;
>>>>>> +    struct task_struct *work_thread;
>>>>>> +    struct kmem_cache *heap_fwork_cachep;
>>>>>> +    atomic_t nr_work;
>>>>>> +};
>>>>>> +
>>>>>> +static struct dma_heap_file_control *heap_fctl;
>>>>>>   static LIST_HEAD(heap_list);
>>>>>>   static DEFINE_MUTEX(heap_list_lock);
>>>>>>   static dev_t dma_heap_devt;
>>>>>>   static struct class *dma_heap_class;
>>>>>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>>>>   +/**
>>>>>> + * map_pages_to_vaddr - map each scatter page into contiguous 
>>>>>> virtual address.
>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>> + *
>>>>>> + * Cached pages need to trigger file read, this function map 
>>>>>> each scatter page
>>>>>> + * into contiguous virtual address, so that file read can easy use.
>>>>>> + * Now that we get vaddr page, cached pages can return to 
>>>>>> original user, so we
>>>>>> + * will not effect dma-buf export even if file read not end.
>>>>>> + */
>>>>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task 
>>>>>> *heap_ftask)
>>>>>> +{
>>>>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>>>>> +            PAGE_KERNEL);
>>>>>> +}
>>>>>> +
>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>> *heap_ftask,
>>>>>> +                struct page *page)
>>>>>> +{
>>>>>> +    struct page **array = heap_ftask->parray;
>>>>>> +    int index = heap_ftask->pindex;
>>>>>> +    int num = compound_nr(page), i;
>>>>>> +    unsigned long sz = page_size(page);
>>>>>> +
>>>>>> +    heap_ftask->rsize += sz;
>>>>>> +    for (i = 0; i < num; ++i)
>>>>>> +        array[index++] = &page[i];
>>>>>> +    heap_ftask->pindex = index;
>>>>>> +
>>>>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>>>>> +}
>>>>>> +
>>>>>> +static struct dma_heap_file_work *
>>>>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>>>>> +{
>>>>>> +    struct dma_heap_file_work *heap_fwork;
>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>> +
>>>>>> +    if (READ_ONCE(heap_ftask->fail))
>>>>>> +        return NULL;
>>>>>> +
>>>>>> +    heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, 
>>>>>> GFP_KERNEL);
>>>>>> +    if (unlikely(!heap_fwork))
>>>>>> +        return NULL;
>>>>>> +
>>>>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>>>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>> +        return NULL;
>>>>>> +    }
>>>>>> +
>>>>>> +    heap_fwork->heap_file = heap_file;
>>>>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>>>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>>>>> +    heap_fwork->refp = &heap_ftask->ref;
>>>>>> +    heap_fwork->failp = &heap_ftask->fail;
>>>>>> +    atomic_inc(&heap_ftask->ref);
>>>>>> +    return heap_fwork;
>>>>>> +}
>>>>>> +
>>>>>> +static void destroy_file_work(struct dma_heap_file_work 
>>>>>> *heap_fwork)
>>>>>> +{
>>>>>> +    vunmap(heap_fwork->vaddr);
>>>>>> +    atomic_dec(heap_fwork->refp);
>>>>>> +    wake_up(&heap_fctl->workwq);
>>>>>> +
>>>>>> +    kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>> +}
>>>>>> +
>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>> *heap_ftask)
>>>>>> +{
>>>>>> +    struct dma_heap_file_work *heap_fwork = 
>>>>>> init_file_work(heap_ftask);
>>>>>> +    struct page *last = NULL;
>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>> +    size_t start = heap_ftask->roffset;
>>>>>> +    struct file *file = heap_file->file;
>>>>>> +    size_t fsz = heap_file->fsz;
>>>>>> +
>>>>>> +    if (unlikely(!heap_fwork))
>>>>>> +        return -ENOMEM;
>>>>>> +
>>>>>> +    /**
>>>>>> +     * If file size is not page aligned, direct io can't process 
>>>>>> the tail.
>>>>>> +     * So, if reach to tail, remain the last page use buffer read.
>>>>>> +     */
>>>>>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>>>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>>>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>>>>> +    }
>>>>>> +
>>>>>> +    spin_lock(&heap_fctl->lock);
>>>>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>>>>> +    spin_unlock(&heap_fctl->lock);
>>>>>> +    atomic_inc(&heap_fctl->nr_work);
>>>>>> +
>>>>>> +    wake_up(&heap_fctl->threadwq);
>>>>>> +
>>>>>> +    if (last) {
>>>>>> +        char *buf, *pathp;
>>>>>> +        ssize_t err;
>>>>>> +        void *buffer;
>>>>>> +
>>>>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>>>>> +        if (unlikely(!buf))
>>>>>> +            return -ENOMEM;
>>>>>> +
>>>>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>>>>> +
>>>>>> +        pathp = file_path(file, buf, PATH_MAX);
>>>>>> +        if (IS_ERR(pathp)) {
>>>>>> +            kfree(buf);
>>>>>> +            return PTR_ERR(pathp);
>>>>>> +        }
>>>>>> +
>>>>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>>>>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>>>>>> +                         fsz - start, &fsz,
>>>>>> +                         READING_POLICY);
>>>>>> +        kunmap_local(buffer);
>>>>>> +        kfree(buf);
>>>>>> +        if (err < 0) {
>>>>>> +            pr_err("failed to use buffer kernel_read_file %s, 
>>>>>> err=%ld, [%ld, %ld], f_sz=%ld\n",
>>>>>> +                   pathp, err, start, fsz, fsz);
>>>>>> +
>>>>>> +            return err;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>>>>> +    heap_ftask->rsize = 0;
>>>>>> +    heap_ftask->pindex = 0;
>>>>>> +    heap_ftask->rbatch = min_t(size_t,
>>>>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>>>>> +                   heap_ftask->rbatch);
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>> *heap_ftask)
>>>>>> +{
>>>>>> +    wait_event_freezable(heap_fctl->workwq,
>>>>>> +                 atomic_read(&heap_ftask->ref) == 0);
>>>>>> +    return heap_ftask->fail;
>>>>>> +}
>>>>>> +
>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>> *heap_ftask)
>>>>>> +{
>>>>>> +    bool fail;
>>>>>> +
>>>>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>>>>> +    fail = heap_ftask->fail;
>>>>>> +    kvfree(heap_ftask->parray);
>>>>>> +    kfree(heap_ftask);
>>>>>> +    return fail;
>>>>>> +}
>>>>>> +
>>>>>> +struct dma_heap_file_task *
>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>>>>> +{
>>>>>> +    struct dma_heap_file_task *heap_ftask =
>>>>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>>>>> +    if (unlikely(!heap_ftask))
>>>>>> +        return NULL;
>>>>>> +
>>>>>> +    /**
>>>>>> +     * Batch is the maximum size which we prepare work will meet.
>>>>>> +     * So, direct alloc this number's page array is OK.
>>>>>> +     */
>>>>>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> 
>>>>>> PAGE_SHIFT,
>>>>>> +                        sizeof(struct page *), GFP_KERNEL);
>>>>>> +    if (unlikely(!heap_ftask->parray))
>>>>>> +        goto put;
>>>>>> +
>>>>>> +    heap_ftask->heap_file = heap_file;
>>>>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>>>>> +    return heap_ftask;
>>>>>> +put:
>>>>>> +    kfree(heap_ftask);
>>>>>> +    return NULL;
>>>>>> +}
>>>>>> +
>>>>>> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
>>>>>> +{
>>>>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>>>>> +    struct file *file = heap_file->file;
>>>>>> +    ssize_t start = heap_fwork->start_size;
>>>>>> +    ssize_t size = heap_fwork->need_size;
>>>>>> +    void *buffer = heap_fwork->vaddr;
>>>>>> +    const struct cred *old_cred;
>>>>>> +    ssize_t err;
>>>>>> +
>>>>>> +    // use real task's cred to read this file.
>>>>>> +    old_cred = override_creds(heap_file->cred);
>>>>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>>>>> &heap_file->fsz,
>>>>>> +                   READING_POLICY);
>>>>>> +    if (err < 0) {
>>>>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>>>>> f_sz=%ld\n",
>>>>>> +               err, start, (start + size), heap_file->fsz);
>>>>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>>>>> +    }
>>>>>> +    // recovery to my cred.
>>>>>> +    revert_creds(old_cred);
>>>>>> +}
>>>>>> +
>>>>>> +static int dma_heap_file_control_thread(void *data)
>>>>>> +{
>>>>>> +    struct dma_heap_file_control *heap_fctl =
>>>>>> +        (struct dma_heap_file_control *)data;
>>>>>> +    struct dma_heap_file_work *worker, *tmp;
>>>>>> +    int nr_work;
>>>>>> +
>>>>>> +    LIST_HEAD(pages);
>>>>>> +    LIST_HEAD(workers);
>>>>>> +
>>>>>> +    while (true) {
>>>>>> +        wait_event_freezable(heap_fctl->threadwq,
>>>>>> + atomic_read(&heap_fctl->nr_work) > 0);
>>>>>> +recheck:
>>>>>> +        spin_lock(&heap_fctl->lock);
>>>>>> +        list_splice_init(&heap_fctl->works, &workers);
>>>>>> +        spin_unlock(&heap_fctl->lock);
>>>>>> +
>>>>>> +        if (unlikely(kthread_should_stop())) {
>>>>>> +            list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>>>> +                list_del(&worker->list);
>>>>>> +                destroy_file_work(worker);
>>>>>> +            }
>>>>>> +            break;
>>>>>> +        }
>>>>>> +
>>>>>> +        nr_work = 0;
>>>>>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>>>> +            ++nr_work;
>>>>>> +            list_del(&worker->list);
>>>>>> +            __work_this_io(worker);
>>>>>> +
>>>>>> +            destroy_file_work(worker);
>>>>>> +        }
>>>>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>>>>> +
>>>>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>>>>> +            goto recheck;
>>>>>> +    }
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>>>>> +{
>>>>>> +    return heap_file->fsz;
>>>>>> +}
>>>>>> +
>>>>>> +static int prepare_dma_heap_file(struct dma_heap_file 
>>>>>> *heap_file, int file_fd,
>>>>>> +                 size_t batch)
>>>>>> +{
>>>>>> +    struct file *file;
>>>>>> +    size_t fsz;
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    file = fget(file_fd);
>>>>>> +    if (!file)
>>>>>> +        return -EINVAL;
>>>>>> +
>>>>>> +    fsz = i_size_read(file_inode(file));
>>>>>> +    if (fsz < batch) {
>>>>>> +        ret = -EINVAL;
>>>>>> +        goto err;
>>>>>> +    }
>>>>>> +
>>>>>> +    /**
>>>>>> +     * Selinux block our read, but actually we are reading the 
>>>>>> stand-in
>>>>>> +     * for this file.
>>>>>> +     * So save current's cred and when going to read, override 
>>>>>> mine, and
>>>>>> +     * end of read, revert.
>>>>>> +     */
>>>>>> +    heap_file->cred = prepare_kernel_cred(current);
>>>>>> +    if (unlikely(!heap_file->cred)) {
>>>>>> +        ret = -ENOMEM;
>>>>>> +        goto err;
>>>>>> +    }
>>>>>> +
>>>>>> +    heap_file->file = file;
>>>>>> +    heap_file->max_batch = batch;
>>>>>> +    heap_file->fsz = fsz;
>>>>>> +
>>>>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>>>>> +
>>>>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>>>>> +    if (!heap_file->direct && fsz >= 
>>>>>> DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>>>>> +        pr_warn("alloc read file better to use O_DIRECT to read 
>>>>>> larget file\n");
>>>>>> +
>>>>>> +    return 0;
>>>>>> +
>>>>>> +err:
>>>>>> +    fput(file);
>>>>>> +    return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
>>>>>> +{
>>>>>> +    fput(heap_file->file);
>>>>>> +    put_cred(heap_file->cred);
>>>>>> +}
>>>>>> +
>>>>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap 
>>>>>> *heap, int file_fd,
>>>>>> +                       size_t batch, unsigned int fd_flags,
>>>>>> +                       unsigned int heap_flags)
>>>>>> +{
>>>>>> +    struct dma_buf *dmabuf;
>>>>>> +    int fd;
>>>>>> +    struct dma_heap_file heap_file;
>>>>>> +
>>>>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>>>>> +    if (fd)
>>>>>> +        goto error_file;
>>>>>> +
>>>>>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, 
>>>>>> fd_flags,
>>>>>> +                           heap_flags);
>>>>>> +    if (IS_ERR(dmabuf)) {
>>>>>> +        fd = PTR_ERR(dmabuf);
>>>>>> +        goto error;
>>>>>> +    }
>>>>>> +
>>>>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>>>>> +    if (fd < 0) {
>>>>>> +        dma_buf_put(dmabuf);
>>>>>> +        /* just return, as put will call release and that will 
>>>>>> free */
>>>>>> +    }
>>>>>> +
>>>>>> +error:
>>>>>> +    destroy_dma_heap_file(&heap_file);
>>>>>> +error_file:
>>>>>> +    return fd;
>>>>>> +}
>>>>>> +
>>>>>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t 
>>>>>> len,
>>>>>>                    u32 fd_flags,
>>>>>>                    u64 heap_flags)
>>>>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode *inode, 
>>>>>> struct file *file)
>>>>>>       return 0;
>>>>>>   }
>>>>>>   +static long dma_heap_ioctl_allocate_read_file(struct file 
>>>>>> *file, void *data)
>>>>>> +{
>>>>>> +    struct dma_heap_allocation_file_data *heap_allocation_file = 
>>>>>> data;
>>>>>> +    struct dma_heap *heap = file->private_data;
>>>>>> +    int fd;
>>>>>> +
>>>>>> +    if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
>>>>>> +        return -EINVAL;
>>>>>> +
>>>>>> +    if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
>>>>>> +        return -EINVAL;
>>>>>> +
>>>>>> +    if (heap_allocation_file->heap_flags & 
>>>>>> ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>>>> +        return -EINVAL;
>>>>>> +
>>>>>> +    if (!heap->ops->allocate_read_file)
>>>>>> +        return -EINVAL;
>>>>>> +
>>>>>> +    fd = dma_heap_buffer_alloc_read_file(
>>>>>> +        heap, heap_allocation_file->file_fd,
>>>>>> +        heap_allocation_file->batch ?
>>>>>> +            PAGE_ALIGN(heap_allocation_file->batch) :
>>>>>> +            DEFAULT_ADI_BATCH,
>>>>>> +        heap_allocation_file->fd_flags,
>>>>>> +        heap_allocation_file->heap_flags);
>>>>>> +    if (fd < 0)
>>>>>> +        return fd;
>>>>>> +
>>>>>> +    heap_allocation_file->fd = fd;
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>>   static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>>>>>   {
>>>>>>       struct dma_heap_allocation_data *heap_allocation = data;
>>>>>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct 
>>>>>> file *file, void *data)
>>>>>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>>>>>       DMA_HEAP_IOCTL_ALLOC,
>>>>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>>>>   };
>>>>>>     static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
>>>>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file *file, 
>>>>>> unsigned int ucmd,
>>>>>>       case DMA_HEAP_IOCTL_ALLOC:
>>>>>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>>>>>           break;
>>>>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>>>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>>>>> +        break;
>>>>>>       default:
>>>>>>           ret = -ENOTTY;
>>>>>>           goto err;
>>>>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>>>>         dma_heap_class = class_create(DEVNAME);
>>>>>>       if (IS_ERR(dma_heap_class)) {
>>>>>> -        unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>> -        return PTR_ERR(dma_heap_class);
>>>>>> +        ret = PTR_ERR(dma_heap_class);
>>>>>> +        goto fail_class;
>>>>>>       }
>>>>>>       dma_heap_class->devnode = dma_heap_devnode;
>>>>>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>>>>> +    if (unlikely(!heap_fctl)) {
>>>>>> +        ret =  -ENOMEM;
>>>>>> +        goto fail_alloc;
>>>>>> +    }
>>>>>> +
>>>>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>>>>> +    init_waitqueue_head(&heap_fctl->threadwq);
>>>>>> +    init_waitqueue_head(&heap_fctl->workwq);
>>>>>> +
>>>>>> +    heap_fctl->work_thread = 
>>>>>> kthread_run(dma_heap_file_control_thread,
>>>>>> +                         heap_fctl, "heap_fwork_t");
>>>>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>>>>> +        ret = -ENOMEM;
>>>>>> +        goto fail_thread;
>>>>>> +    }
>>>>>> +
>>>>>> +    heap_fctl->heap_fwork_cachep = 
>>>>>> KMEM_CACHE(dma_heap_file_work, 0);
>>>>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>>>>> +        ret = -ENOMEM;
>>>>>> +        goto fail_cache;
>>>>>> +    }
>>>>>> +
>>>>>>       return 0;
>>>>>> +
>>>>>> +fail_cache:
>>>>>> +    kthread_stop(heap_fctl->work_thread);
>>>>>> +fail_thread:
>>>>>> +    kfree(heap_fctl);
>>>>>> +fail_alloc:
>>>>>> +    class_destroy(dma_heap_class);
>>>>>> +fail_class:
>>>>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>> +    return ret;
>>>>>>   }
>>>>>>   subsys_initcall(dma_heap_init);
>>>>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>>>>> index 064bad725061..9c25383f816c 100644
>>>>>> --- a/include/linux/dma-heap.h
>>>>>> +++ b/include/linux/dma-heap.h
>>>>>> @@ -12,12 +12,17 @@
>>>>>>   #include <linux/cdev.h>
>>>>>>   #include <linux/types.h>
>>>>>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>>>>>> +
>>>>>>   struct dma_heap;
>>>>>> +struct dma_heap_file_task;
>>>>>> +struct dma_heap_file;
>>>>>>     /**
>>>>>>    * struct dma_heap_ops - ops to operate on a given heap
>>>>>>    * @allocate:        allocate dmabuf and return struct dma_buf ptr
>>>>>> - *
>>>>>> + * @allocate_read_file: allocate dmabuf and read file, then 
>>>>>> return struct
>>>>>> + * dma_buf ptr.
>>>>>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>>>>>>    */
>>>>>>   struct dma_heap_ops {
>>>>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>>>>                       unsigned long len,
>>>>>>                       u32 fd_flags,
>>>>>>                       u64 heap_flags);
>>>>>> +
>>>>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>>>>>> +                          struct dma_heap_file *heap_file,
>>>>>> +                          u32 fd_flags,
>>>>>> +                          u64 heap_flags);
>>>>>>   };
>>>>>>     /**
>>>>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap 
>>>>>> *heap);
>>>>>>    */
>>>>>>   struct dma_heap *dma_heap_add(const struct dma_heap_export_info 
>>>>>> *exp_info);
>>>>>>   +/**
>>>>>> + * dma_heap_destroy_file_read - waits for a file read to 
>>>>>> complete then destroy it
>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>> + */
>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>> *heap_ftask);
>>>>>> +
>>>>>> +/**
>>>>>> + * dma_heap_wait_for_file_read - waits for a file read to complete
>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>> + */
>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>> *heap_ftask);
>>>>>> +
>>>>>> +/**
>>>>>> + * dma_heap_alloc_file_read - Declare a task to read file when 
>>>>>> allocate pages.
>>>>>> + * @heap_file:        target file to read
>>>>>> + *
>>>>>> + * Return NULL if failed, otherwise return a struct pointer.
>>>>>> + */
>>>>>> +struct dma_heap_file_task *
>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>>>>> +
>>>>>> +/**
>>>>>> + * dma_heap_prepare_file_read - cache each allocated page until 
>>>>>> we meet this batch.
>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>> + * @page:        current allocated page. don't care which order.
>>>>>> + *
>>>>>> + * Returns true if reach to batch, false so go on prepare.
>>>>>> + */
>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>> *heap_ftask,
>>>>>> +                struct page *page);
>>>>>> +
>>>>>> +/**
>>>>>> + * dma_heap_commit_file_read -  prepare collect enough memory, 
>>>>>> going to trigger IO
>>>>>> + * @heap_ftask:            info that current IO needs
>>>>>> + *
>>>>>> + * This commit will also check if reach to tail read.
>>>>>> + * For direct I/O submissions, it is necessary to pay attention 
>>>>>> to file reads
>>>>>> + * that are not page-aligned. For the unaligned portion of the 
>>>>>> read, buffer IO
>>>>>> + * needs to be triggered.
>>>>>> + * Returns:
>>>>>> + *   0 if all right, -errno if something wrong
>>>>>> + */
>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>> *heap_ftask);
>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>>>>> +
>>>>>>   #endif /* _DMA_HEAPS_H */
>>>>>> diff --git a/include/uapi/linux/dma-heap.h 
>>>>>> b/include/uapi/linux/dma-heap.h
>>>>>> index a4cf716a49fa..8c20e8b74eed 100644
>>>>>> --- a/include/uapi/linux/dma-heap.h
>>>>>> +++ b/include/uapi/linux/dma-heap.h
>>>>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>>>>       __u64 heap_flags;
>>>>>>   };
>>>>>>   +/**
>>>>>> + * struct dma_heap_allocation_file_data - metadata passed from 
>>>>>> userspace for
>>>>>> + *                                      allocations and read file
>>>>>> + * @fd:            will be populated with a fd which provides the
>>>>>> + *     ��      handle to the allocated dma-buf
>>>>>> + * @file_fd:        file descriptor to read from(suggested to 
>>>>>> use O_DIRECT open file)
>>>>>> + * @batch:        how many memory alloced then file read(bytes), 
>>>>>> default 128MB
>>>>>> + *            will auto aligned to PAGE_SIZE
>>>>>> + * @fd_flags:        file descriptor flags used when allocating
>>>>>> + * @heap_flags:        flags passed to heap
>>>>>> + *
>>>>>> + * Provided by userspace as an argument to the ioctl
>>>>>> + */
>>>>>> +struct dma_heap_allocation_file_data {
>>>>>> +    __u32 fd;
>>>>>> +    __u32 file_fd;
>>>>>> +    __u32 batch;
>>>>>> +    __u32 fd_flags;
>>>>>> +    __u64 heap_flags;
>>>>>> +};
>>>>>> +
>>>>>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>>>>>     /**
>>>>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>>>>   #define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>>>>                         struct dma_heap_allocation_data)
>>>>>>   +/**
>>>>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from 
>>>>>> pool and both
>>>>>> + *                    read file when allocate memory.
>>>>>> + *
>>>>>> + * Takes a dma_heap_allocation_file_data struct and returns it 
>>>>>> with the fd field
>>>>>> + * populated with the dmabuf handle of the allocation. When 
>>>>>> return, the dma-buf
>>>>>> + * content is read from file.
>>>>>> + */
>>>>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>>>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>>>>> dma_heap_allocation_file_data)
>>>>>> +
>>>>>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>>>>>
>>>
Huan Yang July 12, 2024, 7:29 a.m. UTC | #7
Hi Christian,

在 2024/7/12 15:10, Christian König 写道:
> Am 12.07.24 um 04:14 schrieb Huan Yang:
>> 在 2024/7/12 9:59, Huan Yang 写道:
>>> Hi Christian,
>>>
>>> 在 2024/7/11 19:39, Christian König 写道:
>>>> Am 11.07.24 um 11:18 schrieb Huan Yang:
>>>>> Hi Christian,
>>>>>
>>>>> Thanks for your reply.
>>>>>
>>>>> 在 2024/7/11 17:00, Christian König 写道:
>>>>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>>>>> Some user may need load file into dma-buf, current
>>>>>>> way is:
>>>>>>>    1. allocate a dma-buf, get dma-buf fd
>>>>>>>    2. mmap dma-buf fd into vaddr
>>>>>>>    3. read(file_fd, vaddr, fsz)
>>>>>>> This is too heavy if fsz reached to GB.
>>>>>>
>>>>>> You need to describe a bit more why that is to heavy. I can only 
>>>>>> assume you need to save memory bandwidth and avoid the extra copy 
>>>>>> with the CPU.
>>>>>
>>>>> Sorry for the oversimplified explanation. But, yes, you're right, 
>>>>> we want to avoid this.
>>>>>
>>>>> As we are dealing with embedded devices, the available memory and 
>>>>> computing power for users are usually limited.(The maximum 
>>>>> available memory is currently
>>>>>
>>>>> 24GB, typically ranging from 8-12GB. )
>>>>>
>>>>> Also, the CPU computing power is also usually in short supply, due 
>>>>> to limited battery capacity and limited heat dissipation 
>>>>> capabilities.
>>>>>
>>>>> So, we hope to avoid ineffective paths as much as possible.
>>>>>
>>>>>>
>>>>>>> This patch implement a feature called 
>>>>>>> DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>>>>> User need to offer a file_fd which you want to load into 
>>>>>>> dma-buf, then,
>>>>>>> it promise if you got a dma-buf fd, it will contains the file 
>>>>>>> content.
>>>>>>
>>>>>> Interesting idea, that has at least more potential than trying to 
>>>>>> enable direct I/O on mmap()ed DMA-bufs.
>>>>>>
>>>>>> The approach with the new IOCTL might not work because it is a 
>>>>>> very specialized use case.
>>>>>
>>>>> Thank you for your advice. maybe the "read file" behavior can be 
>>>>> attached to an existing allocation?
>>>>
>>>> The point is there are already system calls to do something like that.
>>>>
>>>> See copy_file_range() 
>>>> (https://man7.org/linux/man-pages/man2/copy_file_range.2.html) 
>>>> and send_file() 
>>>> (https://man7.org/linux/man-pages/man2/sendfile.2.html).
>>>
>>> That's helpfull to learn it, thanks.
>>>
>>> In terms of only DMA-BUF supporting direct I/O, 
>>> copy_file_range/send_file may help to achieve this functionality.
>>>
>>> However, my patchset also aims to achieve parallel copying of file 
>>> contents while allocating the DMA-BUF, which is something that the 
>>> current set of calls may not be able to accomplish.
>
> And exactly that is a no-go. Use the existing IOCTLs and system calls 
> instead they should have similar performance when done right.

Get it, but In my testing process, even without memory pressure, it 
takes about 60ms to allocate a 3GB DMA-BUF. When there is significant 
memory pressure, the allocation time for a 3GB

DMA-BUF can increase to 300ms-1s. (The above test times can also 
demonstrate the difference.)

But, talk is cheap, I agree to research use existing way to implements 
it and give a test.

I'll show this if I done .

Thanks for your suggestions.

>
> Regards,
> Christian.
>
>>
>> You can see cover-letter, here are the normal test and this IOCTL's 
>> compare in memory pressure, even if buffered I/O in this ioctl can 
>> have 50% improve by  parallel.
>>
>> dd a 3GB file for test, 12G RAM phone, UFS4.0, stressapptest 4G 
>> memory pressure.
>>
>> 1. original
>> ```shel
>> # create a model file
>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>> # drop page cache
>> echo 3 > /proc/sys/vm/drop_caches
>> ./dmabuf-heap-file-read mtk_mm-uncached normal
>>
>>> result is total cost 13087213847ns
>>
>> ```
>>
>> 2.DMA_HEAP_IOCTL_ALLOC_AND_READ O_DIRECT
>> ```shel
>> # create a model file
>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>> # drop page cache
>> echo 3 > /proc/sys/vm/drop_caches
>> ./dmabuf-heap-file-read mtk_mm-uncached direct_io
>>
>>> result is total cost 2902386846ns
>>
>> # use direct_io_check can check the content if is same to file.
>> ```
>>
>> 3. DMA_HEAP_IOCTL_ALLOC_AND_READ BUFFER I/O
>> ```shel
>> # create a model file
>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>> # drop page cache
>> echo 3 > /proc/sys/vm/drop_caches
>> ./dmabuf-heap-file-read mtk_mm-uncached normal_io
>>
>>> result is total cost 5735579385ns
>>
>> ```
>>
>>>
>>> Perhaps simply returning the DMA-BUF file descriptor and then 
>>> implementing copy_file_range, while populating the memory and 
>>> content during the copy process, could achieve this? At present, it 
>>> seems that it will be quite complex - We need to ensure that only 
>>> the returned DMA-BUF file descriptor will fail in case of memory not 
>>> fill, like mmap, vmap, attach, and so on.
>>>
>>>>
>>>> What we probably could do is to internally optimize those.
>>>>
>>>>> I am currently creating a new ioctl to remind the user that memory 
>>>>> is being allocated and read, and I am also unsure
>>>>>
>>>>> whether it is appropriate to add additional parameters to the 
>>>>> existing allocate behavior.
>>>>>
>>>>> Please, give me more suggestion. Thanks.
>>>>>
>>>>>>
>>>>>> But IIRC there was a copy_file_range callback in the 
>>>>>> file_operations structure you could use for that. I'm just not 
>>>>>> sure when and how that's used with the copy_file_range() system 
>>>>>> call.
>>>>>
>>>>> Sorry, I'm not familiar with this, but I will look into it. 
>>>>> However, this type of callback function is not currently 
>>>>> implemented when exporting
>>>>>
>>>>> the dma_buf file, which means that I need to implement the 
>>>>> callback for it?
>>>>
>>>> If I'm not completely mistaken the copy_file_range, splice_read and 
>>>> splice_write callbacks on the struct file_operations 
>>>> (https://elixir.bootlin.com/linux/v6.10-rc7/source/include/linux/fs.h#L1999).
>>>>
>>>> Can be used to implement what you want to do.
>>> Yes.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>> Notice, file_fd depends on user how to open this file. So, both 
>>>>>>> buffer
>>>>>>> I/O and Direct I/O is supported.
>>>>>>>
>>>>>>> Signed-off-by: Huan Yang <link@vivo.com>
>>>>>>> ---
>>>>>>>   drivers/dma-buf/dma-heap.c    | 525 
>>>>>>> +++++++++++++++++++++++++++++++++-
>>>>>>>   include/linux/dma-heap.h      |  57 +++-
>>>>>>>   include/uapi/linux/dma-heap.h |  32 +++
>>>>>>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/dma-buf/dma-heap.c 
>>>>>>> b/drivers/dma-buf/dma-heap.c
>>>>>>> index 2298ca5e112e..abe17281adb8 100644
>>>>>>> --- a/drivers/dma-buf/dma-heap.c
>>>>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>>>>> @@ -15,9 +15,11 @@
>>>>>>>   #include <linux/list.h>
>>>>>>>   #include <linux/slab.h>
>>>>>>>   #include <linux/nospec.h>
>>>>>>> +#include <linux/highmem.h>
>>>>>>>   #include <linux/uaccess.h>
>>>>>>>   #include <linux/syscalls.h>
>>>>>>>   #include <linux/dma-heap.h>
>>>>>>> +#include <linux/vmalloc.h>
>>>>>>>   #include <uapi/linux/dma-heap.h>
>>>>>>>     #define DEVNAME "dma_heap"
>>>>>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>>>>>       struct cdev heap_cdev;
>>>>>>>   };
>>>>>>>   +/**
>>>>>>> + * struct dma_heap_file - wrap the file, read task for dma_heap 
>>>>>>> allocate use.
>>>>>>> + * @file:        file to read from.
>>>>>>> + *
>>>>>>> + * @cred:        kthread use, user cred copy to use for the read.
>>>>>>> + *
>>>>>>> + * @max_batch:        maximum batch size to read, if collect 
>>>>>>> match batch,
>>>>>>> + *            trigger read, default 128MB, must below file size.
>>>>>>> + *
>>>>>>> + * @fsz:        file size.
>>>>>>> + *
>>>>>>> + * @direct:        use direct IO?
>>>>>>> + */
>>>>>>> +struct dma_heap_file {
>>>>>>> +    struct file *file;
>>>>>>> +    struct cred *cred;
>>>>>>> +    size_t max_batch;
>>>>>>> +    size_t fsz;
>>>>>>> +    bool direct;
>>>>>>> +};
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * struct dma_heap_file_work - represents a dma_heap file read 
>>>>>>> real work.
>>>>>>> + * @vaddr:        contigous virtual address alloc by vmap, file 
>>>>>>> read need.
>>>>>>> + *
>>>>>>> + * @start_size:        file read start offset, same to 
>>>>>>> @dma_heap_file_task->roffset.
>>>>>>> + *
>>>>>>> + * @need_size:        file read need size, same to 
>>>>>>> @dma_heap_file_task->rsize.
>>>>>>> + *
>>>>>>> + * @heap_file:        file wrapper.
>>>>>>> + *
>>>>>>> + * @list:        child node of @dma_heap_file_control->works.
>>>>>>> + *
>>>>>>> + * @refp:        same @dma_heap_file_task->ref, if end of read, 
>>>>>>> put ref.
>>>>>>> + *
>>>>>>> + * @failp:        if any work io failed, set it true, pointp 
>>>>>>> @dma_heap_file_task->fail.
>>>>>>> + */
>>>>>>> +struct dma_heap_file_work {
>>>>>>> +    void *vaddr;
>>>>>>> +    ssize_t start_size;
>>>>>>> +    ssize_t need_size;
>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>> +    struct list_head list;
>>>>>>> +    atomic_t *refp;
>>>>>>> +    bool *failp;
>>>>>>> +};
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * struct dma_heap_file_task - represents a dma_heap file read 
>>>>>>> process
>>>>>>> + * @ref:        current file work counter, if zero, allocate 
>>>>>>> and read
>>>>>>> + *            done.
>>>>>>> + *
>>>>>>> + * @roffset:        last read offset, current prepared work' 
>>>>>>> begin file
>>>>>>> + *            start offset.
>>>>>>> + *
>>>>>>> + * @rsize:        current allocated page size use to read, if 
>>>>>>> reach rbatch,
>>>>>>> + *            trigger commit.
>>>>>>> + *
>>>>>>> + * @rbatch:        current prepared work's batch, below 
>>>>>>> @dma_heap_file's
>>>>>>> + *            batch.
>>>>>>> + *
>>>>>>> + * @heap_file:        current dma_heap_file
>>>>>>> + *
>>>>>>> + * @parray:        used for vmap, size is @dma_heap_file's 
>>>>>>> batch's number
>>>>>>> + *            pages.(this is maximum). Due to single thread 
>>>>>>> file read,
>>>>>>> + *            one page array reuse each work prepare is OK.
>>>>>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>>>>>> + *
>>>>>>> + * @pindex:        current allocated page filled in @parray's 
>>>>>>> index.
>>>>>>> + *
>>>>>>> + * @fail:        any work failed when file read?
>>>>>>> + *
>>>>>>> + * dma_heap_file_task is the production of file read, will 
>>>>>>> prepare each work
>>>>>>> + * during allocate dma_buf pages, if match current batch, then 
>>>>>>> trigger commit
>>>>>>> + * and prepare next work. After all batch queued, user going on 
>>>>>>> prepare dma_buf
>>>>>>> + * and so on, but before return dma_buf fd, need to wait file 
>>>>>>> read end and
>>>>>>> + * check read result.
>>>>>>> + */
>>>>>>> +struct dma_heap_file_task {
>>>>>>> +    atomic_t ref;
>>>>>>> +    size_t roffset;
>>>>>>> +    size_t rsize;
>>>>>>> +    size_t rbatch;
>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>> +    struct page **parray;
>>>>>>> +    unsigned int pindex;
>>>>>>> +    bool fail;
>>>>>>> +};
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * struct dma_heap_file_control - global control of dma_heap 
>>>>>>> file read.
>>>>>>> + * @works:        @dma_heap_file_work's list head.
>>>>>>> + *
>>>>>>> + * @lock:        only lock for @works.
>>>>>>> + *
>>>>>>> + * @threadwq:        wait queue for @work_thread, if commit 
>>>>>>> work, @work_thread
>>>>>>> + *            wakeup and read this work's file contains.
>>>>>>> + *
>>>>>>> + * @workwq:        used for main thread wait for file read end, 
>>>>>>> if allocation
>>>>>>> + *            end before file read. @dma_heap_file_task ref 
>>>>>>> effect this.
>>>>>>> + *
>>>>>>> + * @work_thread:    file read kthread. the dma_heap_file_task 
>>>>>>> work's consumer.
>>>>>>> + *
>>>>>>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>>>>>>> alloc/free frequently.
>>>>>>> + *
>>>>>>> + * @nr_work:        global number of how many work committed.
>>>>>>> + */
>>>>>>> +struct dma_heap_file_control {
>>>>>>> +    struct list_head works;
>>>>>>> +    spinlock_t lock;
>>>>>>> +    wait_queue_head_t threadwq;
>>>>>>> +    wait_queue_head_t workwq;
>>>>>>> +    struct task_struct *work_thread;
>>>>>>> +    struct kmem_cache *heap_fwork_cachep;
>>>>>>> +    atomic_t nr_work;
>>>>>>> +};
>>>>>>> +
>>>>>>> +static struct dma_heap_file_control *heap_fctl;
>>>>>>>   static LIST_HEAD(heap_list);
>>>>>>>   static DEFINE_MUTEX(heap_list_lock);
>>>>>>>   static dev_t dma_heap_devt;
>>>>>>>   static struct class *dma_heap_class;
>>>>>>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>>>>>   +/**
>>>>>>> + * map_pages_to_vaddr - map each scatter page into contiguous 
>>>>>>> virtual address.
>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>> + *
>>>>>>> + * Cached pages need to trigger file read, this function map 
>>>>>>> each scatter page
>>>>>>> + * into contiguous virtual address, so that file read can easy 
>>>>>>> use.
>>>>>>> + * Now that we get vaddr page, cached pages can return to 
>>>>>>> original user, so we
>>>>>>> + * will not effect dma-buf export even if file read not end.
>>>>>>> + */
>>>>>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task 
>>>>>>> *heap_ftask)
>>>>>>> +{
>>>>>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>>>>>> +            PAGE_KERNEL);
>>>>>>> +}
>>>>>>> +
>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>> *heap_ftask,
>>>>>>> +                struct page *page)
>>>>>>> +{
>>>>>>> +    struct page **array = heap_ftask->parray;
>>>>>>> +    int index = heap_ftask->pindex;
>>>>>>> +    int num = compound_nr(page), i;
>>>>>>> +    unsigned long sz = page_size(page);
>>>>>>> +
>>>>>>> +    heap_ftask->rsize += sz;
>>>>>>> +    for (i = 0; i < num; ++i)
>>>>>>> +        array[index++] = &page[i];
>>>>>>> +    heap_ftask->pindex = index;
>>>>>>> +
>>>>>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static struct dma_heap_file_work *
>>>>>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>>>>>> +{
>>>>>>> +    struct dma_heap_file_work *heap_fwork;
>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>> +
>>>>>>> +    if (READ_ONCE(heap_ftask->fail))
>>>>>>> +        return NULL;
>>>>>>> +
>>>>>>> +    heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, 
>>>>>>> GFP_KERNEL);
>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>> +        return NULL;
>>>>>>> +
>>>>>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>>>>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>> +        return NULL;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    heap_fwork->heap_file = heap_file;
>>>>>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>>>>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>>>>>> +    heap_fwork->refp = &heap_ftask->ref;
>>>>>>> +    heap_fwork->failp = &heap_ftask->fail;
>>>>>>> +    atomic_inc(&heap_ftask->ref);
>>>>>>> +    return heap_fwork;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void destroy_file_work(struct dma_heap_file_work 
>>>>>>> *heap_fwork)
>>>>>>> +{
>>>>>>> +    vunmap(heap_fwork->vaddr);
>>>>>>> +    atomic_dec(heap_fwork->refp);
>>>>>>> +    wake_up(&heap_fctl->workwq);
>>>>>>> +
>>>>>>> +    kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>> +}
>>>>>>> +
>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>> *heap_ftask)
>>>>>>> +{
>>>>>>> +    struct dma_heap_file_work *heap_fwork = 
>>>>>>> init_file_work(heap_ftask);
>>>>>>> +    struct page *last = NULL;
>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>> +    size_t start = heap_ftask->roffset;
>>>>>>> +    struct file *file = heap_file->file;
>>>>>>> +    size_t fsz = heap_file->fsz;
>>>>>>> +
>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>> +        return -ENOMEM;
>>>>>>> +
>>>>>>> +    /**
>>>>>>> +     * If file size is not page aligned, direct io can't 
>>>>>>> process the tail.
>>>>>>> +     * So, if reach to tail, remain the last page use buffer read.
>>>>>>> +     */
>>>>>>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>>>>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>>>>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    spin_lock(&heap_fctl->lock);
>>>>>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>>>>>> +    spin_unlock(&heap_fctl->lock);
>>>>>>> +    atomic_inc(&heap_fctl->nr_work);
>>>>>>> +
>>>>>>> +    wake_up(&heap_fctl->threadwq);
>>>>>>> +
>>>>>>> +    if (last) {
>>>>>>> +        char *buf, *pathp;
>>>>>>> +        ssize_t err;
>>>>>>> +        void *buffer;
>>>>>>> +
>>>>>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>>>>>> +        if (unlikely(!buf))
>>>>>>> +            return -ENOMEM;
>>>>>>> +
>>>>>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>>>>>> +
>>>>>>> +        pathp = file_path(file, buf, PATH_MAX);
>>>>>>> +        if (IS_ERR(pathp)) {
>>>>>>> +            kfree(buf);
>>>>>>> +            return PTR_ERR(pathp);
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>>>>>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>>>>>>> +                         fsz - start, &fsz,
>>>>>>> +                         READING_POLICY);
>>>>>>> +        kunmap_local(buffer);
>>>>>>> +        kfree(buf);
>>>>>>> +        if (err < 0) {
>>>>>>> +            pr_err("failed to use buffer kernel_read_file %s, 
>>>>>>> err=%ld, [%ld, %ld], f_sz=%ld\n",
>>>>>>> +                   pathp, err, start, fsz, fsz);
>>>>>>> +
>>>>>>> +            return err;
>>>>>>> +        }
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>>>>>> +    heap_ftask->rsize = 0;
>>>>>>> +    heap_ftask->pindex = 0;
>>>>>>> +    heap_ftask->rbatch = min_t(size_t,
>>>>>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>>>>>> +                   heap_ftask->rbatch);
>>>>>>> +    return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>> *heap_ftask)
>>>>>>> +{
>>>>>>> +    wait_event_freezable(heap_fctl->workwq,
>>>>>>> +                 atomic_read(&heap_ftask->ref) == 0);
>>>>>>> +    return heap_ftask->fail;
>>>>>>> +}
>>>>>>> +
>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>> *heap_ftask)
>>>>>>> +{
>>>>>>> +    bool fail;
>>>>>>> +
>>>>>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>>>>>> +    fail = heap_ftask->fail;
>>>>>>> +    kvfree(heap_ftask->parray);
>>>>>>> +    kfree(heap_ftask);
>>>>>>> +    return fail;
>>>>>>> +}
>>>>>>> +
>>>>>>> +struct dma_heap_file_task *
>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>>>>>> +{
>>>>>>> +    struct dma_heap_file_task *heap_ftask =
>>>>>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>>>>>> +    if (unlikely(!heap_ftask))
>>>>>>> +        return NULL;
>>>>>>> +
>>>>>>> +    /**
>>>>>>> +     * Batch is the maximum size which we prepare work will meet.
>>>>>>> +     * So, direct alloc this number's page array is OK.
>>>>>>> +     */
>>>>>>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> 
>>>>>>> PAGE_SHIFT,
>>>>>>> +                        sizeof(struct page *), GFP_KERNEL);
>>>>>>> +    if (unlikely(!heap_ftask->parray))
>>>>>>> +        goto put;
>>>>>>> +
>>>>>>> +    heap_ftask->heap_file = heap_file;
>>>>>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>>>>>> +    return heap_ftask;
>>>>>>> +put:
>>>>>>> +    kfree(heap_ftask);
>>>>>>> +    return NULL;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
>>>>>>> +{
>>>>>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>>>>>> +    struct file *file = heap_file->file;
>>>>>>> +    ssize_t start = heap_fwork->start_size;
>>>>>>> +    ssize_t size = heap_fwork->need_size;
>>>>>>> +    void *buffer = heap_fwork->vaddr;
>>>>>>> +    const struct cred *old_cred;
>>>>>>> +    ssize_t err;
>>>>>>> +
>>>>>>> +    // use real task's cred to read this file.
>>>>>>> +    old_cred = override_creds(heap_file->cred);
>>>>>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>>>>>> &heap_file->fsz,
>>>>>>> +                   READING_POLICY);
>>>>>>> +    if (err < 0) {
>>>>>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>>>>>> f_sz=%ld\n",
>>>>>>> +               err, start, (start + size), heap_file->fsz);
>>>>>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>>>>>> +    }
>>>>>>> +    // recovery to my cred.
>>>>>>> +    revert_creds(old_cred);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int dma_heap_file_control_thread(void *data)
>>>>>>> +{
>>>>>>> +    struct dma_heap_file_control *heap_fctl =
>>>>>>> +        (struct dma_heap_file_control *)data;
>>>>>>> +    struct dma_heap_file_work *worker, *tmp;
>>>>>>> +    int nr_work;
>>>>>>> +
>>>>>>> +    LIST_HEAD(pages);
>>>>>>> +    LIST_HEAD(workers);
>>>>>>> +
>>>>>>> +    while (true) {
>>>>>>> +        wait_event_freezable(heap_fctl->threadwq,
>>>>>>> + atomic_read(&heap_fctl->nr_work) > 0);
>>>>>>> +recheck:
>>>>>>> +        spin_lock(&heap_fctl->lock);
>>>>>>> +        list_splice_init(&heap_fctl->works, &workers);
>>>>>>> +        spin_unlock(&heap_fctl->lock);
>>>>>>> +
>>>>>>> +        if (unlikely(kthread_should_stop())) {
>>>>>>> +            list_for_each_entry_safe(worker, tmp, &workers, 
>>>>>>> list) {
>>>>>>> +                list_del(&worker->list);
>>>>>>> +                destroy_file_work(worker);
>>>>>>> +            }
>>>>>>> +            break;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        nr_work = 0;
>>>>>>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>>>>> +            ++nr_work;
>>>>>>> +            list_del(&worker->list);
>>>>>>> +            __work_this_io(worker);
>>>>>>> +
>>>>>>> +            destroy_file_work(worker);
>>>>>>> +        }
>>>>>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>>>>>> +
>>>>>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>>>>>> +            goto recheck;
>>>>>>> +    }
>>>>>>> +    return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>>>>>> +{
>>>>>>> +    return heap_file->fsz;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int prepare_dma_heap_file(struct dma_heap_file 
>>>>>>> *heap_file, int file_fd,
>>>>>>> +                 size_t batch)
>>>>>>> +{
>>>>>>> +    struct file *file;
>>>>>>> +    size_t fsz;
>>>>>>> +    int ret;
>>>>>>> +
>>>>>>> +    file = fget(file_fd);
>>>>>>> +    if (!file)
>>>>>>> +        return -EINVAL;
>>>>>>> +
>>>>>>> +    fsz = i_size_read(file_inode(file));
>>>>>>> +    if (fsz < batch) {
>>>>>>> +        ret = -EINVAL;
>>>>>>> +        goto err;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    /**
>>>>>>> +     * Selinux block our read, but actually we are reading the 
>>>>>>> stand-in
>>>>>>> +     * for this file.
>>>>>>> +     * So save current's cred and when going to read, override 
>>>>>>> mine, and
>>>>>>> +     * end of read, revert.
>>>>>>> +     */
>>>>>>> +    heap_file->cred = prepare_kernel_cred(current);
>>>>>>> +    if (unlikely(!heap_file->cred)) {
>>>>>>> +        ret = -ENOMEM;
>>>>>>> +        goto err;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    heap_file->file = file;
>>>>>>> +    heap_file->max_batch = batch;
>>>>>>> +    heap_file->fsz = fsz;
>>>>>>> +
>>>>>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>>>>>> +
>>>>>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>>>>>> +    if (!heap_file->direct && fsz >= 
>>>>>>> DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>>>>>> +        pr_warn("alloc read file better to use O_DIRECT to read 
>>>>>>> larget file\n");
>>>>>>> +
>>>>>>> +    return 0;
>>>>>>> +
>>>>>>> +err:
>>>>>>> +    fput(file);
>>>>>>> +    return ret;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
>>>>>>> +{
>>>>>>> +    fput(heap_file->file);
>>>>>>> +    put_cred(heap_file->cred);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap 
>>>>>>> *heap, int file_fd,
>>>>>>> +                       size_t batch, unsigned int fd_flags,
>>>>>>> +                       unsigned int heap_flags)
>>>>>>> +{
>>>>>>> +    struct dma_buf *dmabuf;
>>>>>>> +    int fd;
>>>>>>> +    struct dma_heap_file heap_file;
>>>>>>> +
>>>>>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>>>>>> +    if (fd)
>>>>>>> +        goto error_file;
>>>>>>> +
>>>>>>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, 
>>>>>>> fd_flags,
>>>>>>> +                           heap_flags);
>>>>>>> +    if (IS_ERR(dmabuf)) {
>>>>>>> +        fd = PTR_ERR(dmabuf);
>>>>>>> +        goto error;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>>>>>> +    if (fd < 0) {
>>>>>>> +        dma_buf_put(dmabuf);
>>>>>>> +        /* just return, as put will call release and that will 
>>>>>>> free */
>>>>>>> +    }
>>>>>>> +
>>>>>>> +error:
>>>>>>> +    destroy_dma_heap_file(&heap_file);
>>>>>>> +error_file:
>>>>>>> +    return fd;
>>>>>>> +}
>>>>>>> +
>>>>>>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t 
>>>>>>> len,
>>>>>>>                    u32 fd_flags,
>>>>>>>                    u64 heap_flags)
>>>>>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode 
>>>>>>> *inode, struct file *file)
>>>>>>>       return 0;
>>>>>>>   }
>>>>>>>   +static long dma_heap_ioctl_allocate_read_file(struct file 
>>>>>>> *file, void *data)
>>>>>>> +{
>>>>>>> +    struct dma_heap_allocation_file_data *heap_allocation_file 
>>>>>>> = data;
>>>>>>> +    struct dma_heap *heap = file->private_data;
>>>>>>> +    int fd;
>>>>>>> +
>>>>>>> +    if (heap_allocation_file->fd || 
>>>>>>> !heap_allocation_file->file_fd)
>>>>>>> +        return -EINVAL;
>>>>>>> +
>>>>>>> +    if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
>>>>>>> +        return -EINVAL;
>>>>>>> +
>>>>>>> +    if (heap_allocation_file->heap_flags & 
>>>>>>> ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>>>>> +        return -EINVAL;
>>>>>>> +
>>>>>>> +    if (!heap->ops->allocate_read_file)
>>>>>>> +        return -EINVAL;
>>>>>>> +
>>>>>>> +    fd = dma_heap_buffer_alloc_read_file(
>>>>>>> +        heap, heap_allocation_file->file_fd,
>>>>>>> +        heap_allocation_file->batch ?
>>>>>>> + PAGE_ALIGN(heap_allocation_file->batch) :
>>>>>>> +            DEFAULT_ADI_BATCH,
>>>>>>> +        heap_allocation_file->fd_flags,
>>>>>>> +        heap_allocation_file->heap_flags);
>>>>>>> +    if (fd < 0)
>>>>>>> +        return fd;
>>>>>>> +
>>>>>>> +    heap_allocation_file->fd = fd;
>>>>>>> +    return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>>   static long dma_heap_ioctl_allocate(struct file *file, void 
>>>>>>> *data)
>>>>>>>   {
>>>>>>>       struct dma_heap_allocation_data *heap_allocation = data;
>>>>>>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct 
>>>>>>> file *file, void *data)
>>>>>>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>>>>>>       DMA_HEAP_IOCTL_ALLOC,
>>>>>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>>>>>   };
>>>>>>>     static long dma_heap_ioctl(struct file *file, unsigned int 
>>>>>>> ucmd,
>>>>>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file 
>>>>>>> *file, unsigned int ucmd,
>>>>>>>       case DMA_HEAP_IOCTL_ALLOC:
>>>>>>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>>>>>>           break;
>>>>>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>>>>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>>>>>> +        break;
>>>>>>>       default:
>>>>>>>           ret = -ENOTTY;
>>>>>>>           goto err;
>>>>>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>>>>>         dma_heap_class = class_create(DEVNAME);
>>>>>>>       if (IS_ERR(dma_heap_class)) {
>>>>>>> -        unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>>> -        return PTR_ERR(dma_heap_class);
>>>>>>> +        ret = PTR_ERR(dma_heap_class);
>>>>>>> +        goto fail_class;
>>>>>>>       }
>>>>>>>       dma_heap_class->devnode = dma_heap_devnode;
>>>>>>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>>>>>> +    if (unlikely(!heap_fctl)) {
>>>>>>> +        ret =  -ENOMEM;
>>>>>>> +        goto fail_alloc;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>>>>>> +    init_waitqueue_head(&heap_fctl->threadwq);
>>>>>>> +    init_waitqueue_head(&heap_fctl->workwq);
>>>>>>> +
>>>>>>> +    heap_fctl->work_thread = 
>>>>>>> kthread_run(dma_heap_file_control_thread,
>>>>>>> +                         heap_fctl, "heap_fwork_t");
>>>>>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>>>>>> +        ret = -ENOMEM;
>>>>>>> +        goto fail_thread;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    heap_fctl->heap_fwork_cachep = 
>>>>>>> KMEM_CACHE(dma_heap_file_work, 0);
>>>>>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>>>>>> +        ret = -ENOMEM;
>>>>>>> +        goto fail_cache;
>>>>>>> +    }
>>>>>>> +
>>>>>>>       return 0;
>>>>>>> +
>>>>>>> +fail_cache:
>>>>>>> +    kthread_stop(heap_fctl->work_thread);
>>>>>>> +fail_thread:
>>>>>>> +    kfree(heap_fctl);
>>>>>>> +fail_alloc:
>>>>>>> +    class_destroy(dma_heap_class);
>>>>>>> +fail_class:
>>>>>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>>> +    return ret;
>>>>>>>   }
>>>>>>>   subsys_initcall(dma_heap_init);
>>>>>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>>>>>> index 064bad725061..9c25383f816c 100644
>>>>>>> --- a/include/linux/dma-heap.h
>>>>>>> +++ b/include/linux/dma-heap.h
>>>>>>> @@ -12,12 +12,17 @@
>>>>>>>   #include <linux/cdev.h>
>>>>>>>   #include <linux/types.h>
>>>>>>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>>>>>>> +
>>>>>>>   struct dma_heap;
>>>>>>> +struct dma_heap_file_task;
>>>>>>> +struct dma_heap_file;
>>>>>>>     /**
>>>>>>>    * struct dma_heap_ops - ops to operate on a given heap
>>>>>>>    * @allocate:        allocate dmabuf and return struct dma_buf 
>>>>>>> ptr
>>>>>>> - *
>>>>>>> + * @allocate_read_file: allocate dmabuf and read file, then 
>>>>>>> return struct
>>>>>>> + * dma_buf ptr.
>>>>>>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>>>>>>>    */
>>>>>>>   struct dma_heap_ops {
>>>>>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>>>>>                       unsigned long len,
>>>>>>>                       u32 fd_flags,
>>>>>>>                       u64 heap_flags);
>>>>>>> +
>>>>>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>>>>>>> +                          struct dma_heap_file *heap_file,
>>>>>>> +                          u32 fd_flags,
>>>>>>> +                          u64 heap_flags);
>>>>>>>   };
>>>>>>>     /**
>>>>>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap 
>>>>>>> *heap);
>>>>>>>    */
>>>>>>>   struct dma_heap *dma_heap_add(const struct 
>>>>>>> dma_heap_export_info *exp_info);
>>>>>>>   +/**
>>>>>>> + * dma_heap_destroy_file_read - waits for a file read to 
>>>>>>> complete then destroy it
>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>> + */
>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>> *heap_ftask);
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * dma_heap_wait_for_file_read - waits for a file read to complete
>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>> + */
>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>> *heap_ftask);
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * dma_heap_alloc_file_read - Declare a task to read file when 
>>>>>>> allocate pages.
>>>>>>> + * @heap_file:        target file to read
>>>>>>> + *
>>>>>>> + * Return NULL if failed, otherwise return a struct pointer.
>>>>>>> + */
>>>>>>> +struct dma_heap_file_task *
>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * dma_heap_prepare_file_read - cache each allocated page until 
>>>>>>> we meet this batch.
>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>> + * @page:        current allocated page. don't care which order.
>>>>>>> + *
>>>>>>> + * Returns true if reach to batch, false so go on prepare.
>>>>>>> + */
>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>> *heap_ftask,
>>>>>>> +                struct page *page);
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * dma_heap_commit_file_read -  prepare collect enough memory, 
>>>>>>> going to trigger IO
>>>>>>> + * @heap_ftask:            info that current IO needs
>>>>>>> + *
>>>>>>> + * This commit will also check if reach to tail read.
>>>>>>> + * For direct I/O submissions, it is necessary to pay attention 
>>>>>>> to file reads
>>>>>>> + * that are not page-aligned. For the unaligned portion of the 
>>>>>>> read, buffer IO
>>>>>>> + * needs to be triggered.
>>>>>>> + * Returns:
>>>>>>> + *   0 if all right, -errno if something wrong
>>>>>>> + */
>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>> *heap_ftask);
>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>>>>>> +
>>>>>>>   #endif /* _DMA_HEAPS_H */
>>>>>>> diff --git a/include/uapi/linux/dma-heap.h 
>>>>>>> b/include/uapi/linux/dma-heap.h
>>>>>>> index a4cf716a49fa..8c20e8b74eed 100644
>>>>>>> --- a/include/uapi/linux/dma-heap.h
>>>>>>> +++ b/include/uapi/linux/dma-heap.h
>>>>>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>>>>>       __u64 heap_flags;
>>>>>>>   };
>>>>>>>   +/**
>>>>>>> + * struct dma_heap_allocation_file_data - metadata passed from 
>>>>>>> userspace for
>>>>>>> + *                                      allocations and read file
>>>>>>> + * @fd:            will be populated with a fd which provides the
>>>>>>> + *     ��      handle to the allocated dma-buf
>>>>>>> + * @file_fd:        file descriptor to read from(suggested to 
>>>>>>> use O_DIRECT open file)
>>>>>>> + * @batch:        how many memory alloced then file 
>>>>>>> read(bytes), default 128MB
>>>>>>> + *            will auto aligned to PAGE_SIZE
>>>>>>> + * @fd_flags:        file descriptor flags used when allocating
>>>>>>> + * @heap_flags:        flags passed to heap
>>>>>>> + *
>>>>>>> + * Provided by userspace as an argument to the ioctl
>>>>>>> + */
>>>>>>> +struct dma_heap_allocation_file_data {
>>>>>>> +    __u32 fd;
>>>>>>> +    __u32 file_fd;
>>>>>>> +    __u32 batch;
>>>>>>> +    __u32 fd_flags;
>>>>>>> +    __u64 heap_flags;
>>>>>>> +};
>>>>>>> +
>>>>>>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>>>>>>     /**
>>>>>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>>>>>   #define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>>>>>                         struct dma_heap_allocation_data)
>>>>>>>   +/**
>>>>>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from 
>>>>>>> pool and both
>>>>>>> + *                    read file when allocate memory.
>>>>>>> + *
>>>>>>> + * Takes a dma_heap_allocation_file_data struct and returns it 
>>>>>>> with the fd field
>>>>>>> + * populated with the dmabuf handle of the allocation. When 
>>>>>>> return, the dma-buf
>>>>>>> + * content is read from file.
>>>>>>> + */
>>>>>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>>>>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>>>>>> dma_heap_allocation_file_data)
>>>>>>> +
>>>>>>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>>>>>>
>>>>
>
Christian König July 12, 2024, 7:41 a.m. UTC | #8
Am 12.07.24 um 09:29 schrieb Huan Yang:
> Hi Christian,
>
> 在 2024/7/12 15:10, Christian König 写道:
>> Am 12.07.24 um 04:14 schrieb Huan Yang:
>>> 在 2024/7/12 9:59, Huan Yang 写道:
>>>> Hi Christian,
>>>>
>>>> 在 2024/7/11 19:39, Christian König 写道:
>>>>> Am 11.07.24 um 11:18 schrieb Huan Yang:
>>>>>> Hi Christian,
>>>>>>
>>>>>> Thanks for your reply.
>>>>>>
>>>>>> 在 2024/7/11 17:00, Christian König 写道:
>>>>>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>>>>>> Some user may need load file into dma-buf, current
>>>>>>>> way is:
>>>>>>>>    1. allocate a dma-buf, get dma-buf fd
>>>>>>>>    2. mmap dma-buf fd into vaddr
>>>>>>>>    3. read(file_fd, vaddr, fsz)
>>>>>>>> This is too heavy if fsz reached to GB.
>>>>>>>
>>>>>>> You need to describe a bit more why that is to heavy. I can only 
>>>>>>> assume you need to save memory bandwidth and avoid the extra 
>>>>>>> copy with the CPU.
>>>>>>
>>>>>> Sorry for the oversimplified explanation. But, yes, you're right, 
>>>>>> we want to avoid this.
>>>>>>
>>>>>> As we are dealing with embedded devices, the available memory and 
>>>>>> computing power for users are usually limited.(The maximum 
>>>>>> available memory is currently
>>>>>>
>>>>>> 24GB, typically ranging from 8-12GB. )
>>>>>>
>>>>>> Also, the CPU computing power is also usually in short supply, 
>>>>>> due to limited battery capacity and limited heat dissipation 
>>>>>> capabilities.
>>>>>>
>>>>>> So, we hope to avoid ineffective paths as much as possible.
>>>>>>
>>>>>>>
>>>>>>>> This patch implement a feature called 
>>>>>>>> DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>>>>>> User need to offer a file_fd which you want to load into 
>>>>>>>> dma-buf, then,
>>>>>>>> it promise if you got a dma-buf fd, it will contains the file 
>>>>>>>> content.
>>>>>>>
>>>>>>> Interesting idea, that has at least more potential than trying 
>>>>>>> to enable direct I/O on mmap()ed DMA-bufs.
>>>>>>>
>>>>>>> The approach with the new IOCTL might not work because it is a 
>>>>>>> very specialized use case.
>>>>>>
>>>>>> Thank you for your advice. maybe the "read file" behavior can be 
>>>>>> attached to an existing allocation?
>>>>>
>>>>> The point is there are already system calls to do something like 
>>>>> that.
>>>>>
>>>>> See copy_file_range() 
>>>>> (https://man7.org/linux/man-pages/man2/copy_file_range.2.html) and 
>>>>> send_file() (https://man7.org/linux/man-pages/man2/sendfile.2.html).
>>>>
>>>> That's helpfull to learn it, thanks.
>>>>
>>>> In terms of only DMA-BUF supporting direct I/O, 
>>>> copy_file_range/send_file may help to achieve this functionality.
>>>>
>>>> However, my patchset also aims to achieve parallel copying of file 
>>>> contents while allocating the DMA-BUF, which is something that the 
>>>> current set of calls may not be able to accomplish.
>>
>> And exactly that is a no-go. Use the existing IOCTLs and system calls 
>> instead they should have similar performance when done right.
>
> Get it, but In my testing process, even without memory pressure, it 
> takes about 60ms to allocate a 3GB DMA-BUF. When there is significant 
> memory pressure, the allocation time for a 3GB

Well exactly that doesn't make sense. Even if you read the content of 
the DMA-buf from a file you still need to allocate it first.

So the question is why should reading and allocating it at the same time 
be better in any way?

Regards,
Christian.

>
>
> DMA-BUF can increase to 300ms-1s. (The above test times can also 
> demonstrate the difference.)
>
> But, talk is cheap, I agree to research use existing way to implements 
> it and give a test.
>
> I'll show this if I done .
>
> Thanks for your suggestions.
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> You can see cover-letter, here are the normal test and this IOCTL's 
>>> compare in memory pressure, even if buffered I/O in this ioctl can 
>>> have 50% improve by  parallel.
>>>
>>> dd a 3GB file for test, 12G RAM phone, UFS4.0, stressapptest 4G 
>>> memory pressure.
>>>
>>> 1. original
>>> ```shel
>>> # create a model file
>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>> # drop page cache
>>> echo 3 > /proc/sys/vm/drop_caches
>>> ./dmabuf-heap-file-read mtk_mm-uncached normal
>>>
>>>> result is total cost 13087213847ns
>>>
>>> ```
>>>
>>> 2.DMA_HEAP_IOCTL_ALLOC_AND_READ O_DIRECT
>>> ```shel
>>> # create a model file
>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>> # drop page cache
>>> echo 3 > /proc/sys/vm/drop_caches
>>> ./dmabuf-heap-file-read mtk_mm-uncached direct_io
>>>
>>>> result is total cost 2902386846ns
>>>
>>> # use direct_io_check can check the content if is same to file.
>>> ```
>>>
>>> 3. DMA_HEAP_IOCTL_ALLOC_AND_READ BUFFER I/O
>>> ```shel
>>> # create a model file
>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>> # drop page cache
>>> echo 3 > /proc/sys/vm/drop_caches
>>> ./dmabuf-heap-file-read mtk_mm-uncached normal_io
>>>
>>>> result is total cost 5735579385ns
>>>
>>> ```
>>>
>>>>
>>>> Perhaps simply returning the DMA-BUF file descriptor and then 
>>>> implementing copy_file_range, while populating the memory and 
>>>> content during the copy process, could achieve this? At present, it 
>>>> seems that it will be quite complex - We need to ensure that only 
>>>> the returned DMA-BUF file descriptor will fail in case of memory 
>>>> not fill, like mmap, vmap, attach, and so on.
>>>>
>>>>>
>>>>> What we probably could do is to internally optimize those.
>>>>>
>>>>>> I am currently creating a new ioctl to remind the user that 
>>>>>> memory is being allocated and read, and I am also unsure
>>>>>>
>>>>>> whether it is appropriate to add additional parameters to the 
>>>>>> existing allocate behavior.
>>>>>>
>>>>>> Please, give me more suggestion. Thanks.
>>>>>>
>>>>>>>
>>>>>>> But IIRC there was a copy_file_range callback in the 
>>>>>>> file_operations structure you could use for that. I'm just not 
>>>>>>> sure when and how that's used with the copy_file_range() system 
>>>>>>> call.
>>>>>>
>>>>>> Sorry, I'm not familiar with this, but I will look into it. 
>>>>>> However, this type of callback function is not currently 
>>>>>> implemented when exporting
>>>>>>
>>>>>> the dma_buf file, which means that I need to implement the 
>>>>>> callback for it?
>>>>>
>>>>> If I'm not completely mistaken the copy_file_range, splice_read 
>>>>> and splice_write callbacks on the struct file_operations 
>>>>> (https://elixir.bootlin.com/linux/v6.10-rc7/source/include/linux/fs.h#L1999).
>>>>>
>>>>> Can be used to implement what you want to do.
>>>> Yes.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>
>>>>>>>> Notice, file_fd depends on user how to open this file. So, both 
>>>>>>>> buffer
>>>>>>>> I/O and Direct I/O is supported.
>>>>>>>>
>>>>>>>> Signed-off-by: Huan Yang <link@vivo.com>
>>>>>>>> ---
>>>>>>>>   drivers/dma-buf/dma-heap.c    | 525 
>>>>>>>> +++++++++++++++++++++++++++++++++-
>>>>>>>>   include/linux/dma-heap.h      |  57 +++-
>>>>>>>>   include/uapi/linux/dma-heap.h |  32 +++
>>>>>>>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/dma-buf/dma-heap.c 
>>>>>>>> b/drivers/dma-buf/dma-heap.c
>>>>>>>> index 2298ca5e112e..abe17281adb8 100644
>>>>>>>> --- a/drivers/dma-buf/dma-heap.c
>>>>>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>>>>>> @@ -15,9 +15,11 @@
>>>>>>>>   #include <linux/list.h>
>>>>>>>>   #include <linux/slab.h>
>>>>>>>>   #include <linux/nospec.h>
>>>>>>>> +#include <linux/highmem.h>
>>>>>>>>   #include <linux/uaccess.h>
>>>>>>>>   #include <linux/syscalls.h>
>>>>>>>>   #include <linux/dma-heap.h>
>>>>>>>> +#include <linux/vmalloc.h>
>>>>>>>>   #include <uapi/linux/dma-heap.h>
>>>>>>>>     #define DEVNAME "dma_heap"
>>>>>>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>>>>>>       struct cdev heap_cdev;
>>>>>>>>   };
>>>>>>>>   +/**
>>>>>>>> + * struct dma_heap_file - wrap the file, read task for 
>>>>>>>> dma_heap allocate use.
>>>>>>>> + * @file:        file to read from.
>>>>>>>> + *
>>>>>>>> + * @cred:        kthread use, user cred copy to use for the read.
>>>>>>>> + *
>>>>>>>> + * @max_batch:        maximum batch size to read, if collect 
>>>>>>>> match batch,
>>>>>>>> + *            trigger read, default 128MB, must below file size.
>>>>>>>> + *
>>>>>>>> + * @fsz:        file size.
>>>>>>>> + *
>>>>>>>> + * @direct:        use direct IO?
>>>>>>>> + */
>>>>>>>> +struct dma_heap_file {
>>>>>>>> +    struct file *file;
>>>>>>>> +    struct cred *cred;
>>>>>>>> +    size_t max_batch;
>>>>>>>> +    size_t fsz;
>>>>>>>> +    bool direct;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +/**
>>>>>>>> + * struct dma_heap_file_work - represents a dma_heap file read 
>>>>>>>> real work.
>>>>>>>> + * @vaddr:        contigous virtual address alloc by vmap, 
>>>>>>>> file read need.
>>>>>>>> + *
>>>>>>>> + * @start_size:        file read start offset, same to 
>>>>>>>> @dma_heap_file_task->roffset.
>>>>>>>> + *
>>>>>>>> + * @need_size:        file read need size, same to 
>>>>>>>> @dma_heap_file_task->rsize.
>>>>>>>> + *
>>>>>>>> + * @heap_file:        file wrapper.
>>>>>>>> + *
>>>>>>>> + * @list:        child node of @dma_heap_file_control->works.
>>>>>>>> + *
>>>>>>>> + * @refp:        same @dma_heap_file_task->ref, if end of 
>>>>>>>> read, put ref.
>>>>>>>> + *
>>>>>>>> + * @failp:        if any work io failed, set it true, pointp 
>>>>>>>> @dma_heap_file_task->fail.
>>>>>>>> + */
>>>>>>>> +struct dma_heap_file_work {
>>>>>>>> +    void *vaddr;
>>>>>>>> +    ssize_t start_size;
>>>>>>>> +    ssize_t need_size;
>>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>>> +    struct list_head list;
>>>>>>>> +    atomic_t *refp;
>>>>>>>> +    bool *failp;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +/**
>>>>>>>> + * struct dma_heap_file_task - represents a dma_heap file read 
>>>>>>>> process
>>>>>>>> + * @ref:        current file work counter, if zero, allocate 
>>>>>>>> and read
>>>>>>>> + *            done.
>>>>>>>> + *
>>>>>>>> + * @roffset:        last read offset, current prepared work' 
>>>>>>>> begin file
>>>>>>>> + *            start offset.
>>>>>>>> + *
>>>>>>>> + * @rsize:        current allocated page size use to read, if 
>>>>>>>> reach rbatch,
>>>>>>>> + *            trigger commit.
>>>>>>>> + *
>>>>>>>> + * @rbatch:        current prepared work's batch, below 
>>>>>>>> @dma_heap_file's
>>>>>>>> + *            batch.
>>>>>>>> + *
>>>>>>>> + * @heap_file:        current dma_heap_file
>>>>>>>> + *
>>>>>>>> + * @parray:        used for vmap, size is @dma_heap_file's 
>>>>>>>> batch's number
>>>>>>>> + *            pages.(this is maximum). Due to single thread 
>>>>>>>> file read,
>>>>>>>> + *            one page array reuse each work prepare is OK.
>>>>>>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>>>>>>> + *
>>>>>>>> + * @pindex:        current allocated page filled in @parray's 
>>>>>>>> index.
>>>>>>>> + *
>>>>>>>> + * @fail:        any work failed when file read?
>>>>>>>> + *
>>>>>>>> + * dma_heap_file_task is the production of file read, will 
>>>>>>>> prepare each work
>>>>>>>> + * during allocate dma_buf pages, if match current batch, then 
>>>>>>>> trigger commit
>>>>>>>> + * and prepare next work. After all batch queued, user going 
>>>>>>>> on prepare dma_buf
>>>>>>>> + * and so on, but before return dma_buf fd, need to wait file 
>>>>>>>> read end and
>>>>>>>> + * check read result.
>>>>>>>> + */
>>>>>>>> +struct dma_heap_file_task {
>>>>>>>> +    atomic_t ref;
>>>>>>>> +    size_t roffset;
>>>>>>>> +    size_t rsize;
>>>>>>>> +    size_t rbatch;
>>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>>> +    struct page **parray;
>>>>>>>> +    unsigned int pindex;
>>>>>>>> +    bool fail;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +/**
>>>>>>>> + * struct dma_heap_file_control - global control of dma_heap 
>>>>>>>> file read.
>>>>>>>> + * @works:        @dma_heap_file_work's list head.
>>>>>>>> + *
>>>>>>>> + * @lock:        only lock for @works.
>>>>>>>> + *
>>>>>>>> + * @threadwq:        wait queue for @work_thread, if commit 
>>>>>>>> work, @work_thread
>>>>>>>> + *            wakeup and read this work's file contains.
>>>>>>>> + *
>>>>>>>> + * @workwq:        used for main thread wait for file read 
>>>>>>>> end, if allocation
>>>>>>>> + *            end before file read. @dma_heap_file_task ref 
>>>>>>>> effect this.
>>>>>>>> + *
>>>>>>>> + * @work_thread:    file read kthread. the dma_heap_file_task 
>>>>>>>> work's consumer.
>>>>>>>> + *
>>>>>>>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>>>>>>>> alloc/free frequently.
>>>>>>>> + *
>>>>>>>> + * @nr_work:        global number of how many work committed.
>>>>>>>> + */
>>>>>>>> +struct dma_heap_file_control {
>>>>>>>> +    struct list_head works;
>>>>>>>> +    spinlock_t lock;
>>>>>>>> +    wait_queue_head_t threadwq;
>>>>>>>> +    wait_queue_head_t workwq;
>>>>>>>> +    struct task_struct *work_thread;
>>>>>>>> +    struct kmem_cache *heap_fwork_cachep;
>>>>>>>> +    atomic_t nr_work;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +static struct dma_heap_file_control *heap_fctl;
>>>>>>>>   static LIST_HEAD(heap_list);
>>>>>>>>   static DEFINE_MUTEX(heap_list_lock);
>>>>>>>>   static dev_t dma_heap_devt;
>>>>>>>>   static struct class *dma_heap_class;
>>>>>>>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>>>>>>   +/**
>>>>>>>> + * map_pages_to_vaddr - map each scatter page into contiguous 
>>>>>>>> virtual address.
>>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>>> + *
>>>>>>>> + * Cached pages need to trigger file read, this function map 
>>>>>>>> each scatter page
>>>>>>>> + * into contiguous virtual address, so that file read can easy 
>>>>>>>> use.
>>>>>>>> + * Now that we get vaddr page, cached pages can return to 
>>>>>>>> original user, so we
>>>>>>>> + * will not effect dma-buf export even if file read not end.
>>>>>>>> + */
>>>>>>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task 
>>>>>>>> *heap_ftask)
>>>>>>>> +{
>>>>>>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>>>>>>> +            PAGE_KERNEL);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>>> *heap_ftask,
>>>>>>>> +                struct page *page)
>>>>>>>> +{
>>>>>>>> +    struct page **array = heap_ftask->parray;
>>>>>>>> +    int index = heap_ftask->pindex;
>>>>>>>> +    int num = compound_nr(page), i;
>>>>>>>> +    unsigned long sz = page_size(page);
>>>>>>>> +
>>>>>>>> +    heap_ftask->rsize += sz;
>>>>>>>> +    for (i = 0; i < num; ++i)
>>>>>>>> +        array[index++] = &page[i];
>>>>>>>> +    heap_ftask->pindex = index;
>>>>>>>> +
>>>>>>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static struct dma_heap_file_work *
>>>>>>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>>>>>>> +{
>>>>>>>> +    struct dma_heap_file_work *heap_fwork;
>>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>>> +
>>>>>>>> +    if (READ_ONCE(heap_ftask->fail))
>>>>>>>> +        return NULL;
>>>>>>>> +
>>>>>>>> +    heap_fwork = 
>>>>>>>> kmem_cache_alloc(heap_fctl->heap_fwork_cachep, GFP_KERNEL);
>>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>>> +        return NULL;
>>>>>>>> +
>>>>>>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>>>>>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>>> +        return NULL;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    heap_fwork->heap_file = heap_file;
>>>>>>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>>>>>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>>>>>>> +    heap_fwork->refp = &heap_ftask->ref;
>>>>>>>> +    heap_fwork->failp = &heap_ftask->fail;
>>>>>>>> +    atomic_inc(&heap_ftask->ref);
>>>>>>>> +    return heap_fwork;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void destroy_file_work(struct dma_heap_file_work 
>>>>>>>> *heap_fwork)
>>>>>>>> +{
>>>>>>>> +    vunmap(heap_fwork->vaddr);
>>>>>>>> +    atomic_dec(heap_fwork->refp);
>>>>>>>> +    wake_up(&heap_fctl->workwq);
>>>>>>>> +
>>>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>>> *heap_ftask)
>>>>>>>> +{
>>>>>>>> +    struct dma_heap_file_work *heap_fwork = 
>>>>>>>> init_file_work(heap_ftask);
>>>>>>>> +    struct page *last = NULL;
>>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>>> +    size_t start = heap_ftask->roffset;
>>>>>>>> +    struct file *file = heap_file->file;
>>>>>>>> +    size_t fsz = heap_file->fsz;
>>>>>>>> +
>>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>>> +        return -ENOMEM;
>>>>>>>> +
>>>>>>>> +    /**
>>>>>>>> +     * If file size is not page aligned, direct io can't 
>>>>>>>> process the tail.
>>>>>>>> +     * So, if reach to tail, remain the last page use buffer 
>>>>>>>> read.
>>>>>>>> +     */
>>>>>>>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>>>>>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>>>>>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    spin_lock(&heap_fctl->lock);
>>>>>>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>>>>>>> +    spin_unlock(&heap_fctl->lock);
>>>>>>>> +    atomic_inc(&heap_fctl->nr_work);
>>>>>>>> +
>>>>>>>> +    wake_up(&heap_fctl->threadwq);
>>>>>>>> +
>>>>>>>> +    if (last) {
>>>>>>>> +        char *buf, *pathp;
>>>>>>>> +        ssize_t err;
>>>>>>>> +        void *buffer;
>>>>>>>> +
>>>>>>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>>>>>>> +        if (unlikely(!buf))
>>>>>>>> +            return -ENOMEM;
>>>>>>>> +
>>>>>>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>>>>>>> +
>>>>>>>> +        pathp = file_path(file, buf, PATH_MAX);
>>>>>>>> +        if (IS_ERR(pathp)) {
>>>>>>>> +            kfree(buf);
>>>>>>>> +            return PTR_ERR(pathp);
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>>>>>>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>>>>>>>> +                         fsz - start, &fsz,
>>>>>>>> +                         READING_POLICY);
>>>>>>>> +        kunmap_local(buffer);
>>>>>>>> +        kfree(buf);
>>>>>>>> +        if (err < 0) {
>>>>>>>> +            pr_err("failed to use buffer kernel_read_file %s, 
>>>>>>>> err=%ld, [%ld, %ld], f_sz=%ld\n",
>>>>>>>> +                   pathp, err, start, fsz, fsz);
>>>>>>>> +
>>>>>>>> +            return err;
>>>>>>>> +        }
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>>>>>>> +    heap_ftask->rsize = 0;
>>>>>>>> +    heap_ftask->pindex = 0;
>>>>>>>> +    heap_ftask->rbatch = min_t(size_t,
>>>>>>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>>>>>>> +                   heap_ftask->rbatch);
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>>> *heap_ftask)
>>>>>>>> +{
>>>>>>>> +    wait_event_freezable(heap_fctl->workwq,
>>>>>>>> + atomic_read(&heap_ftask->ref) == 0);
>>>>>>>> +    return heap_ftask->fail;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>>> *heap_ftask)
>>>>>>>> +{
>>>>>>>> +    bool fail;
>>>>>>>> +
>>>>>>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>>>>>>> +    fail = heap_ftask->fail;
>>>>>>>> +    kvfree(heap_ftask->parray);
>>>>>>>> +    kfree(heap_ftask);
>>>>>>>> +    return fail;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +struct dma_heap_file_task *
>>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>>>>>>> +{
>>>>>>>> +    struct dma_heap_file_task *heap_ftask =
>>>>>>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>>>>>>> +    if (unlikely(!heap_ftask))
>>>>>>>> +        return NULL;
>>>>>>>> +
>>>>>>>> +    /**
>>>>>>>> +     * Batch is the maximum size which we prepare work will meet.
>>>>>>>> +     * So, direct alloc this number's page array is OK.
>>>>>>>> +     */
>>>>>>>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch 
>>>>>>>> >> PAGE_SHIFT,
>>>>>>>> +                        sizeof(struct page *), GFP_KERNEL);
>>>>>>>> +    if (unlikely(!heap_ftask->parray))
>>>>>>>> +        goto put;
>>>>>>>> +
>>>>>>>> +    heap_ftask->heap_file = heap_file;
>>>>>>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>>>>>>> +    return heap_ftask;
>>>>>>>> +put:
>>>>>>>> +    kfree(heap_ftask);
>>>>>>>> +    return NULL;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
>>>>>>>> +{
>>>>>>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>>>>>>> +    struct file *file = heap_file->file;
>>>>>>>> +    ssize_t start = heap_fwork->start_size;
>>>>>>>> +    ssize_t size = heap_fwork->need_size;
>>>>>>>> +    void *buffer = heap_fwork->vaddr;
>>>>>>>> +    const struct cred *old_cred;
>>>>>>>> +    ssize_t err;
>>>>>>>> +
>>>>>>>> +    // use real task's cred to read this file.
>>>>>>>> +    old_cred = override_creds(heap_file->cred);
>>>>>>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>>>>>>> &heap_file->fsz,
>>>>>>>> +                   READING_POLICY);
>>>>>>>> +    if (err < 0) {
>>>>>>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>>>>>>> f_sz=%ld\n",
>>>>>>>> +               err, start, (start + size), heap_file->fsz);
>>>>>>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>>>>>>> +    }
>>>>>>>> +    // recovery to my cred.
>>>>>>>> +    revert_creds(old_cred);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int dma_heap_file_control_thread(void *data)
>>>>>>>> +{
>>>>>>>> +    struct dma_heap_file_control *heap_fctl =
>>>>>>>> +        (struct dma_heap_file_control *)data;
>>>>>>>> +    struct dma_heap_file_work *worker, *tmp;
>>>>>>>> +    int nr_work;
>>>>>>>> +
>>>>>>>> +    LIST_HEAD(pages);
>>>>>>>> +    LIST_HEAD(workers);
>>>>>>>> +
>>>>>>>> +    while (true) {
>>>>>>>> + wait_event_freezable(heap_fctl->threadwq,
>>>>>>>> + atomic_read(&heap_fctl->nr_work) > 0);
>>>>>>>> +recheck:
>>>>>>>> +        spin_lock(&heap_fctl->lock);
>>>>>>>> +        list_splice_init(&heap_fctl->works, &workers);
>>>>>>>> +        spin_unlock(&heap_fctl->lock);
>>>>>>>> +
>>>>>>>> +        if (unlikely(kthread_should_stop())) {
>>>>>>>> +            list_for_each_entry_safe(worker, tmp, &workers, 
>>>>>>>> list) {
>>>>>>>> +                list_del(&worker->list);
>>>>>>>> +                destroy_file_work(worker);
>>>>>>>> +            }
>>>>>>>> +            break;
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        nr_work = 0;
>>>>>>>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>>>>>> +            ++nr_work;
>>>>>>>> +            list_del(&worker->list);
>>>>>>>> +            __work_this_io(worker);
>>>>>>>> +
>>>>>>>> +            destroy_file_work(worker);
>>>>>>>> +        }
>>>>>>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>>>>>>> +
>>>>>>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>>>>>>> +            goto recheck;
>>>>>>>> +    }
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>>>>>>> +{
>>>>>>>> +    return heap_file->fsz;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int prepare_dma_heap_file(struct dma_heap_file 
>>>>>>>> *heap_file, int file_fd,
>>>>>>>> +                 size_t batch)
>>>>>>>> +{
>>>>>>>> +    struct file *file;
>>>>>>>> +    size_t fsz;
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    file = fget(file_fd);
>>>>>>>> +    if (!file)
>>>>>>>> +        return -EINVAL;
>>>>>>>> +
>>>>>>>> +    fsz = i_size_read(file_inode(file));
>>>>>>>> +    if (fsz < batch) {
>>>>>>>> +        ret = -EINVAL;
>>>>>>>> +        goto err;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    /**
>>>>>>>> +     * Selinux block our read, but actually we are reading the 
>>>>>>>> stand-in
>>>>>>>> +     * for this file.
>>>>>>>> +     * So save current's cred and when going to read, override 
>>>>>>>> mine, and
>>>>>>>> +     * end of read, revert.
>>>>>>>> +     */
>>>>>>>> +    heap_file->cred = prepare_kernel_cred(current);
>>>>>>>> +    if (unlikely(!heap_file->cred)) {
>>>>>>>> +        ret = -ENOMEM;
>>>>>>>> +        goto err;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    heap_file->file = file;
>>>>>>>> +    heap_file->max_batch = batch;
>>>>>>>> +    heap_file->fsz = fsz;
>>>>>>>> +
>>>>>>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>>>>>>> +
>>>>>>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>>>>>>> +    if (!heap_file->direct && fsz >= 
>>>>>>>> DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>>>>>>> +        pr_warn("alloc read file better to use O_DIRECT to 
>>>>>>>> read larget file\n");
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +
>>>>>>>> +err:
>>>>>>>> +    fput(file);
>>>>>>>> +    return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void destroy_dma_heap_file(struct dma_heap_file 
>>>>>>>> *heap_file)
>>>>>>>> +{
>>>>>>>> +    fput(heap_file->file);
>>>>>>>> +    put_cred(heap_file->cred);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap 
>>>>>>>> *heap, int file_fd,
>>>>>>>> +                       size_t batch, unsigned int fd_flags,
>>>>>>>> +                       unsigned int heap_flags)
>>>>>>>> +{
>>>>>>>> +    struct dma_buf *dmabuf;
>>>>>>>> +    int fd;
>>>>>>>> +    struct dma_heap_file heap_file;
>>>>>>>> +
>>>>>>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>>>>>>> +    if (fd)
>>>>>>>> +        goto error_file;
>>>>>>>> +
>>>>>>>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, 
>>>>>>>> fd_flags,
>>>>>>>> +                           heap_flags);
>>>>>>>> +    if (IS_ERR(dmabuf)) {
>>>>>>>> +        fd = PTR_ERR(dmabuf);
>>>>>>>> +        goto error;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>>>>>>> +    if (fd < 0) {
>>>>>>>> +        dma_buf_put(dmabuf);
>>>>>>>> +        /* just return, as put will call release and that will 
>>>>>>>> free */
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +error:
>>>>>>>> +    destroy_dma_heap_file(&heap_file);
>>>>>>>> +error_file:
>>>>>>>> +    return fd;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, 
>>>>>>>> size_t len,
>>>>>>>>                    u32 fd_flags,
>>>>>>>>                    u64 heap_flags)
>>>>>>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode 
>>>>>>>> *inode, struct file *file)
>>>>>>>>       return 0;
>>>>>>>>   }
>>>>>>>>   +static long dma_heap_ioctl_allocate_read_file(struct file 
>>>>>>>> *file, void *data)
>>>>>>>> +{
>>>>>>>> +    struct dma_heap_allocation_file_data *heap_allocation_file 
>>>>>>>> = data;
>>>>>>>> +    struct dma_heap *heap = file->private_data;
>>>>>>>> +    int fd;
>>>>>>>> +
>>>>>>>> +    if (heap_allocation_file->fd || 
>>>>>>>> !heap_allocation_file->file_fd)
>>>>>>>> +        return -EINVAL;
>>>>>>>> +
>>>>>>>> +    if (heap_allocation_file->fd_flags & 
>>>>>>>> ~DMA_HEAP_VALID_FD_FLAGS)
>>>>>>>> +        return -EINVAL;
>>>>>>>> +
>>>>>>>> +    if (heap_allocation_file->heap_flags & 
>>>>>>>> ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>>>>>> +        return -EINVAL;
>>>>>>>> +
>>>>>>>> +    if (!heap->ops->allocate_read_file)
>>>>>>>> +        return -EINVAL;
>>>>>>>> +
>>>>>>>> +    fd = dma_heap_buffer_alloc_read_file(
>>>>>>>> +        heap, heap_allocation_file->file_fd,
>>>>>>>> +        heap_allocation_file->batch ?
>>>>>>>> + PAGE_ALIGN(heap_allocation_file->batch) :
>>>>>>>> +            DEFAULT_ADI_BATCH,
>>>>>>>> +        heap_allocation_file->fd_flags,
>>>>>>>> +        heap_allocation_file->heap_flags);
>>>>>>>> +    if (fd < 0)
>>>>>>>> +        return fd;
>>>>>>>> +
>>>>>>>> +    heap_allocation_file->fd = fd;
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>   static long dma_heap_ioctl_allocate(struct file *file, void 
>>>>>>>> *data)
>>>>>>>>   {
>>>>>>>>       struct dma_heap_allocation_data *heap_allocation = data;
>>>>>>>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct 
>>>>>>>> file *file, void *data)
>>>>>>>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>>>>>>>       DMA_HEAP_IOCTL_ALLOC,
>>>>>>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>>>>>>   };
>>>>>>>>     static long dma_heap_ioctl(struct file *file, unsigned int 
>>>>>>>> ucmd,
>>>>>>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file 
>>>>>>>> *file, unsigned int ucmd,
>>>>>>>>       case DMA_HEAP_IOCTL_ALLOC:
>>>>>>>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>>>>>>>           break;
>>>>>>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>>>>>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>>>>>>> +        break;
>>>>>>>>       default:
>>>>>>>>           ret = -ENOTTY;
>>>>>>>>           goto err;
>>>>>>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>>>>>>         dma_heap_class = class_create(DEVNAME);
>>>>>>>>       if (IS_ERR(dma_heap_class)) {
>>>>>>>> -        unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>>>> -        return PTR_ERR(dma_heap_class);
>>>>>>>> +        ret = PTR_ERR(dma_heap_class);
>>>>>>>> +        goto fail_class;
>>>>>>>>       }
>>>>>>>>       dma_heap_class->devnode = dma_heap_devnode;
>>>>>>>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>>>>>>> +    if (unlikely(!heap_fctl)) {
>>>>>>>> +        ret =  -ENOMEM;
>>>>>>>> +        goto fail_alloc;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>>>>>>> + init_waitqueue_head(&heap_fctl->threadwq);
>>>>>>>> +    init_waitqueue_head(&heap_fctl->workwq);
>>>>>>>> +
>>>>>>>> +    heap_fctl->work_thread = 
>>>>>>>> kthread_run(dma_heap_file_control_thread,
>>>>>>>> +                         heap_fctl, "heap_fwork_t");
>>>>>>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>>>>>>> +        ret = -ENOMEM;
>>>>>>>> +        goto fail_thread;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    heap_fctl->heap_fwork_cachep = 
>>>>>>>> KMEM_CACHE(dma_heap_file_work, 0);
>>>>>>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>>>>>>> +        ret = -ENOMEM;
>>>>>>>> +        goto fail_cache;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>>       return 0;
>>>>>>>> +
>>>>>>>> +fail_cache:
>>>>>>>> +    kthread_stop(heap_fctl->work_thread);
>>>>>>>> +fail_thread:
>>>>>>>> +    kfree(heap_fctl);
>>>>>>>> +fail_alloc:
>>>>>>>> +    class_destroy(dma_heap_class);
>>>>>>>> +fail_class:
>>>>>>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>>>> +    return ret;
>>>>>>>>   }
>>>>>>>>   subsys_initcall(dma_heap_init);
>>>>>>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>>>>>>> index 064bad725061..9c25383f816c 100644
>>>>>>>> --- a/include/linux/dma-heap.h
>>>>>>>> +++ b/include/linux/dma-heap.h
>>>>>>>> @@ -12,12 +12,17 @@
>>>>>>>>   #include <linux/cdev.h>
>>>>>>>>   #include <linux/types.h>
>>>>>>>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>>>>>>>> +
>>>>>>>>   struct dma_heap;
>>>>>>>> +struct dma_heap_file_task;
>>>>>>>> +struct dma_heap_file;
>>>>>>>>     /**
>>>>>>>>    * struct dma_heap_ops - ops to operate on a given heap
>>>>>>>>    * @allocate:        allocate dmabuf and return struct 
>>>>>>>> dma_buf ptr
>>>>>>>> - *
>>>>>>>> + * @allocate_read_file: allocate dmabuf and read file, then 
>>>>>>>> return struct
>>>>>>>> + * dma_buf ptr.
>>>>>>>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>>>>>>>>    */
>>>>>>>>   struct dma_heap_ops {
>>>>>>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>>>>>>                       unsigned long len,
>>>>>>>>                       u32 fd_flags,
>>>>>>>>                       u64 heap_flags);
>>>>>>>> +
>>>>>>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>>>>>>>> +                          struct dma_heap_file *heap_file,
>>>>>>>> +                          u32 fd_flags,
>>>>>>>> +                          u64 heap_flags);
>>>>>>>>   };
>>>>>>>>     /**
>>>>>>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct 
>>>>>>>> dma_heap *heap);
>>>>>>>>    */
>>>>>>>>   struct dma_heap *dma_heap_add(const struct 
>>>>>>>> dma_heap_export_info *exp_info);
>>>>>>>>   +/**
>>>>>>>> + * dma_heap_destroy_file_read - waits for a file read to 
>>>>>>>> complete then destroy it
>>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>>> + */
>>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>>> *heap_ftask);
>>>>>>>> +
>>>>>>>> +/**
>>>>>>>> + * dma_heap_wait_for_file_read - waits for a file read to 
>>>>>>>> complete
>>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>>> + */
>>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>>> *heap_ftask);
>>>>>>>> +
>>>>>>>> +/**
>>>>>>>> + * dma_heap_alloc_file_read - Declare a task to read file when 
>>>>>>>> allocate pages.
>>>>>>>> + * @heap_file:        target file to read
>>>>>>>> + *
>>>>>>>> + * Return NULL if failed, otherwise return a struct pointer.
>>>>>>>> + */
>>>>>>>> +struct dma_heap_file_task *
>>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>>>>>>> +
>>>>>>>> +/**
>>>>>>>> + * dma_heap_prepare_file_read - cache each allocated page 
>>>>>>>> until we meet this batch.
>>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>>> + * @page:        current allocated page. don't care which order.
>>>>>>>> + *
>>>>>>>> + * Returns true if reach to batch, false so go on prepare.
>>>>>>>> + */
>>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>>> *heap_ftask,
>>>>>>>> +                struct page *page);
>>>>>>>> +
>>>>>>>> +/**
>>>>>>>> + * dma_heap_commit_file_read -  prepare collect enough memory, 
>>>>>>>> going to trigger IO
>>>>>>>> + * @heap_ftask:            info that current IO needs
>>>>>>>> + *
>>>>>>>> + * This commit will also check if reach to tail read.
>>>>>>>> + * For direct I/O submissions, it is necessary to pay 
>>>>>>>> attention to file reads
>>>>>>>> + * that are not page-aligned. For the unaligned portion of the 
>>>>>>>> read, buffer IO
>>>>>>>> + * needs to be triggered.
>>>>>>>> + * Returns:
>>>>>>>> + *   0 if all right, -errno if something wrong
>>>>>>>> + */
>>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>>> *heap_ftask);
>>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>>>>>>> +
>>>>>>>>   #endif /* _DMA_HEAPS_H */
>>>>>>>> diff --git a/include/uapi/linux/dma-heap.h 
>>>>>>>> b/include/uapi/linux/dma-heap.h
>>>>>>>> index a4cf716a49fa..8c20e8b74eed 100644
>>>>>>>> --- a/include/uapi/linux/dma-heap.h
>>>>>>>> +++ b/include/uapi/linux/dma-heap.h
>>>>>>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>>>>>>       __u64 heap_flags;
>>>>>>>>   };
>>>>>>>>   +/**
>>>>>>>> + * struct dma_heap_allocation_file_data - metadata passed from 
>>>>>>>> userspace for
>>>>>>>> + *                                      allocations and read file
>>>>>>>> + * @fd:            will be populated with a fd which provides the
>>>>>>>> + *     ��      handle to the allocated dma-buf
>>>>>>>> + * @file_fd:        file descriptor to read from(suggested to 
>>>>>>>> use O_DIRECT open file)
>>>>>>>> + * @batch:        how many memory alloced then file 
>>>>>>>> read(bytes), default 128MB
>>>>>>>> + *            will auto aligned to PAGE_SIZE
>>>>>>>> + * @fd_flags:        file descriptor flags used when allocating
>>>>>>>> + * @heap_flags:        flags passed to heap
>>>>>>>> + *
>>>>>>>> + * Provided by userspace as an argument to the ioctl
>>>>>>>> + */
>>>>>>>> +struct dma_heap_allocation_file_data {
>>>>>>>> +    __u32 fd;
>>>>>>>> +    __u32 file_fd;
>>>>>>>> +    __u32 batch;
>>>>>>>> +    __u32 fd_flags;
>>>>>>>> +    __u64 heap_flags;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>>>>>>>     /**
>>>>>>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>>>>>>   #define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>>>>>>                         struct dma_heap_allocation_data)
>>>>>>>>   +/**
>>>>>>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from 
>>>>>>>> pool and both
>>>>>>>> + *                    read file when allocate memory.
>>>>>>>> + *
>>>>>>>> + * Takes a dma_heap_allocation_file_data struct and returns it 
>>>>>>>> with the fd field
>>>>>>>> + * populated with the dmabuf handle of the allocation. When 
>>>>>>>> return, the dma-buf
>>>>>>>> + * content is read from file.
>>>>>>>> + */
>>>>>>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>>>>>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>>>>>>> dma_heap_allocation_file_data)
>>>>>>>> +
>>>>>>>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>>>>>>>
>>>>>
>>
Huan Yang July 12, 2024, 7:52 a.m. UTC | #9
在 2024/7/12 15:41, Christian König 写道:
> Am 12.07.24 um 09:29 schrieb Huan Yang:
>> Hi Christian,
>>
>> 在 2024/7/12 15:10, Christian König 写道:
>>> Am 12.07.24 um 04:14 schrieb Huan Yang:
>>>> 在 2024/7/12 9:59, Huan Yang 写道:
>>>>> Hi Christian,
>>>>>
>>>>> 在 2024/7/11 19:39, Christian König 写道:
>>>>>> Am 11.07.24 um 11:18 schrieb Huan Yang:
>>>>>>> Hi Christian,
>>>>>>>
>>>>>>> Thanks for your reply.
>>>>>>>
>>>>>>> 在 2024/7/11 17:00, Christian König 写道:
>>>>>>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>>>>>>> Some user may need load file into dma-buf, current
>>>>>>>>> way is:
>>>>>>>>>    1. allocate a dma-buf, get dma-buf fd
>>>>>>>>>    2. mmap dma-buf fd into vaddr
>>>>>>>>>    3. read(file_fd, vaddr, fsz)
>>>>>>>>> This is too heavy if fsz reached to GB.
>>>>>>>>
>>>>>>>> You need to describe a bit more why that is to heavy. I can 
>>>>>>>> only assume you need to save memory bandwidth and avoid the 
>>>>>>>> extra copy with the CPU.
>>>>>>>
>>>>>>> Sorry for the oversimplified explanation. But, yes, you're 
>>>>>>> right, we want to avoid this.
>>>>>>>
>>>>>>> As we are dealing with embedded devices, the available memory 
>>>>>>> and computing power for users are usually limited.(The maximum 
>>>>>>> available memory is currently
>>>>>>>
>>>>>>> 24GB, typically ranging from 8-12GB. )
>>>>>>>
>>>>>>> Also, the CPU computing power is also usually in short supply, 
>>>>>>> due to limited battery capacity and limited heat dissipation 
>>>>>>> capabilities.
>>>>>>>
>>>>>>> So, we hope to avoid ineffective paths as much as possible.
>>>>>>>
>>>>>>>>
>>>>>>>>> This patch implement a feature called 
>>>>>>>>> DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>>>>>>> User need to offer a file_fd which you want to load into 
>>>>>>>>> dma-buf, then,
>>>>>>>>> it promise if you got a dma-buf fd, it will contains the file 
>>>>>>>>> content.
>>>>>>>>
>>>>>>>> Interesting idea, that has at least more potential than trying 
>>>>>>>> to enable direct I/O on mmap()ed DMA-bufs.
>>>>>>>>
>>>>>>>> The approach with the new IOCTL might not work because it is a 
>>>>>>>> very specialized use case.
>>>>>>>
>>>>>>> Thank you for your advice. maybe the "read file" behavior can be 
>>>>>>> attached to an existing allocation?
>>>>>>
>>>>>> The point is there are already system calls to do something like 
>>>>>> that.
>>>>>>
>>>>>> See copy_file_range() 
>>>>>> (https://man7.org/linux/man-pages/man2/copy_file_range.2.html) 
>>>>>> and send_file() 
>>>>>> (https://man7.org/linux/man-pages/man2/sendfile.2.html).
>>>>>
>>>>> That's helpfull to learn it, thanks.
>>>>>
>>>>> In terms of only DMA-BUF supporting direct I/O, 
>>>>> copy_file_range/send_file may help to achieve this functionality.
>>>>>
>>>>> However, my patchset also aims to achieve parallel copying of file 
>>>>> contents while allocating the DMA-BUF, which is something that the 
>>>>> current set of calls may not be able to accomplish.
>>>
>>> And exactly that is a no-go. Use the existing IOCTLs and system 
>>> calls instead they should have similar performance when done right.
>>
>> Get it, but In my testing process, even without memory pressure, it 
>> takes about 60ms to allocate a 3GB DMA-BUF. When there is significant 
>> memory pressure, the allocation time for a 3GB
>
> Well exactly that doesn't make sense. Even if you read the content of 
> the DMA-buf from a file you still need to allocate it first.

Yes, need allocate first, but in kernelspace, no need to wait all memory 
allocated done and then trigger file load.

This patchset use `batch` to done(default 128MB), ever 128MB allocated, 
vmap and get vaddr, then trigger this vaddr load file's target pos content.

>
> So the question is why should reading and allocating it at the same 
> time be better in any way?

Memory pressure will trigger reclaim, it must to wait.(ms) Asume I 
already allocated 512MB(need 3G) without enter slowpath,

Even I need to enter slowpath to allocated remain memory, the already 
allocated memory is using load file content.(Save time compare to 
allocated done and read)

The time difference between them can be expressed by the formula:

1. Allocate dmabuf time + file load time -- for original

2. first prepare batch time + Max(file load time, allocate remain 
dma-buf time) + latest batch prepare time -- for new

  When the file reaches the gigabyte level, the significant difference 
between the two can be clearly observed.

>
> Regards,
> Christian.
>
>>
>>
>> DMA-BUF can increase to 300ms-1s. (The above test times can also 
>> demonstrate the difference.)
>>
>> But, talk is cheap, I agree to research use existing way to 
>> implements it and give a test.
>>
>> I'll show this if I done .
>>
>> Thanks for your suggestions.
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> You can see cover-letter, here are the normal test and this IOCTL's 
>>>> compare in memory pressure, even if buffered I/O in this ioctl can 
>>>> have 50% improve by  parallel.
>>>>
>>>> dd a 3GB file for test, 12G RAM phone, UFS4.0, stressapptest 4G 
>>>> memory pressure.
>>>>
>>>> 1. original
>>>> ```shel
>>>> # create a model file
>>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>>> # drop page cache
>>>> echo 3 > /proc/sys/vm/drop_caches
>>>> ./dmabuf-heap-file-read mtk_mm-uncached normal
>>>>
>>>>> result is total cost 13087213847ns
>>>>
>>>> ```
>>>>
>>>> 2.DMA_HEAP_IOCTL_ALLOC_AND_READ O_DIRECT
>>>> ```shel
>>>> # create a model file
>>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>>> # drop page cache
>>>> echo 3 > /proc/sys/vm/drop_caches
>>>> ./dmabuf-heap-file-read mtk_mm-uncached direct_io
>>>>
>>>>> result is total cost 2902386846ns
>>>>
>>>> # use direct_io_check can check the content if is same to file.
>>>> ```
>>>>
>>>> 3. DMA_HEAP_IOCTL_ALLOC_AND_READ BUFFER I/O
>>>> ```shel
>>>> # create a model file
>>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>>> # drop page cache
>>>> echo 3 > /proc/sys/vm/drop_caches
>>>> ./dmabuf-heap-file-read mtk_mm-uncached normal_io
>>>>
>>>>> result is total cost 5735579385ns
>>>>
>>>> ```
>>>>
>>>>>
>>>>> Perhaps simply returning the DMA-BUF file descriptor and then 
>>>>> implementing copy_file_range, while populating the memory and 
>>>>> content during the copy process, could achieve this? At present, 
>>>>> it seems that it will be quite complex - We need to ensure that 
>>>>> only the returned DMA-BUF file descriptor will fail in case of 
>>>>> memory not fill, like mmap, vmap, attach, and so on.
>>>>>
>>>>>>
>>>>>> What we probably could do is to internally optimize those.
>>>>>>
>>>>>>> I am currently creating a new ioctl to remind the user that 
>>>>>>> memory is being allocated and read, and I am also unsure
>>>>>>>
>>>>>>> whether it is appropriate to add additional parameters to the 
>>>>>>> existing allocate behavior.
>>>>>>>
>>>>>>> Please, give me more suggestion. Thanks.
>>>>>>>
>>>>>>>>
>>>>>>>> But IIRC there was a copy_file_range callback in the 
>>>>>>>> file_operations structure you could use for that. I'm just not 
>>>>>>>> sure when and how that's used with the copy_file_range() system 
>>>>>>>> call.
>>>>>>>
>>>>>>> Sorry, I'm not familiar with this, but I will look into it. 
>>>>>>> However, this type of callback function is not currently 
>>>>>>> implemented when exporting
>>>>>>>
>>>>>>> the dma_buf file, which means that I need to implement the 
>>>>>>> callback for it?
>>>>>>
>>>>>> If I'm not completely mistaken the copy_file_range, splice_read 
>>>>>> and splice_write callbacks on the struct file_operations 
>>>>>> (https://elixir.bootlin.com/linux/v6.10-rc7/source/include/linux/fs.h#L1999).
>>>>>>
>>>>>> Can be used to implement what you want to do.
>>>>> Yes.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Notice, file_fd depends on user how to open this file. So, 
>>>>>>>>> both buffer
>>>>>>>>> I/O and Direct I/O is supported.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Huan Yang <link@vivo.com>
>>>>>>>>> ---
>>>>>>>>>   drivers/dma-buf/dma-heap.c    | 525 
>>>>>>>>> +++++++++++++++++++++++++++++++++-
>>>>>>>>>   include/linux/dma-heap.h      |  57 +++-
>>>>>>>>>   include/uapi/linux/dma-heap.h |  32 +++
>>>>>>>>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/dma-buf/dma-heap.c 
>>>>>>>>> b/drivers/dma-buf/dma-heap.c
>>>>>>>>> index 2298ca5e112e..abe17281adb8 100644
>>>>>>>>> --- a/drivers/dma-buf/dma-heap.c
>>>>>>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>>>>>>> @@ -15,9 +15,11 @@
>>>>>>>>>   #include <linux/list.h>
>>>>>>>>>   #include <linux/slab.h>
>>>>>>>>>   #include <linux/nospec.h>
>>>>>>>>> +#include <linux/highmem.h>
>>>>>>>>>   #include <linux/uaccess.h>
>>>>>>>>>   #include <linux/syscalls.h>
>>>>>>>>>   #include <linux/dma-heap.h>
>>>>>>>>> +#include <linux/vmalloc.h>
>>>>>>>>>   #include <uapi/linux/dma-heap.h>
>>>>>>>>>     #define DEVNAME "dma_heap"
>>>>>>>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>>>>>>>       struct cdev heap_cdev;
>>>>>>>>>   };
>>>>>>>>>   +/**
>>>>>>>>> + * struct dma_heap_file - wrap the file, read task for 
>>>>>>>>> dma_heap allocate use.
>>>>>>>>> + * @file:        file to read from.
>>>>>>>>> + *
>>>>>>>>> + * @cred:        kthread use, user cred copy to use for the 
>>>>>>>>> read.
>>>>>>>>> + *
>>>>>>>>> + * @max_batch:        maximum batch size to read, if collect 
>>>>>>>>> match batch,
>>>>>>>>> + *            trigger read, default 128MB, must below file size.
>>>>>>>>> + *
>>>>>>>>> + * @fsz:        file size.
>>>>>>>>> + *
>>>>>>>>> + * @direct:        use direct IO?
>>>>>>>>> + */
>>>>>>>>> +struct dma_heap_file {
>>>>>>>>> +    struct file *file;
>>>>>>>>> +    struct cred *cred;
>>>>>>>>> +    size_t max_batch;
>>>>>>>>> +    size_t fsz;
>>>>>>>>> +    bool direct;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * struct dma_heap_file_work - represents a dma_heap file 
>>>>>>>>> read real work.
>>>>>>>>> + * @vaddr:        contigous virtual address alloc by vmap, 
>>>>>>>>> file read need.
>>>>>>>>> + *
>>>>>>>>> + * @start_size:        file read start offset, same to 
>>>>>>>>> @dma_heap_file_task->roffset.
>>>>>>>>> + *
>>>>>>>>> + * @need_size:        file read need size, same to 
>>>>>>>>> @dma_heap_file_task->rsize.
>>>>>>>>> + *
>>>>>>>>> + * @heap_file:        file wrapper.
>>>>>>>>> + *
>>>>>>>>> + * @list:        child node of @dma_heap_file_control->works.
>>>>>>>>> + *
>>>>>>>>> + * @refp:        same @dma_heap_file_task->ref, if end of 
>>>>>>>>> read, put ref.
>>>>>>>>> + *
>>>>>>>>> + * @failp:        if any work io failed, set it true, pointp 
>>>>>>>>> @dma_heap_file_task->fail.
>>>>>>>>> + */
>>>>>>>>> +struct dma_heap_file_work {
>>>>>>>>> +    void *vaddr;
>>>>>>>>> +    ssize_t start_size;
>>>>>>>>> +    ssize_t need_size;
>>>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>>>> +    struct list_head list;
>>>>>>>>> +    atomic_t *refp;
>>>>>>>>> +    bool *failp;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * struct dma_heap_file_task - represents a dma_heap file 
>>>>>>>>> read process
>>>>>>>>> + * @ref:        current file work counter, if zero, allocate 
>>>>>>>>> and read
>>>>>>>>> + *            done.
>>>>>>>>> + *
>>>>>>>>> + * @roffset:        last read offset, current prepared work' 
>>>>>>>>> begin file
>>>>>>>>> + *            start offset.
>>>>>>>>> + *
>>>>>>>>> + * @rsize:        current allocated page size use to read, if 
>>>>>>>>> reach rbatch,
>>>>>>>>> + *            trigger commit.
>>>>>>>>> + *
>>>>>>>>> + * @rbatch:        current prepared work's batch, below 
>>>>>>>>> @dma_heap_file's
>>>>>>>>> + *            batch.
>>>>>>>>> + *
>>>>>>>>> + * @heap_file:        current dma_heap_file
>>>>>>>>> + *
>>>>>>>>> + * @parray:        used for vmap, size is @dma_heap_file's 
>>>>>>>>> batch's number
>>>>>>>>> + *            pages.(this is maximum). Due to single thread 
>>>>>>>>> file read,
>>>>>>>>> + *            one page array reuse each work prepare is OK.
>>>>>>>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>>>>>>>> + *
>>>>>>>>> + * @pindex:        current allocated page filled in @parray's 
>>>>>>>>> index.
>>>>>>>>> + *
>>>>>>>>> + * @fail:        any work failed when file read?
>>>>>>>>> + *
>>>>>>>>> + * dma_heap_file_task is the production of file read, will 
>>>>>>>>> prepare each work
>>>>>>>>> + * during allocate dma_buf pages, if match current batch, 
>>>>>>>>> then trigger commit
>>>>>>>>> + * and prepare next work. After all batch queued, user going 
>>>>>>>>> on prepare dma_buf
>>>>>>>>> + * and so on, but before return dma_buf fd, need to wait file 
>>>>>>>>> read end and
>>>>>>>>> + * check read result.
>>>>>>>>> + */
>>>>>>>>> +struct dma_heap_file_task {
>>>>>>>>> +    atomic_t ref;
>>>>>>>>> +    size_t roffset;
>>>>>>>>> +    size_t rsize;
>>>>>>>>> +    size_t rbatch;
>>>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>>>> +    struct page **parray;
>>>>>>>>> +    unsigned int pindex;
>>>>>>>>> +    bool fail;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * struct dma_heap_file_control - global control of dma_heap 
>>>>>>>>> file read.
>>>>>>>>> + * @works:        @dma_heap_file_work's list head.
>>>>>>>>> + *
>>>>>>>>> + * @lock:        only lock for @works.
>>>>>>>>> + *
>>>>>>>>> + * @threadwq:        wait queue for @work_thread, if commit 
>>>>>>>>> work, @work_thread
>>>>>>>>> + *            wakeup and read this work's file contains.
>>>>>>>>> + *
>>>>>>>>> + * @workwq:        used for main thread wait for file read 
>>>>>>>>> end, if allocation
>>>>>>>>> + *            end before file read. @dma_heap_file_task ref 
>>>>>>>>> effect this.
>>>>>>>>> + *
>>>>>>>>> + * @work_thread:    file read kthread. the dma_heap_file_task 
>>>>>>>>> work's consumer.
>>>>>>>>> + *
>>>>>>>>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>>>>>>>>> alloc/free frequently.
>>>>>>>>> + *
>>>>>>>>> + * @nr_work:        global number of how many work committed.
>>>>>>>>> + */
>>>>>>>>> +struct dma_heap_file_control {
>>>>>>>>> +    struct list_head works;
>>>>>>>>> +    spinlock_t lock;
>>>>>>>>> +    wait_queue_head_t threadwq;
>>>>>>>>> +    wait_queue_head_t workwq;
>>>>>>>>> +    struct task_struct *work_thread;
>>>>>>>>> +    struct kmem_cache *heap_fwork_cachep;
>>>>>>>>> +    atomic_t nr_work;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +static struct dma_heap_file_control *heap_fctl;
>>>>>>>>>   static LIST_HEAD(heap_list);
>>>>>>>>>   static DEFINE_MUTEX(heap_list_lock);
>>>>>>>>>   static dev_t dma_heap_devt;
>>>>>>>>>   static struct class *dma_heap_class;
>>>>>>>>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>>>>>>>   +/**
>>>>>>>>> + * map_pages_to_vaddr - map each scatter page into contiguous 
>>>>>>>>> virtual address.
>>>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>>>> + *
>>>>>>>>> + * Cached pages need to trigger file read, this function map 
>>>>>>>>> each scatter page
>>>>>>>>> + * into contiguous virtual address, so that file read can 
>>>>>>>>> easy use.
>>>>>>>>> + * Now that we get vaddr page, cached pages can return to 
>>>>>>>>> original user, so we
>>>>>>>>> + * will not effect dma-buf export even if file read not end.
>>>>>>>>> + */
>>>>>>>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task 
>>>>>>>>> *heap_ftask)
>>>>>>>>> +{
>>>>>>>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>>>>>>>> +            PAGE_KERNEL);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>>>> *heap_ftask,
>>>>>>>>> +                struct page *page)
>>>>>>>>> +{
>>>>>>>>> +    struct page **array = heap_ftask->parray;
>>>>>>>>> +    int index = heap_ftask->pindex;
>>>>>>>>> +    int num = compound_nr(page), i;
>>>>>>>>> +    unsigned long sz = page_size(page);
>>>>>>>>> +
>>>>>>>>> +    heap_ftask->rsize += sz;
>>>>>>>>> +    for (i = 0; i < num; ++i)
>>>>>>>>> +        array[index++] = &page[i];
>>>>>>>>> +    heap_ftask->pindex = index;
>>>>>>>>> +
>>>>>>>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static struct dma_heap_file_work *
>>>>>>>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>>>>>>>> +{
>>>>>>>>> +    struct dma_heap_file_work *heap_fwork;
>>>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>>>> +
>>>>>>>>> +    if (READ_ONCE(heap_ftask->fail))
>>>>>>>>> +        return NULL;
>>>>>>>>> +
>>>>>>>>> +    heap_fwork = 
>>>>>>>>> kmem_cache_alloc(heap_fctl->heap_fwork_cachep, GFP_KERNEL);
>>>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>>>> +        return NULL;
>>>>>>>>> +
>>>>>>>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>>>>>>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>>>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>>>> +        return NULL;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    heap_fwork->heap_file = heap_file;
>>>>>>>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>>>>>>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>>>>>>>> +    heap_fwork->refp = &heap_ftask->ref;
>>>>>>>>> +    heap_fwork->failp = &heap_ftask->fail;
>>>>>>>>> +    atomic_inc(&heap_ftask->ref);
>>>>>>>>> +    return heap_fwork;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static void destroy_file_work(struct dma_heap_file_work 
>>>>>>>>> *heap_fwork)
>>>>>>>>> +{
>>>>>>>>> +    vunmap(heap_fwork->vaddr);
>>>>>>>>> +    atomic_dec(heap_fwork->refp);
>>>>>>>>> +    wake_up(&heap_fctl->workwq);
>>>>>>>>> +
>>>>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>>>> *heap_ftask)
>>>>>>>>> +{
>>>>>>>>> +    struct dma_heap_file_work *heap_fwork = 
>>>>>>>>> init_file_work(heap_ftask);
>>>>>>>>> +    struct page *last = NULL;
>>>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>>>> +    size_t start = heap_ftask->roffset;
>>>>>>>>> +    struct file *file = heap_file->file;
>>>>>>>>> +    size_t fsz = heap_file->fsz;
>>>>>>>>> +
>>>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>>>> +        return -ENOMEM;
>>>>>>>>> +
>>>>>>>>> +    /**
>>>>>>>>> +     * If file size is not page aligned, direct io can't 
>>>>>>>>> process the tail.
>>>>>>>>> +     * So, if reach to tail, remain the last page use buffer 
>>>>>>>>> read.
>>>>>>>>> +     */
>>>>>>>>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>>>>>>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>>>>>>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    spin_lock(&heap_fctl->lock);
>>>>>>>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>>>>>>>> +    spin_unlock(&heap_fctl->lock);
>>>>>>>>> +    atomic_inc(&heap_fctl->nr_work);
>>>>>>>>> +
>>>>>>>>> +    wake_up(&heap_fctl->threadwq);
>>>>>>>>> +
>>>>>>>>> +    if (last) {
>>>>>>>>> +        char *buf, *pathp;
>>>>>>>>> +        ssize_t err;
>>>>>>>>> +        void *buffer;
>>>>>>>>> +
>>>>>>>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>>>>>>>> +        if (unlikely(!buf))
>>>>>>>>> +            return -ENOMEM;
>>>>>>>>> +
>>>>>>>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>>>>>>>> +
>>>>>>>>> +        pathp = file_path(file, buf, PATH_MAX);
>>>>>>>>> +        if (IS_ERR(pathp)) {
>>>>>>>>> +            kfree(buf);
>>>>>>>>> +            return PTR_ERR(pathp);
>>>>>>>>> +        }
>>>>>>>>> +
>>>>>>>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>>>>>>>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>>>>>>>>> +                         fsz - start, &fsz,
>>>>>>>>> +                         READING_POLICY);
>>>>>>>>> +        kunmap_local(buffer);
>>>>>>>>> +        kfree(buf);
>>>>>>>>> +        if (err < 0) {
>>>>>>>>> +            pr_err("failed to use buffer kernel_read_file %s, 
>>>>>>>>> err=%ld, [%ld, %ld], f_sz=%ld\n",
>>>>>>>>> +                   pathp, err, start, fsz, fsz);
>>>>>>>>> +
>>>>>>>>> +            return err;
>>>>>>>>> +        }
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>>>>>>>> +    heap_ftask->rsize = 0;
>>>>>>>>> +    heap_ftask->pindex = 0;
>>>>>>>>> +    heap_ftask->rbatch = min_t(size_t,
>>>>>>>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>>>>>>>> +                   heap_ftask->rbatch);
>>>>>>>>> +    return 0;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>>>> *heap_ftask)
>>>>>>>>> +{
>>>>>>>>> +    wait_event_freezable(heap_fctl->workwq,
>>>>>>>>> + atomic_read(&heap_ftask->ref) == 0);
>>>>>>>>> +    return heap_ftask->fail;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>>>> *heap_ftask)
>>>>>>>>> +{
>>>>>>>>> +    bool fail;
>>>>>>>>> +
>>>>>>>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>>>>>>>> +    fail = heap_ftask->fail;
>>>>>>>>> +    kvfree(heap_ftask->parray);
>>>>>>>>> +    kfree(heap_ftask);
>>>>>>>>> +    return fail;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +struct dma_heap_file_task *
>>>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>>>>>>>> +{
>>>>>>>>> +    struct dma_heap_file_task *heap_ftask =
>>>>>>>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>>>>>>>> +    if (unlikely(!heap_ftask))
>>>>>>>>> +        return NULL;
>>>>>>>>> +
>>>>>>>>> +    /**
>>>>>>>>> +     * Batch is the maximum size which we prepare work will 
>>>>>>>>> meet.
>>>>>>>>> +     * So, direct alloc this number's page array is OK.
>>>>>>>>> +     */
>>>>>>>>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch 
>>>>>>>>> >> PAGE_SHIFT,
>>>>>>>>> +                        sizeof(struct page *), GFP_KERNEL);
>>>>>>>>> +    if (unlikely(!heap_ftask->parray))
>>>>>>>>> +        goto put;
>>>>>>>>> +
>>>>>>>>> +    heap_ftask->heap_file = heap_file;
>>>>>>>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>>>>>>>> +    return heap_ftask;
>>>>>>>>> +put:
>>>>>>>>> +    kfree(heap_ftask);
>>>>>>>>> +    return NULL;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static void __work_this_io(struct dma_heap_file_work 
>>>>>>>>> *heap_fwork)
>>>>>>>>> +{
>>>>>>>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>>>>>>>> +    struct file *file = heap_file->file;
>>>>>>>>> +    ssize_t start = heap_fwork->start_size;
>>>>>>>>> +    ssize_t size = heap_fwork->need_size;
>>>>>>>>> +    void *buffer = heap_fwork->vaddr;
>>>>>>>>> +    const struct cred *old_cred;
>>>>>>>>> +    ssize_t err;
>>>>>>>>> +
>>>>>>>>> +    // use real task's cred to read this file.
>>>>>>>>> +    old_cred = override_creds(heap_file->cred);
>>>>>>>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>>>>>>>> &heap_file->fsz,
>>>>>>>>> +                   READING_POLICY);
>>>>>>>>> +    if (err < 0) {
>>>>>>>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>>>>>>>> f_sz=%ld\n",
>>>>>>>>> +               err, start, (start + size), heap_file->fsz);
>>>>>>>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>>>>>>>> +    }
>>>>>>>>> +    // recovery to my cred.
>>>>>>>>> +    revert_creds(old_cred);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static int dma_heap_file_control_thread(void *data)
>>>>>>>>> +{
>>>>>>>>> +    struct dma_heap_file_control *heap_fctl =
>>>>>>>>> +        (struct dma_heap_file_control *)data;
>>>>>>>>> +    struct dma_heap_file_work *worker, *tmp;
>>>>>>>>> +    int nr_work;
>>>>>>>>> +
>>>>>>>>> +    LIST_HEAD(pages);
>>>>>>>>> +    LIST_HEAD(workers);
>>>>>>>>> +
>>>>>>>>> +    while (true) {
>>>>>>>>> + wait_event_freezable(heap_fctl->threadwq,
>>>>>>>>> + atomic_read(&heap_fctl->nr_work) > 0);
>>>>>>>>> +recheck:
>>>>>>>>> +        spin_lock(&heap_fctl->lock);
>>>>>>>>> + list_splice_init(&heap_fctl->works, &workers);
>>>>>>>>> +        spin_unlock(&heap_fctl->lock);
>>>>>>>>> +
>>>>>>>>> +        if (unlikely(kthread_should_stop())) {
>>>>>>>>> +            list_for_each_entry_safe(worker, tmp, &workers, 
>>>>>>>>> list) {
>>>>>>>>> +                list_del(&worker->list);
>>>>>>>>> +                destroy_file_work(worker);
>>>>>>>>> +            }
>>>>>>>>> +            break;
>>>>>>>>> +        }
>>>>>>>>> +
>>>>>>>>> +        nr_work = 0;
>>>>>>>>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>>>>>>> +            ++nr_work;
>>>>>>>>> +            list_del(&worker->list);
>>>>>>>>> +            __work_this_io(worker);
>>>>>>>>> +
>>>>>>>>> +            destroy_file_work(worker);
>>>>>>>>> +        }
>>>>>>>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>>>>>>>> +
>>>>>>>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>>>>>>>> +            goto recheck;
>>>>>>>>> +    }
>>>>>>>>> +    return 0;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>>>>>>>> +{
>>>>>>>>> +    return heap_file->fsz;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static int prepare_dma_heap_file(struct dma_heap_file 
>>>>>>>>> *heap_file, int file_fd,
>>>>>>>>> +                 size_t batch)
>>>>>>>>> +{
>>>>>>>>> +    struct file *file;
>>>>>>>>> +    size_t fsz;
>>>>>>>>> +    int ret;
>>>>>>>>> +
>>>>>>>>> +    file = fget(file_fd);
>>>>>>>>> +    if (!file)
>>>>>>>>> +        return -EINVAL;
>>>>>>>>> +
>>>>>>>>> +    fsz = i_size_read(file_inode(file));
>>>>>>>>> +    if (fsz < batch) {
>>>>>>>>> +        ret = -EINVAL;
>>>>>>>>> +        goto err;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    /**
>>>>>>>>> +     * Selinux block our read, but actually we are reading 
>>>>>>>>> the stand-in
>>>>>>>>> +     * for this file.
>>>>>>>>> +     * So save current's cred and when going to read, 
>>>>>>>>> override mine, and
>>>>>>>>> +     * end of read, revert.
>>>>>>>>> +     */
>>>>>>>>> +    heap_file->cred = prepare_kernel_cred(current);
>>>>>>>>> +    if (unlikely(!heap_file->cred)) {
>>>>>>>>> +        ret = -ENOMEM;
>>>>>>>>> +        goto err;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    heap_file->file = file;
>>>>>>>>> +    heap_file->max_batch = batch;
>>>>>>>>> +    heap_file->fsz = fsz;
>>>>>>>>> +
>>>>>>>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>>>>>>>> +
>>>>>>>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>>>>>>>> +    if (!heap_file->direct && fsz >= 
>>>>>>>>> DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>>>>>>>> +        pr_warn("alloc read file better to use O_DIRECT to 
>>>>>>>>> read larget file\n");
>>>>>>>>> +
>>>>>>>>> +    return 0;
>>>>>>>>> +
>>>>>>>>> +err:
>>>>>>>>> +    fput(file);
>>>>>>>>> +    return ret;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static void destroy_dma_heap_file(struct dma_heap_file 
>>>>>>>>> *heap_file)
>>>>>>>>> +{
>>>>>>>>> +    fput(heap_file->file);
>>>>>>>>> +    put_cred(heap_file->cred);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap 
>>>>>>>>> *heap, int file_fd,
>>>>>>>>> +                       size_t batch, unsigned int fd_flags,
>>>>>>>>> +                       unsigned int heap_flags)
>>>>>>>>> +{
>>>>>>>>> +    struct dma_buf *dmabuf;
>>>>>>>>> +    int fd;
>>>>>>>>> +    struct dma_heap_file heap_file;
>>>>>>>>> +
>>>>>>>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>>>>>>>> +    if (fd)
>>>>>>>>> +        goto error_file;
>>>>>>>>> +
>>>>>>>>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, 
>>>>>>>>> fd_flags,
>>>>>>>>> +                           heap_flags);
>>>>>>>>> +    if (IS_ERR(dmabuf)) {
>>>>>>>>> +        fd = PTR_ERR(dmabuf);
>>>>>>>>> +        goto error;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>>>>>>>> +    if (fd < 0) {
>>>>>>>>> +        dma_buf_put(dmabuf);
>>>>>>>>> +        /* just return, as put will call release and that 
>>>>>>>>> will free */
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +error:
>>>>>>>>> +    destroy_dma_heap_file(&heap_file);
>>>>>>>>> +error_file:
>>>>>>>>> +    return fd;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, 
>>>>>>>>> size_t len,
>>>>>>>>>                    u32 fd_flags,
>>>>>>>>>                    u64 heap_flags)
>>>>>>>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode 
>>>>>>>>> *inode, struct file *file)
>>>>>>>>>       return 0;
>>>>>>>>>   }
>>>>>>>>>   +static long dma_heap_ioctl_allocate_read_file(struct file 
>>>>>>>>> *file, void *data)
>>>>>>>>> +{
>>>>>>>>> +    struct dma_heap_allocation_file_data 
>>>>>>>>> *heap_allocation_file = data;
>>>>>>>>> +    struct dma_heap *heap = file->private_data;
>>>>>>>>> +    int fd;
>>>>>>>>> +
>>>>>>>>> +    if (heap_allocation_file->fd || 
>>>>>>>>> !heap_allocation_file->file_fd)
>>>>>>>>> +        return -EINVAL;
>>>>>>>>> +
>>>>>>>>> +    if (heap_allocation_file->fd_flags & 
>>>>>>>>> ~DMA_HEAP_VALID_FD_FLAGS)
>>>>>>>>> +        return -EINVAL;
>>>>>>>>> +
>>>>>>>>> +    if (heap_allocation_file->heap_flags & 
>>>>>>>>> ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>>>>>>> +        return -EINVAL;
>>>>>>>>> +
>>>>>>>>> +    if (!heap->ops->allocate_read_file)
>>>>>>>>> +        return -EINVAL;
>>>>>>>>> +
>>>>>>>>> +    fd = dma_heap_buffer_alloc_read_file(
>>>>>>>>> +        heap, heap_allocation_file->file_fd,
>>>>>>>>> +        heap_allocation_file->batch ?
>>>>>>>>> + PAGE_ALIGN(heap_allocation_file->batch) :
>>>>>>>>> +            DEFAULT_ADI_BATCH,
>>>>>>>>> +        heap_allocation_file->fd_flags,
>>>>>>>>> +        heap_allocation_file->heap_flags);
>>>>>>>>> +    if (fd < 0)
>>>>>>>>> +        return fd;
>>>>>>>>> +
>>>>>>>>> +    heap_allocation_file->fd = fd;
>>>>>>>>> +    return 0;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>   static long dma_heap_ioctl_allocate(struct file *file, void 
>>>>>>>>> *data)
>>>>>>>>>   {
>>>>>>>>>       struct dma_heap_allocation_data *heap_allocation = data;
>>>>>>>>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct 
>>>>>>>>> file *file, void *data)
>>>>>>>>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>>>>>>>>       DMA_HEAP_IOCTL_ALLOC,
>>>>>>>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>>>>>>>   };
>>>>>>>>>     static long dma_heap_ioctl(struct file *file, unsigned int 
>>>>>>>>> ucmd,
>>>>>>>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file 
>>>>>>>>> *file, unsigned int ucmd,
>>>>>>>>>       case DMA_HEAP_IOCTL_ALLOC:
>>>>>>>>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>>>>>>>>           break;
>>>>>>>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>>>>>>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>>>>>>>> +        break;
>>>>>>>>>       default:
>>>>>>>>>           ret = -ENOTTY;
>>>>>>>>>           goto err;
>>>>>>>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>>>>>>>         dma_heap_class = class_create(DEVNAME);
>>>>>>>>>       if (IS_ERR(dma_heap_class)) {
>>>>>>>>> -        unregister_chrdev_region(dma_heap_devt, 
>>>>>>>>> NUM_HEAP_MINORS);
>>>>>>>>> -        return PTR_ERR(dma_heap_class);
>>>>>>>>> +        ret = PTR_ERR(dma_heap_class);
>>>>>>>>> +        goto fail_class;
>>>>>>>>>       }
>>>>>>>>>       dma_heap_class->devnode = dma_heap_devnode;
>>>>>>>>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>>>>>>>> +    if (unlikely(!heap_fctl)) {
>>>>>>>>> +        ret =  -ENOMEM;
>>>>>>>>> +        goto fail_alloc;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>>>>>>>> + init_waitqueue_head(&heap_fctl->threadwq);
>>>>>>>>> + init_waitqueue_head(&heap_fctl->workwq);
>>>>>>>>> +
>>>>>>>>> +    heap_fctl->work_thread = 
>>>>>>>>> kthread_run(dma_heap_file_control_thread,
>>>>>>>>> +                         heap_fctl, "heap_fwork_t");
>>>>>>>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>>>>>>>> +        ret = -ENOMEM;
>>>>>>>>> +        goto fail_thread;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    heap_fctl->heap_fwork_cachep = 
>>>>>>>>> KMEM_CACHE(dma_heap_file_work, 0);
>>>>>>>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>>>>>>>> +        ret = -ENOMEM;
>>>>>>>>> +        goto fail_cache;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>>       return 0;
>>>>>>>>> +
>>>>>>>>> +fail_cache:
>>>>>>>>> +    kthread_stop(heap_fctl->work_thread);
>>>>>>>>> +fail_thread:
>>>>>>>>> +    kfree(heap_fctl);
>>>>>>>>> +fail_alloc:
>>>>>>>>> +    class_destroy(dma_heap_class);
>>>>>>>>> +fail_class:
>>>>>>>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>>>>> +    return ret;
>>>>>>>>>   }
>>>>>>>>>   subsys_initcall(dma_heap_init);
>>>>>>>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>>>>>>>> index 064bad725061..9c25383f816c 100644
>>>>>>>>> --- a/include/linux/dma-heap.h
>>>>>>>>> +++ b/include/linux/dma-heap.h
>>>>>>>>> @@ -12,12 +12,17 @@
>>>>>>>>>   #include <linux/cdev.h>
>>>>>>>>>   #include <linux/types.h>
>>>>>>>>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>>>>>>>>> +
>>>>>>>>>   struct dma_heap;
>>>>>>>>> +struct dma_heap_file_task;
>>>>>>>>> +struct dma_heap_file;
>>>>>>>>>     /**
>>>>>>>>>    * struct dma_heap_ops - ops to operate on a given heap
>>>>>>>>>    * @allocate:        allocate dmabuf and return struct 
>>>>>>>>> dma_buf ptr
>>>>>>>>> - *
>>>>>>>>> + * @allocate_read_file: allocate dmabuf and read file, then 
>>>>>>>>> return struct
>>>>>>>>> + * dma_buf ptr.
>>>>>>>>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on 
>>>>>>>>> error.
>>>>>>>>>    */
>>>>>>>>>   struct dma_heap_ops {
>>>>>>>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>>>>>>>                       unsigned long len,
>>>>>>>>>                       u32 fd_flags,
>>>>>>>>>                       u64 heap_flags);
>>>>>>>>> +
>>>>>>>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>>>>>>>>> +                          struct dma_heap_file *heap_file,
>>>>>>>>> +                          u32 fd_flags,
>>>>>>>>> +                          u64 heap_flags);
>>>>>>>>>   };
>>>>>>>>>     /**
>>>>>>>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct 
>>>>>>>>> dma_heap *heap);
>>>>>>>>>    */
>>>>>>>>>   struct dma_heap *dma_heap_add(const struct 
>>>>>>>>> dma_heap_export_info *exp_info);
>>>>>>>>>   +/**
>>>>>>>>> + * dma_heap_destroy_file_read - waits for a file read to 
>>>>>>>>> complete then destroy it
>>>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>>>> + */
>>>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>>>> *heap_ftask);
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * dma_heap_wait_for_file_read - waits for a file read to 
>>>>>>>>> complete
>>>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>>>> + */
>>>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>>>> *heap_ftask);
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * dma_heap_alloc_file_read - Declare a task to read file 
>>>>>>>>> when allocate pages.
>>>>>>>>> + * @heap_file:        target file to read
>>>>>>>>> + *
>>>>>>>>> + * Return NULL if failed, otherwise return a struct pointer.
>>>>>>>>> + */
>>>>>>>>> +struct dma_heap_file_task *
>>>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * dma_heap_prepare_file_read - cache each allocated page 
>>>>>>>>> until we meet this batch.
>>>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>>>> + * @page:        current allocated page. don't care which order.
>>>>>>>>> + *
>>>>>>>>> + * Returns true if reach to batch, false so go on prepare.
>>>>>>>>> + */
>>>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>>>> *heap_ftask,
>>>>>>>>> +                struct page *page);
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * dma_heap_commit_file_read -  prepare collect enough 
>>>>>>>>> memory, going to trigger IO
>>>>>>>>> + * @heap_ftask:            info that current IO needs
>>>>>>>>> + *
>>>>>>>>> + * This commit will also check if reach to tail read.
>>>>>>>>> + * For direct I/O submissions, it is necessary to pay 
>>>>>>>>> attention to file reads
>>>>>>>>> + * that are not page-aligned. For the unaligned portion of 
>>>>>>>>> the read, buffer IO
>>>>>>>>> + * needs to be triggered.
>>>>>>>>> + * Returns:
>>>>>>>>> + *   0 if all right, -errno if something wrong
>>>>>>>>> + */
>>>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>>>> *heap_ftask);
>>>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>>>>>>>> +
>>>>>>>>>   #endif /* _DMA_HEAPS_H */
>>>>>>>>> diff --git a/include/uapi/linux/dma-heap.h 
>>>>>>>>> b/include/uapi/linux/dma-heap.h
>>>>>>>>> index a4cf716a49fa..8c20e8b74eed 100644
>>>>>>>>> --- a/include/uapi/linux/dma-heap.h
>>>>>>>>> +++ b/include/uapi/linux/dma-heap.h
>>>>>>>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>>>>>>>       __u64 heap_flags;
>>>>>>>>>   };
>>>>>>>>>   +/**
>>>>>>>>> + * struct dma_heap_allocation_file_data - metadata passed 
>>>>>>>>> from userspace for
>>>>>>>>> + * allocations and read file
>>>>>>>>> + * @fd:            will be populated with a fd which provides 
>>>>>>>>> the
>>>>>>>>> + *     ��      handle to the allocated dma-buf
>>>>>>>>> + * @file_fd:        file descriptor to read from(suggested to 
>>>>>>>>> use O_DIRECT open file)
>>>>>>>>> + * @batch:        how many memory alloced then file 
>>>>>>>>> read(bytes), default 128MB
>>>>>>>>> + *            will auto aligned to PAGE_SIZE
>>>>>>>>> + * @fd_flags:        file descriptor flags used when allocating
>>>>>>>>> + * @heap_flags:        flags passed to heap
>>>>>>>>> + *
>>>>>>>>> + * Provided by userspace as an argument to the ioctl
>>>>>>>>> + */
>>>>>>>>> +struct dma_heap_allocation_file_data {
>>>>>>>>> +    __u32 fd;
>>>>>>>>> +    __u32 file_fd;
>>>>>>>>> +    __u32 batch;
>>>>>>>>> +    __u32 fd_flags;
>>>>>>>>> +    __u64 heap_flags;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>>>>>>>>     /**
>>>>>>>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>>>>>>>   #define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>>>>>>>                         struct dma_heap_allocation_data)
>>>>>>>>>   +/**
>>>>>>>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from 
>>>>>>>>> pool and both
>>>>>>>>> + *                    read file when allocate memory.
>>>>>>>>> + *
>>>>>>>>> + * Takes a dma_heap_allocation_file_data struct and returns 
>>>>>>>>> it with the fd field
>>>>>>>>> + * populated with the dmabuf handle of the allocation. When 
>>>>>>>>> return, the dma-buf
>>>>>>>>> + * content is read from file.
>>>>>>>>> + */
>>>>>>>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>>>>>>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>>>>>>>> dma_heap_allocation_file_data)
>>>>>>>>> +
>>>>>>>>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>>>>>>>>
>>>>>>
>>>
>
Christian König July 12, 2024, 10:59 a.m. UTC | #10
Am 12.07.24 um 09:52 schrieb Huan Yang:
>
> 在 2024/7/12 15:41, Christian König 写道:
>> Am 12.07.24 um 09:29 schrieb Huan Yang:
>>> Hi Christian,
>>>
>>> 在 2024/7/12 15:10, Christian König 写道:
>>>> Am 12.07.24 um 04:14 schrieb Huan Yang:
>>>>> 在 2024/7/12 9:59, Huan Yang 写道:
>>>>>> Hi Christian,
>>>>>>
>>>>>> 在 2024/7/11 19:39, Christian König 写道:
>>>>>>> Am 11.07.24 um 11:18 schrieb Huan Yang:
>>>>>>>> Hi Christian,
>>>>>>>>
>>>>>>>> Thanks for your reply.
>>>>>>>>
>>>>>>>> 在 2024/7/11 17:00, Christian König 写道:
>>>>>>>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>>>>>>>> Some user may need load file into dma-buf, current
>>>>>>>>>> way is:
>>>>>>>>>>    1. allocate a dma-buf, get dma-buf fd
>>>>>>>>>>    2. mmap dma-buf fd into vaddr
>>>>>>>>>>    3. read(file_fd, vaddr, fsz)
>>>>>>>>>> This is too heavy if fsz reached to GB.
>>>>>>>>>
>>>>>>>>> You need to describe a bit more why that is to heavy. I can 
>>>>>>>>> only assume you need to save memory bandwidth and avoid the 
>>>>>>>>> extra copy with the CPU.
>>>>>>>>
>>>>>>>> Sorry for the oversimplified explanation. But, yes, you're 
>>>>>>>> right, we want to avoid this.
>>>>>>>>
>>>>>>>> As we are dealing with embedded devices, the available memory 
>>>>>>>> and computing power for users are usually limited.(The maximum 
>>>>>>>> available memory is currently
>>>>>>>>
>>>>>>>> 24GB, typically ranging from 8-12GB. )
>>>>>>>>
>>>>>>>> Also, the CPU computing power is also usually in short supply, 
>>>>>>>> due to limited battery capacity and limited heat dissipation 
>>>>>>>> capabilities.
>>>>>>>>
>>>>>>>> So, we hope to avoid ineffective paths as much as possible.
>>>>>>>>
>>>>>>>>>
>>>>>>>>>> This patch implement a feature called 
>>>>>>>>>> DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>>>>>>>> User need to offer a file_fd which you want to load into 
>>>>>>>>>> dma-buf, then,
>>>>>>>>>> it promise if you got a dma-buf fd, it will contains the file 
>>>>>>>>>> content.
>>>>>>>>>
>>>>>>>>> Interesting idea, that has at least more potential than trying 
>>>>>>>>> to enable direct I/O on mmap()ed DMA-bufs.
>>>>>>>>>
>>>>>>>>> The approach with the new IOCTL might not work because it is a 
>>>>>>>>> very specialized use case.
>>>>>>>>
>>>>>>>> Thank you for your advice. maybe the "read file" behavior can 
>>>>>>>> be attached to an existing allocation?
>>>>>>>
>>>>>>> The point is there are already system calls to do something like 
>>>>>>> that.
>>>>>>>
>>>>>>> See copy_file_range() 
>>>>>>> (https://man7.org/linux/man-pages/man2/copy_file_range.2.html) 
>>>>>>> and send_file() 
>>>>>>> (https://man7.org/linux/man-pages/man2/sendfile.2.html).
>>>>>>
>>>>>> That's helpfull to learn it, thanks.
>>>>>>
>>>>>> In terms of only DMA-BUF supporting direct I/O, 
>>>>>> copy_file_range/send_file may help to achieve this functionality.
>>>>>>
>>>>>> However, my patchset also aims to achieve parallel copying of 
>>>>>> file contents while allocating the DMA-BUF, which is something 
>>>>>> that the current set of calls may not be able to accomplish.
>>>>
>>>> And exactly that is a no-go. Use the existing IOCTLs and system 
>>>> calls instead they should have similar performance when done right.
>>>
>>> Get it, but In my testing process, even without memory pressure, it 
>>> takes about 60ms to allocate a 3GB DMA-BUF. When there is 
>>> significant memory pressure, the allocation time for a 3GB
>>
>> Well exactly that doesn't make sense. Even if you read the content of 
>> the DMA-buf from a file you still need to allocate it first.
>
> Yes, need allocate first, but in kernelspace, no need to wait all 
> memory allocated done and then trigger file load.

That doesn't really make sense. Allocating a large bunch of memory is 
more efficient than allocating less multiple times because of cache 
locality for example.

You could of course hide latency caused by operations to reduce memory 
pressure when you have a specific use case, but you don't need to use an 
in kernel implementation for that.

Question is do you have clear on allocation or clear on free enabled?

> This patchset use `batch` to done(default 128MB), ever 128MB 
> allocated, vmap and get vaddr, then trigger this vaddr load file's 
> target pos content.

Again that sounds really not ideal to me. Creating the vmap alone is 
complete unnecessary overhead.

>> So the question is why should reading and allocating it at the same 
>> time be better in any way?
>
> Memory pressure will trigger reclaim, it must to wait.(ms) Asume I 
> already allocated 512MB(need 3G) without enter slowpath,
>
> Even I need to enter slowpath to allocated remain memory, the already 
> allocated memory is using load file content.(Save time compare to 
> allocated done and read)
>
> The time difference between them can be expressed by the formula:
>
> 1. Allocate dmabuf time + file load time -- for original
>
> 2. first prepare batch time + Max(file load time, allocate remain 
> dma-buf time) + latest batch prepare time -- for new
>
>  When the file reaches the gigabyte level, the significant difference 
> between the two can be clearly observed.

I have strong doubts about that. The method you describe above is 
actually really inefficient.

First of all you create a memory mapping just to load data, that is 
superfluous and TLB flushes are usually extremely costly. Both for 
userspace as well as kernel.

I strongly suggest to try to use copy_file_range() instead. But could be 
that copy_file_range() doesn't even work right now because of some 
restrictions, never tried that on a DMA-buf.

When that works as far as I can see what could still be saved on 
overhead is the following:

1. Clearing of memory on allocation. That could potentially be done with 
delayed allocation or clear on free instead.

2. CPU copy between the I/O target buffer and the DMA-buf backing pages. 
In theory it should be possible to avoid that by implementing the 
copy_file_range() callback, but I'm not 100% sure.

Regards,
Christian.

>
>>
>> Regards,
>> Christian.
>>
>>>
>>>
>>> DMA-BUF can increase to 300ms-1s. (The above test times can also 
>>> demonstrate the difference.)
>>>
>>> But, talk is cheap, I agree to research use existing way to 
>>> implements it and give a test.
>>>
>>> I'll show this if I done .
>>>
>>> Thanks for your suggestions.
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> You can see cover-letter, here are the normal test and this 
>>>>> IOCTL's compare in memory pressure, even if buffered I/O in this 
>>>>> ioctl can have 50% improve by  parallel.
>>>>>
>>>>> dd a 3GB file for test, 12G RAM phone, UFS4.0, stressapptest 4G 
>>>>> memory pressure.
>>>>>
>>>>> 1. original
>>>>> ```shel
>>>>> # create a model file
>>>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>>>> # drop page cache
>>>>> echo 3 > /proc/sys/vm/drop_caches
>>>>> ./dmabuf-heap-file-read mtk_mm-uncached normal
>>>>>
>>>>>> result is total cost 13087213847ns
>>>>>
>>>>> ```
>>>>>
>>>>> 2.DMA_HEAP_IOCTL_ALLOC_AND_READ O_DIRECT
>>>>> ```shel
>>>>> # create a model file
>>>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>>>> # drop page cache
>>>>> echo 3 > /proc/sys/vm/drop_caches
>>>>> ./dmabuf-heap-file-read mtk_mm-uncached direct_io
>>>>>
>>>>>> result is total cost 2902386846ns
>>>>>
>>>>> # use direct_io_check can check the content if is same to file.
>>>>> ```
>>>>>
>>>>> 3. DMA_HEAP_IOCTL_ALLOC_AND_READ BUFFER I/O
>>>>> ```shel
>>>>> # create a model file
>>>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>>>> # drop page cache
>>>>> echo 3 > /proc/sys/vm/drop_caches
>>>>> ./dmabuf-heap-file-read mtk_mm-uncached normal_io
>>>>>
>>>>>> result is total cost 5735579385ns
>>>>>
>>>>> ```
>>>>>
>>>>>>
>>>>>> Perhaps simply returning the DMA-BUF file descriptor and then 
>>>>>> implementing copy_file_range, while populating the memory and 
>>>>>> content during the copy process, could achieve this? At present, 
>>>>>> it seems that it will be quite complex - We need to ensure that 
>>>>>> only the returned DMA-BUF file descriptor will fail in case of 
>>>>>> memory not fill, like mmap, vmap, attach, and so on.
>>>>>>
>>>>>>>
>>>>>>> What we probably could do is to internally optimize those.
>>>>>>>
>>>>>>>> I am currently creating a new ioctl to remind the user that 
>>>>>>>> memory is being allocated and read, and I am also unsure
>>>>>>>>
>>>>>>>> whether it is appropriate to add additional parameters to the 
>>>>>>>> existing allocate behavior.
>>>>>>>>
>>>>>>>> Please, give me more suggestion. Thanks.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> But IIRC there was a copy_file_range callback in the 
>>>>>>>>> file_operations structure you could use for that. I'm just not 
>>>>>>>>> sure when and how that's used with the copy_file_range() 
>>>>>>>>> system call.
>>>>>>>>
>>>>>>>> Sorry, I'm not familiar with this, but I will look into it. 
>>>>>>>> However, this type of callback function is not currently 
>>>>>>>> implemented when exporting
>>>>>>>>
>>>>>>>> the dma_buf file, which means that I need to implement the 
>>>>>>>> callback for it?
>>>>>>>
>>>>>>> If I'm not completely mistaken the copy_file_range, splice_read 
>>>>>>> and splice_write callbacks on the struct file_operations 
>>>>>>> (https://elixir.bootlin.com/linux/v6.10-rc7/source/include/linux/fs.h#L1999).
>>>>>>>
>>>>>>> Can be used to implement what you want to do.
>>>>>> Yes.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Notice, file_fd depends on user how to open this file. So, 
>>>>>>>>>> both buffer
>>>>>>>>>> I/O and Direct I/O is supported.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Huan Yang <link@vivo.com>
>>>>>>>>>> ---
>>>>>>>>>>   drivers/dma-buf/dma-heap.c    | 525 
>>>>>>>>>> +++++++++++++++++++++++++++++++++-
>>>>>>>>>>   include/linux/dma-heap.h      |  57 +++-
>>>>>>>>>>   include/uapi/linux/dma-heap.h |  32 +++
>>>>>>>>>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/dma-buf/dma-heap.c 
>>>>>>>>>> b/drivers/dma-buf/dma-heap.c
>>>>>>>>>> index 2298ca5e112e..abe17281adb8 100644
>>>>>>>>>> --- a/drivers/dma-buf/dma-heap.c
>>>>>>>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>>>>>>>> @@ -15,9 +15,11 @@
>>>>>>>>>>   #include <linux/list.h>
>>>>>>>>>>   #include <linux/slab.h>
>>>>>>>>>>   #include <linux/nospec.h>
>>>>>>>>>> +#include <linux/highmem.h>
>>>>>>>>>>   #include <linux/uaccess.h>
>>>>>>>>>>   #include <linux/syscalls.h>
>>>>>>>>>>   #include <linux/dma-heap.h>
>>>>>>>>>> +#include <linux/vmalloc.h>
>>>>>>>>>>   #include <uapi/linux/dma-heap.h>
>>>>>>>>>>     #define DEVNAME "dma_heap"
>>>>>>>>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>>>>>>>>       struct cdev heap_cdev;
>>>>>>>>>>   };
>>>>>>>>>>   +/**
>>>>>>>>>> + * struct dma_heap_file - wrap the file, read task for 
>>>>>>>>>> dma_heap allocate use.
>>>>>>>>>> + * @file:        file to read from.
>>>>>>>>>> + *
>>>>>>>>>> + * @cred:        kthread use, user cred copy to use for the 
>>>>>>>>>> read.
>>>>>>>>>> + *
>>>>>>>>>> + * @max_batch:        maximum batch size to read, if collect 
>>>>>>>>>> match batch,
>>>>>>>>>> + *            trigger read, default 128MB, must below file 
>>>>>>>>>> size.
>>>>>>>>>> + *
>>>>>>>>>> + * @fsz:        file size.
>>>>>>>>>> + *
>>>>>>>>>> + * @direct:        use direct IO?
>>>>>>>>>> + */
>>>>>>>>>> +struct dma_heap_file {
>>>>>>>>>> +    struct file *file;
>>>>>>>>>> +    struct cred *cred;
>>>>>>>>>> +    size_t max_batch;
>>>>>>>>>> +    size_t fsz;
>>>>>>>>>> +    bool direct;
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * struct dma_heap_file_work - represents a dma_heap file 
>>>>>>>>>> read real work.
>>>>>>>>>> + * @vaddr:        contigous virtual address alloc by vmap, 
>>>>>>>>>> file read need.
>>>>>>>>>> + *
>>>>>>>>>> + * @start_size:        file read start offset, same to 
>>>>>>>>>> @dma_heap_file_task->roffset.
>>>>>>>>>> + *
>>>>>>>>>> + * @need_size:        file read need size, same to 
>>>>>>>>>> @dma_heap_file_task->rsize.
>>>>>>>>>> + *
>>>>>>>>>> + * @heap_file:        file wrapper.
>>>>>>>>>> + *
>>>>>>>>>> + * @list:        child node of @dma_heap_file_control->works.
>>>>>>>>>> + *
>>>>>>>>>> + * @refp:        same @dma_heap_file_task->ref, if end of 
>>>>>>>>>> read, put ref.
>>>>>>>>>> + *
>>>>>>>>>> + * @failp:        if any work io failed, set it true, pointp 
>>>>>>>>>> @dma_heap_file_task->fail.
>>>>>>>>>> + */
>>>>>>>>>> +struct dma_heap_file_work {
>>>>>>>>>> +    void *vaddr;
>>>>>>>>>> +    ssize_t start_size;
>>>>>>>>>> +    ssize_t need_size;
>>>>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>>>>> +    struct list_head list;
>>>>>>>>>> +    atomic_t *refp;
>>>>>>>>>> +    bool *failp;
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * struct dma_heap_file_task - represents a dma_heap file 
>>>>>>>>>> read process
>>>>>>>>>> + * @ref:        current file work counter, if zero, allocate 
>>>>>>>>>> and read
>>>>>>>>>> + *            done.
>>>>>>>>>> + *
>>>>>>>>>> + * @roffset:        last read offset, current prepared work' 
>>>>>>>>>> begin file
>>>>>>>>>> + *            start offset.
>>>>>>>>>> + *
>>>>>>>>>> + * @rsize:        current allocated page size use to read, 
>>>>>>>>>> if reach rbatch,
>>>>>>>>>> + *            trigger commit.
>>>>>>>>>> + *
>>>>>>>>>> + * @rbatch:        current prepared work's batch, below 
>>>>>>>>>> @dma_heap_file's
>>>>>>>>>> + *            batch.
>>>>>>>>>> + *
>>>>>>>>>> + * @heap_file:        current dma_heap_file
>>>>>>>>>> + *
>>>>>>>>>> + * @parray:        used for vmap, size is @dma_heap_file's 
>>>>>>>>>> batch's number
>>>>>>>>>> + *            pages.(this is maximum). Due to single thread 
>>>>>>>>>> file read,
>>>>>>>>>> + *            one page array reuse each work prepare is OK.
>>>>>>>>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>>>>>>>>> + *
>>>>>>>>>> + * @pindex:        current allocated page filled in 
>>>>>>>>>> @parray's index.
>>>>>>>>>> + *
>>>>>>>>>> + * @fail:        any work failed when file read?
>>>>>>>>>> + *
>>>>>>>>>> + * dma_heap_file_task is the production of file read, will 
>>>>>>>>>> prepare each work
>>>>>>>>>> + * during allocate dma_buf pages, if match current batch, 
>>>>>>>>>> then trigger commit
>>>>>>>>>> + * and prepare next work. After all batch queued, user going 
>>>>>>>>>> on prepare dma_buf
>>>>>>>>>> + * and so on, but before return dma_buf fd, need to wait 
>>>>>>>>>> file read end and
>>>>>>>>>> + * check read result.
>>>>>>>>>> + */
>>>>>>>>>> +struct dma_heap_file_task {
>>>>>>>>>> +    atomic_t ref;
>>>>>>>>>> +    size_t roffset;
>>>>>>>>>> +    size_t rsize;
>>>>>>>>>> +    size_t rbatch;
>>>>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>>>>> +    struct page **parray;
>>>>>>>>>> +    unsigned int pindex;
>>>>>>>>>> +    bool fail;
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * struct dma_heap_file_control - global control of dma_heap 
>>>>>>>>>> file read.
>>>>>>>>>> + * @works:        @dma_heap_file_work's list head.
>>>>>>>>>> + *
>>>>>>>>>> + * @lock:        only lock for @works.
>>>>>>>>>> + *
>>>>>>>>>> + * @threadwq:        wait queue for @work_thread, if commit 
>>>>>>>>>> work, @work_thread
>>>>>>>>>> + *            wakeup and read this work's file contains.
>>>>>>>>>> + *
>>>>>>>>>> + * @workwq:        used for main thread wait for file read 
>>>>>>>>>> end, if allocation
>>>>>>>>>> + *            end before file read. @dma_heap_file_task ref 
>>>>>>>>>> effect this.
>>>>>>>>>> + *
>>>>>>>>>> + * @work_thread:    file read kthread. the 
>>>>>>>>>> dma_heap_file_task work's consumer.
>>>>>>>>>> + *
>>>>>>>>>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>>>>>>>>>> alloc/free frequently.
>>>>>>>>>> + *
>>>>>>>>>> + * @nr_work:        global number of how many work committed.
>>>>>>>>>> + */
>>>>>>>>>> +struct dma_heap_file_control {
>>>>>>>>>> +    struct list_head works;
>>>>>>>>>> +    spinlock_t lock;
>>>>>>>>>> +    wait_queue_head_t threadwq;
>>>>>>>>>> +    wait_queue_head_t workwq;
>>>>>>>>>> +    struct task_struct *work_thread;
>>>>>>>>>> +    struct kmem_cache *heap_fwork_cachep;
>>>>>>>>>> +    atomic_t nr_work;
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>> +static struct dma_heap_file_control *heap_fctl;
>>>>>>>>>>   static LIST_HEAD(heap_list);
>>>>>>>>>>   static DEFINE_MUTEX(heap_list_lock);
>>>>>>>>>>   static dev_t dma_heap_devt;
>>>>>>>>>>   static struct class *dma_heap_class;
>>>>>>>>>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>>>>>>>>   +/**
>>>>>>>>>> + * map_pages_to_vaddr - map each scatter page into 
>>>>>>>>>> contiguous virtual address.
>>>>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>>>>> + *
>>>>>>>>>> + * Cached pages need to trigger file read, this function map 
>>>>>>>>>> each scatter page
>>>>>>>>>> + * into contiguous virtual address, so that file read can 
>>>>>>>>>> easy use.
>>>>>>>>>> + * Now that we get vaddr page, cached pages can return to 
>>>>>>>>>> original user, so we
>>>>>>>>>> + * will not effect dma-buf export even if file read not end.
>>>>>>>>>> + */
>>>>>>>>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task 
>>>>>>>>>> *heap_ftask)
>>>>>>>>>> +{
>>>>>>>>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>>>>>>>>> +            PAGE_KERNEL);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>>>>> *heap_ftask,
>>>>>>>>>> +                struct page *page)
>>>>>>>>>> +{
>>>>>>>>>> +    struct page **array = heap_ftask->parray;
>>>>>>>>>> +    int index = heap_ftask->pindex;
>>>>>>>>>> +    int num = compound_nr(page), i;
>>>>>>>>>> +    unsigned long sz = page_size(page);
>>>>>>>>>> +
>>>>>>>>>> +    heap_ftask->rsize += sz;
>>>>>>>>>> +    for (i = 0; i < num; ++i)
>>>>>>>>>> +        array[index++] = &page[i];
>>>>>>>>>> +    heap_ftask->pindex = index;
>>>>>>>>>> +
>>>>>>>>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static struct dma_heap_file_work *
>>>>>>>>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>>>>>>>>> +{
>>>>>>>>>> +    struct dma_heap_file_work *heap_fwork;
>>>>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>>>>> +
>>>>>>>>>> +    if (READ_ONCE(heap_ftask->fail))
>>>>>>>>>> +        return NULL;
>>>>>>>>>> +
>>>>>>>>>> +    heap_fwork = 
>>>>>>>>>> kmem_cache_alloc(heap_fctl->heap_fwork_cachep, GFP_KERNEL);
>>>>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>>>>> +        return NULL;
>>>>>>>>>> +
>>>>>>>>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>>>>>>>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>>>>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>>>>> +        return NULL;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    heap_fwork->heap_file = heap_file;
>>>>>>>>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>>>>>>>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>>>>>>>>> +    heap_fwork->refp = &heap_ftask->ref;
>>>>>>>>>> +    heap_fwork->failp = &heap_ftask->fail;
>>>>>>>>>> +    atomic_inc(&heap_ftask->ref);
>>>>>>>>>> +    return heap_fwork;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void destroy_file_work(struct dma_heap_file_work 
>>>>>>>>>> *heap_fwork)
>>>>>>>>>> +{
>>>>>>>>>> +    vunmap(heap_fwork->vaddr);
>>>>>>>>>> +    atomic_dec(heap_fwork->refp);
>>>>>>>>>> +    wake_up(&heap_fctl->workwq);
>>>>>>>>>> +
>>>>>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>>>>> *heap_ftask)
>>>>>>>>>> +{
>>>>>>>>>> +    struct dma_heap_file_work *heap_fwork = 
>>>>>>>>>> init_file_work(heap_ftask);
>>>>>>>>>> +    struct page *last = NULL;
>>>>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>>>>> +    size_t start = heap_ftask->roffset;
>>>>>>>>>> +    struct file *file = heap_file->file;
>>>>>>>>>> +    size_t fsz = heap_file->fsz;
>>>>>>>>>> +
>>>>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>>>>> +        return -ENOMEM;
>>>>>>>>>> +
>>>>>>>>>> +    /**
>>>>>>>>>> +     * If file size is not page aligned, direct io can't 
>>>>>>>>>> process the tail.
>>>>>>>>>> +     * So, if reach to tail, remain the last page use buffer 
>>>>>>>>>> read.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>>>>>>>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>>>>>>>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    spin_lock(&heap_fctl->lock);
>>>>>>>>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>>>>>>>>> +    spin_unlock(&heap_fctl->lock);
>>>>>>>>>> +    atomic_inc(&heap_fctl->nr_work);
>>>>>>>>>> +
>>>>>>>>>> +    wake_up(&heap_fctl->threadwq);
>>>>>>>>>> +
>>>>>>>>>> +    if (last) {
>>>>>>>>>> +        char *buf, *pathp;
>>>>>>>>>> +        ssize_t err;
>>>>>>>>>> +        void *buffer;
>>>>>>>>>> +
>>>>>>>>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>>>>>>>>> +        if (unlikely(!buf))
>>>>>>>>>> +            return -ENOMEM;
>>>>>>>>>> +
>>>>>>>>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>>>>>>>>> +
>>>>>>>>>> +        pathp = file_path(file, buf, PATH_MAX);
>>>>>>>>>> +        if (IS_ERR(pathp)) {
>>>>>>>>>> +            kfree(buf);
>>>>>>>>>> +            return PTR_ERR(pathp);
>>>>>>>>>> +        }
>>>>>>>>>> +
>>>>>>>>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>>>>>>>>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>>>>>>>>>> +                         fsz - start, &fsz,
>>>>>>>>>> +                         READING_POLICY);
>>>>>>>>>> +        kunmap_local(buffer);
>>>>>>>>>> +        kfree(buf);
>>>>>>>>>> +        if (err < 0) {
>>>>>>>>>> +            pr_err("failed to use buffer kernel_read_file 
>>>>>>>>>> %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
>>>>>>>>>> +                   pathp, err, start, fsz, fsz);
>>>>>>>>>> +
>>>>>>>>>> +            return err;
>>>>>>>>>> +        }
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>>>>>>>>> +    heap_ftask->rsize = 0;
>>>>>>>>>> +    heap_ftask->pindex = 0;
>>>>>>>>>> +    heap_ftask->rbatch = min_t(size_t,
>>>>>>>>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>>>>>>>>> +                   heap_ftask->rbatch);
>>>>>>>>>> +    return 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>>>>> *heap_ftask)
>>>>>>>>>> +{
>>>>>>>>>> +    wait_event_freezable(heap_fctl->workwq,
>>>>>>>>>> + atomic_read(&heap_ftask->ref) == 0);
>>>>>>>>>> +    return heap_ftask->fail;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>>>>> *heap_ftask)
>>>>>>>>>> +{
>>>>>>>>>> +    bool fail;
>>>>>>>>>> +
>>>>>>>>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>>>>>>>>> +    fail = heap_ftask->fail;
>>>>>>>>>> +    kvfree(heap_ftask->parray);
>>>>>>>>>> +    kfree(heap_ftask);
>>>>>>>>>> +    return fail;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +struct dma_heap_file_task *
>>>>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>>>>>>>>> +{
>>>>>>>>>> +    struct dma_heap_file_task *heap_ftask =
>>>>>>>>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>>>>>>>>> +    if (unlikely(!heap_ftask))
>>>>>>>>>> +        return NULL;
>>>>>>>>>> +
>>>>>>>>>> +    /**
>>>>>>>>>> +     * Batch is the maximum size which we prepare work will 
>>>>>>>>>> meet.
>>>>>>>>>> +     * So, direct alloc this number's page array is OK.
>>>>>>>>>> +     */
>>>>>>>>>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch 
>>>>>>>>>> >> PAGE_SHIFT,
>>>>>>>>>> +                        sizeof(struct page *), GFP_KERNEL);
>>>>>>>>>> +    if (unlikely(!heap_ftask->parray))
>>>>>>>>>> +        goto put;
>>>>>>>>>> +
>>>>>>>>>> +    heap_ftask->heap_file = heap_file;
>>>>>>>>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>>>>>>>>> +    return heap_ftask;
>>>>>>>>>> +put:
>>>>>>>>>> +    kfree(heap_ftask);
>>>>>>>>>> +    return NULL;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void __work_this_io(struct dma_heap_file_work 
>>>>>>>>>> *heap_fwork)
>>>>>>>>>> +{
>>>>>>>>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>>>>>>>>> +    struct file *file = heap_file->file;
>>>>>>>>>> +    ssize_t start = heap_fwork->start_size;
>>>>>>>>>> +    ssize_t size = heap_fwork->need_size;
>>>>>>>>>> +    void *buffer = heap_fwork->vaddr;
>>>>>>>>>> +    const struct cred *old_cred;
>>>>>>>>>> +    ssize_t err;
>>>>>>>>>> +
>>>>>>>>>> +    // use real task's cred to read this file.
>>>>>>>>>> +    old_cred = override_creds(heap_file->cred);
>>>>>>>>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>>>>>>>>> &heap_file->fsz,
>>>>>>>>>> +                   READING_POLICY);
>>>>>>>>>> +    if (err < 0) {
>>>>>>>>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>>>>>>>>> f_sz=%ld\n",
>>>>>>>>>> +               err, start, (start + size), heap_file->fsz);
>>>>>>>>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>>>>>>>>> +    }
>>>>>>>>>> +    // recovery to my cred.
>>>>>>>>>> +    revert_creds(old_cred);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static int dma_heap_file_control_thread(void *data)
>>>>>>>>>> +{
>>>>>>>>>> +    struct dma_heap_file_control *heap_fctl =
>>>>>>>>>> +        (struct dma_heap_file_control *)data;
>>>>>>>>>> +    struct dma_heap_file_work *worker, *tmp;
>>>>>>>>>> +    int nr_work;
>>>>>>>>>> +
>>>>>>>>>> +    LIST_HEAD(pages);
>>>>>>>>>> +    LIST_HEAD(workers);
>>>>>>>>>> +
>>>>>>>>>> +    while (true) {
>>>>>>>>>> + wait_event_freezable(heap_fctl->threadwq,
>>>>>>>>>> + atomic_read(&heap_fctl->nr_work) > 0);
>>>>>>>>>> +recheck:
>>>>>>>>>> +        spin_lock(&heap_fctl->lock);
>>>>>>>>>> + list_splice_init(&heap_fctl->works, &workers);
>>>>>>>>>> +        spin_unlock(&heap_fctl->lock);
>>>>>>>>>> +
>>>>>>>>>> +        if (unlikely(kthread_should_stop())) {
>>>>>>>>>> +            list_for_each_entry_safe(worker, tmp, &workers, 
>>>>>>>>>> list) {
>>>>>>>>>> +                list_del(&worker->list);
>>>>>>>>>> +                destroy_file_work(worker);
>>>>>>>>>> +            }
>>>>>>>>>> +            break;
>>>>>>>>>> +        }
>>>>>>>>>> +
>>>>>>>>>> +        nr_work = 0;
>>>>>>>>>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>>>>>>>> +            ++nr_work;
>>>>>>>>>> +            list_del(&worker->list);
>>>>>>>>>> +            __work_this_io(worker);
>>>>>>>>>> +
>>>>>>>>>> +            destroy_file_work(worker);
>>>>>>>>>> +        }
>>>>>>>>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>>>>>>>>> +
>>>>>>>>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>>>>>>>>> +            goto recheck;
>>>>>>>>>> +    }
>>>>>>>>>> +    return 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>>>>>>>>> +{
>>>>>>>>>> +    return heap_file->fsz;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static int prepare_dma_heap_file(struct dma_heap_file 
>>>>>>>>>> *heap_file, int file_fd,
>>>>>>>>>> +                 size_t batch)
>>>>>>>>>> +{
>>>>>>>>>> +    struct file *file;
>>>>>>>>>> +    size_t fsz;
>>>>>>>>>> +    int ret;
>>>>>>>>>> +
>>>>>>>>>> +    file = fget(file_fd);
>>>>>>>>>> +    if (!file)
>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>> +
>>>>>>>>>> +    fsz = i_size_read(file_inode(file));
>>>>>>>>>> +    if (fsz < batch) {
>>>>>>>>>> +        ret = -EINVAL;
>>>>>>>>>> +        goto err;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    /**
>>>>>>>>>> +     * Selinux block our read, but actually we are reading 
>>>>>>>>>> the stand-in
>>>>>>>>>> +     * for this file.
>>>>>>>>>> +     * So save current's cred and when going to read, 
>>>>>>>>>> override mine, and
>>>>>>>>>> +     * end of read, revert.
>>>>>>>>>> +     */
>>>>>>>>>> +    heap_file->cred = prepare_kernel_cred(current);
>>>>>>>>>> +    if (unlikely(!heap_file->cred)) {
>>>>>>>>>> +        ret = -ENOMEM;
>>>>>>>>>> +        goto err;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    heap_file->file = file;
>>>>>>>>>> +    heap_file->max_batch = batch;
>>>>>>>>>> +    heap_file->fsz = fsz;
>>>>>>>>>> +
>>>>>>>>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>>>>>>>>> +
>>>>>>>>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>>>>>>>>> +    if (!heap_file->direct && fsz >= 
>>>>>>>>>> DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>>>>>>>>> +        pr_warn("alloc read file better to use O_DIRECT to 
>>>>>>>>>> read larget file\n");
>>>>>>>>>> +
>>>>>>>>>> +    return 0;
>>>>>>>>>> +
>>>>>>>>>> +err:
>>>>>>>>>> +    fput(file);
>>>>>>>>>> +    return ret;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void destroy_dma_heap_file(struct dma_heap_file 
>>>>>>>>>> *heap_file)
>>>>>>>>>> +{
>>>>>>>>>> +    fput(heap_file->file);
>>>>>>>>>> +    put_cred(heap_file->cred);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap 
>>>>>>>>>> *heap, int file_fd,
>>>>>>>>>> +                       size_t batch, unsigned int fd_flags,
>>>>>>>>>> +                       unsigned int heap_flags)
>>>>>>>>>> +{
>>>>>>>>>> +    struct dma_buf *dmabuf;
>>>>>>>>>> +    int fd;
>>>>>>>>>> +    struct dma_heap_file heap_file;
>>>>>>>>>> +
>>>>>>>>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>>>>>>>>> +    if (fd)
>>>>>>>>>> +        goto error_file;
>>>>>>>>>> +
>>>>>>>>>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, 
>>>>>>>>>> fd_flags,
>>>>>>>>>> +                           heap_flags);
>>>>>>>>>> +    if (IS_ERR(dmabuf)) {
>>>>>>>>>> +        fd = PTR_ERR(dmabuf);
>>>>>>>>>> +        goto error;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>>>>>>>>> +    if (fd < 0) {
>>>>>>>>>> +        dma_buf_put(dmabuf);
>>>>>>>>>> +        /* just return, as put will call release and that 
>>>>>>>>>> will free */
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +error:
>>>>>>>>>> +    destroy_dma_heap_file(&heap_file);
>>>>>>>>>> +error_file:
>>>>>>>>>> +    return fd;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, 
>>>>>>>>>> size_t len,
>>>>>>>>>>                    u32 fd_flags,
>>>>>>>>>>                    u64 heap_flags)
>>>>>>>>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode 
>>>>>>>>>> *inode, struct file *file)
>>>>>>>>>>       return 0;
>>>>>>>>>>   }
>>>>>>>>>>   +static long dma_heap_ioctl_allocate_read_file(struct file 
>>>>>>>>>> *file, void *data)
>>>>>>>>>> +{
>>>>>>>>>> +    struct dma_heap_allocation_file_data 
>>>>>>>>>> *heap_allocation_file = data;
>>>>>>>>>> +    struct dma_heap *heap = file->private_data;
>>>>>>>>>> +    int fd;
>>>>>>>>>> +
>>>>>>>>>> +    if (heap_allocation_file->fd || 
>>>>>>>>>> !heap_allocation_file->file_fd)
>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>> +
>>>>>>>>>> +    if (heap_allocation_file->fd_flags & 
>>>>>>>>>> ~DMA_HEAP_VALID_FD_FLAGS)
>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>> +
>>>>>>>>>> +    if (heap_allocation_file->heap_flags & 
>>>>>>>>>> ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>> +
>>>>>>>>>> +    if (!heap->ops->allocate_read_file)
>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>> +
>>>>>>>>>> +    fd = dma_heap_buffer_alloc_read_file(
>>>>>>>>>> +        heap, heap_allocation_file->file_fd,
>>>>>>>>>> +        heap_allocation_file->batch ?
>>>>>>>>>> + PAGE_ALIGN(heap_allocation_file->batch) :
>>>>>>>>>> +            DEFAULT_ADI_BATCH,
>>>>>>>>>> +        heap_allocation_file->fd_flags,
>>>>>>>>>> +        heap_allocation_file->heap_flags);
>>>>>>>>>> +    if (fd < 0)
>>>>>>>>>> +        return fd;
>>>>>>>>>> +
>>>>>>>>>> +    heap_allocation_file->fd = fd;
>>>>>>>>>> +    return 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>   static long dma_heap_ioctl_allocate(struct file *file, void 
>>>>>>>>>> *data)
>>>>>>>>>>   {
>>>>>>>>>>       struct dma_heap_allocation_data *heap_allocation = data;
>>>>>>>>>> @@ -121,6 +605,7 @@ static long 
>>>>>>>>>> dma_heap_ioctl_allocate(struct file *file, void *data)
>>>>>>>>>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>>>>>>>>>       DMA_HEAP_IOCTL_ALLOC,
>>>>>>>>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>>>>>>>>   };
>>>>>>>>>>     static long dma_heap_ioctl(struct file *file, unsigned 
>>>>>>>>>> int ucmd,
>>>>>>>>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file 
>>>>>>>>>> *file, unsigned int ucmd,
>>>>>>>>>>       case DMA_HEAP_IOCTL_ALLOC:
>>>>>>>>>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>>>>>>>>>           break;
>>>>>>>>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>>>>>>>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>>>>>>>>> +        break;
>>>>>>>>>>       default:
>>>>>>>>>>           ret = -ENOTTY;
>>>>>>>>>>           goto err;
>>>>>>>>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>>>>>>>>         dma_heap_class = class_create(DEVNAME);
>>>>>>>>>>       if (IS_ERR(dma_heap_class)) {
>>>>>>>>>> -        unregister_chrdev_region(dma_heap_devt, 
>>>>>>>>>> NUM_HEAP_MINORS);
>>>>>>>>>> -        return PTR_ERR(dma_heap_class);
>>>>>>>>>> +        ret = PTR_ERR(dma_heap_class);
>>>>>>>>>> +        goto fail_class;
>>>>>>>>>>       }
>>>>>>>>>>       dma_heap_class->devnode = dma_heap_devnode;
>>>>>>>>>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>>>>>>>>> +    if (unlikely(!heap_fctl)) {
>>>>>>>>>> +        ret =  -ENOMEM;
>>>>>>>>>> +        goto fail_alloc;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>>>>>>>>> + init_waitqueue_head(&heap_fctl->threadwq);
>>>>>>>>>> + init_waitqueue_head(&heap_fctl->workwq);
>>>>>>>>>> +
>>>>>>>>>> +    heap_fctl->work_thread = 
>>>>>>>>>> kthread_run(dma_heap_file_control_thread,
>>>>>>>>>> +                         heap_fctl, "heap_fwork_t");
>>>>>>>>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>>>>>>>>> +        ret = -ENOMEM;
>>>>>>>>>> +        goto fail_thread;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    heap_fctl->heap_fwork_cachep = 
>>>>>>>>>> KMEM_CACHE(dma_heap_file_work, 0);
>>>>>>>>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>>>>>>>>> +        ret = -ENOMEM;
>>>>>>>>>> +        goto fail_cache;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>>       return 0;
>>>>>>>>>> +
>>>>>>>>>> +fail_cache:
>>>>>>>>>> +    kthread_stop(heap_fctl->work_thread);
>>>>>>>>>> +fail_thread:
>>>>>>>>>> +    kfree(heap_fctl);
>>>>>>>>>> +fail_alloc:
>>>>>>>>>> +    class_destroy(dma_heap_class);
>>>>>>>>>> +fail_class:
>>>>>>>>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>>>>>> +    return ret;
>>>>>>>>>>   }
>>>>>>>>>>   subsys_initcall(dma_heap_init);
>>>>>>>>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>>>>>>>>> index 064bad725061..9c25383f816c 100644
>>>>>>>>>> --- a/include/linux/dma-heap.h
>>>>>>>>>> +++ b/include/linux/dma-heap.h
>>>>>>>>>> @@ -12,12 +12,17 @@
>>>>>>>>>>   #include <linux/cdev.h>
>>>>>>>>>>   #include <linux/types.h>
>>>>>>>>>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>>>>>>>>>> +
>>>>>>>>>>   struct dma_heap;
>>>>>>>>>> +struct dma_heap_file_task;
>>>>>>>>>> +struct dma_heap_file;
>>>>>>>>>>     /**
>>>>>>>>>>    * struct dma_heap_ops - ops to operate on a given heap
>>>>>>>>>>    * @allocate:        allocate dmabuf and return struct 
>>>>>>>>>> dma_buf ptr
>>>>>>>>>> - *
>>>>>>>>>> + * @allocate_read_file: allocate dmabuf and read file, then 
>>>>>>>>>> return struct
>>>>>>>>>> + * dma_buf ptr.
>>>>>>>>>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on 
>>>>>>>>>> error.
>>>>>>>>>>    */
>>>>>>>>>>   struct dma_heap_ops {
>>>>>>>>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>>>>>>>>                       unsigned long len,
>>>>>>>>>>                       u32 fd_flags,
>>>>>>>>>>                       u64 heap_flags);
>>>>>>>>>> +
>>>>>>>>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap 
>>>>>>>>>> *heap,
>>>>>>>>>> +                          struct dma_heap_file *heap_file,
>>>>>>>>>> +                          u32 fd_flags,
>>>>>>>>>> +                          u64 heap_flags);
>>>>>>>>>>   };
>>>>>>>>>>     /**
>>>>>>>>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct 
>>>>>>>>>> dma_heap *heap);
>>>>>>>>>>    */
>>>>>>>>>>   struct dma_heap *dma_heap_add(const struct 
>>>>>>>>>> dma_heap_export_info *exp_info);
>>>>>>>>>>   +/**
>>>>>>>>>> + * dma_heap_destroy_file_read - waits for a file read to 
>>>>>>>>>> complete then destroy it
>>>>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>>>>> + */
>>>>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>>>>> *heap_ftask);
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * dma_heap_wait_for_file_read - waits for a file read to 
>>>>>>>>>> complete
>>>>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>>>>> + */
>>>>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>>>>> *heap_ftask);
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * dma_heap_alloc_file_read - Declare a task to read file 
>>>>>>>>>> when allocate pages.
>>>>>>>>>> + * @heap_file:        target file to read
>>>>>>>>>> + *
>>>>>>>>>> + * Return NULL if failed, otherwise return a struct pointer.
>>>>>>>>>> + */
>>>>>>>>>> +struct dma_heap_file_task *
>>>>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * dma_heap_prepare_file_read - cache each allocated page 
>>>>>>>>>> until we meet this batch.
>>>>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>>>>> + * @page:        current allocated page. don't care which 
>>>>>>>>>> order.
>>>>>>>>>> + *
>>>>>>>>>> + * Returns true if reach to batch, false so go on prepare.
>>>>>>>>>> + */
>>>>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>>>>> *heap_ftask,
>>>>>>>>>> +                struct page *page);
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * dma_heap_commit_file_read -  prepare collect enough 
>>>>>>>>>> memory, going to trigger IO
>>>>>>>>>> + * @heap_ftask:            info that current IO needs
>>>>>>>>>> + *
>>>>>>>>>> + * This commit will also check if reach to tail read.
>>>>>>>>>> + * For direct I/O submissions, it is necessary to pay 
>>>>>>>>>> attention to file reads
>>>>>>>>>> + * that are not page-aligned. For the unaligned portion of 
>>>>>>>>>> the read, buffer IO
>>>>>>>>>> + * needs to be triggered.
>>>>>>>>>> + * Returns:
>>>>>>>>>> + *   0 if all right, -errno if something wrong
>>>>>>>>>> + */
>>>>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>>>>> *heap_ftask);
>>>>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>>>>>>>>> +
>>>>>>>>>>   #endif /* _DMA_HEAPS_H */
>>>>>>>>>> diff --git a/include/uapi/linux/dma-heap.h 
>>>>>>>>>> b/include/uapi/linux/dma-heap.h
>>>>>>>>>> index a4cf716a49fa..8c20e8b74eed 100644
>>>>>>>>>> --- a/include/uapi/linux/dma-heap.h
>>>>>>>>>> +++ b/include/uapi/linux/dma-heap.h
>>>>>>>>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>>>>>>>>       __u64 heap_flags;
>>>>>>>>>>   };
>>>>>>>>>>   +/**
>>>>>>>>>> + * struct dma_heap_allocation_file_data - metadata passed 
>>>>>>>>>> from userspace for
>>>>>>>>>> + * allocations and read file
>>>>>>>>>> + * @fd:            will be populated with a fd which 
>>>>>>>>>> provides the
>>>>>>>>>> + *     ��      handle to the allocated dma-buf
>>>>>>>>>> + * @file_fd:        file descriptor to read from(suggested 
>>>>>>>>>> to use O_DIRECT open file)
>>>>>>>>>> + * @batch:        how many memory alloced then file 
>>>>>>>>>> read(bytes), default 128MB
>>>>>>>>>> + *            will auto aligned to PAGE_SIZE
>>>>>>>>>> + * @fd_flags:        file descriptor flags used when allocating
>>>>>>>>>> + * @heap_flags:        flags passed to heap
>>>>>>>>>> + *
>>>>>>>>>> + * Provided by userspace as an argument to the ioctl
>>>>>>>>>> + */
>>>>>>>>>> +struct dma_heap_allocation_file_data {
>>>>>>>>>> +    __u32 fd;
>>>>>>>>>> +    __u32 file_fd;
>>>>>>>>>> +    __u32 batch;
>>>>>>>>>> +    __u32 fd_flags;
>>>>>>>>>> +    __u64 heap_flags;
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>>>>>>>>>     /**
>>>>>>>>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>>>>>>>>   #define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>>>>>>>>                         struct dma_heap_allocation_data)
>>>>>>>>>>   +/**
>>>>>>>>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from 
>>>>>>>>>> pool and both
>>>>>>>>>> + *                    read file when allocate memory.
>>>>>>>>>> + *
>>>>>>>>>> + * Takes a dma_heap_allocation_file_data struct and returns 
>>>>>>>>>> it with the fd field
>>>>>>>>>> + * populated with the dmabuf handle of the allocation. When 
>>>>>>>>>> return, the dma-buf
>>>>>>>>>> + * content is read from file.
>>>>>>>>>> + */
>>>>>>>>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>>>>>>>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>>>>>>>>> dma_heap_allocation_file_data)
>>>>>>>>>> +
>>>>>>>>>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>>>>>>>>>
>>>>>>>
>>>>
>>
Huan Yang July 12, 2024, 11:12 a.m. UTC | #11
在 2024/7/12 18:59, Christian König 写道:
> Am 12.07.24 um 09:52 schrieb Huan Yang:
>>
>> 在 2024/7/12 15:41, Christian König 写道:
>>> Am 12.07.24 um 09:29 schrieb Huan Yang:
>>>> Hi Christian,
>>>>
>>>> 在 2024/7/12 15:10, Christian König 写道:
>>>>> Am 12.07.24 um 04:14 schrieb Huan Yang:
>>>>>> 在 2024/7/12 9:59, Huan Yang 写道:
>>>>>>> Hi Christian,
>>>>>>>
>>>>>>> 在 2024/7/11 19:39, Christian König 写道:
>>>>>>>> Am 11.07.24 um 11:18 schrieb Huan Yang:
>>>>>>>>> Hi Christian,
>>>>>>>>>
>>>>>>>>> Thanks for your reply.
>>>>>>>>>
>>>>>>>>> 在 2024/7/11 17:00, Christian König 写道:
>>>>>>>>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>>>>>>>>> Some user may need load file into dma-buf, current
>>>>>>>>>>> way is:
>>>>>>>>>>>    1. allocate a dma-buf, get dma-buf fd
>>>>>>>>>>>    2. mmap dma-buf fd into vaddr
>>>>>>>>>>>    3. read(file_fd, vaddr, fsz)
>>>>>>>>>>> This is too heavy if fsz reached to GB.
>>>>>>>>>>
>>>>>>>>>> You need to describe a bit more why that is to heavy. I can 
>>>>>>>>>> only assume you need to save memory bandwidth and avoid the 
>>>>>>>>>> extra copy with the CPU.
>>>>>>>>>
>>>>>>>>> Sorry for the oversimplified explanation. But, yes, you're 
>>>>>>>>> right, we want to avoid this.
>>>>>>>>>
>>>>>>>>> As we are dealing with embedded devices, the available memory 
>>>>>>>>> and computing power for users are usually limited.(The maximum 
>>>>>>>>> available memory is currently
>>>>>>>>>
>>>>>>>>> 24GB, typically ranging from 8-12GB. )
>>>>>>>>>
>>>>>>>>> Also, the CPU computing power is also usually in short supply, 
>>>>>>>>> due to limited battery capacity and limited heat dissipation 
>>>>>>>>> capabilities.
>>>>>>>>>
>>>>>>>>> So, we hope to avoid ineffective paths as much as possible.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> This patch implement a feature called 
>>>>>>>>>>> DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>>>>>>>>> User need to offer a file_fd which you want to load into 
>>>>>>>>>>> dma-buf, then,
>>>>>>>>>>> it promise if you got a dma-buf fd, it will contains the 
>>>>>>>>>>> file content.
>>>>>>>>>>
>>>>>>>>>> Interesting idea, that has at least more potential than 
>>>>>>>>>> trying to enable direct I/O on mmap()ed DMA-bufs.
>>>>>>>>>>
>>>>>>>>>> The approach with the new IOCTL might not work because it is 
>>>>>>>>>> a very specialized use case.
>>>>>>>>>
>>>>>>>>> Thank you for your advice. maybe the "read file" behavior can 
>>>>>>>>> be attached to an existing allocation?
>>>>>>>>
>>>>>>>> The point is there are already system calls to do something 
>>>>>>>> like that.
>>>>>>>>
>>>>>>>> See copy_file_range() 
>>>>>>>> (https://man7.org/linux/man-pages/man2/copy_file_range.2.html) 
>>>>>>>> and send_file() 
>>>>>>>> (https://man7.org/linux/man-pages/man2/sendfile.2.html).
>>>>>>>
>>>>>>> That's helpfull to learn it, thanks.
>>>>>>>
>>>>>>> In terms of only DMA-BUF supporting direct I/O, 
>>>>>>> copy_file_range/send_file may help to achieve this functionality.
>>>>>>>
>>>>>>> However, my patchset also aims to achieve parallel copying of 
>>>>>>> file contents while allocating the DMA-BUF, which is something 
>>>>>>> that the current set of calls may not be able to accomplish.
>>>>>
>>>>> And exactly that is a no-go. Use the existing IOCTLs and system 
>>>>> calls instead they should have similar performance when done right.
>>>>
>>>> Get it, but In my testing process, even without memory pressure, it 
>>>> takes about 60ms to allocate a 3GB DMA-BUF. When there is 
>>>> significant memory pressure, the allocation time for a 3GB
>>>
>>> Well exactly that doesn't make sense. Even if you read the content 
>>> of the DMA-buf from a file you still need to allocate it first.
>>
>> Yes, need allocate first, but in kernelspace, no need to wait all 
>> memory allocated done and then trigger file load.
>
> That doesn't really make sense. Allocating a large bunch of memory is 
> more efficient than allocating less multiple times because of cache 
> locality for example.
No, this patchset not change `the alloc behavior`, heap can goon alloc, 
but we will in a second it meet batch, then map the batch page(it 
alloced) into vmalloc area, then trigger IO.
>
> You could of course hide latency caused by operations to reduce memory 
> pressure when you have a specific use case, but you don't need to use 
> an in kernel implementation for that.
>
> Question is do you have clear on allocation or clear on free enabled?
We have a free clear, so, alloc and load file is OK.
>
>> This patchset use `batch` to done(default 128MB), ever 128MB 
>> allocated, vmap and get vaddr, then trigger this vaddr load file's 
>> target pos content.
>
> Again that sounds really not ideal to me. Creating the vmap alone is 
> complete unnecessary overhead.
Hmmm, maybe you can give a try, I offered the test program also in 
cover-letter?
>
>>> So the question is why should reading and allocating it at the same 
>>> time be better in any way?
>>
>> Memory pressure will trigger reclaim, it must to wait.(ms) Asume I 
>> already allocated 512MB(need 3G) without enter slowpath,
>>
>> Even I need to enter slowpath to allocated remain memory, the already 
>> allocated memory is using load file content.(Save time compare to 
>> allocated done and read)
>>
>> The time difference between them can be expressed by the formula:
>>
>> 1. Allocate dmabuf time + file load time -- for original
>>
>> 2. first prepare batch time + Max(file load time, allocate remain 
>> dma-buf time) + latest batch prepare time -- for new
>>
>>  When the file reaches the gigabyte level, the significant difference 
>> between the two can be clearly observed.
>
> I have strong doubts about that. The method you describe above is 
> actually really inefficient.

Also, maybe you can test? dd a large file, then compare?

All of it I test in my phone and archlinux PC both show some improve.

>
> First of all you create a memory mapping just to load data, that is 
> superfluous and TLB flushes are usually extremely costly. Both for 
> userspace as well as kernel.
>
> I strongly suggest to try to use copy_file_range() instead. But could 
> be that copy_file_range() doesn't even work right now because of some 
> restrictions, never tried that on a DMA-buf.
I agree, I'm start this research.
>
> When that works as far as I can see what could still be saved on 
> overhead is the following:
>
> 1. Clearing of memory on allocation. That could potentially be done 
> with delayed allocation or clear on free instead.
>
> 2. CPU copy between the I/O target buffer and the DMA-buf backing 
> pages. In theory it should be possible to avoid that by implementing 
> the copy_file_range() callback, but I'm not 100% sure.
All you mentioned above is make sense. :)
>
> Regards,
> Christian.
>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>>
>>>> DMA-BUF can increase to 300ms-1s. (The above test times can also 
>>>> demonstrate the difference.)
>>>>
>>>> But, talk is cheap, I agree to research use existing way to 
>>>> implements it and give a test.
>>>>
>>>> I'll show this if I done .
>>>>
>>>> Thanks for your suggestions.
>>>>
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>> You can see cover-letter, here are the normal test and this 
>>>>>> IOCTL's compare in memory pressure, even if buffered I/O in this 
>>>>>> ioctl can have 50% improve by parallel.
>>>>>>
>>>>>> dd a 3GB file for test, 12G RAM phone, UFS4.0, stressapptest 4G 
>>>>>> memory pressure.
>>>>>>
>>>>>> 1. original
>>>>>> ```shel
>>>>>> # create a model file
>>>>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>>>>> # drop page cache
>>>>>> echo 3 > /proc/sys/vm/drop_caches
>>>>>> ./dmabuf-heap-file-read mtk_mm-uncached normal
>>>>>>
>>>>>>> result is total cost 13087213847ns
>>>>>>
>>>>>> ```
>>>>>>
>>>>>> 2.DMA_HEAP_IOCTL_ALLOC_AND_READ O_DIRECT
>>>>>> ```shel
>>>>>> # create a model file
>>>>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>>>>> # drop page cache
>>>>>> echo 3 > /proc/sys/vm/drop_caches
>>>>>> ./dmabuf-heap-file-read mtk_mm-uncached direct_io
>>>>>>
>>>>>>> result is total cost 2902386846ns
>>>>>>
>>>>>> # use direct_io_check can check the content if is same to file.
>>>>>> ```
>>>>>>
>>>>>> 3. DMA_HEAP_IOCTL_ALLOC_AND_READ BUFFER I/O
>>>>>> ```shel
>>>>>> # create a model file
>>>>>> dd if=/dev/zero of=./model.txt bs=1M count=3072
>>>>>> # drop page cache
>>>>>> echo 3 > /proc/sys/vm/drop_caches
>>>>>> ./dmabuf-heap-file-read mtk_mm-uncached normal_io
>>>>>>
>>>>>>> result is total cost 5735579385ns
>>>>>>
>>>>>> ```
>>>>>>
>>>>>>>
>>>>>>> Perhaps simply returning the DMA-BUF file descriptor and then 
>>>>>>> implementing copy_file_range, while populating the memory and 
>>>>>>> content during the copy process, could achieve this? At present, 
>>>>>>> it seems that it will be quite complex - We need to ensure that 
>>>>>>> only the returned DMA-BUF file descriptor will fail in case of 
>>>>>>> memory not fill, like mmap, vmap, attach, and so on.
>>>>>>>
>>>>>>>>
>>>>>>>> What we probably could do is to internally optimize those.
>>>>>>>>
>>>>>>>>> I am currently creating a new ioctl to remind the user that 
>>>>>>>>> memory is being allocated and read, and I am also unsure
>>>>>>>>>
>>>>>>>>> whether it is appropriate to add additional parameters to the 
>>>>>>>>> existing allocate behavior.
>>>>>>>>>
>>>>>>>>> Please, give me more suggestion. Thanks.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> But IIRC there was a copy_file_range callback in the 
>>>>>>>>>> file_operations structure you could use for that. I'm just 
>>>>>>>>>> not sure when and how that's used with the copy_file_range() 
>>>>>>>>>> system call.
>>>>>>>>>
>>>>>>>>> Sorry, I'm not familiar with this, but I will look into it. 
>>>>>>>>> However, this type of callback function is not currently 
>>>>>>>>> implemented when exporting
>>>>>>>>>
>>>>>>>>> the dma_buf file, which means that I need to implement the 
>>>>>>>>> callback for it?
>>>>>>>>
>>>>>>>> If I'm not completely mistaken the copy_file_range, splice_read 
>>>>>>>> and splice_write callbacks on the struct file_operations 
>>>>>>>> (https://elixir.bootlin.com/linux/v6.10-rc7/source/include/linux/fs.h#L1999).
>>>>>>>>
>>>>>>>> Can be used to implement what you want to do.
>>>>>>> Yes.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Notice, file_fd depends on user how to open this file. So, 
>>>>>>>>>>> both buffer
>>>>>>>>>>> I/O and Direct I/O is supported.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Huan Yang <link@vivo.com>
>>>>>>>>>>> ---
>>>>>>>>>>>   drivers/dma-buf/dma-heap.c    | 525 
>>>>>>>>>>> +++++++++++++++++++++++++++++++++-
>>>>>>>>>>>   include/linux/dma-heap.h      |  57 +++-
>>>>>>>>>>>   include/uapi/linux/dma-heap.h |  32 +++
>>>>>>>>>>>   3 files changed, 611 insertions(+), 3 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/dma-buf/dma-heap.c 
>>>>>>>>>>> b/drivers/dma-buf/dma-heap.c
>>>>>>>>>>> index 2298ca5e112e..abe17281adb8 100644
>>>>>>>>>>> --- a/drivers/dma-buf/dma-heap.c
>>>>>>>>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>>>>>>>>> @@ -15,9 +15,11 @@
>>>>>>>>>>>   #include <linux/list.h>
>>>>>>>>>>>   #include <linux/slab.h>
>>>>>>>>>>>   #include <linux/nospec.h>
>>>>>>>>>>> +#include <linux/highmem.h>
>>>>>>>>>>>   #include <linux/uaccess.h>
>>>>>>>>>>>   #include <linux/syscalls.h>
>>>>>>>>>>>   #include <linux/dma-heap.h>
>>>>>>>>>>> +#include <linux/vmalloc.h>
>>>>>>>>>>>   #include <uapi/linux/dma-heap.h>
>>>>>>>>>>>     #define DEVNAME "dma_heap"
>>>>>>>>>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>>>>>>>>>       struct cdev heap_cdev;
>>>>>>>>>>>   };
>>>>>>>>>>>   +/**
>>>>>>>>>>> + * struct dma_heap_file - wrap the file, read task for 
>>>>>>>>>>> dma_heap allocate use.
>>>>>>>>>>> + * @file:        file to read from.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @cred:        kthread use, user cred copy to use for the 
>>>>>>>>>>> read.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @max_batch:        maximum batch size to read, if 
>>>>>>>>>>> collect match batch,
>>>>>>>>>>> + *            trigger read, default 128MB, must below file 
>>>>>>>>>>> size.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @fsz:        file size.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @direct:        use direct IO?
>>>>>>>>>>> + */
>>>>>>>>>>> +struct dma_heap_file {
>>>>>>>>>>> +    struct file *file;
>>>>>>>>>>> +    struct cred *cred;
>>>>>>>>>>> +    size_t max_batch;
>>>>>>>>>>> +    size_t fsz;
>>>>>>>>>>> +    bool direct;
>>>>>>>>>>> +};
>>>>>>>>>>> +
>>>>>>>>>>> +/**
>>>>>>>>>>> + * struct dma_heap_file_work - represents a dma_heap file 
>>>>>>>>>>> read real work.
>>>>>>>>>>> + * @vaddr:        contigous virtual address alloc by vmap, 
>>>>>>>>>>> file read need.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @start_size:        file read start offset, same to 
>>>>>>>>>>> @dma_heap_file_task->roffset.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @need_size:        file read need size, same to 
>>>>>>>>>>> @dma_heap_file_task->rsize.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @heap_file:        file wrapper.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @list:        child node of @dma_heap_file_control->works.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @refp:        same @dma_heap_file_task->ref, if end of 
>>>>>>>>>>> read, put ref.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @failp:        if any work io failed, set it true, 
>>>>>>>>>>> pointp @dma_heap_file_task->fail.
>>>>>>>>>>> + */
>>>>>>>>>>> +struct dma_heap_file_work {
>>>>>>>>>>> +    void *vaddr;
>>>>>>>>>>> +    ssize_t start_size;
>>>>>>>>>>> +    ssize_t need_size;
>>>>>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>>>>>> +    struct list_head list;
>>>>>>>>>>> +    atomic_t *refp;
>>>>>>>>>>> +    bool *failp;
>>>>>>>>>>> +};
>>>>>>>>>>> +
>>>>>>>>>>> +/**
>>>>>>>>>>> + * struct dma_heap_file_task - represents a dma_heap file 
>>>>>>>>>>> read process
>>>>>>>>>>> + * @ref:        current file work counter, if zero, 
>>>>>>>>>>> allocate and read
>>>>>>>>>>> + *            done.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @roffset:        last read offset, current prepared 
>>>>>>>>>>> work' begin file
>>>>>>>>>>> + *            start offset.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @rsize:        current allocated page size use to read, 
>>>>>>>>>>> if reach rbatch,
>>>>>>>>>>> + *            trigger commit.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @rbatch:        current prepared work's batch, below 
>>>>>>>>>>> @dma_heap_file's
>>>>>>>>>>> + *            batch.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @heap_file:        current dma_heap_file
>>>>>>>>>>> + *
>>>>>>>>>>> + * @parray:        used for vmap, size is @dma_heap_file's 
>>>>>>>>>>> batch's number
>>>>>>>>>>> + *            pages.(this is maximum). Due to single thread 
>>>>>>>>>>> file read,
>>>>>>>>>>> + *            one page array reuse each work prepare is OK.
>>>>>>>>>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>>>>>>>>>> + *
>>>>>>>>>>> + * @pindex:        current allocated page filled in 
>>>>>>>>>>> @parray's index.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @fail:        any work failed when file read?
>>>>>>>>>>> + *
>>>>>>>>>>> + * dma_heap_file_task is the production of file read, will 
>>>>>>>>>>> prepare each work
>>>>>>>>>>> + * during allocate dma_buf pages, if match current batch, 
>>>>>>>>>>> then trigger commit
>>>>>>>>>>> + * and prepare next work. After all batch queued, user 
>>>>>>>>>>> going on prepare dma_buf
>>>>>>>>>>> + * and so on, but before return dma_buf fd, need to wait 
>>>>>>>>>>> file read end and
>>>>>>>>>>> + * check read result.
>>>>>>>>>>> + */
>>>>>>>>>>> +struct dma_heap_file_task {
>>>>>>>>>>> +    atomic_t ref;
>>>>>>>>>>> +    size_t roffset;
>>>>>>>>>>> +    size_t rsize;
>>>>>>>>>>> +    size_t rbatch;
>>>>>>>>>>> +    struct dma_heap_file *heap_file;
>>>>>>>>>>> +    struct page **parray;
>>>>>>>>>>> +    unsigned int pindex;
>>>>>>>>>>> +    bool fail;
>>>>>>>>>>> +};
>>>>>>>>>>> +
>>>>>>>>>>> +/**
>>>>>>>>>>> + * struct dma_heap_file_control - global control of 
>>>>>>>>>>> dma_heap file read.
>>>>>>>>>>> + * @works:        @dma_heap_file_work's list head.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @lock:        only lock for @works.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @threadwq:        wait queue for @work_thread, if commit 
>>>>>>>>>>> work, @work_thread
>>>>>>>>>>> + *            wakeup and read this work's file contains.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @workwq:        used for main thread wait for file read 
>>>>>>>>>>> end, if allocation
>>>>>>>>>>> + *            end before file read. @dma_heap_file_task ref 
>>>>>>>>>>> effect this.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @work_thread:    file read kthread. the 
>>>>>>>>>>> dma_heap_file_task work's consumer.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @heap_fwork_cachep: @dma_heap_file_work's cachep, it's 
>>>>>>>>>>> alloc/free frequently.
>>>>>>>>>>> + *
>>>>>>>>>>> + * @nr_work:        global number of how many work committed.
>>>>>>>>>>> + */
>>>>>>>>>>> +struct dma_heap_file_control {
>>>>>>>>>>> +    struct list_head works;
>>>>>>>>>>> +    spinlock_t lock;
>>>>>>>>>>> +    wait_queue_head_t threadwq;
>>>>>>>>>>> +    wait_queue_head_t workwq;
>>>>>>>>>>> +    struct task_struct *work_thread;
>>>>>>>>>>> +    struct kmem_cache *heap_fwork_cachep;
>>>>>>>>>>> +    atomic_t nr_work;
>>>>>>>>>>> +};
>>>>>>>>>>> +
>>>>>>>>>>> +static struct dma_heap_file_control *heap_fctl;
>>>>>>>>>>>   static LIST_HEAD(heap_list);
>>>>>>>>>>>   static DEFINE_MUTEX(heap_list_lock);
>>>>>>>>>>>   static dev_t dma_heap_devt;
>>>>>>>>>>>   static struct class *dma_heap_class;
>>>>>>>>>>>   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>>>>>>>>>   +/**
>>>>>>>>>>> + * map_pages_to_vaddr - map each scatter page into 
>>>>>>>>>>> contiguous virtual address.
>>>>>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>>>>>> + *
>>>>>>>>>>> + * Cached pages need to trigger file read, this function 
>>>>>>>>>>> map each scatter page
>>>>>>>>>>> + * into contiguous virtual address, so that file read can 
>>>>>>>>>>> easy use.
>>>>>>>>>>> + * Now that we get vaddr page, cached pages can return to 
>>>>>>>>>>> original user, so we
>>>>>>>>>>> + * will not effect dma-buf export even if file read not end.
>>>>>>>>>>> + */
>>>>>>>>>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task 
>>>>>>>>>>> *heap_ftask)
>>>>>>>>>>> +{
>>>>>>>>>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, 
>>>>>>>>>>> VM_MAP,
>>>>>>>>>>> +            PAGE_KERNEL);
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>>>>>> *heap_ftask,
>>>>>>>>>>> +                struct page *page)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct page **array = heap_ftask->parray;
>>>>>>>>>>> +    int index = heap_ftask->pindex;
>>>>>>>>>>> +    int num = compound_nr(page), i;
>>>>>>>>>>> +    unsigned long sz = page_size(page);
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_ftask->rsize += sz;
>>>>>>>>>>> +    for (i = 0; i < num; ++i)
>>>>>>>>>>> +        array[index++] = &page[i];
>>>>>>>>>>> +    heap_ftask->pindex = index;
>>>>>>>>>>> +
>>>>>>>>>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static struct dma_heap_file_work *
>>>>>>>>>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct dma_heap_file_work *heap_fwork;
>>>>>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>>>>>> +
>>>>>>>>>>> +    if (READ_ONCE(heap_ftask->fail))
>>>>>>>>>>> +        return NULL;
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_fwork = 
>>>>>>>>>>> kmem_cache_alloc(heap_fctl->heap_fwork_cachep, GFP_KERNEL);
>>>>>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>>>>>> +        return NULL;
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>>>>>>>>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>>>>>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>>>>>> +        return NULL;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_fwork->heap_file = heap_file;
>>>>>>>>>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>>>>>>>>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>>>>>>>>>> +    heap_fwork->refp = &heap_ftask->ref;
>>>>>>>>>>> +    heap_fwork->failp = &heap_ftask->fail;
>>>>>>>>>>> +    atomic_inc(&heap_ftask->ref);
>>>>>>>>>>> +    return heap_fwork;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static void destroy_file_work(struct dma_heap_file_work 
>>>>>>>>>>> *heap_fwork)
>>>>>>>>>>> +{
>>>>>>>>>>> +    vunmap(heap_fwork->vaddr);
>>>>>>>>>>> +    atomic_dec(heap_fwork->refp);
>>>>>>>>>>> +    wake_up(&heap_fctl->workwq);
>>>>>>>>>>> +
>>>>>>>>>>> + kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>>>>>> *heap_ftask)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct dma_heap_file_work *heap_fwork = 
>>>>>>>>>>> init_file_work(heap_ftask);
>>>>>>>>>>> +    struct page *last = NULL;
>>>>>>>>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>>>>>>>>> +    size_t start = heap_ftask->roffset;
>>>>>>>>>>> +    struct file *file = heap_file->file;
>>>>>>>>>>> +    size_t fsz = heap_file->fsz;
>>>>>>>>>>> +
>>>>>>>>>>> +    if (unlikely(!heap_fwork))
>>>>>>>>>>> +        return -ENOMEM;
>>>>>>>>>>> +
>>>>>>>>>>> +    /**
>>>>>>>>>>> +     * If file size is not page aligned, direct io can't 
>>>>>>>>>>> process the tail.
>>>>>>>>>>> +     * So, if reach to tail, remain the last page use 
>>>>>>>>>>> buffer read.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    if (heap_file->direct && start + heap_ftask->rsize > 
>>>>>>>>>>> fsz) {
>>>>>>>>>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>>>>>>>>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    spin_lock(&heap_fctl->lock);
>>>>>>>>>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>>>>>>>>>> +    spin_unlock(&heap_fctl->lock);
>>>>>>>>>>> +    atomic_inc(&heap_fctl->nr_work);
>>>>>>>>>>> +
>>>>>>>>>>> +    wake_up(&heap_fctl->threadwq);
>>>>>>>>>>> +
>>>>>>>>>>> +    if (last) {
>>>>>>>>>>> +        char *buf, *pathp;
>>>>>>>>>>> +        ssize_t err;
>>>>>>>>>>> +        void *buffer;
>>>>>>>>>>> +
>>>>>>>>>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>>>>>>>>>> +        if (unlikely(!buf))
>>>>>>>>>>> +            return -ENOMEM;
>>>>>>>>>>> +
>>>>>>>>>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>>>>>>>>>> +
>>>>>>>>>>> +        pathp = file_path(file, buf, PATH_MAX);
>>>>>>>>>>> +        if (IS_ERR(pathp)) {
>>>>>>>>>>> +            kfree(buf);
>>>>>>>>>>> +            return PTR_ERR(pathp);
>>>>>>>>>>> +        }
>>>>>>>>>>> +
>>>>>>>>>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>>>>>>>>>> +        err = kernel_read_file_from_path(pathp, start, 
>>>>>>>>>>> &buffer,
>>>>>>>>>>> +                         fsz - start, &fsz,
>>>>>>>>>>> +                         READING_POLICY);
>>>>>>>>>>> +        kunmap_local(buffer);
>>>>>>>>>>> +        kfree(buf);
>>>>>>>>>>> +        if (err < 0) {
>>>>>>>>>>> +            pr_err("failed to use buffer kernel_read_file 
>>>>>>>>>>> %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
>>>>>>>>>>> +                   pathp, err, start, fsz, fsz);
>>>>>>>>>>> +
>>>>>>>>>>> +            return err;
>>>>>>>>>>> +        }
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>>>>>>>>>> +    heap_ftask->rsize = 0;
>>>>>>>>>>> +    heap_ftask->pindex = 0;
>>>>>>>>>>> +    heap_ftask->rbatch = min_t(size_t,
>>>>>>>>>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>>>>>>>>>> +                   heap_ftask->rbatch);
>>>>>>>>>>> +    return 0;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>>>>>> *heap_ftask)
>>>>>>>>>>> +{
>>>>>>>>>>> + wait_event_freezable(heap_fctl->workwq,
>>>>>>>>>>> + atomic_read(&heap_ftask->ref) == 0);
>>>>>>>>>>> +    return heap_ftask->fail;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>>>>>> *heap_ftask)
>>>>>>>>>>> +{
>>>>>>>>>>> +    bool fail;
>>>>>>>>>>> +
>>>>>>>>>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>>>>>>>>>> +    fail = heap_ftask->fail;
>>>>>>>>>>> +    kvfree(heap_ftask->parray);
>>>>>>>>>>> +    kfree(heap_ftask);
>>>>>>>>>>> +    return fail;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +struct dma_heap_file_task *
>>>>>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct dma_heap_file_task *heap_ftask =
>>>>>>>>>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>>>>>>>>>> +    if (unlikely(!heap_ftask))
>>>>>>>>>>> +        return NULL;
>>>>>>>>>>> +
>>>>>>>>>>> +    /**
>>>>>>>>>>> +     * Batch is the maximum size which we prepare work will 
>>>>>>>>>>> meet.
>>>>>>>>>>> +     * So, direct alloc this number's page array is OK.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    heap_ftask->parray = 
>>>>>>>>>>> kvmalloc_array(heap_file->max_batch >> PAGE_SHIFT,
>>>>>>>>>>> +                        sizeof(struct page *), GFP_KERNEL);
>>>>>>>>>>> +    if (unlikely(!heap_ftask->parray))
>>>>>>>>>>> +        goto put;
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_ftask->heap_file = heap_file;
>>>>>>>>>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>>>>>>>>>> +    return heap_ftask;
>>>>>>>>>>> +put:
>>>>>>>>>>> +    kfree(heap_ftask);
>>>>>>>>>>> +    return NULL;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static void __work_this_io(struct dma_heap_file_work 
>>>>>>>>>>> *heap_fwork)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>>>>>>>>>> +    struct file *file = heap_file->file;
>>>>>>>>>>> +    ssize_t start = heap_fwork->start_size;
>>>>>>>>>>> +    ssize_t size = heap_fwork->need_size;
>>>>>>>>>>> +    void *buffer = heap_fwork->vaddr;
>>>>>>>>>>> +    const struct cred *old_cred;
>>>>>>>>>>> +    ssize_t err;
>>>>>>>>>>> +
>>>>>>>>>>> +    // use real task's cred to read this file.
>>>>>>>>>>> +    old_cred = override_creds(heap_file->cred);
>>>>>>>>>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>>>>>>>>>> &heap_file->fsz,
>>>>>>>>>>> +                   READING_POLICY);
>>>>>>>>>>> +    if (err < 0) {
>>>>>>>>>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>>>>>>>>>> f_sz=%ld\n",
>>>>>>>>>>> +               err, start, (start + size), heap_file->fsz);
>>>>>>>>>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>>>>>>>>>> +    }
>>>>>>>>>>> +    // recovery to my cred.
>>>>>>>>>>> +    revert_creds(old_cred);
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static int dma_heap_file_control_thread(void *data)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct dma_heap_file_control *heap_fctl =
>>>>>>>>>>> +        (struct dma_heap_file_control *)data;
>>>>>>>>>>> +    struct dma_heap_file_work *worker, *tmp;
>>>>>>>>>>> +    int nr_work;
>>>>>>>>>>> +
>>>>>>>>>>> +    LIST_HEAD(pages);
>>>>>>>>>>> +    LIST_HEAD(workers);
>>>>>>>>>>> +
>>>>>>>>>>> +    while (true) {
>>>>>>>>>>> + wait_event_freezable(heap_fctl->threadwq,
>>>>>>>>>>> + atomic_read(&heap_fctl->nr_work) > 0);
>>>>>>>>>>> +recheck:
>>>>>>>>>>> +        spin_lock(&heap_fctl->lock);
>>>>>>>>>>> + list_splice_init(&heap_fctl->works, &workers);
>>>>>>>>>>> +        spin_unlock(&heap_fctl->lock);
>>>>>>>>>>> +
>>>>>>>>>>> +        if (unlikely(kthread_should_stop())) {
>>>>>>>>>>> +            list_for_each_entry_safe(worker, tmp, &workers, 
>>>>>>>>>>> list) {
>>>>>>>>>>> + list_del(&worker->list);
>>>>>>>>>>> +                destroy_file_work(worker);
>>>>>>>>>>> +            }
>>>>>>>>>>> +            break;
>>>>>>>>>>> +        }
>>>>>>>>>>> +
>>>>>>>>>>> +        nr_work = 0;
>>>>>>>>>>> +        list_for_each_entry_safe(worker, tmp, &workers, 
>>>>>>>>>>> list) {
>>>>>>>>>>> +            ++nr_work;
>>>>>>>>>>> +            list_del(&worker->list);
>>>>>>>>>>> +            __work_this_io(worker);
>>>>>>>>>>> +
>>>>>>>>>>> +            destroy_file_work(worker);
>>>>>>>>>>> +        }
>>>>>>>>>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>>>>>>>>>> +
>>>>>>>>>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>>>>>>>>>> +            goto recheck;
>>>>>>>>>>> +    }
>>>>>>>>>>> +    return 0;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>>>>>>>>>> +{
>>>>>>>>>>> +    return heap_file->fsz;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static int prepare_dma_heap_file(struct dma_heap_file 
>>>>>>>>>>> *heap_file, int file_fd,
>>>>>>>>>>> +                 size_t batch)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct file *file;
>>>>>>>>>>> +    size_t fsz;
>>>>>>>>>>> +    int ret;
>>>>>>>>>>> +
>>>>>>>>>>> +    file = fget(file_fd);
>>>>>>>>>>> +    if (!file)
>>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>>> +
>>>>>>>>>>> +    fsz = i_size_read(file_inode(file));
>>>>>>>>>>> +    if (fsz < batch) {
>>>>>>>>>>> +        ret = -EINVAL;
>>>>>>>>>>> +        goto err;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    /**
>>>>>>>>>>> +     * Selinux block our read, but actually we are reading 
>>>>>>>>>>> the stand-in
>>>>>>>>>>> +     * for this file.
>>>>>>>>>>> +     * So save current's cred and when going to read, 
>>>>>>>>>>> override mine, and
>>>>>>>>>>> +     * end of read, revert.
>>>>>>>>>>> +     */
>>>>>>>>>>> +    heap_file->cred = prepare_kernel_cred(current);
>>>>>>>>>>> +    if (unlikely(!heap_file->cred)) {
>>>>>>>>>>> +        ret = -ENOMEM;
>>>>>>>>>>> +        goto err;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_file->file = file;
>>>>>>>>>>> +    heap_file->max_batch = batch;
>>>>>>>>>>> +    heap_file->fsz = fsz;
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>>>>>>>>>> +
>>>>>>>>>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>>>>>>>>>> +    if (!heap_file->direct && fsz >= 
>>>>>>>>>>> DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>>>>>>>>>> +        pr_warn("alloc read file better to use O_DIRECT to 
>>>>>>>>>>> read larget file\n");
>>>>>>>>>>> +
>>>>>>>>>>> +    return 0;
>>>>>>>>>>> +
>>>>>>>>>>> +err:
>>>>>>>>>>> +    fput(file);
>>>>>>>>>>> +    return ret;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static void destroy_dma_heap_file(struct dma_heap_file 
>>>>>>>>>>> *heap_file)
>>>>>>>>>>> +{
>>>>>>>>>>> +    fput(heap_file->file);
>>>>>>>>>>> +    put_cred(heap_file->cred);
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap 
>>>>>>>>>>> *heap, int file_fd,
>>>>>>>>>>> +                       size_t batch, unsigned int fd_flags,
>>>>>>>>>>> +                       unsigned int heap_flags)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct dma_buf *dmabuf;
>>>>>>>>>>> +    int fd;
>>>>>>>>>>> +    struct dma_heap_file heap_file;
>>>>>>>>>>> +
>>>>>>>>>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>>>>>>>>>> +    if (fd)
>>>>>>>>>>> +        goto error_file;
>>>>>>>>>>> +
>>>>>>>>>>> +    dmabuf = heap->ops->allocate_read_file(heap, 
>>>>>>>>>>> &heap_file, fd_flags,
>>>>>>>>>>> +                           heap_flags);
>>>>>>>>>>> +    if (IS_ERR(dmabuf)) {
>>>>>>>>>>> +        fd = PTR_ERR(dmabuf);
>>>>>>>>>>> +        goto error;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>>>>>>>>>> +    if (fd < 0) {
>>>>>>>>>>> +        dma_buf_put(dmabuf);
>>>>>>>>>>> +        /* just return, as put will call release and that 
>>>>>>>>>>> will free */
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +error:
>>>>>>>>>>> +    destroy_dma_heap_file(&heap_file);
>>>>>>>>>>> +error_file:
>>>>>>>>>>> +    return fd;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>>   static int dma_heap_buffer_alloc(struct dma_heap *heap, 
>>>>>>>>>>> size_t len,
>>>>>>>>>>>                    u32 fd_flags,
>>>>>>>>>>>                    u64 heap_flags)
>>>>>>>>>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode 
>>>>>>>>>>> *inode, struct file *file)
>>>>>>>>>>>       return 0;
>>>>>>>>>>>   }
>>>>>>>>>>>   +static long dma_heap_ioctl_allocate_read_file(struct file 
>>>>>>>>>>> *file, void *data)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct dma_heap_allocation_file_data 
>>>>>>>>>>> *heap_allocation_file = data;
>>>>>>>>>>> +    struct dma_heap *heap = file->private_data;
>>>>>>>>>>> +    int fd;
>>>>>>>>>>> +
>>>>>>>>>>> +    if (heap_allocation_file->fd || 
>>>>>>>>>>> !heap_allocation_file->file_fd)
>>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>>> +
>>>>>>>>>>> +    if (heap_allocation_file->fd_flags & 
>>>>>>>>>>> ~DMA_HEAP_VALID_FD_FLAGS)
>>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>>> +
>>>>>>>>>>> +    if (heap_allocation_file->heap_flags & 
>>>>>>>>>>> ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>>> +
>>>>>>>>>>> +    if (!heap->ops->allocate_read_file)
>>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>>> +
>>>>>>>>>>> +    fd = dma_heap_buffer_alloc_read_file(
>>>>>>>>>>> +        heap, heap_allocation_file->file_fd,
>>>>>>>>>>> +        heap_allocation_file->batch ?
>>>>>>>>>>> + PAGE_ALIGN(heap_allocation_file->batch) :
>>>>>>>>>>> +            DEFAULT_ADI_BATCH,
>>>>>>>>>>> +        heap_allocation_file->fd_flags,
>>>>>>>>>>> +        heap_allocation_file->heap_flags);
>>>>>>>>>>> +    if (fd < 0)
>>>>>>>>>>> +        return fd;
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_allocation_file->fd = fd;
>>>>>>>>>>> +    return 0;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>>   static long dma_heap_ioctl_allocate(struct file *file, 
>>>>>>>>>>> void *data)
>>>>>>>>>>>   {
>>>>>>>>>>>       struct dma_heap_allocation_data *heap_allocation = data;
>>>>>>>>>>> @@ -121,6 +605,7 @@ static long 
>>>>>>>>>>> dma_heap_ioctl_allocate(struct file *file, void *data)
>>>>>>>>>>>     static unsigned int dma_heap_ioctl_cmds[] = {
>>>>>>>>>>>       DMA_HEAP_IOCTL_ALLOC,
>>>>>>>>>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>>>>>>>>>   };
>>>>>>>>>>>     static long dma_heap_ioctl(struct file *file, unsigned 
>>>>>>>>>>> int ucmd,
>>>>>>>>>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file 
>>>>>>>>>>> *file, unsigned int ucmd,
>>>>>>>>>>>       case DMA_HEAP_IOCTL_ALLOC:
>>>>>>>>>>>           ret = dma_heap_ioctl_allocate(file, kdata);
>>>>>>>>>>>           break;
>>>>>>>>>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>>>>>>>>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>>>>>>>>>> +        break;
>>>>>>>>>>>       default:
>>>>>>>>>>>           ret = -ENOTTY;
>>>>>>>>>>>           goto err;
>>>>>>>>>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>>>>>>>>>         dma_heap_class = class_create(DEVNAME);
>>>>>>>>>>>       if (IS_ERR(dma_heap_class)) {
>>>>>>>>>>> - unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>>>>>>> -        return PTR_ERR(dma_heap_class);
>>>>>>>>>>> +        ret = PTR_ERR(dma_heap_class);
>>>>>>>>>>> +        goto fail_class;
>>>>>>>>>>>       }
>>>>>>>>>>>       dma_heap_class->devnode = dma_heap_devnode;
>>>>>>>>>>>   +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>>>>>>>>>> +    if (unlikely(!heap_fctl)) {
>>>>>>>>>>> +        ret =  -ENOMEM;
>>>>>>>>>>> +        goto fail_alloc;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>>>>>>>>>> + init_waitqueue_head(&heap_fctl->threadwq);
>>>>>>>>>>> + init_waitqueue_head(&heap_fctl->workwq);
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_fctl->work_thread = 
>>>>>>>>>>> kthread_run(dma_heap_file_control_thread,
>>>>>>>>>>> +                         heap_fctl, "heap_fwork_t");
>>>>>>>>>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>>>>>>>>>> +        ret = -ENOMEM;
>>>>>>>>>>> +        goto fail_thread;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    heap_fctl->heap_fwork_cachep = 
>>>>>>>>>>> KMEM_CACHE(dma_heap_file_work, 0);
>>>>>>>>>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>>>>>>>>>> +        ret = -ENOMEM;
>>>>>>>>>>> +        goto fail_cache;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>>       return 0;
>>>>>>>>>>> +
>>>>>>>>>>> +fail_cache:
>>>>>>>>>>> +    kthread_stop(heap_fctl->work_thread);
>>>>>>>>>>> +fail_thread:
>>>>>>>>>>> +    kfree(heap_fctl);
>>>>>>>>>>> +fail_alloc:
>>>>>>>>>>> +    class_destroy(dma_heap_class);
>>>>>>>>>>> +fail_class:
>>>>>>>>>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>>>>>>>>> +    return ret;
>>>>>>>>>>>   }
>>>>>>>>>>>   subsys_initcall(dma_heap_init);
>>>>>>>>>>> diff --git a/include/linux/dma-heap.h 
>>>>>>>>>>> b/include/linux/dma-heap.h
>>>>>>>>>>> index 064bad725061..9c25383f816c 100644
>>>>>>>>>>> --- a/include/linux/dma-heap.h
>>>>>>>>>>> +++ b/include/linux/dma-heap.h
>>>>>>>>>>> @@ -12,12 +12,17 @@
>>>>>>>>>>>   #include <linux/cdev.h>
>>>>>>>>>>>   #include <linux/types.h>
>>>>>>>>>>>   +#define DEFAULT_ADI_BATCH (128 << 20)
>>>>>>>>>>> +
>>>>>>>>>>>   struct dma_heap;
>>>>>>>>>>> +struct dma_heap_file_task;
>>>>>>>>>>> +struct dma_heap_file;
>>>>>>>>>>>     /**
>>>>>>>>>>>    * struct dma_heap_ops - ops to operate on a given heap
>>>>>>>>>>>    * @allocate:        allocate dmabuf and return struct 
>>>>>>>>>>> dma_buf ptr
>>>>>>>>>>> - *
>>>>>>>>>>> + * @allocate_read_file: allocate dmabuf and read file, then 
>>>>>>>>>>> return struct
>>>>>>>>>>> + * dma_buf ptr.
>>>>>>>>>>>    * allocate returns dmabuf on success, ERR_PTR(-errno) on 
>>>>>>>>>>> error.
>>>>>>>>>>>    */
>>>>>>>>>>>   struct dma_heap_ops {
>>>>>>>>>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>>>>>>>>>                       unsigned long len,
>>>>>>>>>>>                       u32 fd_flags,
>>>>>>>>>>>                       u64 heap_flags);
>>>>>>>>>>> +
>>>>>>>>>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap 
>>>>>>>>>>> *heap,
>>>>>>>>>>> +                          struct dma_heap_file *heap_file,
>>>>>>>>>>> +                          u32 fd_flags,
>>>>>>>>>>> +                          u64 heap_flags);
>>>>>>>>>>>   };
>>>>>>>>>>>     /**
>>>>>>>>>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct 
>>>>>>>>>>> dma_heap *heap);
>>>>>>>>>>>    */
>>>>>>>>>>>   struct dma_heap *dma_heap_add(const struct 
>>>>>>>>>>> dma_heap_export_info *exp_info);
>>>>>>>>>>>   +/**
>>>>>>>>>>> + * dma_heap_destroy_file_read - waits for a file read to 
>>>>>>>>>>> complete then destroy it
>>>>>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>>>>>> + */
>>>>>>>>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>>>>>>>>> *heap_ftask);
>>>>>>>>>>> +
>>>>>>>>>>> +/**
>>>>>>>>>>> + * dma_heap_wait_for_file_read - waits for a file read to 
>>>>>>>>>>> complete
>>>>>>>>>>> + * Returns: true if the file read failed, false otherwise
>>>>>>>>>>> + */
>>>>>>>>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>>>>>>>>> *heap_ftask);
>>>>>>>>>>> +
>>>>>>>>>>> +/**
>>>>>>>>>>> + * dma_heap_alloc_file_read - Declare a task to read file 
>>>>>>>>>>> when allocate pages.
>>>>>>>>>>> + * @heap_file:        target file to read
>>>>>>>>>>> + *
>>>>>>>>>>> + * Return NULL if failed, otherwise return a struct pointer.
>>>>>>>>>>> + */
>>>>>>>>>>> +struct dma_heap_file_task *
>>>>>>>>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>>>>>>>>>> +
>>>>>>>>>>> +/**
>>>>>>>>>>> + * dma_heap_prepare_file_read - cache each allocated page 
>>>>>>>>>>> until we meet this batch.
>>>>>>>>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>>>>>>>>> + * @page:        current allocated page. don't care which 
>>>>>>>>>>> order.
>>>>>>>>>>> + *
>>>>>>>>>>> + * Returns true if reach to batch, false so go on prepare.
>>>>>>>>>>> + */
>>>>>>>>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>>>>>>>>> *heap_ftask,
>>>>>>>>>>> +                struct page *page);
>>>>>>>>>>> +
>>>>>>>>>>> +/**
>>>>>>>>>>> + * dma_heap_commit_file_read -  prepare collect enough 
>>>>>>>>>>> memory, going to trigger IO
>>>>>>>>>>> + * @heap_ftask:            info that current IO needs
>>>>>>>>>>> + *
>>>>>>>>>>> + * This commit will also check if reach to tail read.
>>>>>>>>>>> + * For direct I/O submissions, it is necessary to pay 
>>>>>>>>>>> attention to file reads
>>>>>>>>>>> + * that are not page-aligned. For the unaligned portion of 
>>>>>>>>>>> the read, buffer IO
>>>>>>>>>>> + * needs to be triggered.
>>>>>>>>>>> + * Returns:
>>>>>>>>>>> + *   0 if all right, -errno if something wrong
>>>>>>>>>>> + */
>>>>>>>>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task 
>>>>>>>>>>> *heap_ftask);
>>>>>>>>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>>>>>>>>>> +
>>>>>>>>>>>   #endif /* _DMA_HEAPS_H */
>>>>>>>>>>> diff --git a/include/uapi/linux/dma-heap.h 
>>>>>>>>>>> b/include/uapi/linux/dma-heap.h
>>>>>>>>>>> index a4cf716a49fa..8c20e8b74eed 100644
>>>>>>>>>>> --- a/include/uapi/linux/dma-heap.h
>>>>>>>>>>> +++ b/include/uapi/linux/dma-heap.h
>>>>>>>>>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>>>>>>>>>       __u64 heap_flags;
>>>>>>>>>>>   };
>>>>>>>>>>>   +/**
>>>>>>>>>>> + * struct dma_heap_allocation_file_data - metadata passed 
>>>>>>>>>>> from userspace for
>>>>>>>>>>> + * allocations and read file
>>>>>>>>>>> + * @fd:            will be populated with a fd which 
>>>>>>>>>>> provides the
>>>>>>>>>>> + *     ��      handle to the allocated dma-buf
>>>>>>>>>>> + * @file_fd:        file descriptor to read from(suggested 
>>>>>>>>>>> to use O_DIRECT open file)
>>>>>>>>>>> + * @batch:        how many memory alloced then file 
>>>>>>>>>>> read(bytes), default 128MB
>>>>>>>>>>> + *            will auto aligned to PAGE_SIZE
>>>>>>>>>>> + * @fd_flags:        file descriptor flags used when 
>>>>>>>>>>> allocating
>>>>>>>>>>> + * @heap_flags:        flags passed to heap
>>>>>>>>>>> + *
>>>>>>>>>>> + * Provided by userspace as an argument to the ioctl
>>>>>>>>>>> + */
>>>>>>>>>>> +struct dma_heap_allocation_file_data {
>>>>>>>>>>> +    __u32 fd;
>>>>>>>>>>> +    __u32 file_fd;
>>>>>>>>>>> +    __u32 batch;
>>>>>>>>>>> +    __u32 fd_flags;
>>>>>>>>>>> +    __u64 heap_flags;
>>>>>>>>>>> +};
>>>>>>>>>>> +
>>>>>>>>>>>   #define DMA_HEAP_IOC_MAGIC        'H'
>>>>>>>>>>>     /**
>>>>>>>>>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>>>>>>>>>   #define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>>>>>>>>>                         struct dma_heap_allocation_data)
>>>>>>>>>>>   +/**
>>>>>>>>>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory 
>>>>>>>>>>> from pool and both
>>>>>>>>>>> + *                    read file when allocate memory.
>>>>>>>>>>> + *
>>>>>>>>>>> + * Takes a dma_heap_allocation_file_data struct and returns 
>>>>>>>>>>> it with the fd field
>>>>>>>>>>> + * populated with the dmabuf handle of the allocation. When 
>>>>>>>>>>> return, the dma-buf
>>>>>>>>>>> + * content is read from file.
>>>>>>>>>>> + */
>>>>>>>>>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>>>>>>>>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>>>>>>>>>> dma_heap_allocation_file_data)
>>>>>>>>>>> +
>>>>>>>>>>>   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>>>>>>>>>>
>>>>>>>>
>>>>>
>>>
>
kernel test robot July 13, 2024, 10:33 a.m. UTC | #12
Hi Huan,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 523b23f0bee3014a7a752c9bb9f5c54f0eddae88]

url:    https://github.com/intel-lab-lkp/linux/commits/Huan-Yang/dma-buf-heaps-DMA_HEAP_IOCTL_ALLOC_READ_FILE-framework/20240711-155902
base:   523b23f0bee3014a7a752c9bb9f5c54f0eddae88
patch link:    https://lore.kernel.org/r/20240711074221.459589-2-link%40vivo.com
patch subject: [PATCH 1/2] dma-buf: heaps: DMA_HEAP_IOCTL_ALLOC_READ_FILE framework
config: i386-buildonly-randconfig-002-20240713 (https://download.01.org/0day-ci/archive/20240713/202407131825.A44mFGu1-lkp@intel.com/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240713/202407131825.A44mFGu1-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202407131825.A44mFGu1-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/dma-buf/dma-heap.c:293:18: warning: format specifies type 'long' but the argument has type 'ssize_t' (aka 'int') [-Wformat]
     292 |                         pr_err("failed to use buffer kernel_read_file %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
         |                                                                               ~~~
         |                                                                               %zd
     293 |                                pathp, err, start, fsz, fsz);
         |                                       ^~~
   include/linux/printk.h:533:33: note: expanded from macro 'pr_err'
     533 |         printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                                ~~~     ^~~~~~~~~~~
   include/linux/printk.h:490:60: note: expanded from macro 'printk'
     490 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
         |                                                     ~~~    ^~~~~~~~~~~
   include/linux/printk.h:462:19: note: expanded from macro 'printk_index_wrap'
     462 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ~~~~    ^~~~~~~~~~~
>> drivers/dma-buf/dma-heap.c:293:23: warning: format specifies type 'long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
     292 |                         pr_err("failed to use buffer kernel_read_file %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
         |                                                                                     ~~~
         |                                                                                     %zu
     293 |                                pathp, err, start, fsz, fsz);
         |                                            ^~~~~
   include/linux/printk.h:533:33: note: expanded from macro 'pr_err'
     533 |         printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                                ~~~     ^~~~~~~~~~~
   include/linux/printk.h:490:60: note: expanded from macro 'printk'
     490 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
         |                                                     ~~~    ^~~~~~~~~~~
   include/linux/printk.h:462:19: note: expanded from macro 'printk_index_wrap'
     462 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ~~~~    ^~~~~~~~~~~
   drivers/dma-buf/dma-heap.c:293:30: warning: format specifies type 'long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
     292 |                         pr_err("failed to use buffer kernel_read_file %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
         |                                                                                          ~~~
         |                                                                                          %zu
     293 |                                pathp, err, start, fsz, fsz);
         |                                                   ^~~
   include/linux/printk.h:533:33: note: expanded from macro 'pr_err'
     533 |         printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                                ~~~     ^~~~~~~~~~~
   include/linux/printk.h:490:60: note: expanded from macro 'printk'
     490 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
         |                                                     ~~~    ^~~~~~~~~~~
   include/linux/printk.h:462:19: note: expanded from macro 'printk_index_wrap'
     462 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ~~~~    ^~~~~~~~~~~
   drivers/dma-buf/dma-heap.c:293:35: warning: format specifies type 'long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
     292 |                         pr_err("failed to use buffer kernel_read_file %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
         |                                                                                                     ~~~
         |                                                                                                     %zu
     293 |                                pathp, err, start, fsz, fsz);
         |                                                        ^~~
   include/linux/printk.h:533:33: note: expanded from macro 'pr_err'
     533 |         printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                                ~~~     ^~~~~~~~~~~
   include/linux/printk.h:490:60: note: expanded from macro 'printk'
     490 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
         |                                                     ~~~    ^~~~~~~~~~~
   include/linux/printk.h:462:19: note: expanded from macro 'printk_index_wrap'
     462 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ~~~~    ^~~~~~~~~~~
   drivers/dma-buf/dma-heap.c:367:10: warning: format specifies type 'long' but the argument has type 'ssize_t' (aka 'int') [-Wformat]
     366 |                 pr_err("use kernel_read_file, err=%ld, [%ld, %ld], f_sz=%ld\n",
         |                                                   ~~~
         |                                                   %zd
     367 |                        err, start, (start + size), heap_file->fsz);
         |                        ^~~
   include/linux/printk.h:533:33: note: expanded from macro 'pr_err'
     533 |         printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                                ~~~     ^~~~~~~~~~~
   include/linux/printk.h:490:60: note: expanded from macro 'printk'
     490 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
         |                                                     ~~~    ^~~~~~~~~~~
   include/linux/printk.h:462:19: note: expanded from macro 'printk_index_wrap'
     462 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ~~~~    ^~~~~~~~~~~
   drivers/dma-buf/dma-heap.c:367:15: warning: format specifies type 'long' but the argument has type 'ssize_t' (aka 'int') [-Wformat]
     366 |                 pr_err("use kernel_read_file, err=%ld, [%ld, %ld], f_sz=%ld\n",
         |                                                         ~~~
         |                                                         %zd
     367 |                        err, start, (start + size), heap_file->fsz);
         |                             ^~~~~
   include/linux/printk.h:533:33: note: expanded from macro 'pr_err'
     533 |         printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                                ~~~     ^~~~~~~~~~~
   include/linux/printk.h:490:60: note: expanded from macro 'printk'
     490 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
         |                                                     ~~~    ^~~~~~~~~~~
   include/linux/printk.h:462:19: note: expanded from macro 'printk_index_wrap'
     462 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ~~~~    ^~~~~~~~~~~
   drivers/dma-buf/dma-heap.c:367:22: warning: format specifies type 'long' but the argument has type 'ssize_t' (aka 'int') [-Wformat]
     366 |                 pr_err("use kernel_read_file, err=%ld, [%ld, %ld], f_sz=%ld\n",
         |                                                              ~~~
         |                                                              %zd
     367 |                        err, start, (start + size), heap_file->fsz);
         |                                    ^~~~~~~~~~~~~~
   include/linux/printk.h:533:33: note: expanded from macro 'pr_err'
     533 |         printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                                ~~~     ^~~~~~~~~~~
   include/linux/printk.h:490:60: note: expanded from macro 'printk'
     490 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
         |                                                     ~~~    ^~~~~~~~~~~
   include/linux/printk.h:462:19: note: expanded from macro 'printk_index_wrap'
     462 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ~~~~    ^~~~~~~~~~~
   drivers/dma-buf/dma-heap.c:367:38: warning: format specifies type 'long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
     366 |                 pr_err("use kernel_read_file, err=%ld, [%ld, %ld], f_sz=%ld\n",
         |                                                                         ~~~
         |                                                                         %zu
     367 |                        err, start, (start + size), heap_file->fsz);
         |                                                    ^~~~~~~~~~~~~~
   include/linux/printk.h:533:33: note: expanded from macro 'pr_err'
     533 |         printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                                ~~~     ^~~~~~~~~~~
   include/linux/printk.h:490:60: note: expanded from macro 'printk'
     490 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)


vim +293 drivers/dma-buf/dma-heap.c

   239	
   240	int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
   241	{
   242		struct dma_heap_file_work *heap_fwork = init_file_work(heap_ftask);
   243		struct page *last = NULL;
   244		struct dma_heap_file *heap_file = heap_ftask->heap_file;
   245		size_t start = heap_ftask->roffset;
   246		struct file *file = heap_file->file;
   247		size_t fsz = heap_file->fsz;
   248	
   249		if (unlikely(!heap_fwork))
   250			return -ENOMEM;
   251	
   252		/**
   253		 * If file size is not page aligned, direct io can't process the tail.
   254		 * So, if reach to tail, remain the last page use buffer read.
   255		 */
   256		if (heap_file->direct && start + heap_ftask->rsize > fsz) {
   257			heap_fwork->need_size -= PAGE_SIZE;
   258			last = heap_ftask->parray[heap_ftask->pindex - 1];
   259		}
   260	
   261		spin_lock(&heap_fctl->lock);
   262		list_add_tail(&heap_fwork->list, &heap_fctl->works);
   263		spin_unlock(&heap_fctl->lock);
   264		atomic_inc(&heap_fctl->nr_work);
   265	
   266		wake_up(&heap_fctl->threadwq);
   267	
   268		if (last) {
   269			char *buf, *pathp;
   270			ssize_t err;
   271			void *buffer;
   272	
   273			buf = kmalloc(PATH_MAX, GFP_KERNEL);
   274			if (unlikely(!buf))
   275				return -ENOMEM;
   276	
   277			start = PAGE_ALIGN_DOWN(fsz);
   278	
   279			pathp = file_path(file, buf, PATH_MAX);
   280			if (IS_ERR(pathp)) {
   281				kfree(buf);
   282				return PTR_ERR(pathp);
   283			}
   284	
   285			buffer = kmap_local_page(last); // use page's kaddr.
   286			err = kernel_read_file_from_path(pathp, start, &buffer,
   287							 fsz - start, &fsz,
   288							 READING_POLICY);
   289			kunmap_local(buffer);
   290			kfree(buf);
   291			if (err < 0) {
   292				pr_err("failed to use buffer kernel_read_file %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
 > 293				       pathp, err, start, fsz, fsz);
   294	
   295				return err;
   296			}
   297		}
   298	
   299		heap_ftask->roffset += heap_ftask->rsize;
   300		heap_ftask->rsize = 0;
   301		heap_ftask->pindex = 0;
   302		heap_ftask->rbatch = min_t(size_t,
   303					   PAGE_ALIGN(fsz) - heap_ftask->roffset,
   304					   heap_ftask->rbatch);
   305		return 0;
   306	}
   307
Daniel Vetter July 15, 2024, 9:11 a.m. UTC | #13
On Thu, Jul 11, 2024 at 11:00:02AM +0200, Christian König wrote:
> Am 11.07.24 um 09:42 schrieb Huan Yang:
> > Some user may need load file into dma-buf, current
> > way is:
> >    1. allocate a dma-buf, get dma-buf fd
> >    2. mmap dma-buf fd into vaddr
> >    3. read(file_fd, vaddr, fsz)
> > This is too heavy if fsz reached to GB.
> 
> You need to describe a bit more why that is to heavy. I can only assume you
> need to save memory bandwidth and avoid the extra copy with the CPU.
> 
> > This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
> > User need to offer a file_fd which you want to load into dma-buf, then,
> > it promise if you got a dma-buf fd, it will contains the file content.
> 
> Interesting idea, that has at least more potential than trying to enable
> direct I/O on mmap()ed DMA-bufs.
> 
> The approach with the new IOCTL might not work because it is a very
> specialized use case.
> 
> But IIRC there was a copy_file_range callback in the file_operations
> structure you could use for that. I'm just not sure when and how that's used
> with the copy_file_range() system call.

I'm not sure any of those help, because internally they're all still based
on struct page (or maybe in the future on folios). And that's the thing
dma-buf can't give you, at least without peaking behind the curtain.

I think an entirely different option would be malloc+udmabuf. That
essentially handles the impendence-mismatch between direct I/O and dma-buf
on the dma-buf side. The downside is that it'll make the permanently
pinned memory accounting and tracking issues even more apparent, but I
guess eventually we do need to sort that one out.

And since all the patches here are only for the pages system heap I'm
guess udmabuf should work out for the use-case here? Worth a shot at
least.
-Sima

> 
> Regards,
> Christian.
> 
> > 
> > Notice, file_fd depends on user how to open this file. So, both buffer
> > I/O and Direct I/O is supported.
> > 
> > Signed-off-by: Huan Yang <link@vivo.com>
> > ---
> >   drivers/dma-buf/dma-heap.c    | 525 +++++++++++++++++++++++++++++++++-
> >   include/linux/dma-heap.h      |  57 +++-
> >   include/uapi/linux/dma-heap.h |  32 +++
> >   3 files changed, 611 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
> > index 2298ca5e112e..abe17281adb8 100644
> > --- a/drivers/dma-buf/dma-heap.c
> > +++ b/drivers/dma-buf/dma-heap.c
> > @@ -15,9 +15,11 @@
> >   #include <linux/list.h>
> >   #include <linux/slab.h>
> >   #include <linux/nospec.h>
> > +#include <linux/highmem.h>
> >   #include <linux/uaccess.h>
> >   #include <linux/syscalls.h>
> >   #include <linux/dma-heap.h>
> > +#include <linux/vmalloc.h>
> >   #include <uapi/linux/dma-heap.h>
> >   #define DEVNAME "dma_heap"
> > @@ -43,12 +45,462 @@ struct dma_heap {
> >   	struct cdev heap_cdev;
> >   };
> > +/**
> > + * struct dma_heap_file - wrap the file, read task for dma_heap allocate use.
> > + * @file:		file to read from.
> > + *
> > + * @cred:		kthread use, user cred copy to use for the read.
> > + *
> > + * @max_batch:		maximum batch size to read, if collect match batch,
> > + *			trigger read, default 128MB, must below file size.
> > + *
> > + * @fsz:		file size.
> > + *
> > + * @direct:		use direct IO?
> > + */
> > +struct dma_heap_file {
> > +	struct file *file;
> > +	struct cred *cred;
> > +	size_t max_batch;
> > +	size_t fsz;
> > +	bool direct;
> > +};
> > +
> > +/**
> > + * struct dma_heap_file_work - represents a dma_heap file read real work.
> > + * @vaddr:		contigous virtual address alloc by vmap, file read need.
> > + *
> > + * @start_size:		file read start offset, same to @dma_heap_file_task->roffset.
> > + *
> > + * @need_size:		file read need size, same to @dma_heap_file_task->rsize.
> > + *
> > + * @heap_file:		file wrapper.
> > + *
> > + * @list:		child node of @dma_heap_file_control->works.
> > + *
> > + * @refp:		same @dma_heap_file_task->ref, if end of read, put ref.
> > + *
> > + * @failp:		if any work io failed, set it true, pointp @dma_heap_file_task->fail.
> > + */
> > +struct dma_heap_file_work {
> > +	void *vaddr;
> > +	ssize_t start_size;
> > +	ssize_t need_size;
> > +	struct dma_heap_file *heap_file;
> > +	struct list_head list;
> > +	atomic_t *refp;
> > +	bool *failp;
> > +};
> > +
> > +/**
> > + * struct dma_heap_file_task - represents a dma_heap file read process
> > + * @ref:		current file work counter, if zero, allocate and read
> > + *			done.
> > + *
> > + * @roffset:		last read offset, current prepared work' begin file
> > + *			start offset.
> > + *
> > + * @rsize:		current allocated page size use to read, if reach rbatch,
> > + *			trigger commit.
> > + *
> > + * @rbatch:		current prepared work's batch, below @dma_heap_file's
> > + *			batch.
> > + *
> > + * @heap_file:		current dma_heap_file
> > + *
> > + * @parray:		used for vmap, size is @dma_heap_file's batch's number
> > + *			pages.(this is maximum). Due to single thread file read,
> > + *			one page array reuse each work prepare is OK.
> > + *			Each index in parray is PAGE_SIZE.(vmap need)
> > + *
> > + * @pindex:		current allocated page filled in @parray's index.
> > + *
> > + * @fail:		any work failed when file read?
> > + *
> > + * dma_heap_file_task is the production of file read, will prepare each work
> > + * during allocate dma_buf pages, if match current batch, then trigger commit
> > + * and prepare next work. After all batch queued, user going on prepare dma_buf
> > + * and so on, but before return dma_buf fd, need to wait file read end and
> > + * check read result.
> > + */
> > +struct dma_heap_file_task {
> > +	atomic_t ref;
> > +	size_t roffset;
> > +	size_t rsize;
> > +	size_t rbatch;
> > +	struct dma_heap_file *heap_file;
> > +	struct page **parray;
> > +	unsigned int pindex;
> > +	bool fail;
> > +};
> > +
> > +/**
> > + * struct dma_heap_file_control - global control of dma_heap file read.
> > + * @works:		@dma_heap_file_work's list head.
> > + *
> > + * @lock:		only lock for @works.
> > + *
> > + * @threadwq:		wait queue for @work_thread, if commit work, @work_thread
> > + *			wakeup and read this work's file contains.
> > + *
> > + * @workwq:		used for main thread wait for file read end, if allocation
> > + *			end before file read. @dma_heap_file_task ref effect this.
> > + *
> > + * @work_thread:	file read kthread. the dma_heap_file_task work's consumer.
> > + *
> > + * @heap_fwork_cachep:	@dma_heap_file_work's cachep, it's alloc/free frequently.
> > + *
> > + * @nr_work:		global number of how many work committed.
> > + */
> > +struct dma_heap_file_control {
> > +	struct list_head works;
> > +	spinlock_t lock;
> > +	wait_queue_head_t threadwq;
> > +	wait_queue_head_t workwq;
> > +	struct task_struct *work_thread;
> > +	struct kmem_cache *heap_fwork_cachep;
> > +	atomic_t nr_work;
> > +};
> > +
> > +static struct dma_heap_file_control *heap_fctl;
> >   static LIST_HEAD(heap_list);
> >   static DEFINE_MUTEX(heap_list_lock);
> >   static dev_t dma_heap_devt;
> >   static struct class *dma_heap_class;
> >   static DEFINE_XARRAY_ALLOC(dma_heap_minors);
> > +/**
> > + * map_pages_to_vaddr - map each scatter page into contiguous virtual address.
> > + * @heap_ftask:		prepared and need to commit's work.
> > + *
> > + * Cached pages need to trigger file read, this function map each scatter page
> > + * into contiguous virtual address, so that file read can easy use.
> > + * Now that we get vaddr page, cached pages can return to original user, so we
> > + * will not effect dma-buf export even if file read not end.
> > + */
> > +static void *map_pages_to_vaddr(struct dma_heap_file_task *heap_ftask)
> > +{
> > +	return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
> > +		    PAGE_KERNEL);
> > +}
> > +
> > +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
> > +				struct page *page)
> > +{
> > +	struct page **array = heap_ftask->parray;
> > +	int index = heap_ftask->pindex;
> > +	int num = compound_nr(page), i;
> > +	unsigned long sz = page_size(page);
> > +
> > +	heap_ftask->rsize += sz;
> > +	for (i = 0; i < num; ++i)
> > +		array[index++] = &page[i];
> > +	heap_ftask->pindex = index;
> > +
> > +	return heap_ftask->rsize >= heap_ftask->rbatch;
> > +}
> > +
> > +static struct dma_heap_file_work *
> > +init_file_work(struct dma_heap_file_task *heap_ftask)
> > +{
> > +	struct dma_heap_file_work *heap_fwork;
> > +	struct dma_heap_file *heap_file = heap_ftask->heap_file;
> > +
> > +	if (READ_ONCE(heap_ftask->fail))
> > +		return NULL;
> > +
> > +	heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, GFP_KERNEL);
> > +	if (unlikely(!heap_fwork))
> > +		return NULL;
> > +
> > +	heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
> > +	if (unlikely(!heap_fwork->vaddr)) {
> > +		kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
> > +		return NULL;
> > +	}
> > +
> > +	heap_fwork->heap_file = heap_file;
> > +	heap_fwork->start_size = heap_ftask->roffset;
> > +	heap_fwork->need_size = heap_ftask->rsize;
> > +	heap_fwork->refp = &heap_ftask->ref;
> > +	heap_fwork->failp = &heap_ftask->fail;
> > +	atomic_inc(&heap_ftask->ref);
> > +	return heap_fwork;
> > +}
> > +
> > +static void destroy_file_work(struct dma_heap_file_work *heap_fwork)
> > +{
> > +	vunmap(heap_fwork->vaddr);
> > +	atomic_dec(heap_fwork->refp);
> > +	wake_up(&heap_fctl->workwq);
> > +
> > +	kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
> > +}
> > +
> > +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
> > +{
> > +	struct dma_heap_file_work *heap_fwork = init_file_work(heap_ftask);
> > +	struct page *last = NULL;
> > +	struct dma_heap_file *heap_file = heap_ftask->heap_file;
> > +	size_t start = heap_ftask->roffset;
> > +	struct file *file = heap_file->file;
> > +	size_t fsz = heap_file->fsz;
> > +
> > +	if (unlikely(!heap_fwork))
> > +		return -ENOMEM;
> > +
> > +	/**
> > +	 * If file size is not page aligned, direct io can't process the tail.
> > +	 * So, if reach to tail, remain the last page use buffer read.
> > +	 */
> > +	if (heap_file->direct && start + heap_ftask->rsize > fsz) {
> > +		heap_fwork->need_size -= PAGE_SIZE;
> > +		last = heap_ftask->parray[heap_ftask->pindex - 1];
> > +	}
> > +
> > +	spin_lock(&heap_fctl->lock);
> > +	list_add_tail(&heap_fwork->list, &heap_fctl->works);
> > +	spin_unlock(&heap_fctl->lock);
> > +	atomic_inc(&heap_fctl->nr_work);
> > +
> > +	wake_up(&heap_fctl->threadwq);
> > +
> > +	if (last) {
> > +		char *buf, *pathp;
> > +		ssize_t err;
> > +		void *buffer;
> > +
> > +		buf = kmalloc(PATH_MAX, GFP_KERNEL);
> > +		if (unlikely(!buf))
> > +			return -ENOMEM;
> > +
> > +		start = PAGE_ALIGN_DOWN(fsz);
> > +
> > +		pathp = file_path(file, buf, PATH_MAX);
> > +		if (IS_ERR(pathp)) {
> > +			kfree(buf);
> > +			return PTR_ERR(pathp);
> > +		}
> > +
> > +		buffer = kmap_local_page(last); // use page's kaddr.
> > +		err = kernel_read_file_from_path(pathp, start, &buffer,
> > +						 fsz - start, &fsz,
> > +						 READING_POLICY);
> > +		kunmap_local(buffer);
> > +		kfree(buf);
> > +		if (err < 0) {
> > +			pr_err("failed to use buffer kernel_read_file %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
> > +			       pathp, err, start, fsz, fsz);
> > +
> > +			return err;
> > +		}
> > +	}
> > +
> > +	heap_ftask->roffset += heap_ftask->rsize;
> > +	heap_ftask->rsize = 0;
> > +	heap_ftask->pindex = 0;
> > +	heap_ftask->rbatch = min_t(size_t,
> > +				   PAGE_ALIGN(fsz) - heap_ftask->roffset,
> > +				   heap_ftask->rbatch);
> > +	return 0;
> > +}
> > +
> > +bool dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask)
> > +{
> > +	wait_event_freezable(heap_fctl->workwq,
> > +			     atomic_read(&heap_ftask->ref) == 0);
> > +	return heap_ftask->fail;
> > +}
> > +
> > +bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask)
> > +{
> > +	bool fail;
> > +
> > +	dma_heap_wait_for_file_read(heap_ftask);
> > +	fail = heap_ftask->fail;
> > +	kvfree(heap_ftask->parray);
> > +	kfree(heap_ftask);
> > +	return fail;
> > +}
> > +
> > +struct dma_heap_file_task *
> > +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
> > +{
> > +	struct dma_heap_file_task *heap_ftask =
> > +		kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
> > +	if (unlikely(!heap_ftask))
> > +		return NULL;
> > +
> > +	/**
> > +	 * Batch is the maximum size which we prepare work will meet.
> > +	 * So, direct alloc this number's page array is OK.
> > +	 */
> > +	heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> PAGE_SHIFT,
> > +					    sizeof(struct page *), GFP_KERNEL);
> > +	if (unlikely(!heap_ftask->parray))
> > +		goto put;
> > +
> > +	heap_ftask->heap_file = heap_file;
> > +	heap_ftask->rbatch = heap_file->max_batch;
> > +	return heap_ftask;
> > +put:
> > +	kfree(heap_ftask);
> > +	return NULL;
> > +}
> > +
> > +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
> > +{
> > +	struct dma_heap_file *heap_file = heap_fwork->heap_file;
> > +	struct file *file = heap_file->file;
> > +	ssize_t start = heap_fwork->start_size;
> > +	ssize_t size = heap_fwork->need_size;
> > +	void *buffer = heap_fwork->vaddr;
> > +	const struct cred *old_cred;
> > +	ssize_t err;
> > +
> > +	// use real task's cred to read this file.
> > +	old_cred = override_creds(heap_file->cred);
> > +	err = kernel_read_file(file, start, &buffer, size, &heap_file->fsz,
> > +			       READING_POLICY);
> > +	if (err < 0) {
> > +		pr_err("use kernel_read_file, err=%ld, [%ld, %ld], f_sz=%ld\n",
> > +		       err, start, (start + size), heap_file->fsz);
> > +		WRITE_ONCE(*heap_fwork->failp, true);
> > +	}
> > +	// recovery to my cred.
> > +	revert_creds(old_cred);
> > +}
> > +
> > +static int dma_heap_file_control_thread(void *data)
> > +{
> > +	struct dma_heap_file_control *heap_fctl =
> > +		(struct dma_heap_file_control *)data;
> > +	struct dma_heap_file_work *worker, *tmp;
> > +	int nr_work;
> > +
> > +	LIST_HEAD(pages);
> > +	LIST_HEAD(workers);
> > +
> > +	while (true) {
> > +		wait_event_freezable(heap_fctl->threadwq,
> > +				     atomic_read(&heap_fctl->nr_work) > 0);
> > +recheck:
> > +		spin_lock(&heap_fctl->lock);
> > +		list_splice_init(&heap_fctl->works, &workers);
> > +		spin_unlock(&heap_fctl->lock);
> > +
> > +		if (unlikely(kthread_should_stop())) {
> > +			list_for_each_entry_safe(worker, tmp, &workers, list) {
> > +				list_del(&worker->list);
> > +				destroy_file_work(worker);
> > +			}
> > +			break;
> > +		}
> > +
> > +		nr_work = 0;
> > +		list_for_each_entry_safe(worker, tmp, &workers, list) {
> > +			++nr_work;
> > +			list_del(&worker->list);
> > +			__work_this_io(worker);
> > +
> > +			destroy_file_work(worker);
> > +		}
> > +		atomic_sub(nr_work, &heap_fctl->nr_work);
> > +
> > +		if (atomic_read(&heap_fctl->nr_work) > 0)
> > +			goto recheck;
> > +	}
> > +	return 0;
> > +}
> > +
> > +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
> > +{
> > +	return heap_file->fsz;
> > +}
> > +
> > +static int prepare_dma_heap_file(struct dma_heap_file *heap_file, int file_fd,
> > +				 size_t batch)
> > +{
> > +	struct file *file;
> > +	size_t fsz;
> > +	int ret;
> > +
> > +	file = fget(file_fd);
> > +	if (!file)
> > +		return -EINVAL;
> > +
> > +	fsz = i_size_read(file_inode(file));
> > +	if (fsz < batch) {
> > +		ret = -EINVAL;
> > +		goto err;
> > +	}
> > +
> > +	/**
> > +	 * Selinux block our read, but actually we are reading the stand-in
> > +	 * for this file.
> > +	 * So save current's cred and when going to read, override mine, and
> > +	 * end of read, revert.
> > +	 */
> > +	heap_file->cred = prepare_kernel_cred(current);
> > +	if (unlikely(!heap_file->cred)) {
> > +		ret = -ENOMEM;
> > +		goto err;
> > +	}
> > +
> > +	heap_file->file = file;
> > +	heap_file->max_batch = batch;
> > +	heap_file->fsz = fsz;
> > +
> > +	heap_file->direct = file->f_flags & O_DIRECT;
> > +
> > +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
> > +	if (!heap_file->direct && fsz >= DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
> > +		pr_warn("alloc read file better to use O_DIRECT to read larget file\n");
> > +
> > +	return 0;
> > +
> > +err:
> > +	fput(file);
> > +	return ret;
> > +}
> > +
> > +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
> > +{
> > +	fput(heap_file->file);
> > +	put_cred(heap_file->cred);
> > +}
> > +
> > +static int dma_heap_buffer_alloc_read_file(struct dma_heap *heap, int file_fd,
> > +					   size_t batch, unsigned int fd_flags,
> > +					   unsigned int heap_flags)
> > +{
> > +	struct dma_buf *dmabuf;
> > +	int fd;
> > +	struct dma_heap_file heap_file;
> > +
> > +	fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
> > +	if (fd)
> > +		goto error_file;
> > +
> > +	dmabuf = heap->ops->allocate_read_file(heap, &heap_file, fd_flags,
> > +					       heap_flags);
> > +	if (IS_ERR(dmabuf)) {
> > +		fd = PTR_ERR(dmabuf);
> > +		goto error;
> > +	}
> > +
> > +	fd = dma_buf_fd(dmabuf, fd_flags);
> > +	if (fd < 0) {
> > +		dma_buf_put(dmabuf);
> > +		/* just return, as put will call release and that will free */
> > +	}
> > +
> > +error:
> > +	destroy_dma_heap_file(&heap_file);
> > +error_file:
> > +	return fd;
> > +}
> > +
> >   static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
> >   				 u32 fd_flags,
> >   				 u64 heap_flags)
> > @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode *inode, struct file *file)
> >   	return 0;
> >   }
> > +static long dma_heap_ioctl_allocate_read_file(struct file *file, void *data)
> > +{
> > +	struct dma_heap_allocation_file_data *heap_allocation_file = data;
> > +	struct dma_heap *heap = file->private_data;
> > +	int fd;
> > +
> > +	if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
> > +		return -EINVAL;
> > +
> > +	if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
> > +		return -EINVAL;
> > +
> > +	if (heap_allocation_file->heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS)
> > +		return -EINVAL;
> > +
> > +	if (!heap->ops->allocate_read_file)
> > +		return -EINVAL;
> > +
> > +	fd = dma_heap_buffer_alloc_read_file(
> > +		heap, heap_allocation_file->file_fd,
> > +		heap_allocation_file->batch ?
> > +			PAGE_ALIGN(heap_allocation_file->batch) :
> > +			DEFAULT_ADI_BATCH,
> > +		heap_allocation_file->fd_flags,
> > +		heap_allocation_file->heap_flags);
> > +	if (fd < 0)
> > +		return fd;
> > +
> > +	heap_allocation_file->fd = fd;
> > +	return 0;
> > +}
> > +
> >   static long dma_heap_ioctl_allocate(struct file *file, void *data)
> >   {
> >   	struct dma_heap_allocation_data *heap_allocation = data;
> > @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct file *file, void *data)
> >   static unsigned int dma_heap_ioctl_cmds[] = {
> >   	DMA_HEAP_IOCTL_ALLOC,
> > +	DMA_HEAP_IOCTL_ALLOC_AND_READ,
> >   };
> >   static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
> > @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
> >   	case DMA_HEAP_IOCTL_ALLOC:
> >   		ret = dma_heap_ioctl_allocate(file, kdata);
> >   		break;
> > +	case DMA_HEAP_IOCTL_ALLOC_AND_READ:
> > +		ret = dma_heap_ioctl_allocate_read_file(file, kdata);
> > +		break;
> >   	default:
> >   		ret = -ENOTTY;
> >   		goto err;
> > @@ -316,11 +804,44 @@ static int dma_heap_init(void)
> >   	dma_heap_class = class_create(DEVNAME);
> >   	if (IS_ERR(dma_heap_class)) {
> > -		unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
> > -		return PTR_ERR(dma_heap_class);
> > +		ret = PTR_ERR(dma_heap_class);
> > +		goto fail_class;
> >   	}
> >   	dma_heap_class->devnode = dma_heap_devnode;
> > +	heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
> > +	if (unlikely(!heap_fctl)) {
> > +		ret =  -ENOMEM;
> > +		goto fail_alloc;
> > +	}
> > +
> > +	INIT_LIST_HEAD(&heap_fctl->works);
> > +	init_waitqueue_head(&heap_fctl->threadwq);
> > +	init_waitqueue_head(&heap_fctl->workwq);
> > +
> > +	heap_fctl->work_thread = kthread_run(dma_heap_file_control_thread,
> > +					     heap_fctl, "heap_fwork_t");
> > +	if (IS_ERR(heap_fctl->work_thread)) {
> > +		ret = -ENOMEM;
> > +		goto fail_thread;
> > +	}
> > +
> > +	heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 0);
> > +	if (unlikely(!heap_fctl->heap_fwork_cachep)) {
> > +		ret = -ENOMEM;
> > +		goto fail_cache;
> > +	}
> > +
> >   	return 0;
> > +
> > +fail_cache:
> > +	kthread_stop(heap_fctl->work_thread);
> > +fail_thread:
> > +	kfree(heap_fctl);
> > +fail_alloc:
> > +	class_destroy(dma_heap_class);
> > +fail_class:
> > +	unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
> > +	return ret;
> >   }
> >   subsys_initcall(dma_heap_init);
> > diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
> > index 064bad725061..9c25383f816c 100644
> > --- a/include/linux/dma-heap.h
> > +++ b/include/linux/dma-heap.h
> > @@ -12,12 +12,17 @@
> >   #include <linux/cdev.h>
> >   #include <linux/types.h>
> > +#define DEFAULT_ADI_BATCH (128 << 20)
> > +
> >   struct dma_heap;
> > +struct dma_heap_file_task;
> > +struct dma_heap_file;
> >   /**
> >    * struct dma_heap_ops - ops to operate on a given heap
> >    * @allocate:		allocate dmabuf and return struct dma_buf ptr
> > - *
> > + * @allocate_read_file: allocate dmabuf and read file, then return struct
> > + * dma_buf ptr.
> >    * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
> >    */
> >   struct dma_heap_ops {
> > @@ -25,6 +30,11 @@ struct dma_heap_ops {
> >   				    unsigned long len,
> >   				    u32 fd_flags,
> >   				    u64 heap_flags);
> > +
> > +	struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
> > +					      struct dma_heap_file *heap_file,
> > +					      u32 fd_flags,
> > +					      u64 heap_flags);
> >   };
> >   /**
> > @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap *heap);
> >    */
> >   struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info);
> > +/**
> > + * dma_heap_destroy_file_read - waits for a file read to complete then destroy it
> > + * Returns: true if the file read failed, false otherwise
> > + */
> > +bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask);
> > +
> > +/**
> > + * dma_heap_wait_for_file_read - waits for a file read to complete
> > + * Returns: true if the file read failed, false otherwise
> > + */
> > +bool dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask);
> > +
> > +/**
> > + * dma_heap_alloc_file_read - Declare a task to read file when allocate pages.
> > + * @heap_file:		target file to read
> > + *
> > + * Return NULL if failed, otherwise return a struct pointer.
> > + */
> > +struct dma_heap_file_task *
> > +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
> > +
> > +/**
> > + * dma_heap_prepare_file_read - cache each allocated page until we meet this batch.
> > + * @heap_ftask:		prepared and need to commit's work.
> > + * @page:		current allocated page. don't care which order.
> > + *
> > + * Returns true if reach to batch, false so go on prepare.
> > + */
> > +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
> > +				struct page *page);
> > +
> > +/**
> > + * dma_heap_commit_file_read -  prepare collect enough memory, going to trigger IO
> > + * @heap_ftask:			info that current IO needs
> > + *
> > + * This commit will also check if reach to tail read.
> > + * For direct I/O submissions, it is necessary to pay attention to file reads
> > + * that are not page-aligned. For the unaligned portion of the read, buffer IO
> > + * needs to be triggered.
> > + * Returns:
> > + *   0 if all right, -errno if something wrong
> > + */
> > +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask);
> > +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
> > +
> >   #endif /* _DMA_HEAPS_H */
> > diff --git a/include/uapi/linux/dma-heap.h b/include/uapi/linux/dma-heap.h
> > index a4cf716a49fa..8c20e8b74eed 100644
> > --- a/include/uapi/linux/dma-heap.h
> > +++ b/include/uapi/linux/dma-heap.h
> > @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
> >   	__u64 heap_flags;
> >   };
> > +/**
> > + * struct dma_heap_allocation_file_data - metadata passed from userspace for
> > + *                                      allocations and read file
> > + * @fd:			will be populated with a fd which provides the
> > + *			handle to the allocated dma-buf
> > + * @file_fd:		file descriptor to read from(suggested to use O_DIRECT open file)
> > + * @batch:		how many memory alloced then file read(bytes), default 128MB
> > + *			will auto aligned to PAGE_SIZE
> > + * @fd_flags:		file descriptor flags used when allocating
> > + * @heap_flags:		flags passed to heap
> > + *
> > + * Provided by userspace as an argument to the ioctl
> > + */
> > +struct dma_heap_allocation_file_data {
> > +	__u32 fd;
> > +	__u32 file_fd;
> > +	__u32 batch;
> > +	__u32 fd_flags;
> > +	__u64 heap_flags;
> > +};
> > +
> >   #define DMA_HEAP_IOC_MAGIC		'H'
> >   /**
> > @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
> >   #define DMA_HEAP_IOCTL_ALLOC	_IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
> >   				      struct dma_heap_allocation_data)
> > +/**
> > + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from pool and both
> > + *					read file when allocate memory.
> > + *
> > + * Takes a dma_heap_allocation_file_data struct and returns it with the fd field
> > + * populated with the dmabuf handle of the allocation. When return, the dma-buf
> > + * content is read from file.
> > + */
> > +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
> > +	_IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct dma_heap_allocation_file_data)
> > +
> >   #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>
Christian König July 15, 2024, 12:32 p.m. UTC | #14
Am 15.07.24 um 11:11 schrieb Daniel Vetter:
> On Thu, Jul 11, 2024 at 11:00:02AM +0200, Christian König wrote:
>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>> Some user may need load file into dma-buf, current
>>> way is:
>>>     1. allocate a dma-buf, get dma-buf fd
>>>     2. mmap dma-buf fd into vaddr
>>>     3. read(file_fd, vaddr, fsz)
>>> This is too heavy if fsz reached to GB.
>> You need to describe a bit more why that is to heavy. I can only assume you
>> need to save memory bandwidth and avoid the extra copy with the CPU.
>>
>>> This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>> User need to offer a file_fd which you want to load into dma-buf, then,
>>> it promise if you got a dma-buf fd, it will contains the file content.
>> Interesting idea, that has at least more potential than trying to enable
>> direct I/O on mmap()ed DMA-bufs.
>>
>> The approach with the new IOCTL might not work because it is a very
>> specialized use case.
>>
>> But IIRC there was a copy_file_range callback in the file_operations
>> structure you could use for that. I'm just not sure when and how that's used
>> with the copy_file_range() system call.
> I'm not sure any of those help, because internally they're all still based
> on struct page (or maybe in the future on folios). And that's the thing
> dma-buf can't give you, at least without peaking behind the curtain.
>
> I think an entirely different option would be malloc+udmabuf. That
> essentially handles the impendence-mismatch between direct I/O and dma-buf
> on the dma-buf side. The downside is that it'll make the permanently
> pinned memory accounting and tracking issues even more apparent, but I
> guess eventually we do need to sort that one out.

Oh, very good idea!

Just one minor correction: it's not malloc+udmabuf, but rather 
create_memfd()+udmabuf.

And you need to complete your direct I/O before creating the udmabuf 
since that reference will prevent direct I/O from working.

Regards,
Christian.



>
> And since all the patches here are only for the pages system heap I'm
> guess udmabuf should work out for the use-case here? Worth a shot at
> least.
> -Sima
>
>> Regards,
>> Christian.
>>
>>> Notice, file_fd depends on user how to open this file. So, both buffer
>>> I/O and Direct I/O is supported.
>>>
>>> Signed-off-by: Huan Yang <link@vivo.com>
>>> ---
>>>    drivers/dma-buf/dma-heap.c    | 525 +++++++++++++++++++++++++++++++++-
>>>    include/linux/dma-heap.h      |  57 +++-
>>>    include/uapi/linux/dma-heap.h |  32 +++
>>>    3 files changed, 611 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
>>> index 2298ca5e112e..abe17281adb8 100644
>>> --- a/drivers/dma-buf/dma-heap.c
>>> +++ b/drivers/dma-buf/dma-heap.c
>>> @@ -15,9 +15,11 @@
>>>    #include <linux/list.h>
>>>    #include <linux/slab.h>
>>>    #include <linux/nospec.h>
>>> +#include <linux/highmem.h>
>>>    #include <linux/uaccess.h>
>>>    #include <linux/syscalls.h>
>>>    #include <linux/dma-heap.h>
>>> +#include <linux/vmalloc.h>
>>>    #include <uapi/linux/dma-heap.h>
>>>    #define DEVNAME "dma_heap"
>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>    	struct cdev heap_cdev;
>>>    };
>>> +/**
>>> + * struct dma_heap_file - wrap the file, read task for dma_heap allocate use.
>>> + * @file:		file to read from.
>>> + *
>>> + * @cred:		kthread use, user cred copy to use for the read.
>>> + *
>>> + * @max_batch:		maximum batch size to read, if collect match batch,
>>> + *			trigger read, default 128MB, must below file size.
>>> + *
>>> + * @fsz:		file size.
>>> + *
>>> + * @direct:		use direct IO?
>>> + */
>>> +struct dma_heap_file {
>>> +	struct file *file;
>>> +	struct cred *cred;
>>> +	size_t max_batch;
>>> +	size_t fsz;
>>> +	bool direct;
>>> +};
>>> +
>>> +/**
>>> + * struct dma_heap_file_work - represents a dma_heap file read real work.
>>> + * @vaddr:		contigous virtual address alloc by vmap, file read need.
>>> + *
>>> + * @start_size:		file read start offset, same to @dma_heap_file_task->roffset.
>>> + *
>>> + * @need_size:		file read need size, same to @dma_heap_file_task->rsize.
>>> + *
>>> + * @heap_file:		file wrapper.
>>> + *
>>> + * @list:		child node of @dma_heap_file_control->works.
>>> + *
>>> + * @refp:		same @dma_heap_file_task->ref, if end of read, put ref.
>>> + *
>>> + * @failp:		if any work io failed, set it true, pointp @dma_heap_file_task->fail.
>>> + */
>>> +struct dma_heap_file_work {
>>> +	void *vaddr;
>>> +	ssize_t start_size;
>>> +	ssize_t need_size;
>>> +	struct dma_heap_file *heap_file;
>>> +	struct list_head list;
>>> +	atomic_t *refp;
>>> +	bool *failp;
>>> +};
>>> +
>>> +/**
>>> + * struct dma_heap_file_task - represents a dma_heap file read process
>>> + * @ref:		current file work counter, if zero, allocate and read
>>> + *			done.
>>> + *
>>> + * @roffset:		last read offset, current prepared work' begin file
>>> + *			start offset.
>>> + *
>>> + * @rsize:		current allocated page size use to read, if reach rbatch,
>>> + *			trigger commit.
>>> + *
>>> + * @rbatch:		current prepared work's batch, below @dma_heap_file's
>>> + *			batch.
>>> + *
>>> + * @heap_file:		current dma_heap_file
>>> + *
>>> + * @parray:		used for vmap, size is @dma_heap_file's batch's number
>>> + *			pages.(this is maximum). Due to single thread file read,
>>> + *			one page array reuse each work prepare is OK.
>>> + *			Each index in parray is PAGE_SIZE.(vmap need)
>>> + *
>>> + * @pindex:		current allocated page filled in @parray's index.
>>> + *
>>> + * @fail:		any work failed when file read?
>>> + *
>>> + * dma_heap_file_task is the production of file read, will prepare each work
>>> + * during allocate dma_buf pages, if match current batch, then trigger commit
>>> + * and prepare next work. After all batch queued, user going on prepare dma_buf
>>> + * and so on, but before return dma_buf fd, need to wait file read end and
>>> + * check read result.
>>> + */
>>> +struct dma_heap_file_task {
>>> +	atomic_t ref;
>>> +	size_t roffset;
>>> +	size_t rsize;
>>> +	size_t rbatch;
>>> +	struct dma_heap_file *heap_file;
>>> +	struct page **parray;
>>> +	unsigned int pindex;
>>> +	bool fail;
>>> +};
>>> +
>>> +/**
>>> + * struct dma_heap_file_control - global control of dma_heap file read.
>>> + * @works:		@dma_heap_file_work's list head.
>>> + *
>>> + * @lock:		only lock for @works.
>>> + *
>>> + * @threadwq:		wait queue for @work_thread, if commit work, @work_thread
>>> + *			wakeup and read this work's file contains.
>>> + *
>>> + * @workwq:		used for main thread wait for file read end, if allocation
>>> + *			end before file read. @dma_heap_file_task ref effect this.
>>> + *
>>> + * @work_thread:	file read kthread. the dma_heap_file_task work's consumer.
>>> + *
>>> + * @heap_fwork_cachep:	@dma_heap_file_work's cachep, it's alloc/free frequently.
>>> + *
>>> + * @nr_work:		global number of how many work committed.
>>> + */
>>> +struct dma_heap_file_control {
>>> +	struct list_head works;
>>> +	spinlock_t lock;
>>> +	wait_queue_head_t threadwq;
>>> +	wait_queue_head_t workwq;
>>> +	struct task_struct *work_thread;
>>> +	struct kmem_cache *heap_fwork_cachep;
>>> +	atomic_t nr_work;
>>> +};
>>> +
>>> +static struct dma_heap_file_control *heap_fctl;
>>>    static LIST_HEAD(heap_list);
>>>    static DEFINE_MUTEX(heap_list_lock);
>>>    static dev_t dma_heap_devt;
>>>    static struct class *dma_heap_class;
>>>    static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>> +/**
>>> + * map_pages_to_vaddr - map each scatter page into contiguous virtual address.
>>> + * @heap_ftask:		prepared and need to commit's work.
>>> + *
>>> + * Cached pages need to trigger file read, this function map each scatter page
>>> + * into contiguous virtual address, so that file read can easy use.
>>> + * Now that we get vaddr page, cached pages can return to original user, so we
>>> + * will not effect dma-buf export even if file read not end.
>>> + */
>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task *heap_ftask)
>>> +{
>>> +	return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>> +		    PAGE_KERNEL);
>>> +}
>>> +
>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
>>> +				struct page *page)
>>> +{
>>> +	struct page **array = heap_ftask->parray;
>>> +	int index = heap_ftask->pindex;
>>> +	int num = compound_nr(page), i;
>>> +	unsigned long sz = page_size(page);
>>> +
>>> +	heap_ftask->rsize += sz;
>>> +	for (i = 0; i < num; ++i)
>>> +		array[index++] = &page[i];
>>> +	heap_ftask->pindex = index;
>>> +
>>> +	return heap_ftask->rsize >= heap_ftask->rbatch;
>>> +}
>>> +
>>> +static struct dma_heap_file_work *
>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>> +{
>>> +	struct dma_heap_file_work *heap_fwork;
>>> +	struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>> +
>>> +	if (READ_ONCE(heap_ftask->fail))
>>> +		return NULL;
>>> +
>>> +	heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, GFP_KERNEL);
>>> +	if (unlikely(!heap_fwork))
>>> +		return NULL;
>>> +
>>> +	heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>> +	if (unlikely(!heap_fwork->vaddr)) {
>>> +		kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>> +		return NULL;
>>> +	}
>>> +
>>> +	heap_fwork->heap_file = heap_file;
>>> +	heap_fwork->start_size = heap_ftask->roffset;
>>> +	heap_fwork->need_size = heap_ftask->rsize;
>>> +	heap_fwork->refp = &heap_ftask->ref;
>>> +	heap_fwork->failp = &heap_ftask->fail;
>>> +	atomic_inc(&heap_ftask->ref);
>>> +	return heap_fwork;
>>> +}
>>> +
>>> +static void destroy_file_work(struct dma_heap_file_work *heap_fwork)
>>> +{
>>> +	vunmap(heap_fwork->vaddr);
>>> +	atomic_dec(heap_fwork->refp);
>>> +	wake_up(&heap_fctl->workwq);
>>> +
>>> +	kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>> +}
>>> +
>>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
>>> +{
>>> +	struct dma_heap_file_work *heap_fwork = init_file_work(heap_ftask);
>>> +	struct page *last = NULL;
>>> +	struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>> +	size_t start = heap_ftask->roffset;
>>> +	struct file *file = heap_file->file;
>>> +	size_t fsz = heap_file->fsz;
>>> +
>>> +	if (unlikely(!heap_fwork))
>>> +		return -ENOMEM;
>>> +
>>> +	/**
>>> +	 * If file size is not page aligned, direct io can't process the tail.
>>> +	 * So, if reach to tail, remain the last page use buffer read.
>>> +	 */
>>> +	if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>> +		heap_fwork->need_size -= PAGE_SIZE;
>>> +		last = heap_ftask->parray[heap_ftask->pindex - 1];
>>> +	}
>>> +
>>> +	spin_lock(&heap_fctl->lock);
>>> +	list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>> +	spin_unlock(&heap_fctl->lock);
>>> +	atomic_inc(&heap_fctl->nr_work);
>>> +
>>> +	wake_up(&heap_fctl->threadwq);
>>> +
>>> +	if (last) {
>>> +		char *buf, *pathp;
>>> +		ssize_t err;
>>> +		void *buffer;
>>> +
>>> +		buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>> +		if (unlikely(!buf))
>>> +			return -ENOMEM;
>>> +
>>> +		start = PAGE_ALIGN_DOWN(fsz);
>>> +
>>> +		pathp = file_path(file, buf, PATH_MAX);
>>> +		if (IS_ERR(pathp)) {
>>> +			kfree(buf);
>>> +			return PTR_ERR(pathp);
>>> +		}
>>> +
>>> +		buffer = kmap_local_page(last); // use page's kaddr.
>>> +		err = kernel_read_file_from_path(pathp, start, &buffer,
>>> +						 fsz - start, &fsz,
>>> +						 READING_POLICY);
>>> +		kunmap_local(buffer);
>>> +		kfree(buf);
>>> +		if (err < 0) {
>>> +			pr_err("failed to use buffer kernel_read_file %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
>>> +			       pathp, err, start, fsz, fsz);
>>> +
>>> +			return err;
>>> +		}
>>> +	}
>>> +
>>> +	heap_ftask->roffset += heap_ftask->rsize;
>>> +	heap_ftask->rsize = 0;
>>> +	heap_ftask->pindex = 0;
>>> +	heap_ftask->rbatch = min_t(size_t,
>>> +				   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>> +				   heap_ftask->rbatch);
>>> +	return 0;
>>> +}
>>> +
>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask)
>>> +{
>>> +	wait_event_freezable(heap_fctl->workwq,
>>> +			     atomic_read(&heap_ftask->ref) == 0);
>>> +	return heap_ftask->fail;
>>> +}
>>> +
>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask)
>>> +{
>>> +	bool fail;
>>> +
>>> +	dma_heap_wait_for_file_read(heap_ftask);
>>> +	fail = heap_ftask->fail;
>>> +	kvfree(heap_ftask->parray);
>>> +	kfree(heap_ftask);
>>> +	return fail;
>>> +}
>>> +
>>> +struct dma_heap_file_task *
>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>> +{
>>> +	struct dma_heap_file_task *heap_ftask =
>>> +		kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>> +	if (unlikely(!heap_ftask))
>>> +		return NULL;
>>> +
>>> +	/**
>>> +	 * Batch is the maximum size which we prepare work will meet.
>>> +	 * So, direct alloc this number's page array is OK.
>>> +	 */
>>> +	heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> PAGE_SHIFT,
>>> +					    sizeof(struct page *), GFP_KERNEL);
>>> +	if (unlikely(!heap_ftask->parray))
>>> +		goto put;
>>> +
>>> +	heap_ftask->heap_file = heap_file;
>>> +	heap_ftask->rbatch = heap_file->max_batch;
>>> +	return heap_ftask;
>>> +put:
>>> +	kfree(heap_ftask);
>>> +	return NULL;
>>> +}
>>> +
>>> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
>>> +{
>>> +	struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>> +	struct file *file = heap_file->file;
>>> +	ssize_t start = heap_fwork->start_size;
>>> +	ssize_t size = heap_fwork->need_size;
>>> +	void *buffer = heap_fwork->vaddr;
>>> +	const struct cred *old_cred;
>>> +	ssize_t err;
>>> +
>>> +	// use real task's cred to read this file.
>>> +	old_cred = override_creds(heap_file->cred);
>>> +	err = kernel_read_file(file, start, &buffer, size, &heap_file->fsz,
>>> +			       READING_POLICY);
>>> +	if (err < 0) {
>>> +		pr_err("use kernel_read_file, err=%ld, [%ld, %ld], f_sz=%ld\n",
>>> +		       err, start, (start + size), heap_file->fsz);
>>> +		WRITE_ONCE(*heap_fwork->failp, true);
>>> +	}
>>> +	// recovery to my cred.
>>> +	revert_creds(old_cred);
>>> +}
>>> +
>>> +static int dma_heap_file_control_thread(void *data)
>>> +{
>>> +	struct dma_heap_file_control *heap_fctl =
>>> +		(struct dma_heap_file_control *)data;
>>> +	struct dma_heap_file_work *worker, *tmp;
>>> +	int nr_work;
>>> +
>>> +	LIST_HEAD(pages);
>>> +	LIST_HEAD(workers);
>>> +
>>> +	while (true) {
>>> +		wait_event_freezable(heap_fctl->threadwq,
>>> +				     atomic_read(&heap_fctl->nr_work) > 0);
>>> +recheck:
>>> +		spin_lock(&heap_fctl->lock);
>>> +		list_splice_init(&heap_fctl->works, &workers);
>>> +		spin_unlock(&heap_fctl->lock);
>>> +
>>> +		if (unlikely(kthread_should_stop())) {
>>> +			list_for_each_entry_safe(worker, tmp, &workers, list) {
>>> +				list_del(&worker->list);
>>> +				destroy_file_work(worker);
>>> +			}
>>> +			break;
>>> +		}
>>> +
>>> +		nr_work = 0;
>>> +		list_for_each_entry_safe(worker, tmp, &workers, list) {
>>> +			++nr_work;
>>> +			list_del(&worker->list);
>>> +			__work_this_io(worker);
>>> +
>>> +			destroy_file_work(worker);
>>> +		}
>>> +		atomic_sub(nr_work, &heap_fctl->nr_work);
>>> +
>>> +		if (atomic_read(&heap_fctl->nr_work) > 0)
>>> +			goto recheck;
>>> +	}
>>> +	return 0;
>>> +}
>>> +
>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>> +{
>>> +	return heap_file->fsz;
>>> +}
>>> +
>>> +static int prepare_dma_heap_file(struct dma_heap_file *heap_file, int file_fd,
>>> +				 size_t batch)
>>> +{
>>> +	struct file *file;
>>> +	size_t fsz;
>>> +	int ret;
>>> +
>>> +	file = fget(file_fd);
>>> +	if (!file)
>>> +		return -EINVAL;
>>> +
>>> +	fsz = i_size_read(file_inode(file));
>>> +	if (fsz < batch) {
>>> +		ret = -EINVAL;
>>> +		goto err;
>>> +	}
>>> +
>>> +	/**
>>> +	 * Selinux block our read, but actually we are reading the stand-in
>>> +	 * for this file.
>>> +	 * So save current's cred and when going to read, override mine, and
>>> +	 * end of read, revert.
>>> +	 */
>>> +	heap_file->cred = prepare_kernel_cred(current);
>>> +	if (unlikely(!heap_file->cred)) {
>>> +		ret = -ENOMEM;
>>> +		goto err;
>>> +	}
>>> +
>>> +	heap_file->file = file;
>>> +	heap_file->max_batch = batch;
>>> +	heap_file->fsz = fsz;
>>> +
>>> +	heap_file->direct = file->f_flags & O_DIRECT;
>>> +
>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>> +	if (!heap_file->direct && fsz >= DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>> +		pr_warn("alloc read file better to use O_DIRECT to read larget file\n");
>>> +
>>> +	return 0;
>>> +
>>> +err:
>>> +	fput(file);
>>> +	return ret;
>>> +}
>>> +
>>> +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
>>> +{
>>> +	fput(heap_file->file);
>>> +	put_cred(heap_file->cred);
>>> +}
>>> +
>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap *heap, int file_fd,
>>> +					   size_t batch, unsigned int fd_flags,
>>> +					   unsigned int heap_flags)
>>> +{
>>> +	struct dma_buf *dmabuf;
>>> +	int fd;
>>> +	struct dma_heap_file heap_file;
>>> +
>>> +	fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>> +	if (fd)
>>> +		goto error_file;
>>> +
>>> +	dmabuf = heap->ops->allocate_read_file(heap, &heap_file, fd_flags,
>>> +					       heap_flags);
>>> +	if (IS_ERR(dmabuf)) {
>>> +		fd = PTR_ERR(dmabuf);
>>> +		goto error;
>>> +	}
>>> +
>>> +	fd = dma_buf_fd(dmabuf, fd_flags);
>>> +	if (fd < 0) {
>>> +		dma_buf_put(dmabuf);
>>> +		/* just return, as put will call release and that will free */
>>> +	}
>>> +
>>> +error:
>>> +	destroy_dma_heap_file(&heap_file);
>>> +error_file:
>>> +	return fd;
>>> +}
>>> +
>>>    static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
>>>    				 u32 fd_flags,
>>>    				 u64 heap_flags)
>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode *inode, struct file *file)
>>>    	return 0;
>>>    }
>>> +static long dma_heap_ioctl_allocate_read_file(struct file *file, void *data)
>>> +{
>>> +	struct dma_heap_allocation_file_data *heap_allocation_file = data;
>>> +	struct dma_heap *heap = file->private_data;
>>> +	int fd;
>>> +
>>> +	if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
>>> +		return -EINVAL;
>>> +
>>> +	if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
>>> +		return -EINVAL;
>>> +
>>> +	if (heap_allocation_file->heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS)
>>> +		return -EINVAL;
>>> +
>>> +	if (!heap->ops->allocate_read_file)
>>> +		return -EINVAL;
>>> +
>>> +	fd = dma_heap_buffer_alloc_read_file(
>>> +		heap, heap_allocation_file->file_fd,
>>> +		heap_allocation_file->batch ?
>>> +			PAGE_ALIGN(heap_allocation_file->batch) :
>>> +			DEFAULT_ADI_BATCH,
>>> +		heap_allocation_file->fd_flags,
>>> +		heap_allocation_file->heap_flags);
>>> +	if (fd < 0)
>>> +		return fd;
>>> +
>>> +	heap_allocation_file->fd = fd;
>>> +	return 0;
>>> +}
>>> +
>>>    static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>>    {
>>>    	struct dma_heap_allocation_data *heap_allocation = data;
>>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>>    static unsigned int dma_heap_ioctl_cmds[] = {
>>>    	DMA_HEAP_IOCTL_ALLOC,
>>> +	DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>    };
>>>    static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
>>>    	case DMA_HEAP_IOCTL_ALLOC:
>>>    		ret = dma_heap_ioctl_allocate(file, kdata);
>>>    		break;
>>> +	case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>> +		ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>> +		break;
>>>    	default:
>>>    		ret = -ENOTTY;
>>>    		goto err;
>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>    	dma_heap_class = class_create(DEVNAME);
>>>    	if (IS_ERR(dma_heap_class)) {
>>> -		unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>> -		return PTR_ERR(dma_heap_class);
>>> +		ret = PTR_ERR(dma_heap_class);
>>> +		goto fail_class;
>>>    	}
>>>    	dma_heap_class->devnode = dma_heap_devnode;
>>> +	heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>> +	if (unlikely(!heap_fctl)) {
>>> +		ret =  -ENOMEM;
>>> +		goto fail_alloc;
>>> +	}
>>> +
>>> +	INIT_LIST_HEAD(&heap_fctl->works);
>>> +	init_waitqueue_head(&heap_fctl->threadwq);
>>> +	init_waitqueue_head(&heap_fctl->workwq);
>>> +
>>> +	heap_fctl->work_thread = kthread_run(dma_heap_file_control_thread,
>>> +					     heap_fctl, "heap_fwork_t");
>>> +	if (IS_ERR(heap_fctl->work_thread)) {
>>> +		ret = -ENOMEM;
>>> +		goto fail_thread;
>>> +	}
>>> +
>>> +	heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 0);
>>> +	if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>> +		ret = -ENOMEM;
>>> +		goto fail_cache;
>>> +	}
>>> +
>>>    	return 0;
>>> +
>>> +fail_cache:
>>> +	kthread_stop(heap_fctl->work_thread);
>>> +fail_thread:
>>> +	kfree(heap_fctl);
>>> +fail_alloc:
>>> +	class_destroy(dma_heap_class);
>>> +fail_class:
>>> +	unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>> +	return ret;
>>>    }
>>>    subsys_initcall(dma_heap_init);
>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>> index 064bad725061..9c25383f816c 100644
>>> --- a/include/linux/dma-heap.h
>>> +++ b/include/linux/dma-heap.h
>>> @@ -12,12 +12,17 @@
>>>    #include <linux/cdev.h>
>>>    #include <linux/types.h>
>>> +#define DEFAULT_ADI_BATCH (128 << 20)
>>> +
>>>    struct dma_heap;
>>> +struct dma_heap_file_task;
>>> +struct dma_heap_file;
>>>    /**
>>>     * struct dma_heap_ops - ops to operate on a given heap
>>>     * @allocate:		allocate dmabuf and return struct dma_buf ptr
>>> - *
>>> + * @allocate_read_file: allocate dmabuf and read file, then return struct
>>> + * dma_buf ptr.
>>>     * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>>>     */
>>>    struct dma_heap_ops {
>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>    				    unsigned long len,
>>>    				    u32 fd_flags,
>>>    				    u64 heap_flags);
>>> +
>>> +	struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>>> +					      struct dma_heap_file *heap_file,
>>> +					      u32 fd_flags,
>>> +					      u64 heap_flags);
>>>    };
>>>    /**
>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap *heap);
>>>     */
>>>    struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info);
>>> +/**
>>> + * dma_heap_destroy_file_read - waits for a file read to complete then destroy it
>>> + * Returns: true if the file read failed, false otherwise
>>> + */
>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask);
>>> +
>>> +/**
>>> + * dma_heap_wait_for_file_read - waits for a file read to complete
>>> + * Returns: true if the file read failed, false otherwise
>>> + */
>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask);
>>> +
>>> +/**
>>> + * dma_heap_alloc_file_read - Declare a task to read file when allocate pages.
>>> + * @heap_file:		target file to read
>>> + *
>>> + * Return NULL if failed, otherwise return a struct pointer.
>>> + */
>>> +struct dma_heap_file_task *
>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>> +
>>> +/**
>>> + * dma_heap_prepare_file_read - cache each allocated page until we meet this batch.
>>> + * @heap_ftask:		prepared and need to commit's work.
>>> + * @page:		current allocated page. don't care which order.
>>> + *
>>> + * Returns true if reach to batch, false so go on prepare.
>>> + */
>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
>>> +				struct page *page);
>>> +
>>> +/**
>>> + * dma_heap_commit_file_read -  prepare collect enough memory, going to trigger IO
>>> + * @heap_ftask:			info that current IO needs
>>> + *
>>> + * This commit will also check if reach to tail read.
>>> + * For direct I/O submissions, it is necessary to pay attention to file reads
>>> + * that are not page-aligned. For the unaligned portion of the read, buffer IO
>>> + * needs to be triggered.
>>> + * Returns:
>>> + *   0 if all right, -errno if something wrong
>>> + */
>>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask);
>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>> +
>>>    #endif /* _DMA_HEAPS_H */
>>> diff --git a/include/uapi/linux/dma-heap.h b/include/uapi/linux/dma-heap.h
>>> index a4cf716a49fa..8c20e8b74eed 100644
>>> --- a/include/uapi/linux/dma-heap.h
>>> +++ b/include/uapi/linux/dma-heap.h
>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>    	__u64 heap_flags;
>>>    };
>>> +/**
>>> + * struct dma_heap_allocation_file_data - metadata passed from userspace for
>>> + *                                      allocations and read file
>>> + * @fd:			will be populated with a fd which provides the
>>> + *			handle to the allocated dma-buf
>>> + * @file_fd:		file descriptor to read from(suggested to use O_DIRECT open file)
>>> + * @batch:		how many memory alloced then file read(bytes), default 128MB
>>> + *			will auto aligned to PAGE_SIZE
>>> + * @fd_flags:		file descriptor flags used when allocating
>>> + * @heap_flags:		flags passed to heap
>>> + *
>>> + * Provided by userspace as an argument to the ioctl
>>> + */
>>> +struct dma_heap_allocation_file_data {
>>> +	__u32 fd;
>>> +	__u32 file_fd;
>>> +	__u32 batch;
>>> +	__u32 fd_flags;
>>> +	__u64 heap_flags;
>>> +};
>>> +
>>>    #define DMA_HEAP_IOC_MAGIC		'H'
>>>    /**
>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>    #define DMA_HEAP_IOCTL_ALLOC	_IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>    				      struct dma_heap_allocation_data)
>>> +/**
>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from pool and both
>>> + *					read file when allocate memory.
>>> + *
>>> + * Takes a dma_heap_allocation_file_data struct and returns it with the fd field
>>> + * populated with the dmabuf handle of the allocation. When return, the dma-buf
>>> + * content is read from file.
>>> + */
>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>> +	_IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct dma_heap_allocation_file_data)
>>> +
>>>    #endif /* _UAPI_LINUX_DMABUF_POOL_H */
Huan Yang July 16, 2024, 2:48 a.m. UTC | #15
I just research the udmabuf, Please correct me if I'm wrong.

在 2024/7/15 20:32, Christian König 写道:
> Am 15.07.24 um 11:11 schrieb Daniel Vetter:
>> On Thu, Jul 11, 2024 at 11:00:02AM +0200, Christian König wrote:
>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>> Some user may need load file into dma-buf, current
>>>> way is:
>>>>     1. allocate a dma-buf, get dma-buf fd
>>>>     2. mmap dma-buf fd into vaddr
>>>>     3. read(file_fd, vaddr, fsz)
>>>> This is too heavy if fsz reached to GB.
>>> You need to describe a bit more why that is to heavy. I can only 
>>> assume you
>>> need to save memory bandwidth and avoid the extra copy with the CPU.
>>>
>>>> This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>> User need to offer a file_fd which you want to load into dma-buf, 
>>>> then,
>>>> it promise if you got a dma-buf fd, it will contains the file content.
>>> Interesting idea, that has at least more potential than trying to 
>>> enable
>>> direct I/O on mmap()ed DMA-bufs.
>>>
>>> The approach with the new IOCTL might not work because it is a very
>>> specialized use case.
>>>
>>> But IIRC there was a copy_file_range callback in the file_operations
>>> structure you could use for that. I'm just not sure when and how 
>>> that's used
>>> with the copy_file_range() system call.
>> I'm not sure any of those help, because internally they're all still 
>> based
>> on struct page (or maybe in the future on folios). And that's the thing
>> dma-buf can't give you, at least without peaking behind the curtain.
>>
>> I think an entirely different option would be malloc+udmabuf. That
>> essentially handles the impendence-mismatch between direct I/O and 
>> dma-buf
>> on the dma-buf side. The downside is that it'll make the permanently
>> pinned memory accounting and tracking issues even more apparent, but I
>> guess eventually we do need to sort that one out.
>
> Oh, very good idea!
> Just one minor correction: it's not malloc+udmabuf, but rather 
> create_memfd()+udmabuf.
>
> And you need to complete your direct I/O before creating the udmabuf 
> since that reference will prevent direct I/O from working.

udmabuf will pin all pages, so, if returned fd, can't trigger direct I/O 
(same as dmabuf). So, must complete read before pin it.

But current way is use `memfd_pin_folios` to boost alloc and pin, so 
maybe need suit it.


I currently doubt that the udmabuf solution is suitable for our 
gigabyte-level read operations.

1. The current mmap operation uses faulting, so frequent page faults 
will be triggered during reads, resulting in a lot of context switching 
overhead.

2. current udmabuf size limit is 64MB, even can change, maybe not good 
to use in large size?

3. The migration and adaptation of the driver is also a challenge, and 
currently, we are unable to control it.

Perhaps implementing `copy_file_range` would be more suitable for us.

>
> Regards,
> Christian.
>
>
>
>>
>> And since all the patches here are only for the pages system heap I'm
>> guess udmabuf should work out for the use-case here? Worth a shot at
>> least.
>> -Sima
>>
>>> Regards,
>>> Christian.
>>>
>>>> Notice, file_fd depends on user how to open this file. So, both buffer
>>>> I/O and Direct I/O is supported.
>>>>
>>>> Signed-off-by: Huan Yang <link@vivo.com>
>>>> ---
>>>>    drivers/dma-buf/dma-heap.c    | 525 
>>>> +++++++++++++++++++++++++++++++++-
>>>>    include/linux/dma-heap.h      |  57 +++-
>>>>    include/uapi/linux/dma-heap.h |  32 +++
>>>>    3 files changed, 611 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
>>>> index 2298ca5e112e..abe17281adb8 100644
>>>> --- a/drivers/dma-buf/dma-heap.c
>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>> @@ -15,9 +15,11 @@
>>>>    #include <linux/list.h>
>>>>    #include <linux/slab.h>
>>>>    #include <linux/nospec.h>
>>>> +#include <linux/highmem.h>
>>>>    #include <linux/uaccess.h>
>>>>    #include <linux/syscalls.h>
>>>>    #include <linux/dma-heap.h>
>>>> +#include <linux/vmalloc.h>
>>>>    #include <uapi/linux/dma-heap.h>
>>>>    #define DEVNAME "dma_heap"
>>>> @@ -43,12 +45,462 @@ struct dma_heap {
>>>>        struct cdev heap_cdev;
>>>>    };
>>>> +/**
>>>> + * struct dma_heap_file - wrap the file, read task for dma_heap 
>>>> allocate use.
>>>> + * @file:        file to read from.
>>>> + *
>>>> + * @cred:        kthread use, user cred copy to use for the read.
>>>> + *
>>>> + * @max_batch:        maximum batch size to read, if collect match 
>>>> batch,
>>>> + *            trigger read, default 128MB, must below file size.
>>>> + *
>>>> + * @fsz:        file size.
>>>> + *
>>>> + * @direct:        use direct IO?
>>>> + */
>>>> +struct dma_heap_file {
>>>> +    struct file *file;
>>>> +    struct cred *cred;
>>>> +    size_t max_batch;
>>>> +    size_t fsz;
>>>> +    bool direct;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct dma_heap_file_work - represents a dma_heap file read 
>>>> real work.
>>>> + * @vaddr:        contigous virtual address alloc by vmap, file 
>>>> read need.
>>>> + *
>>>> + * @start_size:        file read start offset, same to 
>>>> @dma_heap_file_task->roffset.
>>>> + *
>>>> + * @need_size:        file read need size, same to 
>>>> @dma_heap_file_task->rsize.
>>>> + *
>>>> + * @heap_file:        file wrapper.
>>>> + *
>>>> + * @list:        child node of @dma_heap_file_control->works.
>>>> + *
>>>> + * @refp:        same @dma_heap_file_task->ref, if end of read, 
>>>> put ref.
>>>> + *
>>>> + * @failp:        if any work io failed, set it true, pointp 
>>>> @dma_heap_file_task->fail.
>>>> + */
>>>> +struct dma_heap_file_work {
>>>> +    void *vaddr;
>>>> +    ssize_t start_size;
>>>> +    ssize_t need_size;
>>>> +    struct dma_heap_file *heap_file;
>>>> +    struct list_head list;
>>>> +    atomic_t *refp;
>>>> +    bool *failp;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct dma_heap_file_task - represents a dma_heap file read 
>>>> process
>>>> + * @ref:        current file work counter, if zero, allocate and read
>>>> + *            done.
>>>> + *
>>>> + * @roffset:        last read offset, current prepared work' begin 
>>>> file
>>>> + *            start offset.
>>>> + *
>>>> + * @rsize:        current allocated page size use to read, if 
>>>> reach rbatch,
>>>> + *            trigger commit.
>>>> + *
>>>> + * @rbatch:        current prepared work's batch, below 
>>>> @dma_heap_file's
>>>> + *            batch.
>>>> + *
>>>> + * @heap_file:        current dma_heap_file
>>>> + *
>>>> + * @parray:        used for vmap, size is @dma_heap_file's batch's 
>>>> number
>>>> + *            pages.(this is maximum). Due to single thread file 
>>>> read,
>>>> + *            one page array reuse each work prepare is OK.
>>>> + *            Each index in parray is PAGE_SIZE.(vmap need)
>>>> + *
>>>> + * @pindex:        current allocated page filled in @parray's index.
>>>> + *
>>>> + * @fail:        any work failed when file read?
>>>> + *
>>>> + * dma_heap_file_task is the production of file read, will prepare 
>>>> each work
>>>> + * during allocate dma_buf pages, if match current batch, then 
>>>> trigger commit
>>>> + * and prepare next work. After all batch queued, user going on 
>>>> prepare dma_buf
>>>> + * and so on, but before return dma_buf fd, need to wait file read 
>>>> end and
>>>> + * check read result.
>>>> + */
>>>> +struct dma_heap_file_task {
>>>> +    atomic_t ref;
>>>> +    size_t roffset;
>>>> +    size_t rsize;
>>>> +    size_t rbatch;
>>>> +    struct dma_heap_file *heap_file;
>>>> +    struct page **parray;
>>>> +    unsigned int pindex;
>>>> +    bool fail;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct dma_heap_file_control - global control of dma_heap file 
>>>> read.
>>>> + * @works:        @dma_heap_file_work's list head.
>>>> + *
>>>> + * @lock:        only lock for @works.
>>>> + *
>>>> + * @threadwq:        wait queue for @work_thread, if commit work, 
>>>> @work_thread
>>>> + *            wakeup and read this work's file contains.
>>>> + *
>>>> + * @workwq:        used for main thread wait for file read end, if 
>>>> allocation
>>>> + *            end before file read. @dma_heap_file_task ref effect 
>>>> this.
>>>> + *
>>>> + * @work_thread:    file read kthread. the dma_heap_file_task 
>>>> work's consumer.
>>>> + *
>>>> + * @heap_fwork_cachep:    @dma_heap_file_work's cachep, it's 
>>>> alloc/free frequently.
>>>> + *
>>>> + * @nr_work:        global number of how many work committed.
>>>> + */
>>>> +struct dma_heap_file_control {
>>>> +    struct list_head works;
>>>> +    spinlock_t lock;
>>>> +    wait_queue_head_t threadwq;
>>>> +    wait_queue_head_t workwq;
>>>> +    struct task_struct *work_thread;
>>>> +    struct kmem_cache *heap_fwork_cachep;
>>>> +    atomic_t nr_work;
>>>> +};
>>>> +
>>>> +static struct dma_heap_file_control *heap_fctl;
>>>>    static LIST_HEAD(heap_list);
>>>>    static DEFINE_MUTEX(heap_list_lock);
>>>>    static dev_t dma_heap_devt;
>>>>    static struct class *dma_heap_class;
>>>>    static DEFINE_XARRAY_ALLOC(dma_heap_minors);
>>>> +/**
>>>> + * map_pages_to_vaddr - map each scatter page into contiguous 
>>>> virtual address.
>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>> + *
>>>> + * Cached pages need to trigger file read, this function map each 
>>>> scatter page
>>>> + * into contiguous virtual address, so that file read can easy use.
>>>> + * Now that we get vaddr page, cached pages can return to original 
>>>> user, so we
>>>> + * will not effect dma-buf export even if file read not end.
>>>> + */
>>>> +static void *map_pages_to_vaddr(struct dma_heap_file_task 
>>>> *heap_ftask)
>>>> +{
>>>> +    return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
>>>> +            PAGE_KERNEL);
>>>> +}
>>>> +
>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>> *heap_ftask,
>>>> +                struct page *page)
>>>> +{
>>>> +    struct page **array = heap_ftask->parray;
>>>> +    int index = heap_ftask->pindex;
>>>> +    int num = compound_nr(page), i;
>>>> +    unsigned long sz = page_size(page);
>>>> +
>>>> +    heap_ftask->rsize += sz;
>>>> +    for (i = 0; i < num; ++i)
>>>> +        array[index++] = &page[i];
>>>> +    heap_ftask->pindex = index;
>>>> +
>>>> +    return heap_ftask->rsize >= heap_ftask->rbatch;
>>>> +}
>>>> +
>>>> +static struct dma_heap_file_work *
>>>> +init_file_work(struct dma_heap_file_task *heap_ftask)
>>>> +{
>>>> +    struct dma_heap_file_work *heap_fwork;
>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>> +
>>>> +    if (READ_ONCE(heap_ftask->fail))
>>>> +        return NULL;
>>>> +
>>>> +    heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, 
>>>> GFP_KERNEL);
>>>> +    if (unlikely(!heap_fwork))
>>>> +        return NULL;
>>>> +
>>>> +    heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
>>>> +    if (unlikely(!heap_fwork->vaddr)) {
>>>> +        kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>> +        return NULL;
>>>> +    }
>>>> +
>>>> +    heap_fwork->heap_file = heap_file;
>>>> +    heap_fwork->start_size = heap_ftask->roffset;
>>>> +    heap_fwork->need_size = heap_ftask->rsize;
>>>> +    heap_fwork->refp = &heap_ftask->ref;
>>>> +    heap_fwork->failp = &heap_ftask->fail;
>>>> +    atomic_inc(&heap_ftask->ref);
>>>> +    return heap_fwork;
>>>> +}
>>>> +
>>>> +static void destroy_file_work(struct dma_heap_file_work *heap_fwork)
>>>> +{
>>>> +    vunmap(heap_fwork->vaddr);
>>>> +    atomic_dec(heap_fwork->refp);
>>>> +    wake_up(&heap_fctl->workwq);
>>>> +
>>>> +    kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
>>>> +}
>>>> +
>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
>>>> +{
>>>> +    struct dma_heap_file_work *heap_fwork = 
>>>> init_file_work(heap_ftask);
>>>> +    struct page *last = NULL;
>>>> +    struct dma_heap_file *heap_file = heap_ftask->heap_file;
>>>> +    size_t start = heap_ftask->roffset;
>>>> +    struct file *file = heap_file->file;
>>>> +    size_t fsz = heap_file->fsz;
>>>> +
>>>> +    if (unlikely(!heap_fwork))
>>>> +        return -ENOMEM;
>>>> +
>>>> +    /**
>>>> +     * If file size is not page aligned, direct io can't process 
>>>> the tail.
>>>> +     * So, if reach to tail, remain the last page use buffer read.
>>>> +     */
>>>> +    if (heap_file->direct && start + heap_ftask->rsize > fsz) {
>>>> +        heap_fwork->need_size -= PAGE_SIZE;
>>>> +        last = heap_ftask->parray[heap_ftask->pindex - 1];
>>>> +    }
>>>> +
>>>> +    spin_lock(&heap_fctl->lock);
>>>> +    list_add_tail(&heap_fwork->list, &heap_fctl->works);
>>>> +    spin_unlock(&heap_fctl->lock);
>>>> +    atomic_inc(&heap_fctl->nr_work);
>>>> +
>>>> +    wake_up(&heap_fctl->threadwq);
>>>> +
>>>> +    if (last) {
>>>> +        char *buf, *pathp;
>>>> +        ssize_t err;
>>>> +        void *buffer;
>>>> +
>>>> +        buf = kmalloc(PATH_MAX, GFP_KERNEL);
>>>> +        if (unlikely(!buf))
>>>> +            return -ENOMEM;
>>>> +
>>>> +        start = PAGE_ALIGN_DOWN(fsz);
>>>> +
>>>> +        pathp = file_path(file, buf, PATH_MAX);
>>>> +        if (IS_ERR(pathp)) {
>>>> +            kfree(buf);
>>>> +            return PTR_ERR(pathp);
>>>> +        }
>>>> +
>>>> +        buffer = kmap_local_page(last); // use page's kaddr.
>>>> +        err = kernel_read_file_from_path(pathp, start, &buffer,
>>>> +                         fsz - start, &fsz,
>>>> +                         READING_POLICY);
>>>> +        kunmap_local(buffer);
>>>> +        kfree(buf);
>>>> +        if (err < 0) {
>>>> +            pr_err("failed to use buffer kernel_read_file %s, 
>>>> err=%ld, [%ld, %ld], f_sz=%ld\n",
>>>> +                   pathp, err, start, fsz, fsz);
>>>> +
>>>> +            return err;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    heap_ftask->roffset += heap_ftask->rsize;
>>>> +    heap_ftask->rsize = 0;
>>>> +    heap_ftask->pindex = 0;
>>>> +    heap_ftask->rbatch = min_t(size_t,
>>>> +                   PAGE_ALIGN(fsz) - heap_ftask->roffset,
>>>> +                   heap_ftask->rbatch);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>> *heap_ftask)
>>>> +{
>>>> +    wait_event_freezable(heap_fctl->workwq,
>>>> +                 atomic_read(&heap_ftask->ref) == 0);
>>>> +    return heap_ftask->fail;
>>>> +}
>>>> +
>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>> *heap_ftask)
>>>> +{
>>>> +    bool fail;
>>>> +
>>>> +    dma_heap_wait_for_file_read(heap_ftask);
>>>> +    fail = heap_ftask->fail;
>>>> +    kvfree(heap_ftask->parray);
>>>> +    kfree(heap_ftask);
>>>> +    return fail;
>>>> +}
>>>> +
>>>> +struct dma_heap_file_task *
>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file)
>>>> +{
>>>> +    struct dma_heap_file_task *heap_ftask =
>>>> +        kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
>>>> +    if (unlikely(!heap_ftask))
>>>> +        return NULL;
>>>> +
>>>> +    /**
>>>> +     * Batch is the maximum size which we prepare work will meet.
>>>> +     * So, direct alloc this number's page array is OK.
>>>> +     */
>>>> +    heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> 
>>>> PAGE_SHIFT,
>>>> +                        sizeof(struct page *), GFP_KERNEL);
>>>> +    if (unlikely(!heap_ftask->parray))
>>>> +        goto put;
>>>> +
>>>> +    heap_ftask->heap_file = heap_file;
>>>> +    heap_ftask->rbatch = heap_file->max_batch;
>>>> +    return heap_ftask;
>>>> +put:
>>>> +    kfree(heap_ftask);
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static void __work_this_io(struct dma_heap_file_work *heap_fwork)
>>>> +{
>>>> +    struct dma_heap_file *heap_file = heap_fwork->heap_file;
>>>> +    struct file *file = heap_file->file;
>>>> +    ssize_t start = heap_fwork->start_size;
>>>> +    ssize_t size = heap_fwork->need_size;
>>>> +    void *buffer = heap_fwork->vaddr;
>>>> +    const struct cred *old_cred;
>>>> +    ssize_t err;
>>>> +
>>>> +    // use real task's cred to read this file.
>>>> +    old_cred = override_creds(heap_file->cred);
>>>> +    err = kernel_read_file(file, start, &buffer, size, 
>>>> &heap_file->fsz,
>>>> +                   READING_POLICY);
>>>> +    if (err < 0) {
>>>> +        pr_err("use kernel_read_file, err=%ld, [%ld, %ld], 
>>>> f_sz=%ld\n",
>>>> +               err, start, (start + size), heap_file->fsz);
>>>> +        WRITE_ONCE(*heap_fwork->failp, true);
>>>> +    }
>>>> +    // recovery to my cred.
>>>> +    revert_creds(old_cred);
>>>> +}
>>>> +
>>>> +static int dma_heap_file_control_thread(void *data)
>>>> +{
>>>> +    struct dma_heap_file_control *heap_fctl =
>>>> +        (struct dma_heap_file_control *)data;
>>>> +    struct dma_heap_file_work *worker, *tmp;
>>>> +    int nr_work;
>>>> +
>>>> +    LIST_HEAD(pages);
>>>> +    LIST_HEAD(workers);
>>>> +
>>>> +    while (true) {
>>>> +        wait_event_freezable(heap_fctl->threadwq,
>>>> + atomic_read(&heap_fctl->nr_work) > 0);
>>>> +recheck:
>>>> +        spin_lock(&heap_fctl->lock);
>>>> +        list_splice_init(&heap_fctl->works, &workers);
>>>> +        spin_unlock(&heap_fctl->lock);
>>>> +
>>>> +        if (unlikely(kthread_should_stop())) {
>>>> +            list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>> +                list_del(&worker->list);
>>>> +                destroy_file_work(worker);
>>>> +            }
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        nr_work = 0;
>>>> +        list_for_each_entry_safe(worker, tmp, &workers, list) {
>>>> +            ++nr_work;
>>>> +            list_del(&worker->list);
>>>> +            __work_this_io(worker);
>>>> +
>>>> +            destroy_file_work(worker);
>>>> +        }
>>>> +        atomic_sub(nr_work, &heap_fctl->nr_work);
>>>> +
>>>> +        if (atomic_read(&heap_fctl->nr_work) > 0)
>>>> +            goto recheck;
>>>> +    }
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file)
>>>> +{
>>>> +    return heap_file->fsz;
>>>> +}
>>>> +
>>>> +static int prepare_dma_heap_file(struct dma_heap_file *heap_file, 
>>>> int file_fd,
>>>> +                 size_t batch)
>>>> +{
>>>> +    struct file *file;
>>>> +    size_t fsz;
>>>> +    int ret;
>>>> +
>>>> +    file = fget(file_fd);
>>>> +    if (!file)
>>>> +        return -EINVAL;
>>>> +
>>>> +    fsz = i_size_read(file_inode(file));
>>>> +    if (fsz < batch) {
>>>> +        ret = -EINVAL;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    /**
>>>> +     * Selinux block our read, but actually we are reading the 
>>>> stand-in
>>>> +     * for this file.
>>>> +     * So save current's cred and when going to read, override 
>>>> mine, and
>>>> +     * end of read, revert.
>>>> +     */
>>>> +    heap_file->cred = prepare_kernel_cred(current);
>>>> +    if (unlikely(!heap_file->cred)) {
>>>> +        ret = -ENOMEM;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    heap_file->file = file;
>>>> +    heap_file->max_batch = batch;
>>>> +    heap_file->fsz = fsz;
>>>> +
>>>> +    heap_file->direct = file->f_flags & O_DIRECT;
>>>> +
>>>> +#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
>>>> +    if (!heap_file->direct && fsz >= DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
>>>> +        pr_warn("alloc read file better to use O_DIRECT to read 
>>>> larget file\n");
>>>> +
>>>> +    return 0;
>>>> +
>>>> +err:
>>>> +    fput(file);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
>>>> +{
>>>> +    fput(heap_file->file);
>>>> +    put_cred(heap_file->cred);
>>>> +}
>>>> +
>>>> +static int dma_heap_buffer_alloc_read_file(struct dma_heap *heap, 
>>>> int file_fd,
>>>> +                       size_t batch, unsigned int fd_flags,
>>>> +                       unsigned int heap_flags)
>>>> +{
>>>> +    struct dma_buf *dmabuf;
>>>> +    int fd;
>>>> +    struct dma_heap_file heap_file;
>>>> +
>>>> +    fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
>>>> +    if (fd)
>>>> +        goto error_file;
>>>> +
>>>> +    dmabuf = heap->ops->allocate_read_file(heap, &heap_file, 
>>>> fd_flags,
>>>> +                           heap_flags);
>>>> +    if (IS_ERR(dmabuf)) {
>>>> +        fd = PTR_ERR(dmabuf);
>>>> +        goto error;
>>>> +    }
>>>> +
>>>> +    fd = dma_buf_fd(dmabuf, fd_flags);
>>>> +    if (fd < 0) {
>>>> +        dma_buf_put(dmabuf);
>>>> +        /* just return, as put will call release and that will 
>>>> free */
>>>> +    }
>>>> +
>>>> +error:
>>>> +    destroy_dma_heap_file(&heap_file);
>>>> +error_file:
>>>> +    return fd;
>>>> +}
>>>> +
>>>>    static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
>>>>                     u32 fd_flags,
>>>>                     u64 heap_flags)
>>>> @@ -93,6 +545,38 @@ static int dma_heap_open(struct inode *inode, 
>>>> struct file *file)
>>>>        return 0;
>>>>    }
>>>> +static long dma_heap_ioctl_allocate_read_file(struct file *file, 
>>>> void *data)
>>>> +{
>>>> +    struct dma_heap_allocation_file_data *heap_allocation_file = 
>>>> data;
>>>> +    struct dma_heap *heap = file->private_data;
>>>> +    int fd;
>>>> +
>>>> +    if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
>>>> +        return -EINVAL;
>>>> +
>>>> +    if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
>>>> +        return -EINVAL;
>>>> +
>>>> +    if (heap_allocation_file->heap_flags & 
>>>> ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>> +        return -EINVAL;
>>>> +
>>>> +    if (!heap->ops->allocate_read_file)
>>>> +        return -EINVAL;
>>>> +
>>>> +    fd = dma_heap_buffer_alloc_read_file(
>>>> +        heap, heap_allocation_file->file_fd,
>>>> +        heap_allocation_file->batch ?
>>>> +            PAGE_ALIGN(heap_allocation_file->batch) :
>>>> +            DEFAULT_ADI_BATCH,
>>>> +        heap_allocation_file->fd_flags,
>>>> +        heap_allocation_file->heap_flags);
>>>> +    if (fd < 0)
>>>> +        return fd;
>>>> +
>>>> +    heap_allocation_file->fd = fd;
>>>> +    return 0;
>>>> +}
>>>> +
>>>>    static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>>>    {
>>>>        struct dma_heap_allocation_data *heap_allocation = data;
>>>> @@ -121,6 +605,7 @@ static long dma_heap_ioctl_allocate(struct file 
>>>> *file, void *data)
>>>>    static unsigned int dma_heap_ioctl_cmds[] = {
>>>>        DMA_HEAP_IOCTL_ALLOC,
>>>> +    DMA_HEAP_IOCTL_ALLOC_AND_READ,
>>>>    };
>>>>    static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
>>>> @@ -170,6 +655,9 @@ static long dma_heap_ioctl(struct file *file, 
>>>> unsigned int ucmd,
>>>>        case DMA_HEAP_IOCTL_ALLOC:
>>>>            ret = dma_heap_ioctl_allocate(file, kdata);
>>>>            break;
>>>> +    case DMA_HEAP_IOCTL_ALLOC_AND_READ:
>>>> +        ret = dma_heap_ioctl_allocate_read_file(file, kdata);
>>>> +        break;
>>>>        default:
>>>>            ret = -ENOTTY;
>>>>            goto err;
>>>> @@ -316,11 +804,44 @@ static int dma_heap_init(void)
>>>>        dma_heap_class = class_create(DEVNAME);
>>>>        if (IS_ERR(dma_heap_class)) {
>>>> -        unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>> -        return PTR_ERR(dma_heap_class);
>>>> +        ret = PTR_ERR(dma_heap_class);
>>>> +        goto fail_class;
>>>>        }
>>>>        dma_heap_class->devnode = dma_heap_devnode;
>>>> +    heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
>>>> +    if (unlikely(!heap_fctl)) {
>>>> +        ret =  -ENOMEM;
>>>> +        goto fail_alloc;
>>>> +    }
>>>> +
>>>> +    INIT_LIST_HEAD(&heap_fctl->works);
>>>> +    init_waitqueue_head(&heap_fctl->threadwq);
>>>> +    init_waitqueue_head(&heap_fctl->workwq);
>>>> +
>>>> +    heap_fctl->work_thread = 
>>>> kthread_run(dma_heap_file_control_thread,
>>>> +                         heap_fctl, "heap_fwork_t");
>>>> +    if (IS_ERR(heap_fctl->work_thread)) {
>>>> +        ret = -ENOMEM;
>>>> +        goto fail_thread;
>>>> +    }
>>>> +
>>>> +    heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 0);
>>>> +    if (unlikely(!heap_fctl->heap_fwork_cachep)) {
>>>> +        ret = -ENOMEM;
>>>> +        goto fail_cache;
>>>> +    }
>>>> +
>>>>        return 0;
>>>> +
>>>> +fail_cache:
>>>> +    kthread_stop(heap_fctl->work_thread);
>>>> +fail_thread:
>>>> +    kfree(heap_fctl);
>>>> +fail_alloc:
>>>> +    class_destroy(dma_heap_class);
>>>> +fail_class:
>>>> +    unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
>>>> +    return ret;
>>>>    }
>>>>    subsys_initcall(dma_heap_init);
>>>> diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
>>>> index 064bad725061..9c25383f816c 100644
>>>> --- a/include/linux/dma-heap.h
>>>> +++ b/include/linux/dma-heap.h
>>>> @@ -12,12 +12,17 @@
>>>>    #include <linux/cdev.h>
>>>>    #include <linux/types.h>
>>>> +#define DEFAULT_ADI_BATCH (128 << 20)
>>>> +
>>>>    struct dma_heap;
>>>> +struct dma_heap_file_task;
>>>> +struct dma_heap_file;
>>>>    /**
>>>>     * struct dma_heap_ops - ops to operate on a given heap
>>>>     * @allocate:        allocate dmabuf and return struct dma_buf ptr
>>>> - *
>>>> + * @allocate_read_file: allocate dmabuf and read file, then return 
>>>> struct
>>>> + * dma_buf ptr.
>>>>     * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
>>>>     */
>>>>    struct dma_heap_ops {
>>>> @@ -25,6 +30,11 @@ struct dma_heap_ops {
>>>>                        unsigned long len,
>>>>                        u32 fd_flags,
>>>>                        u64 heap_flags);
>>>> +
>>>> +    struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
>>>> +                          struct dma_heap_file *heap_file,
>>>> +                          u32 fd_flags,
>>>> +                          u64 heap_flags);
>>>>    };
>>>>    /**
>>>> @@ -65,4 +75,49 @@ const char *dma_heap_get_name(struct dma_heap 
>>>> *heap);
>>>>     */
>>>>    struct dma_heap *dma_heap_add(const struct dma_heap_export_info 
>>>> *exp_info);
>>>> +/**
>>>> + * dma_heap_destroy_file_read - waits for a file read to complete 
>>>> then destroy it
>>>> + * Returns: true if the file read failed, false otherwise
>>>> + */
>>>> +bool dma_heap_destroy_file_read(struct dma_heap_file_task 
>>>> *heap_ftask);
>>>> +
>>>> +/**
>>>> + * dma_heap_wait_for_file_read - waits for a file read to complete
>>>> + * Returns: true if the file read failed, false otherwise
>>>> + */
>>>> +bool dma_heap_wait_for_file_read(struct dma_heap_file_task 
>>>> *heap_ftask);
>>>> +
>>>> +/**
>>>> + * dma_heap_alloc_file_read - Declare a task to read file when 
>>>> allocate pages.
>>>> + * @heap_file:        target file to read
>>>> + *
>>>> + * Return NULL if failed, otherwise return a struct pointer.
>>>> + */
>>>> +struct dma_heap_file_task *
>>>> +dma_heap_declare_file_read(struct dma_heap_file *heap_file);
>>>> +
>>>> +/**
>>>> + * dma_heap_prepare_file_read - cache each allocated page until we 
>>>> meet this batch.
>>>> + * @heap_ftask:        prepared and need to commit's work.
>>>> + * @page:        current allocated page. don't care which order.
>>>> + *
>>>> + * Returns true if reach to batch, false so go on prepare.
>>>> + */
>>>> +bool dma_heap_prepare_file_read(struct dma_heap_file_task 
>>>> *heap_ftask,
>>>> +                struct page *page);
>>>> +
>>>> +/**
>>>> + * dma_heap_commit_file_read -  prepare collect enough memory, 
>>>> going to trigger IO
>>>> + * @heap_ftask:            info that current IO needs
>>>> + *
>>>> + * This commit will also check if reach to tail read.
>>>> + * For direct I/O submissions, it is necessary to pay attention to 
>>>> file reads
>>>> + * that are not page-aligned. For the unaligned portion of the 
>>>> read, buffer IO
>>>> + * needs to be triggered.
>>>> + * Returns:
>>>> + *   0 if all right, -errno if something wrong
>>>> + */
>>>> +int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask);
>>>> +size_t dma_heap_file_size(struct dma_heap_file *heap_file);
>>>> +
>>>>    #endif /* _DMA_HEAPS_H */
>>>> diff --git a/include/uapi/linux/dma-heap.h 
>>>> b/include/uapi/linux/dma-heap.h
>>>> index a4cf716a49fa..8c20e8b74eed 100644
>>>> --- a/include/uapi/linux/dma-heap.h
>>>> +++ b/include/uapi/linux/dma-heap.h
>>>> @@ -39,6 +39,27 @@ struct dma_heap_allocation_data {
>>>>        __u64 heap_flags;
>>>>    };
>>>> +/**
>>>> + * struct dma_heap_allocation_file_data - metadata passed from 
>>>> userspace for
>>>> + *                                      allocations and read file
>>>> + * @fd:            will be populated with a fd which provides the
>>>> + *            handle to the allocated dma-buf
>>>> + * @file_fd:        file descriptor to read from(suggested to use 
>>>> O_DIRECT open file)
>>>> + * @batch:        how many memory alloced then file read(bytes), 
>>>> default 128MB
>>>> + *            will auto aligned to PAGE_SIZE
>>>> + * @fd_flags:        file descriptor flags used when allocating
>>>> + * @heap_flags:        flags passed to heap
>>>> + *
>>>> + * Provided by userspace as an argument to the ioctl
>>>> + */
>>>> +struct dma_heap_allocation_file_data {
>>>> +    __u32 fd;
>>>> +    __u32 file_fd;
>>>> +    __u32 batch;
>>>> +    __u32 fd_flags;
>>>> +    __u64 heap_flags;
>>>> +};
>>>> +
>>>>    #define DMA_HEAP_IOC_MAGIC        'H'
>>>>    /**
>>>> @@ -50,4 +71,15 @@ struct dma_heap_allocation_data {
>>>>    #define DMA_HEAP_IOCTL_ALLOC    _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
>>>>                          struct dma_heap_allocation_data)
>>>> +/**
>>>> + * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from pool 
>>>> and both
>>>> + *                    read file when allocate memory.
>>>> + *
>>>> + * Takes a dma_heap_allocation_file_data struct and returns it 
>>>> with the fd field
>>>> + * populated with the dmabuf handle of the allocation. When 
>>>> return, the dma-buf
>>>> + * content is read from file.
>>>> + */
>>>> +#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
>>>> +    _IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct 
>>>> dma_heap_allocation_file_data)
>>>> +
>>>>    #endif /* _UAPI_LINUX_DMABUF_POOL_H */
>
Daniel Vetter July 16, 2024, 9:31 a.m. UTC | #16
On Tue, Jul 16, 2024 at 10:48:40AM +0800, Huan Yang wrote:
> I just research the udmabuf, Please correct me if I'm wrong.
> 
> 在 2024/7/15 20:32, Christian König 写道:
> > Am 15.07.24 um 11:11 schrieb Daniel Vetter:
> > > On Thu, Jul 11, 2024 at 11:00:02AM +0200, Christian König wrote:
> > > > Am 11.07.24 um 09:42 schrieb Huan Yang:
> > > > > Some user may need load file into dma-buf, current
> > > > > way is:
> > > > >     1. allocate a dma-buf, get dma-buf fd
> > > > >     2. mmap dma-buf fd into vaddr
> > > > >     3. read(file_fd, vaddr, fsz)
> > > > > This is too heavy if fsz reached to GB.
> > > > You need to describe a bit more why that is to heavy. I can only
> > > > assume you
> > > > need to save memory bandwidth and avoid the extra copy with the CPU.
> > > > 
> > > > > This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
> > > > > User need to offer a file_fd which you want to load into
> > > > > dma-buf, then,
> > > > > it promise if you got a dma-buf fd, it will contains the file content.
> > > > Interesting idea, that has at least more potential than trying
> > > > to enable
> > > > direct I/O on mmap()ed DMA-bufs.
> > > > 
> > > > The approach with the new IOCTL might not work because it is a very
> > > > specialized use case.
> > > > 
> > > > But IIRC there was a copy_file_range callback in the file_operations
> > > > structure you could use for that. I'm just not sure when and how
> > > > that's used
> > > > with the copy_file_range() system call.
> > > I'm not sure any of those help, because internally they're all still
> > > based
> > > on struct page (or maybe in the future on folios). And that's the thing
> > > dma-buf can't give you, at least without peaking behind the curtain.
> > > 
> > > I think an entirely different option would be malloc+udmabuf. That
> > > essentially handles the impendence-mismatch between direct I/O and
> > > dma-buf
> > > on the dma-buf side. The downside is that it'll make the permanently
> > > pinned memory accounting and tracking issues even more apparent, but I
> > > guess eventually we do need to sort that one out.
> > 
> > Oh, very good idea!
> > Just one minor correction: it's not malloc+udmabuf, but rather
> > create_memfd()+udmabuf.

Hm right, it's create_memfd() + mmap(memfd) + udmabuf

> > And you need to complete your direct I/O before creating the udmabuf
> > since that reference will prevent direct I/O from working.
> 
> udmabuf will pin all pages, so, if returned fd, can't trigger direct I/O
> (same as dmabuf). So, must complete read before pin it.

Why does pinning prevent direct I/O? I haven't tested, but I'd expect the
rdma folks would be really annoyed if that's the case ...

> But current way is use `memfd_pin_folios` to boost alloc and pin, so maybe
> need suit it.
> 
> 
> I currently doubt that the udmabuf solution is suitable for our
> gigabyte-level read operations.
> 
> 1. The current mmap operation uses faulting, so frequent page faults will be
> triggered during reads, resulting in a lot of context switching overhead.
> 
> 2. current udmabuf size limit is 64MB, even can change, maybe not good to
> use in large size?

Yeah that's just a figleaf so we don't have to bother about the accounting
issue.

> 3. The migration and adaptation of the driver is also a challenge, and
> currently, we are unable to control it.

Why does a udmabuf fd not work instead of any other dmabuf fd? That
shouldn't matter for the consuming driver ...

> Perhaps implementing `copy_file_range` would be more suitable for us.

See my other mail, fundamentally these all rely on struct page being
present, and dma-buf doesn't give you that. Which means you need to go
below the dma-buf abstraction. And udmabuf is pretty much the thing for
that, because it wraps normal struct page memory into a dmabuf.

And copy_file_range on the underlying memfd might already work, I haven't
checked though.

Cheers, Sima
Huan Yang July 16, 2024, 10:14 a.m. UTC | #17
在 2024/7/16 17:31, Daniel Vetter 写道:
> [你通常不会收到来自 daniel.vetter@ffwll.ch 的电子邮件。请访问 https://aka.ms/LearnAboutSenderIdentification,以了解这一点为什么很重要]
>
> On Tue, Jul 16, 2024 at 10:48:40AM +0800, Huan Yang wrote:
>> I just research the udmabuf, Please correct me if I'm wrong.
>>
>> 在 2024/7/15 20:32, Christian König 写道:
>>> Am 15.07.24 um 11:11 schrieb Daniel Vetter:
>>>> On Thu, Jul 11, 2024 at 11:00:02AM +0200, Christian König wrote:
>>>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>>>> Some user may need load file into dma-buf, current
>>>>>> way is:
>>>>>>      1. allocate a dma-buf, get dma-buf fd
>>>>>>      2. mmap dma-buf fd into vaddr
>>>>>>      3. read(file_fd, vaddr, fsz)
>>>>>> This is too heavy if fsz reached to GB.
>>>>> You need to describe a bit more why that is to heavy. I can only
>>>>> assume you
>>>>> need to save memory bandwidth and avoid the extra copy with the CPU.
>>>>>
>>>>>> This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>>>> User need to offer a file_fd which you want to load into
>>>>>> dma-buf, then,
>>>>>> it promise if you got a dma-buf fd, it will contains the file content.
>>>>> Interesting idea, that has at least more potential than trying
>>>>> to enable
>>>>> direct I/O on mmap()ed DMA-bufs.
>>>>>
>>>>> The approach with the new IOCTL might not work because it is a very
>>>>> specialized use case.
>>>>>
>>>>> But IIRC there was a copy_file_range callback in the file_operations
>>>>> structure you could use for that. I'm just not sure when and how
>>>>> that's used
>>>>> with the copy_file_range() system call.
>>>> I'm not sure any of those help, because internally they're all still
>>>> based
>>>> on struct page (or maybe in the future on folios). And that's the thing
>>>> dma-buf can't give you, at least without peaking behind the curtain.
>>>>
>>>> I think an entirely different option would be malloc+udmabuf. That
>>>> essentially handles the impendence-mismatch between direct I/O and
>>>> dma-buf
>>>> on the dma-buf side. The downside is that it'll make the permanently
>>>> pinned memory accounting and tracking issues even more apparent, but I
>>>> guess eventually we do need to sort that one out.
>>> Oh, very good idea!
>>> Just one minor correction: it's not malloc+udmabuf, but rather
>>> create_memfd()+udmabuf.
> Hm right, it's create_memfd() + mmap(memfd) + udmabuf
>
>>> And you need to complete your direct I/O before creating the udmabuf
>>> since that reference will prevent direct I/O from working.
>> udmabuf will pin all pages, so, if returned fd, can't trigger direct I/O
>> (same as dmabuf). So, must complete read before pin it.
> Why does pinning prevent direct I/O? I haven't tested, but I'd expect the
> rdma folks would be really annoyed if that's the case ...
>
>> But current way is use `memfd_pin_folios` to boost alloc and pin, so maybe
>> need suit it.
>>
>>
>> I currently doubt that the udmabuf solution is suitable for our
>> gigabyte-level read operations.
>>
>> 1. The current mmap operation uses faulting, so frequent page faults will be
>> triggered during reads, resulting in a lot of context switching overhead.
>>
>> 2. current udmabuf size limit is 64MB, even can change, maybe not good to
>> use in large size?
> Yeah that's just a figleaf so we don't have to bother about the accounting
> issue.
>
>> 3. The migration and adaptation of the driver is also a challenge, and
>> currently, we are unable to control it.
> Why does a udmabuf fd not work instead of any other dmabuf fd? That
> shouldn't matter for the consuming driver ...

Hmm, our production's driver provider by other oem. I see many of they 
implement

their own dma_buf_ops.  These may not be generic and may require them to 
reimplement.

>
>> Perhaps implementing `copy_file_range` would be more suitable for us.
> See my other mail, fundamentally these all rely on struct page being
> present, and dma-buf doesn't give you that. Which means you need to go
> below the dma-buf abstraction. And udmabuf is pretty much the thing for
> that, because it wraps normal struct page memory into a dmabuf.
Yes, udmabuf give this, I am very interested in whether the page 
provided by udmabuf can trigger direct I/O.

So, I'll give a test and report soon.
>
> And copy_file_range on the underlying memfd might already work, I haven't
> checked though.

I have doubts.

I recently tested and found that I need to modify many places in 
vfs_copy_file_range in order to run the copy file range with DMA_BUF 
fd.(I have managed to get it working,

but I don't think the implementation is good enough, so I can't provide 
the source code.)

Maybe memfd can work or not, let's give it a test.:)

Anyway, it's a good idea too. I currently need to focus on whether it 
can be achieved, as well as the performance comparison.

>
> Cheers, Sima
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch/
Huan Yang July 17, 2024, 7:33 a.m. UTC | #18
在 2024/7/16 20:07, Christian König 写道:
> Am 16.07.24 um 11:31 schrieb Daniel Vetter:
>> On Tue, Jul 16, 2024 at 10:48:40AM +0800, Huan Yang wrote:
>>> I just research the udmabuf, Please correct me if I'm wrong.
>>>
>>> 在 2024/7/15 20:32, Christian König 写道:
>>>> Am 15.07.24 um 11:11 schrieb Daniel Vetter:
>>>>> On Thu, Jul 11, 2024 at 11:00:02AM +0200, Christian König wrote:
>>>>>> Am 11.07.24 um 09:42 schrieb Huan Yang:
>>>>>>> Some user may need load file into dma-buf, current
>>>>>>> way is:
>>>>>>>      1. allocate a dma-buf, get dma-buf fd
>>>>>>>      2. mmap dma-buf fd into vaddr
>>>>>>>      3. read(file_fd, vaddr, fsz)
>>>>>>> This is too heavy if fsz reached to GB.
>>>>>> You need to describe a bit more why that is to heavy. I can only
>>>>>> assume you
>>>>>> need to save memory bandwidth and avoid the extra copy with the CPU.
>>>>>>
>>>>>>> This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
>>>>>>> User need to offer a file_fd which you want to load into
>>>>>>> dma-buf, then,
>>>>>>> it promise if you got a dma-buf fd, it will contains the file content.
>>>>>> Interesting idea, that has at least more potential than trying
>>>>>> to enable
>>>>>> direct I/O on mmap()ed DMA-bufs.
>>>>>>
>>>>>> The approach with the new IOCTL might not work because it is a very
>>>>>> specialized use case.
>>>>>>
>>>>>> But IIRC there was a copy_file_range callback in the file_operations
>>>>>> structure you could use for that. I'm just not sure when and how
>>>>>> that's used
>>>>>> with the copy_file_range() system call.
>>>>> I'm not sure any of those help, because internally they're all still
>>>>> based
>>>>> on struct page (or maybe in the future on folios). And that's the thing
>>>>> dma-buf can't give you, at least without peaking behind the curtain.
>>>>>
>>>>> I think an entirely different option would be malloc+udmabuf. That
>>>>> essentially handles the impendence-mismatch between direct I/O and
>>>>> dma-buf
>>>>> on the dma-buf side. The downside is that it'll make the permanently
>>>>> pinned memory accounting and tracking issues even more apparent, but I
>>>>> guess eventually we do need to sort that one out.
>>>> Oh, very good idea!
>>>> Just one minor correction: it's not malloc+udmabuf, but rather
>>>> create_memfd()+udmabuf.
>> Hm right, it's create_memfd() + mmap(memfd) + udmabuf
>>
>>>> And you need to complete your direct I/O before creating the udmabuf
>>>> since that reference will prevent direct I/O from working.
>>> udmabuf will pin all pages, so, if returned fd, can't trigger direct I/O
>>> (same as dmabuf). So, must complete read before pin it.
>> Why does pinning prevent direct I/O? I haven't tested, but I'd expect the
>> rdma folks would be really annoyed if that's the case ...

I used to believe that a pinned page cannot be re-pinned, so performing 
direct I/O on it would fail.  But I misunderstood, and it doesn't have 
any impact.

dma-buf mmap vaddr can't to trigger direct I/O due to can't pin kernel 
page(PFN), So, not same.


>
> Pinning (or rather taking another page reference) prevents writes from 
> using direct I/O because writes try to find all references and make 
> them read only so that nobody modifies the content while the write is 
> done.
>
> As far as I know the same approach is used for NUMA migration and 
> replacing small pages with big ones in THP. But for the read case here 
> it should still work.

Hmm, with udmabuf direct I/O test, I find this will not effect it. Test 
code  I set in email tail. Maybe pin only let page can't be reclaimed, 
rather prevent the write?



With mine test, udmabuf direct I/O read 3GB file, average cost 2.2s.(I 
use ftrace to trace f2fs_direct_IO can make sure direct IO trigger 
success),  Same as mine normal cache file read cost

My patchset average is 1.2s,The difference between the two was obvious 
before.

>
>>> But current way is use `memfd_pin_folios` to boost alloc and pin, so maybe
>>> need suit it.
>>>
>>>
>>> I currently doubt that the udmabuf solution is suitable for our
>>> gigabyte-level read operations.
>>>
>>> 1. The current mmap operation uses faulting, so frequent page faults will be
>>> triggered during reads, resulting in a lot of context switching overhead.
>>>
>>> 2. current udmabuf size limit is 64MB, even can change, maybe not good to
>>> use in large size?
>> Yeah that's just a figleaf so we don't have to bother about the accounting
>> issue.
>>
>>> 3. The migration and adaptation of the driver is also a challenge, and
>>> currently, we are unable to control it.
>> Why does a udmabuf fd not work instead of any other dmabuf fd? That
>> shouldn't matter for the consuming driver ...
>>
>>> Perhaps implementing `copy_file_range` would be more suitable for us.
>> See my other mail, fundamentally these all rely on struct page being
>> present, and dma-buf doesn't give you that. Which means you need to go
>> below the dma-buf abstraction. And udmabuf is pretty much the thing for
>> that, because it wraps normal struct page memory into a dmabuf.
>>
>> And copy_file_range on the underlying memfd might already work, I haven't
>> checked though.
>
> Yeah completely agree.
>
> Regards,
> Christian.
>
>> Cheers, Sima
>
Test code, if test above 2GB, need this patch:

https://lore.kernel.org/all/20240717065444.369876-1-link@vivo.com/

```c

// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#define __EXPORTED_HEADERS__

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <dirent.h>
#include <malloc.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <time.h>
#include <linux/memfd.h>
#include <linux/udmabuf.h>

#define TEST_PREFIX    "drivers/dma-buf/udmabuf"

// static int memfd_create(const char *name, unsigned int flags)
// {
//     return syscall(__NR_memfd_create, name, flags);
// }

int main(void)
{
     struct udmabuf_create create;
     int devfd, memfd, buf, ret;
     unsigned long size;
         int filefd;
         struct timespec ts_start, ts_end;
     long long start, end;

         clock_gettime(CLOCK_MONOTONIC, &ts_start);

     devfd = open("/dev/udmabuf", O_RDWR);
     if (devfd < 0) {
         printf("%s: [skip,no-udmabuf: Unable to access DMA buffer 
device file]\n",
                TEST_PREFIX);
         exit(77);
     }

     memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
     if (memfd < 0) {
         printf("%s: [skip,no-memfd]\n", TEST_PREFIX);
         exit(77);
     }

     ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
     if (ret < 0) {
         printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
         exit(77);
     }

         filefd = open("/data/model.txt", O_RDONLY | O_DIRECT);
         if (filefd < 0) {
                 printf("%s: [failed to open model.txt]\n", TEST_PREFIX);
                 exit(77);
         }

         struct stat ftat;
         fstat(filefd, &ftat);
         size = (ftat.st_size + getpagesize()) & ~(getpagesize());

     ret = ftruncate(memfd, size);
     if (ret == -1) {
         printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
         exit(1);
     }

     memset(&create, 0, sizeof(create));

     /* should work */
     create.memfd  = memfd;
     create.offset = 0;
     create.size   = size;
     buf = ioctl(devfd, UDMABUF_CREATE, &create);
     if (buf < 0) {
         printf("%s: [FAIL,test-4]\n", TEST_PREFIX);
         exit(1);
     }

     // fprintf(stderr, "%s: ok\n", TEST_PREFIX);

         void *vaddr = mmap(NULL, size, PROT_WRITE | PROT_READ,
                          MAP_SHARED, memfd, 0);
         if (!vaddr) {
                 printf("%s: [FAIL, mmap]\n", TEST_PREFIX);
                 exit(77);
         }

         unsigned long rsize = size;
         unsigned long bytes = 0;
         while (bytes != size) {
                 ssize_t rb = read(filefd, vaddr, rsize);
                 if (rb < 0) {
                         printf("%s: [FAIL, read]\n", TEST_PREFIX);
                         exit(77);
                 }
                 rsize -= rb;
                 bytes += rb;

         }
         munmap(vaddr, size);
         clock_gettime(CLOCK_MONOTONIC, &ts_end);

#define NSEC_PER_SEC 1000000000LL
         start = ts_start.tv_sec * NSEC_PER_SEC + ts_start.tv_nsec;
         end = ts_end.tv_sec * NSEC_PER_SEC + ts_end.tv_nsec;

         printf("total cost %lld ns\n", end - start);

         printf("going to check content\n");
         void *fvaddr = mmap(NULL, size, PROT_READ, MAP_SHARED, filefd, 0);
         if (!fvaddr) {
                 printf("%s: [FAIL, mmap file]\n", TEST_PREFIX);
                 exit(77);
         }
         vaddr = mmap(NULL, size, PROT_READ, MAP_SHARED, buf, 0);
         if (!vaddr) {
                 printf("%s: [FAIL, mmap dmabuf]\n", TEST_PREFIX);
                 exit(77);
         }

         if (memcmp(fvaddr, vaddr, size) != 0) {
                 printf("%s: [FAIL, content is not same]\n", TEST_PREFIX);
                 exit(77);
         }

         printf("%s: [SUCCESS, content is same]\n", TEST_PREFIX);
         munmap(vaddr, size);
         munmap(fvaddr, size);
         close(filefd);
     close(buf);
     close(memfd);
     close(devfd);
     return 0;
}

```
Daniel Vetter July 17, 2024, 3:15 p.m. UTC | #19
On Tue, Jul 16, 2024 at 06:14:48PM +0800, Huan Yang wrote:
> 
> 在 2024/7/16 17:31, Daniel Vetter 写道:
> > [你通常不会收到来自 daniel.vetter@ffwll.ch 的电子邮件。请访问 https://aka.ms/LearnAboutSenderIdentification,以了解这一点为什么很重要]
> > 
> > On Tue, Jul 16, 2024 at 10:48:40AM +0800, Huan Yang wrote:
> > > I just research the udmabuf, Please correct me if I'm wrong.
> > > 
> > > 在 2024/7/15 20:32, Christian König 写道:
> > > > Am 15.07.24 um 11:11 schrieb Daniel Vetter:
> > > > > On Thu, Jul 11, 2024 at 11:00:02AM +0200, Christian König wrote:
> > > > > > Am 11.07.24 um 09:42 schrieb Huan Yang:
> > > > > > > Some user may need load file into dma-buf, current
> > > > > > > way is:
> > > > > > >      1. allocate a dma-buf, get dma-buf fd
> > > > > > >      2. mmap dma-buf fd into vaddr
> > > > > > >      3. read(file_fd, vaddr, fsz)
> > > > > > > This is too heavy if fsz reached to GB.
> > > > > > You need to describe a bit more why that is to heavy. I can only
> > > > > > assume you
> > > > > > need to save memory bandwidth and avoid the extra copy with the CPU.
> > > > > > 
> > > > > > > This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
> > > > > > > User need to offer a file_fd which you want to load into
> > > > > > > dma-buf, then,
> > > > > > > it promise if you got a dma-buf fd, it will contains the file content.
> > > > > > Interesting idea, that has at least more potential than trying
> > > > > > to enable
> > > > > > direct I/O on mmap()ed DMA-bufs.
> > > > > > 
> > > > > > The approach with the new IOCTL might not work because it is a very
> > > > > > specialized use case.
> > > > > > 
> > > > > > But IIRC there was a copy_file_range callback in the file_operations
> > > > > > structure you could use for that. I'm just not sure when and how
> > > > > > that's used
> > > > > > with the copy_file_range() system call.
> > > > > I'm not sure any of those help, because internally they're all still
> > > > > based
> > > > > on struct page (or maybe in the future on folios). And that's the thing
> > > > > dma-buf can't give you, at least without peaking behind the curtain.
> > > > > 
> > > > > I think an entirely different option would be malloc+udmabuf. That
> > > > > essentially handles the impendence-mismatch between direct I/O and
> > > > > dma-buf
> > > > > on the dma-buf side. The downside is that it'll make the permanently
> > > > > pinned memory accounting and tracking issues even more apparent, but I
> > > > > guess eventually we do need to sort that one out.
> > > > Oh, very good idea!
> > > > Just one minor correction: it's not malloc+udmabuf, but rather
> > > > create_memfd()+udmabuf.
> > Hm right, it's create_memfd() + mmap(memfd) + udmabuf
> > 
> > > > And you need to complete your direct I/O before creating the udmabuf
> > > > since that reference will prevent direct I/O from working.
> > > udmabuf will pin all pages, so, if returned fd, can't trigger direct I/O
> > > (same as dmabuf). So, must complete read before pin it.
> > Why does pinning prevent direct I/O? I haven't tested, but I'd expect the
> > rdma folks would be really annoyed if that's the case ...
> > 
> > > But current way is use `memfd_pin_folios` to boost alloc and pin, so maybe
> > > need suit it.
> > > 
> > > 
> > > I currently doubt that the udmabuf solution is suitable for our
> > > gigabyte-level read operations.
> > > 
> > > 1. The current mmap operation uses faulting, so frequent page faults will be
> > > triggered during reads, resulting in a lot of context switching overhead.
> > > 
> > > 2. current udmabuf size limit is 64MB, even can change, maybe not good to
> > > use in large size?
> > Yeah that's just a figleaf so we don't have to bother about the accounting
> > issue.
> > 
> > > 3. The migration and adaptation of the driver is also a challenge, and
> > > currently, we are unable to control it.
> > Why does a udmabuf fd not work instead of any other dmabuf fd? That
> > shouldn't matter for the consuming driver ...
> 
> Hmm, our production's driver provider by other oem. I see many of they
> implement
> 
> their own dma_buf_ops.  These may not be generic and may require them to
> reimplement.

Yeah, for exporting a buffer object allocated by that driver. But any
competent gles/vk stack also supports importing dma-buf, and that should
work with udmabuf exactly the same way as with a dma-buf allocated from
the system heap.

> > > Perhaps implementing `copy_file_range` would be more suitable for us.
> > See my other mail, fundamentally these all rely on struct page being
> > present, and dma-buf doesn't give you that. Which means you need to go
> > below the dma-buf abstraction. And udmabuf is pretty much the thing for
> > that, because it wraps normal struct page memory into a dmabuf.
> Yes, udmabuf give this, I am very interested in whether the page provided by
> udmabuf can trigger direct I/O.
> 
> So, I'll give a test and report soon.
> > 
> > And copy_file_range on the underlying memfd might already work, I haven't
> > checked though.
> 
> I have doubts.
> 
> I recently tested and found that I need to modify many places in
> vfs_copy_file_range in order to run the copy file range with DMA_BUF fd.(I
> have managed to get it working,

I'm talking about memfd, not dma-buf here. I think copy_file_range to
dma-buf is as architecturally unsound as allowing O_DIRECT on the dma-buf
mmap.

Cheers, Sima

> but I don't think the implementation is good enough, so I can't provide the
> source code.)
> 
> Maybe memfd can work or not, let's give it a test.:)
> 
> Anyway, it's a good idea too. I currently need to focus on whether it can be
> achieved, as well as the performance comparison.
> 
> > 
> > Cheers, Sima
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch/
Daniel Vetter July 17, 2024, 3:23 p.m. UTC | #20
On Tue, Jul 16, 2024 at 02:07:20PM +0200, Christian König wrote:
> Am 16.07.24 um 11:31 schrieb Daniel Vetter:
> > On Tue, Jul 16, 2024 at 10:48:40AM +0800, Huan Yang wrote:
> > > I just research the udmabuf, Please correct me if I'm wrong.
> > > 
> > > 在 2024/7/15 20:32, Christian König 写道:
> > > > Am 15.07.24 um 11:11 schrieb Daniel Vetter:
> > > > > On Thu, Jul 11, 2024 at 11:00:02AM +0200, Christian König wrote:
> > > > > > Am 11.07.24 um 09:42 schrieb Huan Yang:
> > > > > > > Some user may need load file into dma-buf, current
> > > > > > > way is:
> > > > > > >      1. allocate a dma-buf, get dma-buf fd
> > > > > > >      2. mmap dma-buf fd into vaddr
> > > > > > >      3. read(file_fd, vaddr, fsz)
> > > > > > > This is too heavy if fsz reached to GB.
> > > > > > You need to describe a bit more why that is to heavy. I can only
> > > > > > assume you
> > > > > > need to save memory bandwidth and avoid the extra copy with the CPU.
> > > > > > 
> > > > > > > This patch implement a feature called DMA_HEAP_IOCTL_ALLOC_READ_FILE.
> > > > > > > User need to offer a file_fd which you want to load into
> > > > > > > dma-buf, then,
> > > > > > > it promise if you got a dma-buf fd, it will contains the file content.
> > > > > > Interesting idea, that has at least more potential than trying
> > > > > > to enable
> > > > > > direct I/O on mmap()ed DMA-bufs.
> > > > > > 
> > > > > > The approach with the new IOCTL might not work because it is a very
> > > > > > specialized use case.
> > > > > > 
> > > > > > But IIRC there was a copy_file_range callback in the file_operations
> > > > > > structure you could use for that. I'm just not sure when and how
> > > > > > that's used
> > > > > > with the copy_file_range() system call.
> > > > > I'm not sure any of those help, because internally they're all still
> > > > > based
> > > > > on struct page (or maybe in the future on folios). And that's the thing
> > > > > dma-buf can't give you, at least without peaking behind the curtain.
> > > > > 
> > > > > I think an entirely different option would be malloc+udmabuf. That
> > > > > essentially handles the impendence-mismatch between direct I/O and
> > > > > dma-buf
> > > > > on the dma-buf side. The downside is that it'll make the permanently
> > > > > pinned memory accounting and tracking issues even more apparent, but I
> > > > > guess eventually we do need to sort that one out.
> > > > Oh, very good idea!
> > > > Just one minor correction: it's not malloc+udmabuf, but rather
> > > > create_memfd()+udmabuf.
> > Hm right, it's create_memfd() + mmap(memfd) + udmabuf
> > 
> > > > And you need to complete your direct I/O before creating the udmabuf
> > > > since that reference will prevent direct I/O from working.
> > > udmabuf will pin all pages, so, if returned fd, can't trigger direct I/O
> > > (same as dmabuf). So, must complete read before pin it.
> > Why does pinning prevent direct I/O? I haven't tested, but I'd expect the
> > rdma folks would be really annoyed if that's the case ...
> 
> Pinning (or rather taking another page reference) prevents writes from using
> direct I/O because writes try to find all references and make them read only
> so that nobody modifies the content while the write is done.

Where do you see that happen? That's counter to my understading of what
pin_user_page() does, which is what direct I/O uses ...

> As far as I know the same approach is used for NUMA migration and replacing
> small pages with big ones in THP. But for the read case here it should still
> work.

Yeah elevated refcount breaks migration, but that's entirely different
from the direct I/O use-case. Count me somewhat confused.
-Sima
Christoph Hellwig July 17, 2024, 5:03 p.m. UTC | #21
On Wed, Jul 17, 2024 at 05:15:07PM +0200, Daniel Vetter wrote:
> I'm talking about memfd, not dma-buf here. I think copy_file_range to
> dma-buf is as architecturally unsound as allowing O_DIRECT on the dma-buf
> mmap.

copy_file_range only work inside the same file system anyway, so
it is completely irrelevant here.

What should work just fine is using sendfile (or splice if you like it
complicated) to write TO the dma buf.  That just iterates over the page
cache on the source file and calls ->write_iter from the page cache
pages.  Of course that requires that you actually implement
->write_iter, but given that dmabufs support mmaping there I can't
see why you should not be able to write to it.

Reading FROM the dma buf in that fashion should also work if you provide
a ->read_iter wire up ->splice_read to copy_splice_read so that it
doesn't require any page cache.
Huan Yang July 18, 2024, 1:51 a.m. UTC | #22
在 2024/7/18 1:03, Christoph Hellwig 写道:
> [Some people who received this message don't often get email from hch@infradead.org. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On Wed, Jul 17, 2024 at 05:15:07PM +0200, Daniel Vetter wrote:
>> I'm talking about memfd, not dma-buf here. I think copy_file_range to
>> dma-buf is as architecturally unsound as allowing O_DIRECT on the dma-buf
>> mmap.
> copy_file_range only work inside the same file system anyway, so
> it is completely irrelevant here.

Yes, actually, if dma-buf want's to copy_file_range from a file, it need 
change something in vfs_copy_file_range:

1. in generic_file_rw_checks, dma-buf file is not a normal file, but 
inode_out check it.  need bypass

2. file in and out need in the same file system which you point it. So, 
need bypass it

3. if dma-buf above 2G, need bypass generic_write_check_limits's file 
O_LARGEFILE check, it only allow copy range below 2G.

I feel that the above limitations indicate that copy_file_range is not 
really suitable for copying between different file systems or 
unconventional file types.(both shmemfs and other's)

Perhaps enabling dma-buf to support copy_file_range is not a good idea? :)

>
> What should work just fine is using sendfile (or splice if you like it
> complicated) to write TO the dma buf.  That just iterates over the page
OK, I'll research it also.
> cache on the source file and calls ->write_iter from the page cache
> pages.  Of course that requires that you actually implement
> ->write_iter, but given that dmabufs support mmaping there I can't
> see why you should not be able to write to it.
>
> Reading FROM the dma buf in that fashion should also work if you provide
> a ->read_iter wire up ->splice_read to copy_splice_read so that it
> doesn't require any page cache.
>
Christoph Hellwig July 18, 2024, 3:08 a.m. UTC | #23
On Thu, Jul 18, 2024 at 09:51:39AM +0800, Huan Yang wrote:
> Yes, actually, if dma-buf want's to copy_file_range from a file, it need
> change something in vfs_copy_file_range:

No, it doesn't.  copy_file_range is specifically designed to copy inside
a single file system as already mentioned.  The generic offload for
copying between arbitrary FDs is splice and the sendfile convenience
wrapper around it
Huan Yang July 18, 2024, 3:12 a.m. UTC | #24
在 2024/7/18 11:08, Christoph Hellwig 写道:
> [Some people who received this message don't often get email from hch@infradead.org. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On Thu, Jul 18, 2024 at 09:51:39AM +0800, Huan Yang wrote:
>> Yes, actually, if dma-buf want's to copy_file_range from a file, it need
>> change something in vfs_copy_file_range:
> No, it doesn't.  copy_file_range is specifically designed to copy inside
> a single file system as already mentioned.  The generic offload for
OK, got it. Thanks to point this.
> copying between arbitrary FDs is splice and the sendfile convenience
> wrapper around it
>
Huan Yang July 24, 2024, 7:12 a.m. UTC | #25
在 2024/7/18 1:03, Christoph Hellwig 写道:
> copy_file_range only work inside the same file system anyway, so
> it is completely irrelevant here.
>
> What should work just fine is using sendfile (or splice if you like it
> complicated) to write TO the dma buf.  That just iterates over the page
> cache on the source file and calls ->write_iter from the page cache
> pages.  Of course that requires that you actually implement
> ->write_iter, but given that dmabufs support mmaping there I can't
> see why you should not be able to write to it.

This day, I test dma-buf read large file with sendfile. Here are two 
problem I find when read O_DIRECT open file.

1. sendfile/splice transfer data between read and write through a pipe.
     Even if the read process does not generate page cache, an 
equivalent amount of CPU copy is still required.
     This is particularly noticeable in the performance degradation when 
reading large files.

2. Each pipe max_bytes is 64K(in my phone and arch test), This means 
that for each IO, only 64K is read and then copied, resulting in poor IO 
performance.

Based on observations from testing, it takes an average of 7s to perform 
O_DIRECT read of a 3GB file. Trace show much runable and running and 
some I/O between this.

For buffer read large file into dma-buf by sendfile, cost 2.3s, is normal.

Maybe this is not a good way to let dma-buf support direct IO?


>
> Reading FROM the dma buf in that fashion should also work if you provide
> a ->read_iter wire up ->splice_read to copy_splice_read so that it
We current more care abount read file into dma-buf, not write. :)
> doesn't require any page cache.
diff mbox series

Patch

diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
index 2298ca5e112e..abe17281adb8 100644
--- a/drivers/dma-buf/dma-heap.c
+++ b/drivers/dma-buf/dma-heap.c
@@ -15,9 +15,11 @@ 
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/nospec.h>
+#include <linux/highmem.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/dma-heap.h>
+#include <linux/vmalloc.h>
 #include <uapi/linux/dma-heap.h>
 
 #define DEVNAME "dma_heap"
@@ -43,12 +45,462 @@  struct dma_heap {
 	struct cdev heap_cdev;
 };
 
+/**
+ * struct dma_heap_file - wrap the file, read task for dma_heap allocate use.
+ * @file:		file to read from.
+ *
+ * @cred:		kthread use, user cred copy to use for the read.
+ *
+ * @max_batch:		maximum batch size to read, if collect match batch,
+ *			trigger read, default 128MB, must below file size.
+ *
+ * @fsz:		file size.
+ *
+ * @direct:		use direct IO?
+ */
+struct dma_heap_file {
+	struct file *file;
+	struct cred *cred;
+	size_t max_batch;
+	size_t fsz;
+	bool direct;
+};
+
+/**
+ * struct dma_heap_file_work - represents a dma_heap file read real work.
+ * @vaddr:		contigous virtual address alloc by vmap, file read need.
+ *
+ * @start_size:		file read start offset, same to @dma_heap_file_task->roffset.
+ *
+ * @need_size:		file read need size, same to @dma_heap_file_task->rsize.
+ *
+ * @heap_file:		file wrapper.
+ *
+ * @list:		child node of @dma_heap_file_control->works.
+ *
+ * @refp:		same @dma_heap_file_task->ref, if end of read, put ref.
+ *
+ * @failp:		if any work io failed, set it true, pointp @dma_heap_file_task->fail.
+ */
+struct dma_heap_file_work {
+	void *vaddr;
+	ssize_t start_size;
+	ssize_t need_size;
+	struct dma_heap_file *heap_file;
+	struct list_head list;
+	atomic_t *refp;
+	bool *failp;
+};
+
+/**
+ * struct dma_heap_file_task - represents a dma_heap file read process
+ * @ref:		current file work counter, if zero, allocate and read
+ *			done.
+ *
+ * @roffset:		last read offset, current prepared work' begin file
+ *			start offset.
+ *
+ * @rsize:		current allocated page size use to read, if reach rbatch,
+ *			trigger commit.
+ *
+ * @rbatch:		current prepared work's batch, below @dma_heap_file's
+ *			batch.
+ *
+ * @heap_file:		current dma_heap_file
+ *
+ * @parray:		used for vmap, size is @dma_heap_file's batch's number
+ *			pages.(this is maximum). Due to single thread file read,
+ *			one page array reuse each work prepare is OK.
+ *			Each index in parray is PAGE_SIZE.(vmap need)
+ *
+ * @pindex:		current allocated page filled in @parray's index.
+ *
+ * @fail:		any work failed when file read?
+ *
+ * dma_heap_file_task is the production of file read, will prepare each work
+ * during allocate dma_buf pages, if match current batch, then trigger commit
+ * and prepare next work. After all batch queued, user going on prepare dma_buf
+ * and so on, but before return dma_buf fd, need to wait file read end and
+ * check read result.
+ */
+struct dma_heap_file_task {
+	atomic_t ref;
+	size_t roffset;
+	size_t rsize;
+	size_t rbatch;
+	struct dma_heap_file *heap_file;
+	struct page **parray;
+	unsigned int pindex;
+	bool fail;
+};
+
+/**
+ * struct dma_heap_file_control - global control of dma_heap file read.
+ * @works:		@dma_heap_file_work's list head.
+ *
+ * @lock:		only lock for @works.
+ *
+ * @threadwq:		wait queue for @work_thread, if commit work, @work_thread
+ *			wakeup and read this work's file contains.
+ *
+ * @workwq:		used for main thread wait for file read end, if allocation
+ *			end before file read. @dma_heap_file_task ref effect this.
+ *
+ * @work_thread:	file read kthread. the dma_heap_file_task work's consumer.
+ *
+ * @heap_fwork_cachep:	@dma_heap_file_work's cachep, it's alloc/free frequently.
+ *
+ * @nr_work:		global number of how many work committed.
+ */
+struct dma_heap_file_control {
+	struct list_head works;
+	spinlock_t lock;
+	wait_queue_head_t threadwq;
+	wait_queue_head_t workwq;
+	struct task_struct *work_thread;
+	struct kmem_cache *heap_fwork_cachep;
+	atomic_t nr_work;
+};
+
+static struct dma_heap_file_control *heap_fctl;
 static LIST_HEAD(heap_list);
 static DEFINE_MUTEX(heap_list_lock);
 static dev_t dma_heap_devt;
 static struct class *dma_heap_class;
 static DEFINE_XARRAY_ALLOC(dma_heap_minors);
 
+/**
+ * map_pages_to_vaddr - map each scatter page into contiguous virtual address.
+ * @heap_ftask:		prepared and need to commit's work.
+ *
+ * Cached pages need to trigger file read, this function map each scatter page
+ * into contiguous virtual address, so that file read can easy use.
+ * Now that we get vaddr page, cached pages can return to original user, so we
+ * will not effect dma-buf export even if file read not end.
+ */
+static void *map_pages_to_vaddr(struct dma_heap_file_task *heap_ftask)
+{
+	return vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
+		    PAGE_KERNEL);
+}
+
+bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
+				struct page *page)
+{
+	struct page **array = heap_ftask->parray;
+	int index = heap_ftask->pindex;
+	int num = compound_nr(page), i;
+	unsigned long sz = page_size(page);
+
+	heap_ftask->rsize += sz;
+	for (i = 0; i < num; ++i)
+		array[index++] = &page[i];
+	heap_ftask->pindex = index;
+
+	return heap_ftask->rsize >= heap_ftask->rbatch;
+}
+
+static struct dma_heap_file_work *
+init_file_work(struct dma_heap_file_task *heap_ftask)
+{
+	struct dma_heap_file_work *heap_fwork;
+	struct dma_heap_file *heap_file = heap_ftask->heap_file;
+
+	if (READ_ONCE(heap_ftask->fail))
+		return NULL;
+
+	heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, GFP_KERNEL);
+	if (unlikely(!heap_fwork))
+		return NULL;
+
+	heap_fwork->vaddr = map_pages_to_vaddr(heap_ftask);
+	if (unlikely(!heap_fwork->vaddr)) {
+		kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
+		return NULL;
+	}
+
+	heap_fwork->heap_file = heap_file;
+	heap_fwork->start_size = heap_ftask->roffset;
+	heap_fwork->need_size = heap_ftask->rsize;
+	heap_fwork->refp = &heap_ftask->ref;
+	heap_fwork->failp = &heap_ftask->fail;
+	atomic_inc(&heap_ftask->ref);
+	return heap_fwork;
+}
+
+static void destroy_file_work(struct dma_heap_file_work *heap_fwork)
+{
+	vunmap(heap_fwork->vaddr);
+	atomic_dec(heap_fwork->refp);
+	wake_up(&heap_fctl->workwq);
+
+	kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
+}
+
+int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
+{
+	struct dma_heap_file_work *heap_fwork = init_file_work(heap_ftask);
+	struct page *last = NULL;
+	struct dma_heap_file *heap_file = heap_ftask->heap_file;
+	size_t start = heap_ftask->roffset;
+	struct file *file = heap_file->file;
+	size_t fsz = heap_file->fsz;
+
+	if (unlikely(!heap_fwork))
+		return -ENOMEM;
+
+	/**
+	 * If file size is not page aligned, direct io can't process the tail.
+	 * So, if reach to tail, remain the last page use buffer read.
+	 */
+	if (heap_file->direct && start + heap_ftask->rsize > fsz) {
+		heap_fwork->need_size -= PAGE_SIZE;
+		last = heap_ftask->parray[heap_ftask->pindex - 1];
+	}
+
+	spin_lock(&heap_fctl->lock);
+	list_add_tail(&heap_fwork->list, &heap_fctl->works);
+	spin_unlock(&heap_fctl->lock);
+	atomic_inc(&heap_fctl->nr_work);
+
+	wake_up(&heap_fctl->threadwq);
+
+	if (last) {
+		char *buf, *pathp;
+		ssize_t err;
+		void *buffer;
+
+		buf = kmalloc(PATH_MAX, GFP_KERNEL);
+		if (unlikely(!buf))
+			return -ENOMEM;
+
+		start = PAGE_ALIGN_DOWN(fsz);
+
+		pathp = file_path(file, buf, PATH_MAX);
+		if (IS_ERR(pathp)) {
+			kfree(buf);
+			return PTR_ERR(pathp);
+		}
+
+		buffer = kmap_local_page(last); // use page's kaddr.
+		err = kernel_read_file_from_path(pathp, start, &buffer,
+						 fsz - start, &fsz,
+						 READING_POLICY);
+		kunmap_local(buffer);
+		kfree(buf);
+		if (err < 0) {
+			pr_err("failed to use buffer kernel_read_file %s, err=%ld, [%ld, %ld], f_sz=%ld\n",
+			       pathp, err, start, fsz, fsz);
+
+			return err;
+		}
+	}
+
+	heap_ftask->roffset += heap_ftask->rsize;
+	heap_ftask->rsize = 0;
+	heap_ftask->pindex = 0;
+	heap_ftask->rbatch = min_t(size_t,
+				   PAGE_ALIGN(fsz) - heap_ftask->roffset,
+				   heap_ftask->rbatch);
+	return 0;
+}
+
+bool dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask)
+{
+	wait_event_freezable(heap_fctl->workwq,
+			     atomic_read(&heap_ftask->ref) == 0);
+	return heap_ftask->fail;
+}
+
+bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask)
+{
+	bool fail;
+
+	dma_heap_wait_for_file_read(heap_ftask);
+	fail = heap_ftask->fail;
+	kvfree(heap_ftask->parray);
+	kfree(heap_ftask);
+	return fail;
+}
+
+struct dma_heap_file_task *
+dma_heap_declare_file_read(struct dma_heap_file *heap_file)
+{
+	struct dma_heap_file_task *heap_ftask =
+		kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
+	if (unlikely(!heap_ftask))
+		return NULL;
+
+	/**
+	 * Batch is the maximum size which we prepare work will meet.
+	 * So, direct alloc this number's page array is OK.
+	 */
+	heap_ftask->parray = kvmalloc_array(heap_file->max_batch >> PAGE_SHIFT,
+					    sizeof(struct page *), GFP_KERNEL);
+	if (unlikely(!heap_ftask->parray))
+		goto put;
+
+	heap_ftask->heap_file = heap_file;
+	heap_ftask->rbatch = heap_file->max_batch;
+	return heap_ftask;
+put:
+	kfree(heap_ftask);
+	return NULL;
+}
+
+static void __work_this_io(struct dma_heap_file_work *heap_fwork)
+{
+	struct dma_heap_file *heap_file = heap_fwork->heap_file;
+	struct file *file = heap_file->file;
+	ssize_t start = heap_fwork->start_size;
+	ssize_t size = heap_fwork->need_size;
+	void *buffer = heap_fwork->vaddr;
+	const struct cred *old_cred;
+	ssize_t err;
+
+	// use real task's cred to read this file.
+	old_cred = override_creds(heap_file->cred);
+	err = kernel_read_file(file, start, &buffer, size, &heap_file->fsz,
+			       READING_POLICY);
+	if (err < 0) {
+		pr_err("use kernel_read_file, err=%ld, [%ld, %ld], f_sz=%ld\n",
+		       err, start, (start + size), heap_file->fsz);
+		WRITE_ONCE(*heap_fwork->failp, true);
+	}
+	// recovery to my cred.
+	revert_creds(old_cred);
+}
+
+static int dma_heap_file_control_thread(void *data)
+{
+	struct dma_heap_file_control *heap_fctl =
+		(struct dma_heap_file_control *)data;
+	struct dma_heap_file_work *worker, *tmp;
+	int nr_work;
+
+	LIST_HEAD(pages);
+	LIST_HEAD(workers);
+
+	while (true) {
+		wait_event_freezable(heap_fctl->threadwq,
+				     atomic_read(&heap_fctl->nr_work) > 0);
+recheck:
+		spin_lock(&heap_fctl->lock);
+		list_splice_init(&heap_fctl->works, &workers);
+		spin_unlock(&heap_fctl->lock);
+
+		if (unlikely(kthread_should_stop())) {
+			list_for_each_entry_safe(worker, tmp, &workers, list) {
+				list_del(&worker->list);
+				destroy_file_work(worker);
+			}
+			break;
+		}
+
+		nr_work = 0;
+		list_for_each_entry_safe(worker, tmp, &workers, list) {
+			++nr_work;
+			list_del(&worker->list);
+			__work_this_io(worker);
+
+			destroy_file_work(worker);
+		}
+		atomic_sub(nr_work, &heap_fctl->nr_work);
+
+		if (atomic_read(&heap_fctl->nr_work) > 0)
+			goto recheck;
+	}
+	return 0;
+}
+
+size_t dma_heap_file_size(struct dma_heap_file *heap_file)
+{
+	return heap_file->fsz;
+}
+
+static int prepare_dma_heap_file(struct dma_heap_file *heap_file, int file_fd,
+				 size_t batch)
+{
+	struct file *file;
+	size_t fsz;
+	int ret;
+
+	file = fget(file_fd);
+	if (!file)
+		return -EINVAL;
+
+	fsz = i_size_read(file_inode(file));
+	if (fsz < batch) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/**
+	 * Selinux block our read, but actually we are reading the stand-in
+	 * for this file.
+	 * So save current's cred and when going to read, override mine, and
+	 * end of read, revert.
+	 */
+	heap_file->cred = prepare_kernel_cred(current);
+	if (unlikely(!heap_file->cred)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	heap_file->file = file;
+	heap_file->max_batch = batch;
+	heap_file->fsz = fsz;
+
+	heap_file->direct = file->f_flags & O_DIRECT;
+
+#define DMA_HEAP_SUGGEST_DIRECT_IO_SIZE (1UL << 30)
+	if (!heap_file->direct && fsz >= DMA_HEAP_SUGGEST_DIRECT_IO_SIZE)
+		pr_warn("alloc read file better to use O_DIRECT to read larget file\n");
+
+	return 0;
+
+err:
+	fput(file);
+	return ret;
+}
+
+static void destroy_dma_heap_file(struct dma_heap_file *heap_file)
+{
+	fput(heap_file->file);
+	put_cred(heap_file->cred);
+}
+
+static int dma_heap_buffer_alloc_read_file(struct dma_heap *heap, int file_fd,
+					   size_t batch, unsigned int fd_flags,
+					   unsigned int heap_flags)
+{
+	struct dma_buf *dmabuf;
+	int fd;
+	struct dma_heap_file heap_file;
+
+	fd = prepare_dma_heap_file(&heap_file, file_fd, batch);
+	if (fd)
+		goto error_file;
+
+	dmabuf = heap->ops->allocate_read_file(heap, &heap_file, fd_flags,
+					       heap_flags);
+	if (IS_ERR(dmabuf)) {
+		fd = PTR_ERR(dmabuf);
+		goto error;
+	}
+
+	fd = dma_buf_fd(dmabuf, fd_flags);
+	if (fd < 0) {
+		dma_buf_put(dmabuf);
+		/* just return, as put will call release and that will free */
+	}
+
+error:
+	destroy_dma_heap_file(&heap_file);
+error_file:
+	return fd;
+}
+
 static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
 				 u32 fd_flags,
 				 u64 heap_flags)
@@ -93,6 +545,38 @@  static int dma_heap_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static long dma_heap_ioctl_allocate_read_file(struct file *file, void *data)
+{
+	struct dma_heap_allocation_file_data *heap_allocation_file = data;
+	struct dma_heap *heap = file->private_data;
+	int fd;
+
+	if (heap_allocation_file->fd || !heap_allocation_file->file_fd)
+		return -EINVAL;
+
+	if (heap_allocation_file->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS)
+		return -EINVAL;
+
+	if (heap_allocation_file->heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS)
+		return -EINVAL;
+
+	if (!heap->ops->allocate_read_file)
+		return -EINVAL;
+
+	fd = dma_heap_buffer_alloc_read_file(
+		heap, heap_allocation_file->file_fd,
+		heap_allocation_file->batch ?
+			PAGE_ALIGN(heap_allocation_file->batch) :
+			DEFAULT_ADI_BATCH,
+		heap_allocation_file->fd_flags,
+		heap_allocation_file->heap_flags);
+	if (fd < 0)
+		return fd;
+
+	heap_allocation_file->fd = fd;
+	return 0;
+}
+
 static long dma_heap_ioctl_allocate(struct file *file, void *data)
 {
 	struct dma_heap_allocation_data *heap_allocation = data;
@@ -121,6 +605,7 @@  static long dma_heap_ioctl_allocate(struct file *file, void *data)
 
 static unsigned int dma_heap_ioctl_cmds[] = {
 	DMA_HEAP_IOCTL_ALLOC,
+	DMA_HEAP_IOCTL_ALLOC_AND_READ,
 };
 
 static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
@@ -170,6 +655,9 @@  static long dma_heap_ioctl(struct file *file, unsigned int ucmd,
 	case DMA_HEAP_IOCTL_ALLOC:
 		ret = dma_heap_ioctl_allocate(file, kdata);
 		break;
+	case DMA_HEAP_IOCTL_ALLOC_AND_READ:
+		ret = dma_heap_ioctl_allocate_read_file(file, kdata);
+		break;
 	default:
 		ret = -ENOTTY;
 		goto err;
@@ -316,11 +804,44 @@  static int dma_heap_init(void)
 
 	dma_heap_class = class_create(DEVNAME);
 	if (IS_ERR(dma_heap_class)) {
-		unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
-		return PTR_ERR(dma_heap_class);
+		ret = PTR_ERR(dma_heap_class);
+		goto fail_class;
 	}
 	dma_heap_class->devnode = dma_heap_devnode;
 
+	heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
+	if (unlikely(!heap_fctl)) {
+		ret =  -ENOMEM;
+		goto fail_alloc;
+	}
+
+	INIT_LIST_HEAD(&heap_fctl->works);
+	init_waitqueue_head(&heap_fctl->threadwq);
+	init_waitqueue_head(&heap_fctl->workwq);
+
+	heap_fctl->work_thread = kthread_run(dma_heap_file_control_thread,
+					     heap_fctl, "heap_fwork_t");
+	if (IS_ERR(heap_fctl->work_thread)) {
+		ret = -ENOMEM;
+		goto fail_thread;
+	}
+
+	heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 0);
+	if (unlikely(!heap_fctl->heap_fwork_cachep)) {
+		ret = -ENOMEM;
+		goto fail_cache;
+	}
+
 	return 0;
+
+fail_cache:
+	kthread_stop(heap_fctl->work_thread);
+fail_thread:
+	kfree(heap_fctl);
+fail_alloc:
+	class_destroy(dma_heap_class);
+fail_class:
+	unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
+	return ret;
 }
 subsys_initcall(dma_heap_init);
diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
index 064bad725061..9c25383f816c 100644
--- a/include/linux/dma-heap.h
+++ b/include/linux/dma-heap.h
@@ -12,12 +12,17 @@ 
 #include <linux/cdev.h>
 #include <linux/types.h>
 
+#define DEFAULT_ADI_BATCH (128 << 20)
+
 struct dma_heap;
+struct dma_heap_file_task;
+struct dma_heap_file;
 
 /**
  * struct dma_heap_ops - ops to operate on a given heap
  * @allocate:		allocate dmabuf and return struct dma_buf ptr
- *
+ * @allocate_read_file: allocate dmabuf and read file, then return struct
+ * dma_buf ptr.
  * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
  */
 struct dma_heap_ops {
@@ -25,6 +30,11 @@  struct dma_heap_ops {
 				    unsigned long len,
 				    u32 fd_flags,
 				    u64 heap_flags);
+
+	struct dma_buf *(*allocate_read_file)(struct dma_heap *heap,
+					      struct dma_heap_file *heap_file,
+					      u32 fd_flags,
+					      u64 heap_flags);
 };
 
 /**
@@ -65,4 +75,49 @@  const char *dma_heap_get_name(struct dma_heap *heap);
  */
 struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info);
 
+/**
+ * dma_heap_destroy_file_read - waits for a file read to complete then destroy it
+ * Returns: true if the file read failed, false otherwise
+ */
+bool dma_heap_destroy_file_read(struct dma_heap_file_task *heap_ftask);
+
+/**
+ * dma_heap_wait_for_file_read - waits for a file read to complete
+ * Returns: true if the file read failed, false otherwise
+ */
+bool dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask);
+
+/**
+ * dma_heap_alloc_file_read - Declare a task to read file when allocate pages.
+ * @heap_file:		target file to read
+ *
+ * Return NULL if failed, otherwise return a struct pointer.
+ */
+struct dma_heap_file_task *
+dma_heap_declare_file_read(struct dma_heap_file *heap_file);
+
+/**
+ * dma_heap_prepare_file_read - cache each allocated page until we meet this batch.
+ * @heap_ftask:		prepared and need to commit's work.
+ * @page:		current allocated page. don't care which order.
+ *
+ * Returns true if reach to batch, false so go on prepare.
+ */
+bool dma_heap_prepare_file_read(struct dma_heap_file_task *heap_ftask,
+				struct page *page);
+
+/**
+ * dma_heap_commit_file_read -  prepare collect enough memory, going to trigger IO
+ * @heap_ftask:			info that current IO needs
+ *
+ * This commit will also check if reach to tail read.
+ * For direct I/O submissions, it is necessary to pay attention to file reads
+ * that are not page-aligned. For the unaligned portion of the read, buffer IO
+ * needs to be triggered.
+ * Returns:
+ *   0 if all right, -errno if something wrong
+ */
+int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask);
+size_t dma_heap_file_size(struct dma_heap_file *heap_file);
+
 #endif /* _DMA_HEAPS_H */
diff --git a/include/uapi/linux/dma-heap.h b/include/uapi/linux/dma-heap.h
index a4cf716a49fa..8c20e8b74eed 100644
--- a/include/uapi/linux/dma-heap.h
+++ b/include/uapi/linux/dma-heap.h
@@ -39,6 +39,27 @@  struct dma_heap_allocation_data {
 	__u64 heap_flags;
 };
 
+/**
+ * struct dma_heap_allocation_file_data - metadata passed from userspace for
+ *                                      allocations and read file
+ * @fd:			will be populated with a fd which provides the
+ *			handle to the allocated dma-buf
+ * @file_fd:		file descriptor to read from(suggested to use O_DIRECT open file)
+ * @batch:		how many memory alloced then file read(bytes), default 128MB
+ *			will auto aligned to PAGE_SIZE
+ * @fd_flags:		file descriptor flags used when allocating
+ * @heap_flags:		flags passed to heap
+ *
+ * Provided by userspace as an argument to the ioctl
+ */
+struct dma_heap_allocation_file_data {
+	__u32 fd;
+	__u32 file_fd;
+	__u32 batch;
+	__u32 fd_flags;
+	__u64 heap_flags;
+};
+
 #define DMA_HEAP_IOC_MAGIC		'H'
 
 /**
@@ -50,4 +71,15 @@  struct dma_heap_allocation_data {
 #define DMA_HEAP_IOCTL_ALLOC	_IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
 				      struct dma_heap_allocation_data)
 
+/**
+ * DOC: DMA_HEAP_IOCTL_ALLOC_AND_READ - allocate memory from pool and both
+ *					read file when allocate memory.
+ *
+ * Takes a dma_heap_allocation_file_data struct and returns it with the fd field
+ * populated with the dmabuf handle of the allocation. When return, the dma-buf
+ * content is read from file.
+ */
+#define DMA_HEAP_IOCTL_ALLOC_AND_READ \
+	_IOWR(DMA_HEAP_IOC_MAGIC, 0x1, struct dma_heap_allocation_file_data)
+
 #endif /* _UAPI_LINUX_DMABUF_POOL_H */