diff mbox series

[RFC] fuse: use page cache pages for writeback io when virtio_fs is in use

Message ID 20231228123528.705-1-lege.wang@jaguarmicro.com (mailing list archive)
State New
Headers show
Series [RFC] fuse: use page cache pages for writeback io when virtio_fs is in use | expand

Commit Message

Lege Wang Dec. 28, 2023, 12:35 p.m. UTC
Currently if writeback cache policy is enabled, fuse will allocate
temporary pages with GFP_NOFS|__GFP_HIGHMEM for writeback io, commit
3be5a52b30aa ("fuse: support writable mmap") introduces this behavior
primarily for the following two reasons:
  1. filesystem based on fuse may also need additional memory resources
     to complete a WRITE request, then there's a great danger of a
     deadlock if that allocation may wait for the writepage to finish.
  2. buggy or even malicious userspace filesystem may fail to complete
     WRITE requests, then unrelated parts of the system may hang, for
     example, sync operations may hang.

As commit 3be5a52b30aa said, this approach is wasteful in both memory
and CPU bandwidth, but it's necessary for traditional fuse userspace
filesystems. With the emergence of the virtio-fs file system, the
virtio-fs userspace backend are typically provided by cloud providers,
which may have bugs, but should not be seem as malicious, also vm and
virtio-fs userspace backend normally run on different os, so the
deadlock should not happen, though vm hypervisor is running on the os
same to virtio-fs userspace backend.

Nowadays, modern DPUs(Data Processing Unit) start to support virtio
filesystem device, below is virtio-fs spec:
  https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-49600011
Cloud providers or hardware manufacturers provide DPUs, they should
be seem as trusted hardwares, just like nvme ssd nic, etc. The
DPU also has a running os interenally, the basic architecture would look
like below:
      --------------------------
     | host machine             |
     |                          |
     |--------------------------|
     |       os/virtio-fs       |
     ---------------------------|
     |       hardware           |
      --------------------------
                |
                |(pci)
                v
      -------------------------------
     | DPU                           |
     |                               |
     |  virtio-fs userspace backend  |
     |-------------------------------|
     |          os                   |
     | ------------------------------|
     |         hardware              |
      -------------------------------

Obviously, for virtio-fs case, temporary pages are unnecessary, so this
patch changes the write-back cache policy a bit, if fuse req is passed on
virtio_fs channel, there's no needs to allocate temporary pages, use
page cache pages for write-back io directly.

Use fio to evaluate performance improvement:
[global]
ioengine=psync
fsync=1
direct=0
bs=1048576
rw=randwrite
randrepeat=0
iodepth=1
size=512M
runtime=30
time_based
filename=/home/lege/mntpoint/testfile2

[job1]
cpus_allowed=1

Before patch:
WRITE: bw=655MiB/s (686MB/s), 655MiB/s-655MiB/s (686MB/s-686MB/s), io=19.2GiB (20.6GB), run=30001-30001msec

After patch:
WRITE: bw=754MiB/s (790MB/s), 754MiB/s-754MiB/s (790MB/s-790MB/s), io=22.1GiB (23.7GB), run=30001-30001msec

About 15% throughput improvement.

Signed-off-by: Xiaoguang Wang <lege.wang@jaguarmicro.com>
---
 fs/fuse/file.c      | 109 +++++++++++++++++++++++++++++++-------------
 fs/fuse/fuse_i.h    |   6 +++
 fs/fuse/inode.c     |   5 +-
 fs/fuse/virtio_fs.c |   1 +
 4 files changed, 89 insertions(+), 32 deletions(-)

Comments

Lege Wang Dec. 28, 2023, 1:06 p.m. UTC | #1
hi,

This patch just shows the idea, to see if I'm in the right direction 
Miklos Szeredi Jan. 15, 2024, 8:48 a.m. UTC | #2
On Thu, 28 Dec 2023 at 14:06, Lege Wang <lege.wang@jaguarmicro.com> wrote:
>
> hi,
>
> This patch just shows the idea, to see if I'm in the right direction 
diff mbox series

Patch

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cdb6327511e..3d14a291cd3e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -400,6 +400,7 @@  struct fuse_writepage_args {
 	struct fuse_writepage_args *next;
 	struct inode *inode;
 	struct fuse_sync_bucket *bucket;
+	bool has_temporary_page;
 };
 
 static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
@@ -1660,8 +1661,10 @@  static void fuse_writepage_free(struct fuse_writepage_args *wpa)
 	if (wpa->bucket)
 		fuse_sync_bucket_dec(wpa->bucket);
 
-	for (i = 0; i < ap->num_pages; i++)
-		__free_page(ap->pages[i]);
+	if (wpa->has_temporary_page) {
+		for (i = 0; i < ap->num_pages; i++)
+			__free_page(ap->pages[i]);
+	}
 
 	if (wpa->ia.ff)
 		fuse_file_put(wpa->ia.ff, false, false);
@@ -1681,10 +1684,14 @@  static void fuse_writepage_finish(struct fuse_mount *fm,
 
 	for (i = 0; i < ap->num_pages; i++) {
 		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-		dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
+		if (wpa->has_temporary_page)
+			dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
+		else
+			end_page_writeback(ap->pages[i]);
 		wb_writeout_inc(&bdi->wb);
 	}
-	wake_up(&fi->page_waitq);
+	if (wpa->has_temporary_page)
+		wake_up(&fi->page_waitq);
 }
 
 /* Called under fi->lock, may release and reacquire it */
@@ -1823,6 +1830,9 @@  static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
 	if (!fc->writeback_cache)
 		fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
 	spin_lock(&fi->lock);
+	if (!wpa->has_temporary_page)
+		goto skip;
+
 	rb_erase(&wpa->writepages_entry, &fi->writepages);
 	while (wpa->next) {
 		struct fuse_mount *fm = get_fuse_mount(inode);
@@ -1859,6 +1869,7 @@  static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
 		 */
 		fuse_send_writepage(fm, next, inarg->offset + inarg->size);
 	}
+skip:
 	fi->writectr--;
 	fuse_writepage_finish(fm, wpa);
 	spin_unlock(&fi->lock);
@@ -1952,8 +1963,9 @@  static int fuse_writepage_locked(struct page *page)
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_writepage_args *wpa;
 	struct fuse_args_pages *ap;
-	struct page *tmp_page;
+	struct page *tmp_page = NULL;
 	int error = -ENOMEM;
+	int needs_temporary_page = !fc->no_temporary_page;
 
 	set_page_writeback(page);
 
@@ -1962,9 +1974,11 @@  static int fuse_writepage_locked(struct page *page)
 		goto err;
 	ap = &wpa->ia.ap;
 
-	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-	if (!tmp_page)
-		goto err_free;
+	if (needs_temporary_page) {
+		tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!tmp_page)
+			goto err_free;
+	}
 
 	error = -EIO;
 	wpa->ia.ff = fuse_write_file_get(fi);
@@ -1974,19 +1988,24 @@  static int fuse_writepage_locked(struct page *page)
 	fuse_writepage_add_to_bucket(fc, wpa);
 	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
 
-	copy_highpage(tmp_page, page);
+	if (needs_temporary_page) {
+		copy_highpage(tmp_page, page);
+		ap->pages[0] = tmp_page;
+		inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
+	} else {
+		ap->pages[0] = page;
+	}
 	wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
 	wpa->next = NULL;
 	ap->args.in_pages = true;
 	ap->num_pages = 1;
-	ap->pages[0] = tmp_page;
 	ap->descs[0].offset = 0;
 	ap->descs[0].length = PAGE_SIZE;
 	ap->args.end = fuse_writepage_end;
 	wpa->inode = inode;
+	wpa->has_temporary_page = needs_temporary_page;
 
 	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
-	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fi->lock);
 	tree_insert(&fi->writepages, wpa);
@@ -1994,12 +2013,14 @@  static int fuse_writepage_locked(struct page *page)
 	fuse_flush_writepages(inode);
 	spin_unlock(&fi->lock);
 
-	end_page_writeback(page);
+	if (needs_temporary_page)
+		end_page_writeback(page);
 
 	return 0;
 
 err_nofile:
-	__free_page(tmp_page);
+	if (needs_temporary_page)
+		__free_page(tmp_page);
 err_free:
 	kfree(wpa);
 err:
@@ -2013,7 +2034,7 @@  static int fuse_writepage(struct page *page, struct writeback_control *wbc)
 	struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
 	int err;
 
-	if (fuse_page_is_writeback(page->mapping->host, page->index)) {
+	if ((!fc->no_temporary_page) && fuse_page_is_writeback(page->mapping->host, page->index)) {
 		/*
 		 * ->writepages() should be called for sync() and friends.  We
 		 * should only get here on direct reclaim and then we are
@@ -2084,8 +2105,11 @@  static void fuse_writepages_send(struct fuse_fill_wb_data *data)
 	fuse_flush_writepages(inode);
 	spin_unlock(&fi->lock);
 
-	for (i = 0; i < num_pages; i++)
-		end_page_writeback(data->orig_pages[i]);
+
+	if (wpa->has_temporary_page) {
+		for (i = 0; i < num_pages; i++)
+			end_page_writeback(data->orig_pages[i]);
+	}
 }
 
 /*
@@ -2156,7 +2180,8 @@  static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
 	 * the pages are faulted with get_user_pages(), and then after the read
 	 * completed.
 	 */
-	if (fuse_page_is_writeback(data->inode, page->index))
+	if (data->wpa->has_temporary_page &&
+	    fuse_page_is_writeback(data->inode, page->index))
 		return true;
 
 	/* Reached max pages */
@@ -2187,8 +2212,9 @@  static int fuse_writepages_fill(struct folio *folio,
 	struct inode *inode = data->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct page *tmp_page;
+	struct page *tmp_page = NULL;
 	int err;
+	int needs_temporary_page = !fc->no_temporary_page;
 
 	if (!data->ff) {
 		err = -EIO;
@@ -2202,10 +2228,12 @@  static int fuse_writepages_fill(struct folio *folio,
 		data->wpa = NULL;
 	}
 
-	err = -ENOMEM;
-	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-	if (!tmp_page)
-		goto out_unlock;
+	if (needs_temporary_page) {
+		err = -ENOMEM;
+		tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!tmp_page)
+			goto out_unlock;
+	}
 
 	/*
 	 * The page must not be redirtied until the writeout is completed
@@ -2223,7 +2251,7 @@  static int fuse_writepages_fill(struct folio *folio,
 	if (data->wpa == NULL) {
 		err = -ENOMEM;
 		wpa = fuse_writepage_args_alloc();
-		if (!wpa) {
+		if (!wpa && tmp_page) {
 			__free_page(tmp_page);
 			goto out_unlock;
 		}
@@ -2242,14 +2270,20 @@  static int fuse_writepages_fill(struct folio *folio,
 	}
 	folio_start_writeback(folio);
 
-	copy_highpage(tmp_page, &folio->page);
-	ap->pages[ap->num_pages] = tmp_page;
+	if (needs_temporary_page) {
+		copy_highpage(tmp_page, &folio->page);
+		ap->pages[ap->num_pages] = tmp_page;
+		data->orig_pages[ap->num_pages] = &folio->page;
+		inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
+	} else {
+		ap->pages[ap->num_pages] = &folio->page;
+		data->orig_pages[ap->num_pages] = &folio->page;
+	}
 	ap->descs[ap->num_pages].offset = 0;
 	ap->descs[ap->num_pages].length = PAGE_SIZE;
-	data->orig_pages[ap->num_pages] = &folio->page;
+	wpa->has_temporary_page = needs_temporary_page;
 
 	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
-	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	err = 0;
 	if (data->wpa) {
@@ -2260,10 +2294,16 @@  static int fuse_writepages_fill(struct folio *folio,
 		spin_lock(&fi->lock);
 		ap->num_pages++;
 		spin_unlock(&fi->lock);
-	} else if (fuse_writepage_add(wpa, &folio->page)) {
-		data->wpa = wpa;
+	} else if (needs_temporary_page) {
+		if (fuse_writepage_add(wpa, &folio->page))
+			data->wpa = wpa;
+		else
+			folio_end_writeback(folio);
 	} else {
-		folio_end_writeback(folio);
+		spin_lock(&fi->lock);
+		data->wpa = wpa;
+		ap->num_pages++;
+		spin_unlock(&fi->lock);
 	}
 out_unlock:
 	folio_unlock(folio);
@@ -2436,6 +2476,9 @@  static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct folio *folio = page_folio(vmf->page);
+	int has_temporary_page = !fc->no_temporary_page;
 
 	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
@@ -2444,7 +2487,11 @@  static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
 		return VM_FAULT_NOPAGE;
 	}
 
-	fuse_wait_on_page_writeback(inode, page->index);
+	if (has_temporary_page)
+		fuse_wait_on_page_writeback(inode, page->index);
+	else
+		folio_wait_stable(folio);
+
 	return VM_FAULT_LOCKED;
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6e6e721f421b..9958f672ba47 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -646,9 +646,15 @@  struct fuse_conn {
 	/** Filesystem supports NFS exporting.  Only set in INIT */
 	unsigned export_support:1;
 
+	/** Indicate fuse req is passed on virtio_fs channel **/
+	unsigned virtio_fs_ch:1;
+
 	/** write-back cache policy (default is write-through) */
 	unsigned writeback_cache:1;
 
+	/** Indicate whether write-back cache policy should use temporary pages **/
+	unsigned no_temporary_page:1;
+
 	/** allow parallel lookups and readdir (default is serialized) */
 	unsigned parallel_dirops:1;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 74d4f09d5827..69da39728ae0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1191,8 +1191,11 @@  static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 			}
 			if (flags & FUSE_ASYNC_DIO)
 				fc->async_dio = 1;
-			if (flags & FUSE_WRITEBACK_CACHE)
+			if (flags & FUSE_WRITEBACK_CACHE) {
 				fc->writeback_cache = 1;
+				if (fc->virtio_fs_ch)
+					fc->no_temporary_page = 1;
+			}
 			if (flags & FUSE_PARALLEL_DIROPS)
 				fc->parallel_dirops = 1;
 			if (flags & FUSE_HANDLE_KILLPRIV)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 5f1be1da92ce..dfa3806f979f 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1346,6 +1346,7 @@  static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
 
 	/* Previous unmount will stop all queues. Start these again */
 	virtio_fs_start_all_queues(fs);
+	fc->virtio_fs_ch = 1;
 	fuse_send_init(fm);
 	mutex_unlock(&virtio_fs_mutex);
 	return 0;