diff mbox series

[6/6] fuse: convert direct IO paths to use FOLL_PIN

Message ID 20220227093434.2889464-7-jhubbard@nvidia.com (mailing list archive)
State New
Headers show
Series block, fs: convert most Direct IO cases to FOLL_PIN | expand

Commit Message

jhubbard.send.patches@gmail.com Feb. 27, 2022, 9:34 a.m. UTC
From: John Hubbard <jhubbard@nvidia.com>

Convert the fuse filesystem to support the new iov_iter_get_pages()
behavior. That routine now invokes pin_user_pages_fast(), which means
that such pages must be released via unpin_user_page(), rather than via
put_page().

This commit also removes any possibility of kernel pages being handled,
in the fuse_get_user_pages() call. Although this may seem like a steep
price to pay, Christoph Hellwig actually recommended it a few years ago
for nearly the same situation [1].

[1] https://lore.kernel.org/kvm/20190724061750.GA19397@infradead.org/

Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 fs/fuse/dev.c  |  7 +++++--
 fs/fuse/file.c | 38 +++++++++-----------------------------
 2 files changed, 14 insertions(+), 31 deletions(-)

Comments

Miklos Szeredi Feb. 28, 2022, 3:59 p.m. UTC | #1
On Sun, 27 Feb 2022 at 10:34, <jhubbard.send.patches@gmail.com> wrote:
>
> From: John Hubbard <jhubbard@nvidia.com>
>
> Convert the fuse filesystem to support the new iov_iter_get_pages()
> behavior. That routine now invokes pin_user_pages_fast(), which means
> that such pages must be released via unpin_user_page(), rather than via
> put_page().
>
> This commit also removes any possibility of kernel pages being handled,
> in the fuse_get_user_pages() call. Although this may seem like a steep
> price to pay, Christoph Hellwig actually recommended it a few years ago
> for nearly the same situation [1].

This might work for O_DIRECT, but fuse has this mode of operation
which turns normal "buffered" I/O into direct I/O.  And that in turn
will break execve of such files.

So AFAICS we need to keep kvec handing in some way.

Thanks,
Miklos
John Hubbard Feb. 28, 2022, 9:16 p.m. UTC | #2
On 2/28/22 07:59, Miklos Szeredi wrote:
> On Sun, 27 Feb 2022 at 10:34, <jhubbard.send.patches@gmail.com> wrote:
>>
>> From: John Hubbard <jhubbard@nvidia.com>
>>
>> Convert the fuse filesystem to support the new iov_iter_get_pages()
>> behavior. That routine now invokes pin_user_pages_fast(), which means
>> that such pages must be released via unpin_user_page(), rather than via
>> put_page().
>>
>> This commit also removes any possibility of kernel pages being handled,
>> in the fuse_get_user_pages() call. Although this may seem like a steep
>> price to pay, Christoph Hellwig actually recommended it a few years ago
>> for nearly the same situation [1].
> 
> This might work for O_DIRECT, but fuse has this mode of operation
> which turns normal "buffered" I/O into direct I/O.  And that in turn
> will break execve of such files.
> 
> So AFAICS we need to keep kvec handing in some way.
> 

Thanks for bringing that up! Do you have any hints for me, to jump start
a deeper look? And especially, sample programs that exercise this?


thanks,
Miklos Szeredi March 1, 2022, 9:41 a.m. UTC | #3
On Mon, 28 Feb 2022 at 22:16, John Hubbard <jhubbard@nvidia.com> wrote:
>
> On 2/28/22 07:59, Miklos Szeredi wrote:
> > On Sun, 27 Feb 2022 at 10:34, <jhubbard.send.patches@gmail.com> wrote:
> >>
> >> From: John Hubbard <jhubbard@nvidia.com>
> >>
> >> Convert the fuse filesystem to support the new iov_iter_get_pages()
> >> behavior. That routine now invokes pin_user_pages_fast(), which means
> >> that such pages must be released via unpin_user_page(), rather than via
> >> put_page().
> >>
> >> This commit also removes any possibility of kernel pages being handled,
> >> in the fuse_get_user_pages() call. Although this may seem like a steep
> >> price to pay, Christoph Hellwig actually recommended it a few years ago
> >> for nearly the same situation [1].
> >
> > This might work for O_DIRECT, but fuse has this mode of operation
> > which turns normal "buffered" I/O into direct I/O.  And that in turn
> > will break execve of such files.
> >
> > So AFAICS we need to keep kvec handing in some way.
> >
>
> Thanks for bringing that up! Do you have any hints for me, to jump start

How about just leaving that special code in place?   It bypasses page
refs and directly copies to the kernel buffer, so it should not have
any affect on the user page code.

> a deeper look? And especially, sample programs that exercise this?

Here's one:
# uncomment as appropriate:
#sudo dnf install fuse3-devel
#sudo apt install libfuse3-dev

cat <<EOF > fuse-dio-exec.c
#define FUSE_USE_VERSION 31
#include <fuse.h>
#include <errno.h>
#include <unistd.h>

static const char *filename = "/bin/true";

static int test_getattr(const char *path, struct stat *stbuf,
             struct fuse_file_info *fi)
{
    return lstat(filename, stbuf) == -1 ? -errno : 0;
}

static int test_open(const char *path, struct fuse_file_info *fi)
{
    int res;

    res = open(filename, fi->flags);
    if (res == -1)
        return -errno;

    fi->fh = res;
    fi->direct_io = 1;
    return 0;
}

static int test_read(const char *path, char *buf, size_t size, off_t offset,
              struct fuse_file_info *fi)
{
    int res = pread(fi->fh, buf, size, offset);
    return res == -1 ? -errno : res;
}

static int test_release(const char *path, struct fuse_file_info *fi)
{
    close(fi->fh);
    return 0;
}

static const struct fuse_operations test_oper = {
    .getattr    = test_getattr,
    .open        = test_open,
    .release    = test_release,
    .read        = test_read,
};

int main(int argc, char *argv[])
{
    return fuse_main(argc, argv, &test_oper, NULL);
}
EOF

gcc -W fuse-dio-exec.c `pkg-config fuse3 --cflags --libs` -o fuse-dio-exec
touch /tmp/true

#run test:
./fuse-dio-exec /tmp/true
/tmp/true
umount /tmp/true
John Hubbard March 2, 2022, 8:07 a.m. UTC | #4
On 3/1/22 01:41, Miklos Szeredi wrote:
...
>>> This might work for O_DIRECT, but fuse has this mode of operation
>>> which turns normal "buffered" I/O into direct I/O.  And that in turn
>>> will break execve of such files.
>>>
>>> So AFAICS we need to keep kvec handing in some way.
>>>
>>
>> Thanks for bringing that up! Do you have any hints for me, to jump start
> 
> How about just leaving that special code in place?   It bypasses page
> refs and directly copies to the kernel buffer, so it should not have
> any affect on the user page code.
> 

Good idea, I'll go that direction.

>> a deeper look? And especially, sample programs that exercise this?
> 
> Here's one:

This is really helpful, exactly what I was looking for.


thanks!
diff mbox series

Patch

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e1b4a846c90d..9db85c4d549a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -675,7 +675,10 @@  static void fuse_copy_finish(struct fuse_copy_state *cs)
 			flush_dcache_page(cs->pg);
 			set_page_dirty_lock(cs->pg);
 		}
-		put_page(cs->pg);
+		if (cs->pipebufs)
+			put_page(cs->pg);
+		else
+			unpin_user_page(cs->pg);
 	}
 	cs->pg = NULL;
 }
@@ -730,7 +733,7 @@  static int fuse_copy_fill(struct fuse_copy_state *cs)
 		}
 	} else {
 		size_t off;
-		err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
+		err = iov_iter_pin_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
 		if (err < 0)
 			return err;
 		BUG_ON(!err);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 94747bac3489..ecfa5bdde919 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -611,18 +611,6 @@  void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
 	args->out_args[0].size = count;
 }
 
-static void fuse_release_user_pages(struct fuse_args_pages *ap,
-				    bool should_dirty)
-{
-	unsigned int i;
-
-	for (i = 0; i < ap->num_pages; i++) {
-		if (should_dirty)
-			set_page_dirty_lock(ap->pages[i]);
-		put_page(ap->pages[i]);
-	}
-}
-
 static void fuse_io_release(struct kref *kref)
 {
 	kfree(container_of(kref, struct fuse_io_priv, refcnt));
@@ -720,7 +708,8 @@  static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
 	struct fuse_io_priv *io = ia->io;
 	ssize_t pos = -1;
 
-	fuse_release_user_pages(&ia->ap, io->should_dirty);
+	unpin_user_pages_dirty_lock(ia->ap.pages, ia->ap.num_pages,
+				    io->should_dirty);
 
 	if (err) {
 		/* Nothing */
@@ -1382,25 +1371,14 @@  static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
 	size_t nbytes = 0;  /* # bytes already packed in req */
 	ssize_t ret = 0;
 
-	/* Special case for kernel I/O: can copy directly into the buffer */
-	if (iov_iter_is_kvec(ii)) {
-		unsigned long user_addr = fuse_get_user_addr(ii);
-		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
-
-		if (write)
-			ap->args.in_args[1].value = (void *) user_addr;
-		else
-			ap->args.out_args[0].value = (void *) user_addr;
-
-		iov_iter_advance(ii, frag_size);
-		*nbytesp = frag_size;
-		return 0;
-	}
+	/* Only user space buffers are allowed with fuse Direct IO. */
+	if (WARN_ON_ONCE(!iter_is_iovec(ii)))
+		return -EOPNOTSUPP;
 
 	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
 		unsigned npages;
 		size_t start;
-		ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
+		ret = iov_iter_pin_pages(ii, &ap->pages[ap->num_pages],
 					*nbytesp - nbytes,
 					max_pages - ap->num_pages,
 					&start);
@@ -1484,7 +1462,9 @@  ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 		}
 
 		if (!io->async || nres < 0) {
-			fuse_release_user_pages(&ia->ap, io->should_dirty);
+			unpin_user_pages_dirty_lock(ia->ap.pages,
+						    ia->ap.num_pages,
+						    io->should_dirty);
 			fuse_io_free(ia);
 		}
 		ia = NULL;