diff mbox series

[v6,21/34] 9p: Pin pages rather than ref'ing if appropriate

Message ID 167391063242.2311931.3275290816918213423.stgit@warthog.procyon.org.uk (mailing list archive)
State New, archived
Headers show
Series iov_iter: Improve page extraction (ref, pin or just list) | expand

Commit Message

David Howells Jan. 16, 2023, 11:10 p.m. UTC
Convert the 9p filesystem to use iov_iter_extract_pages() instead of
iov_iter_get_pages().  This will pin pages or leave them unaltered rather
than getting a ref on them as appropriate to the iterator.

The pages need to be pinned for DIO-read rather than having refs taken on
them to prevent VM copy-on-write from malfunctioning during a concurrent
fork() (the result of the I/O would otherwise end up only visible to the
child process and not the parent).

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: Eric Van Hensbergen <ericvh@gmail.com>
cc: Latchesar Ionkov <lucho@ionkov.net>
cc: Christian Schoenebeck <linux_oss@crudebyte.com>
cc: v9fs-developer@lists.sourceforge.net
---

 net/9p/trans_common.c |    6 ++-
 net/9p/trans_common.h |    3 +-
 net/9p/trans_virtio.c |   89 ++++++++++++++-----------------------------------
 3 files changed, 31 insertions(+), 67 deletions(-)

Comments

Al Viro Jan. 19, 2023, 2:52 a.m. UTC | #1
On Mon, Jan 16, 2023 at 11:10:32PM +0000, David Howells wrote:

> @@ -310,73 +310,34 @@ static int p9_get_mapped_pages(struct virtio_chan *chan,
>  			       struct iov_iter *data,
>  			       int count,
>  			       size_t *offs,
> -			       int *need_drop,
> +			       int *cleanup_mode,
>  			       unsigned int gup_flags)
>  {
>  	int nr_pages;
>  	int err;
> +	int n;
>  
>  	if (!iov_iter_count(data))
>  		return 0;
>  
> -	if (!iov_iter_is_kvec(data)) {
> -		int n;
> -		/*
> -		 * We allow only p9_max_pages pinned. We wait for the
> -		 * Other zc request to finish here
> -		 */
> -		if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
> -			err = wait_event_killable(vp_wq,
> -			      (atomic_read(&vp_pinned) < chan->p9_max_pages));
> -			if (err == -ERESTARTSYS)
> -				return err;
> -		}
> -		n = iov_iter_get_pages_alloc(data, pages, count, offs,
> -					     gup_flags);
> -		if (n < 0)
> -			return n;
> -		*need_drop = 1;
> -		nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE);
> -		atomic_add(nr_pages, &vp_pinned);
> -		return n;
> -	} else {
> -		/* kernel buffer, no need to pin pages */
> -		int index;
> -		size_t len;
> -		void *p;
> -
> -		/* we'd already checked that it's non-empty */
> -		while (1) {
> -			len = iov_iter_single_seg_count(data);
> -			if (likely(len)) {
> -				p = data->kvec->iov_base + data->iov_offset;
> -				break;
> -			}
> -			iov_iter_advance(data, 0);
> -		}
> -		if (len > count)
> -			len = count;
> -
> -		nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) -
> -			   (unsigned long)p / PAGE_SIZE;
> -
> -		*pages = kmalloc_array(nr_pages, sizeof(struct page *),
> -				       GFP_NOFS);
> -		if (!*pages)
> -			return -ENOMEM;
> -
> -		*need_drop = 0;
> -		p -= (*offs = offset_in_page(p));
> -		for (index = 0; index < nr_pages; index++) {
> -			if (is_vmalloc_addr(p))
> -				(*pages)[index] = vmalloc_to_page(p);
> -			else
> -				(*pages)[index] = kmap_to_page(p);
> -			p += PAGE_SIZE;
> -		}
> -		iov_iter_advance(data, len);
> -		return len;
> +	/*
> +	 * We allow only p9_max_pages pinned. We wait for the
> +	 * Other zc request to finish here
> +	 */
> +	if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
> +		err = wait_event_killable(vp_wq,
> +					  (atomic_read(&vp_pinned) < chan->p9_max_pages));
> +		if (err == -ERESTARTSYS)
> +			return err;
>  	}
> +
> +	n = iov_iter_extract_pages(data, pages, count, offs, gup_flags);

Wait a sec; just how would that work for ITER_KVEC?  AFAICS, in your
tree that would blow with -EFAULT...

Yup; in p9_client_readdir() in your tree:

net/9p/client.c:2057:	iov_iter_kvec(&to, ITER_DEST, &kv, 1, count);

net/9p/client.c:2077:		req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0,
net/9p/client.c:2078:				       11, "dqd", fid->fid, offset, rsize);

where
net/9p/client.c:799:	err = c->trans_mod->zc_request(c, req, uidata, uodata,
net/9p/client.c:800:				       inlen, olen, in_hdrlen);

and in p9_virtio_zc_request(), which is a possible ->zc_request() instance
net/9p/trans_virtio.c:402:		int n = p9_get_mapped_pages(chan, &out_pages, uodata,
net/9p/trans_virtio.c:403:					    outlen, &offs, &cleanup_mode,
net/9p/trans_virtio.c:404:					    FOLL_DEST_BUF);

with p9_get_mapped_pages() hitting
net/9p/trans_virtio.c:334:	n = iov_iter_extract_pages(data, pages, count, offs, gup_flags);
net/9p/trans_virtio.c:335:	if (n < 0)
net/9p/trans_virtio.c:336:		return n;

and in iov_iter_extract_get_pages()
lib/iov_iter.c:2250:	if (likely(user_backed_iter(i)))
lib/iov_iter.c:2251:		return iov_iter_extract_user_pages(i, pages, maxsize,
lib/iov_iter.c:2252:						   maxpages, gup_flags,
lib/iov_iter.c:2253:						   offset0);
lib/iov_iter.c:2254:	if (iov_iter_is_bvec(i))
lib/iov_iter.c:2255:		return iov_iter_extract_bvec_pages(i, pages, maxsize,
lib/iov_iter.c:2256:						   maxpages, gup_flags,
lib/iov_iter.c:2257:						   offset0);
lib/iov_iter.c:2258:	if (iov_iter_is_pipe(i))
lib/iov_iter.c:2259:		return iov_iter_extract_pipe_pages(i, pages, maxsize,
lib/iov_iter.c:2260:						   maxpages, gup_flags,
lib/iov_iter.c:2261:						   offset0);
lib/iov_iter.c:2262:	if (iov_iter_is_xarray(i))
lib/iov_iter.c:2263:		return iov_iter_extract_xarray_pages(i, pages, maxsize,
lib/iov_iter.c:2264:						     maxpages, gup_flags,
lib/iov_iter.c:2265:						     offset0);
lib/iov_iter.c:2266:	return -EFAULT;

All quoted lines by your
https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/tree/
How could that possibly work?
David Howells Jan. 19, 2023, 4:44 p.m. UTC | #2
Al Viro <viro@zeniv.linux.org.uk> wrote:

> Wait a sec; just how would that work for ITER_KVEC?  AFAICS, in your
> tree that would blow with -EFAULT...

You're right.  I wonder if I should handle ITER_KVEC in
iov_iter_extract_pages(), though I'm sure I've been told that a kvec might
point to data that doesn't have a matching page struct.  Or maybe it's that
the refcount shouldn't be changed on it.

A question for the 9p devs:

Looking more into p9_virtio_zc_request(), it might be better to use
netfs_extract_iter_to_sg(), since the page list is going to get turned into
one, instead of calling p9_get_mapped_pages() and pack_sg_list().

This would, however, require that chan->sg[] be populated outside of the
spinlock'd section - is there any reason that this can't be the case?  There's
nothing inside the locked section that makes sure the chan can be used before
it launches into loading up the scatterlist.  There is a wait afterwards, but
it has to drop the lock first, so wouldn't stop a parallel op from clobbering
chan->sg[] anyway.

Further, if virtqueue_add_sgs() fails with -ENOSPC and we go round again to
req_retry_pinned, do we actually need to reload chan->sg[]?

David
Christoph Hellwig Jan. 19, 2023, 4:51 p.m. UTC | #3
On Thu, Jan 19, 2023 at 04:44:14PM +0000, David Howells wrote:
> Al Viro <viro@zeniv.linux.org.uk> wrote:
> You're right.  I wonder if I should handle ITER_KVEC in
> iov_iter_extract_pages(), though I'm sure I've been told that a kvec might
> point to data that doesn't have a matching page struct.  Or maybe it's that
> the refcount shouldn't be changed on it.

They could in theory contain non-page backed memory, even if I don't
think we currently have that in tree.  The worst case is probably
vmalloc()ed memory.  Many instance will have no good way to deal with
something that isn't page backed.  That's one reason why I'd relaly
love to see ITER_KVEC go away - for most use cases ITER_BVEC is the
right thing, and the others are probably broken for various combinations
already, but that's going to be a fair amount of work.  For now just
failing the I/O if the instance can't deal with it is probably the
right thing.
diff mbox series

Patch

diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c
index c827f694551c..31d133412677 100644
--- a/net/9p/trans_common.c
+++ b/net/9p/trans_common.c
@@ -12,13 +12,15 @@ 
  * p9_release_pages - Release pages after the transaction.
  * @pages: array of pages to be put
  * @nr_pages: size of array
+ * @cleanup_mode: How to clean up the pages.
  */
-void p9_release_pages(struct page **pages, int nr_pages)
+void p9_release_pages(struct page **pages, int nr_pages,
+		      unsigned int cleanup_mode)
 {
 	int i;
 
 	for (i = 0; i < nr_pages; i++)
 		if (pages[i])
-			put_page(pages[i]);
+			page_put_unpin(pages[i], cleanup_mode);
 }
 EXPORT_SYMBOL(p9_release_pages);
diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h
index 32134db6abf3..9b20eb4f2359 100644
--- a/net/9p/trans_common.h
+++ b/net/9p/trans_common.h
@@ -4,4 +4,5 @@ 
  * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
  */
 
-void p9_release_pages(struct page **pages, int nr_pages);
+void p9_release_pages(struct page **pages, int nr_pages,
+		      unsigned int cleanup_mode);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index eb28b54fe5f6..561f7cbd79da 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -310,73 +310,34 @@  static int p9_get_mapped_pages(struct virtio_chan *chan,
 			       struct iov_iter *data,
 			       int count,
 			       size_t *offs,
-			       int *need_drop,
+			       int *cleanup_mode,
 			       unsigned int gup_flags)
 {
 	int nr_pages;
 	int err;
+	int n;
 
 	if (!iov_iter_count(data))
 		return 0;
 
-	if (!iov_iter_is_kvec(data)) {
-		int n;
-		/*
-		 * We allow only p9_max_pages pinned. We wait for the
-		 * Other zc request to finish here
-		 */
-		if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
-			err = wait_event_killable(vp_wq,
-			      (atomic_read(&vp_pinned) < chan->p9_max_pages));
-			if (err == -ERESTARTSYS)
-				return err;
-		}
-		n = iov_iter_get_pages_alloc(data, pages, count, offs,
-					     gup_flags);
-		if (n < 0)
-			return n;
-		*need_drop = 1;
-		nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE);
-		atomic_add(nr_pages, &vp_pinned);
-		return n;
-	} else {
-		/* kernel buffer, no need to pin pages */
-		int index;
-		size_t len;
-		void *p;
-
-		/* we'd already checked that it's non-empty */
-		while (1) {
-			len = iov_iter_single_seg_count(data);
-			if (likely(len)) {
-				p = data->kvec->iov_base + data->iov_offset;
-				break;
-			}
-			iov_iter_advance(data, 0);
-		}
-		if (len > count)
-			len = count;
-
-		nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) -
-			   (unsigned long)p / PAGE_SIZE;
-
-		*pages = kmalloc_array(nr_pages, sizeof(struct page *),
-				       GFP_NOFS);
-		if (!*pages)
-			return -ENOMEM;
-
-		*need_drop = 0;
-		p -= (*offs = offset_in_page(p));
-		for (index = 0; index < nr_pages; index++) {
-			if (is_vmalloc_addr(p))
-				(*pages)[index] = vmalloc_to_page(p);
-			else
-				(*pages)[index] = kmap_to_page(p);
-			p += PAGE_SIZE;
-		}
-		iov_iter_advance(data, len);
-		return len;
+	/*
+	 * We allow only p9_max_pages pinned. We wait for the
+	 * Other zc request to finish here
+	 */
+	if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
+		err = wait_event_killable(vp_wq,
+					  (atomic_read(&vp_pinned) < chan->p9_max_pages));
+		if (err == -ERESTARTSYS)
+			return err;
 	}
+
+	n = iov_iter_extract_pages(data, pages, count, offs, gup_flags);
+	if (n < 0)
+		return n;
+	*cleanup_mode = iov_iter_extract_mode(data, gup_flags);
+	nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE);
+	atomic_add(nr_pages, &vp_pinned);
+	return n;
 }
 
 static void handle_rerror(struct p9_req_t *req, int in_hdr_len,
@@ -431,7 +392,7 @@  p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
 	struct virtio_chan *chan = client->trans;
 	struct scatterlist *sgs[4];
 	size_t offs;
-	int need_drop = 0;
+	int cleanup_mode = 0;
 	int kicked = 0;
 
 	p9_debug(P9_DEBUG_TRANS, "virtio request\n");
@@ -439,7 +400,7 @@  p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
 	if (uodata) {
 		__le32 sz;
 		int n = p9_get_mapped_pages(chan, &out_pages, uodata,
-					    outlen, &offs, &need_drop,
+					    outlen, &offs, &cleanup_mode,
 					    FOLL_DEST_BUF);
 		if (n < 0) {
 			err = n;
@@ -459,7 +420,7 @@  p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
 		memcpy(&req->tc.sdata[0], &sz, sizeof(sz));
 	} else if (uidata) {
 		int n = p9_get_mapped_pages(chan, &in_pages, uidata,
-					    inlen, &offs, &need_drop,
+					    inlen, &offs, &cleanup_mode,
 					    FOLL_SOURCE_BUF);
 		if (n < 0) {
 			err = n;
@@ -546,14 +507,14 @@  p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
 	 * Non kernel buffers are pinned, unpin them
 	 */
 err_out:
-	if (need_drop) {
+	if (cleanup_mode) {
 		if (in_pages) {
 			p9_release_pages(in_pages, in_nr_pages);
-			atomic_sub(in_nr_pages, &vp_pinned);
+			atomic_sub(in_nr_pages, &vp_pinned, cleanup_mode);
 		}
 		if (out_pages) {
 			p9_release_pages(out_pages, out_nr_pages);
-			atomic_sub(out_nr_pages, &vp_pinned);
+			atomic_sub(out_nr_pages, &vp_pinned, cleanup_mode);
 		}
 		/* wakeup anybody waiting for slots to pin pages */
 		wake_up(&vp_wq);