Message ID | 167391063242.2311931.3275290816918213423.stgit@warthog.procyon.org.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | iov_iter: Improve page extraction (ref, pin or just list) | expand |
On Mon, Jan 16, 2023 at 11:10:32PM +0000, David Howells wrote: > @@ -310,73 +310,34 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, > struct iov_iter *data, > int count, > size_t *offs, > - int *need_drop, > + int *cleanup_mode, > unsigned int gup_flags) > { > int nr_pages; > int err; > + int n; > > if (!iov_iter_count(data)) > return 0; > > - if (!iov_iter_is_kvec(data)) { > - int n; > - /* > - * We allow only p9_max_pages pinned. We wait for the > - * Other zc request to finish here > - */ > - if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { > - err = wait_event_killable(vp_wq, > - (atomic_read(&vp_pinned) < chan->p9_max_pages)); > - if (err == -ERESTARTSYS) > - return err; > - } > - n = iov_iter_get_pages_alloc(data, pages, count, offs, > - gup_flags); > - if (n < 0) > - return n; > - *need_drop = 1; > - nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE); > - atomic_add(nr_pages, &vp_pinned); > - return n; > - } else { > - /* kernel buffer, no need to pin pages */ > - int index; > - size_t len; > - void *p; > - > - /* we'd already checked that it's non-empty */ > - while (1) { > - len = iov_iter_single_seg_count(data); > - if (likely(len)) { > - p = data->kvec->iov_base + data->iov_offset; > - break; > - } > - iov_iter_advance(data, 0); > - } > - if (len > count) > - len = count; > - > - nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) - > - (unsigned long)p / PAGE_SIZE; > - > - *pages = kmalloc_array(nr_pages, sizeof(struct page *), > - GFP_NOFS); > - if (!*pages) > - return -ENOMEM; > - > - *need_drop = 0; > - p -= (*offs = offset_in_page(p)); > - for (index = 0; index < nr_pages; index++) { > - if (is_vmalloc_addr(p)) > - (*pages)[index] = vmalloc_to_page(p); > - else > - (*pages)[index] = kmap_to_page(p); > - p += PAGE_SIZE; > - } > - iov_iter_advance(data, len); > - return len; > + /* > + * We allow only p9_max_pages pinned. We wait for the > + * Other zc request to finish here > + */ > + if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { > + err = wait_event_killable(vp_wq, > + (atomic_read(&vp_pinned) < chan->p9_max_pages)); > + if (err == -ERESTARTSYS) > + return err; > } > + > + n = iov_iter_extract_pages(data, pages, count, offs, gup_flags); Wait a sec; just how would that work for ITER_KVEC? AFAICS, in your tree that would blow with -EFAULT... Yup; in p9_client_readdir() in your tree: net/9p/client.c:2057: iov_iter_kvec(&to, ITER_DEST, &kv, 1, count); net/9p/client.c:2077: req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0, net/9p/client.c:2078: 11, "dqd", fid->fid, offset, rsize); where net/9p/client.c:799: err = c->trans_mod->zc_request(c, req, uidata, uodata, net/9p/client.c:800: inlen, olen, in_hdrlen); and in p9_virtio_zc_request(), which is a possible ->zc_request() instance net/9p/trans_virtio.c:402: int n = p9_get_mapped_pages(chan, &out_pages, uodata, net/9p/trans_virtio.c:403: outlen, &offs, &cleanup_mode, net/9p/trans_virtio.c:404: FOLL_DEST_BUF); with p9_get_mapped_pages() hitting net/9p/trans_virtio.c:334: n = iov_iter_extract_pages(data, pages, count, offs, gup_flags); net/9p/trans_virtio.c:335: if (n < 0) net/9p/trans_virtio.c:336: return n; and in iov_iter_extract_get_pages() lib/iov_iter.c:2250: if (likely(user_backed_iter(i))) lib/iov_iter.c:2251: return iov_iter_extract_user_pages(i, pages, maxsize, lib/iov_iter.c:2252: maxpages, gup_flags, lib/iov_iter.c:2253: offset0); lib/iov_iter.c:2254: if (iov_iter_is_bvec(i)) lib/iov_iter.c:2255: return iov_iter_extract_bvec_pages(i, pages, maxsize, lib/iov_iter.c:2256: maxpages, gup_flags, lib/iov_iter.c:2257: offset0); lib/iov_iter.c:2258: if (iov_iter_is_pipe(i)) lib/iov_iter.c:2259: return iov_iter_extract_pipe_pages(i, pages, maxsize, lib/iov_iter.c:2260: maxpages, gup_flags, lib/iov_iter.c:2261: offset0); lib/iov_iter.c:2262: if (iov_iter_is_xarray(i)) lib/iov_iter.c:2263: return iov_iter_extract_xarray_pages(i, pages, maxsize, lib/iov_iter.c:2264: maxpages, gup_flags, lib/iov_iter.c:2265: offset0); lib/iov_iter.c:2266: return -EFAULT; All quoted lines by your https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/tree/ How could that possibly work?
Al Viro <viro@zeniv.linux.org.uk> wrote: > Wait a sec; just how would that work for ITER_KVEC? AFAICS, in your > tree that would blow with -EFAULT... You're right. I wonder if I should handle ITER_KVEC in iov_iter_extract_pages(), though I'm sure I've been told that a kvec might point to data that doesn't have a matching page struct. Or maybe it's that the refcount shouldn't be changed on it. A question for the 9p devs: Looking more into p9_virtio_zc_request(), it might be better to use netfs_extract_iter_to_sg(), since the page list is going to get turned into one, instead of calling p9_get_mapped_pages() and pack_sg_list(). This would, however, require that chan->sg[] be populated outside of the spinlock'd section - is there any reason that this can't be the case? There's nothing inside the locked section that makes sure the chan can be used before it launches into loading up the scatterlist. There is a wait afterwards, but it has to drop the lock first, so wouldn't stop a parallel op from clobbering chan->sg[] anyway. Further, if virtqueue_add_sgs() fails with -ENOSPC and we go round again to req_retry_pinned, do we actually need to reload chan->sg[]? David
On Thu, Jan 19, 2023 at 04:44:14PM +0000, David Howells wrote: > Al Viro <viro@zeniv.linux.org.uk> wrote: > You're right. I wonder if I should handle ITER_KVEC in > iov_iter_extract_pages(), though I'm sure I've been told that a kvec might > point to data that doesn't have a matching page struct. Or maybe it's that > the refcount shouldn't be changed on it. They could in theory contain non-page backed memory, even if I don't think we currently have that in tree. The worst case is probably vmalloc()ed memory. Many instance will have no good way to deal with something that isn't page backed. That's one reason why I'd relaly love to see ITER_KVEC go away - for most use cases ITER_BVEC is the right thing, and the others are probably broken for various combinations already, but that's going to be a fair amount of work. For now just failing the I/O if the instance can't deal with it is probably the right thing.
diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index c827f694551c..31d133412677 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -12,13 +12,15 @@ * p9_release_pages - Release pages after the transaction. * @pages: array of pages to be put * @nr_pages: size of array + * @cleanup_mode: How to clean up the pages. */ -void p9_release_pages(struct page **pages, int nr_pages) +void p9_release_pages(struct page **pages, int nr_pages, + unsigned int cleanup_mode) { int i; for (i = 0; i < nr_pages; i++) if (pages[i]) - put_page(pages[i]); + page_put_unpin(pages[i], cleanup_mode); } EXPORT_SYMBOL(p9_release_pages); diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h index 32134db6abf3..9b20eb4f2359 100644 --- a/net/9p/trans_common.h +++ b/net/9p/trans_common.h @@ -4,4 +4,5 @@ * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com> */ -void p9_release_pages(struct page **pages, int nr_pages); +void p9_release_pages(struct page **pages, int nr_pages, + unsigned int cleanup_mode); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index eb28b54fe5f6..561f7cbd79da 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -310,73 +310,34 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, struct iov_iter *data, int count, size_t *offs, - int *need_drop, + int *cleanup_mode, unsigned int gup_flags) { int nr_pages; int err; + int n; if (!iov_iter_count(data)) return 0; - if (!iov_iter_is_kvec(data)) { - int n; - /* - * We allow only p9_max_pages pinned. We wait for the - * Other zc request to finish here - */ - if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { - err = wait_event_killable(vp_wq, - (atomic_read(&vp_pinned) < chan->p9_max_pages)); - if (err == -ERESTARTSYS) - return err; - } - n = iov_iter_get_pages_alloc(data, pages, count, offs, - gup_flags); - if (n < 0) - return n; - *need_drop = 1; - nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE); - atomic_add(nr_pages, &vp_pinned); - return n; - } else { - /* kernel buffer, no need to pin pages */ - int index; - size_t len; - void *p; - - /* we'd already checked that it's non-empty */ - while (1) { - len = iov_iter_single_seg_count(data); - if (likely(len)) { - p = data->kvec->iov_base + data->iov_offset; - break; - } - iov_iter_advance(data, 0); - } - if (len > count) - len = count; - - nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) - - (unsigned long)p / PAGE_SIZE; - - *pages = kmalloc_array(nr_pages, sizeof(struct page *), - GFP_NOFS); - if (!*pages) - return -ENOMEM; - - *need_drop = 0; - p -= (*offs = offset_in_page(p)); - for (index = 0; index < nr_pages; index++) { - if (is_vmalloc_addr(p)) - (*pages)[index] = vmalloc_to_page(p); - else - (*pages)[index] = kmap_to_page(p); - p += PAGE_SIZE; - } - iov_iter_advance(data, len); - return len; + /* + * We allow only p9_max_pages pinned. We wait for the + * Other zc request to finish here + */ + if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { + err = wait_event_killable(vp_wq, + (atomic_read(&vp_pinned) < chan->p9_max_pages)); + if (err == -ERESTARTSYS) + return err; } + + n = iov_iter_extract_pages(data, pages, count, offs, gup_flags); + if (n < 0) + return n; + *cleanup_mode = iov_iter_extract_mode(data, gup_flags); + nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE); + atomic_add(nr_pages, &vp_pinned); + return n; } static void handle_rerror(struct p9_req_t *req, int in_hdr_len, @@ -431,7 +392,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, struct virtio_chan *chan = client->trans; struct scatterlist *sgs[4]; size_t offs; - int need_drop = 0; + int cleanup_mode = 0; int kicked = 0; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); @@ -439,7 +400,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, if (uodata) { __le32 sz; int n = p9_get_mapped_pages(chan, &out_pages, uodata, - outlen, &offs, &need_drop, + outlen, &offs, &cleanup_mode, FOLL_DEST_BUF); if (n < 0) { err = n; @@ -459,7 +420,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, memcpy(&req->tc.sdata[0], &sz, sizeof(sz)); } else if (uidata) { int n = p9_get_mapped_pages(chan, &in_pages, uidata, - inlen, &offs, &need_drop, + inlen, &offs, &cleanup_mode, FOLL_SOURCE_BUF); if (n < 0) { err = n; @@ -546,14 +507,14 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, * Non kernel buffers are pinned, unpin them */ err_out: - if (need_drop) { + if (cleanup_mode) { if (in_pages) { p9_release_pages(in_pages, in_nr_pages); - atomic_sub(in_nr_pages, &vp_pinned); + atomic_sub(in_nr_pages, &vp_pinned, cleanup_mode); } if (out_pages) { p9_release_pages(out_pages, out_nr_pages); - atomic_sub(out_nr_pages, &vp_pinned); + atomic_sub(out_nr_pages, &vp_pinned, cleanup_mode); } /* wakeup anybody waiting for slots to pin pages */ wake_up(&vp_wq);
Convert the 9p filesystem to use iov_iter_extract_pages() instead of iov_iter_get_pages(). This will pin pages or leave them unaltered rather than getting a ref on them as appropriate to the iterator. The pages need to be pinned for DIO-read rather than having refs taken on them to prevent VM copy-on-write from malfunctioning during a concurrent fork() (the result of the I/O would otherwise end up only visible to the child process and not the parent). Signed-off-by: David Howells <dhowells@redhat.com> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: v9fs-developer@lists.sourceforge.net --- net/9p/trans_common.c | 6 ++- net/9p/trans_common.h | 3 +- net/9p/trans_virtio.c | 89 ++++++++++++++----------------------------------- 3 files changed, 31 insertions(+), 67 deletions(-)