diff mbox series

[v2,16/19] netfs: Split fs/netfs/read_helper.c

Message ID 164678217075.1200972.5101072043126828757.stgit@warthog.procyon.org.uk (mailing list archive)
State New, archived
Headers show
Series netfs: Prep for write helpers | expand

Commit Message

David Howells March 8, 2022, 11:29 p.m. UTC
Split fs/netfs/read_helper.c into two pieces, one to deal with buffered
writes and one to deal with the I/O mechanism.

Changes
=======
ver #2)
 - Add kdoc reference to new file.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-cachefs@redhat.com

Link: https://lore.kernel.org/r/164623005586.3564931.6149556072728481767.stgit@warthog.procyon.org.uk/ # v1
---

 fs/netfs/Makefile        |    1 
 fs/netfs/buffered_read.c |  428 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/netfs/io.c            |  418 ---------------------------------------------
 3 files changed, 429 insertions(+), 418 deletions(-)
 create mode 100644 fs/netfs/buffered_read.c

Comments

Jeff Layton March 9, 2022, 8:27 p.m. UTC | #1
On Tue, 2022-03-08 at 23:29 +0000, David Howells wrote:
> Split fs/netfs/read_helper.c into two pieces, one to deal with buffered
> writes and one to deal with the I/O mechanism.
> 

I think you mean buffered reads here?

> Changes
> =======
> ver #2)
>  - Add kdoc reference to new file.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: linux-cachefs@redhat.com
> 
> Link: https://lore.kernel.org/r/164623005586.3564931.6149556072728481767.stgit@warthog.procyon.org.uk/ # v1
> ---
> 
>  fs/netfs/Makefile        |    1 
>  fs/netfs/buffered_read.c |  428 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/netfs/io.c            |  418 ---------------------------------------------
>  3 files changed, 429 insertions(+), 418 deletions(-)
>  create mode 100644 fs/netfs/buffered_read.c
> 
> diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
> index 51ece4f7bc77..88b904532bc7 100644
> --- a/fs/netfs/Makefile
> +++ b/fs/netfs/Makefile
> @@ -1,6 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0
>  
>  netfs-y := \
> +	buffered_read.o \
>  	io.o \
>  	objects.o
>  
> diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
> new file mode 100644
> index 000000000000..09ba7097a970
> --- /dev/null
> +++ b/fs/netfs/buffered_read.c
> @@ -0,0 +1,428 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/* Network filesystem high-level buffered read support.
> + *
> + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
> + * Written by David Howells (dhowells@redhat.com)
> + */
> +
> +#include <linux/export.h>
> +#include <linux/task_io_accounting_ops.h>
> +#include "internal.h"
> +
> +/*
> + * Unlock the folios in a read operation.  We need to set PG_fscache on any
> + * folios we're going to write back before we unlock them.
> + */
> +void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
> +{
> +	struct netfs_io_subrequest *subreq;
> +	struct folio *folio;
> +	unsigned int iopos, account = 0;
> +	pgoff_t start_page = rreq->start / PAGE_SIZE;
> +	pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
> +	bool subreq_failed = false;
> +
> +	XA_STATE(xas, &rreq->mapping->i_pages, start_page);
> +
> +	if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
> +		__clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
> +		list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
> +			__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
> +		}
> +	}
> +
> +	/* Walk through the pagecache and the I/O request lists simultaneously.
> +	 * We may have a mixture of cached and uncached sections and we only
> +	 * really want to write out the uncached sections.  This is slightly
> +	 * complicated by the possibility that we might have huge pages with a
> +	 * mixture inside.
> +	 */
> +	subreq = list_first_entry(&rreq->subrequests,
> +				  struct netfs_io_subrequest, rreq_link);
> +	iopos = 0;
> +	subreq_failed = (subreq->error < 0);
> +
> +	trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
> +
> +	rcu_read_lock();
> +	xas_for_each(&xas, folio, last_page) {
> +		unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
> +		unsigned int pgend = pgpos + folio_size(folio);
> +		bool pg_failed = false;
> +
> +		for (;;) {
> +			if (!subreq) {
> +				pg_failed = true;
> +				break;
> +			}
> +			if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
> +				folio_start_fscache(folio);
> +			pg_failed |= subreq_failed;
> +			if (pgend < iopos + subreq->len)
> +				break;
> +
> +			account += subreq->transferred;
> +			iopos += subreq->len;
> +			if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
> +				subreq = list_next_entry(subreq, rreq_link);
> +				subreq_failed = (subreq->error < 0);
> +			} else {
> +				subreq = NULL;
> +				subreq_failed = false;
> +			}
> +			if (pgend == iopos)
> +				break;
> +		}
> +
> +		if (!pg_failed) {
> +			flush_dcache_folio(folio);
> +			folio_mark_uptodate(folio);
> +		}
> +
> +		if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
> +			if (folio_index(folio) == rreq->no_unlock_folio &&
> +			    test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
> +				_debug("no unlock");
> +			else
> +				folio_unlock(folio);
> +		}
> +	}
> +	rcu_read_unlock();
> +
> +	task_io_account_read(account);
> +	if (rreq->netfs_ops->done)
> +		rreq->netfs_ops->done(rreq);
> +}
> +
> +static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
> +					 loff_t *_start, size_t *_len, loff_t i_size)
> +{
> +	struct netfs_cache_resources *cres = &rreq->cache_resources;
> +
> +	if (cres->ops && cres->ops->expand_readahead)
> +		cres->ops->expand_readahead(cres, _start, _len, i_size);
> +}
> +
> +static void netfs_rreq_expand(struct netfs_io_request *rreq,
> +			      struct readahead_control *ractl)
> +{
> +	/* Give the cache a chance to change the request parameters.  The
> +	 * resultant request must contain the original region.
> +	 */
> +	netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
> +
> +	/* Give the netfs a chance to change the request parameters.  The
> +	 * resultant request must contain the original region.
> +	 */
> +	if (rreq->netfs_ops->expand_readahead)
> +		rreq->netfs_ops->expand_readahead(rreq);
> +
> +	/* Expand the request if the cache wants it to start earlier.  Note
> +	 * that the expansion may get further extended if the VM wishes to
> +	 * insert THPs and the preferred start and/or end wind up in the middle
> +	 * of THPs.
> +	 *
> +	 * If this is the case, however, the THP size should be an integer
> +	 * multiple of the cache granule size, so we get a whole number of
> +	 * granules to deal with.
> +	 */
> +	if (rreq->start  != readahead_pos(ractl) ||
> +	    rreq->len != readahead_length(ractl)) {
> +		readahead_expand(ractl, rreq->start, rreq->len);
> +		rreq->start  = readahead_pos(ractl);
> +		rreq->len = readahead_length(ractl);
> +
> +		trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
> +				 netfs_read_trace_expanded);
> +	}
> +}
> +
> +/**
> + * netfs_readahead - Helper to manage a read request
> + * @ractl: The description of the readahead request
> + *
> + * Fulfil a readahead request by drawing data from the cache if possible, or
> + * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
> + * requests from different sources will get munged together.  If necessary, the
> + * readahead window can be expanded in either direction to a more convenient
> + * alighment for RPC efficiency or to make storage in the cache feasible.
> + *
> + * The calling netfs must initialise a netfs context contiguous to the vfs
> + * inode before calling this.
> + *
> + * This is usable whether or not caching is enabled.
> + */
> +void netfs_readahead(struct readahead_control *ractl)
> +{
> +	struct netfs_io_request *rreq;
> +	struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
> +	int ret;
> +
> +	_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
> +
> +	if (readahead_count(ractl) == 0)
> +		return;
> +
> +	rreq = netfs_alloc_request(ractl->mapping, ractl->file,
> +				   readahead_pos(ractl),
> +				   readahead_length(ractl),
> +				   NETFS_READAHEAD);
> +	if (IS_ERR(rreq))
> +		return;
> +
> +	if (ctx->ops->begin_cache_operation) {
> +		ret = ctx->ops->begin_cache_operation(rreq);
> +		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
> +			goto cleanup_free;
> +	}
> +
> +	netfs_stat(&netfs_n_rh_readahead);
> +	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
> +			 netfs_read_trace_readahead);
> +
> +	netfs_rreq_expand(rreq, ractl);
> +
> +	/* Drop the refs on the folios here rather than in the cache or
> +	 * filesystem.  The locks will be dropped in netfs_rreq_unlock().
> +	 */
> +	while (readahead_folio(ractl))
> +		;
> +
> +	netfs_begin_read(rreq, false);
> +	return;
> +
> +cleanup_free:
> +	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
> +	return;
> +}
> +EXPORT_SYMBOL(netfs_readahead);
> +
> +/**
> + * netfs_readpage - Helper to manage a readpage request
> + * @file: The file to read from
> + * @subpage: A subpage of the folio to read
> + *
> + * Fulfil a readpage request by drawing data from the cache if possible, or the
> + * netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O requests
> + * from different sources will get munged together.
> + *
> + * The calling netfs must initialise a netfs context contiguous to the vfs
> + * inode before calling this.
> + *
> + * This is usable whether or not caching is enabled.
> + */
> +int netfs_readpage(struct file *file, struct page *subpage)
> +{
> +	struct folio *folio = page_folio(subpage);
> +	struct address_space *mapping = folio_file_mapping(folio);
> +	struct netfs_io_request *rreq;
> +	struct netfs_i_context *ctx = netfs_i_context(mapping->host);
> +	int ret;
> +
> +	_enter("%lx", folio_index(folio));
> +
> +	rreq = netfs_alloc_request(mapping, file,
> +				   folio_file_pos(folio), folio_size(folio),
> +				   NETFS_READPAGE);
> +	if (IS_ERR(rreq)) {
> +		ret = PTR_ERR(rreq);
> +		goto alloc_error;
> +	}
> +
> +	if (ctx->ops->begin_cache_operation) {
> +		ret = ctx->ops->begin_cache_operation(rreq);
> +		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
> +			goto discard;
> +	}
> +
> +	netfs_stat(&netfs_n_rh_readpage);
> +	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
> +	return netfs_begin_read(rreq, true);
> +
> +discard:
> +	netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
> +alloc_error:
> +	folio_unlock(folio);
> +	return ret;
> +}
> +EXPORT_SYMBOL(netfs_readpage);
> +
> +/*
> + * Prepare a folio for writing without reading first
> + * @folio: The folio being prepared
> + * @pos: starting position for the write
> + * @len: length of write
> + * @always_fill: T if the folio should always be completely filled/cleared
> + *
> + * In some cases, write_begin doesn't need to read at all:
> + * - full folio write
> + * - write that lies in a folio that is completely beyond EOF
> + * - write that covers the folio from start to EOF or beyond it
> + *
> + * If any of these criteria are met, then zero out the unwritten parts
> + * of the folio and return true. Otherwise, return false.
> + */
> +static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
> +				 bool always_fill)
> +{
> +	struct inode *inode = folio_inode(folio);
> +	loff_t i_size = i_size_read(inode);
> +	size_t offset = offset_in_folio(folio, pos);
> +	size_t plen = folio_size(folio);
> +
> +	if (unlikely(always_fill)) {
> +		if (pos - offset + len <= i_size)
> +			return false; /* Page entirely before EOF */
> +		zero_user_segment(&folio->page, 0, plen);
> +		folio_mark_uptodate(folio);
> +		return true;
> +	}
> +
> +	/* Full folio write */
> +	if (offset == 0 && len >= plen)
> +		return true;
> +
> +	/* Page entirely beyond the end of the file */
> +	if (pos - offset >= i_size)
> +		goto zero_out;
> +
> +	/* Write that covers from the start of the folio to EOF or beyond */
> +	if (offset == 0 && (pos + len) >= i_size)
> +		goto zero_out;
> +
> +	return false;
> +zero_out:
> +	zero_user_segments(&folio->page, 0, offset, offset + len, len);
> +	return true;
> +}
> +
> +/**
> + * netfs_write_begin - Helper to prepare for writing
> + * @file: The file to read from
> + * @mapping: The mapping to read from
> + * @pos: File position at which the write will begin
> + * @len: The length of the write (may extend beyond the end of the folio chosen)
> + * @aop_flags: AOP_* flags
> + * @_folio: Where to put the resultant folio
> + * @_fsdata: Place for the netfs to store a cookie
> + *
> + * Pre-read data for a write-begin request by drawing data from the cache if
> + * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
> + * Multiple I/O requests from different sources will get munged together.  If
> + * necessary, the readahead window can be expanded in either direction to a
> + * more convenient alighment for RPC efficiency or to make storage in the cache
> + * feasible.
> + *
> + * The calling netfs must provide a table of operations, only one of which,
> + * issue_op, is mandatory.
> + *
> + * The check_write_begin() operation can be provided to check for and flush
> + * conflicting writes once the folio is grabbed and locked.  It is passed a
> + * pointer to the fsdata cookie that gets returned to the VM to be passed to
> + * write_end.  It is permitted to sleep.  It should return 0 if the request
> + * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
> + * be regot; or return an error.
> + *
> + * The calling netfs must initialise a netfs context contiguous to the vfs
> + * inode before calling this.
> + *
> + * This is usable whether or not caching is enabled.
> + */
> +int netfs_write_begin(struct file *file, struct address_space *mapping,
> +		      loff_t pos, unsigned int len, unsigned int aop_flags,
> +		      struct folio **_folio, void **_fsdata)
> +{
> +	struct netfs_io_request *rreq;
> +	struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
> +	struct folio *folio;
> +	unsigned int fgp_flags;
> +	pgoff_t index = pos >> PAGE_SHIFT;
> +	int ret;
> +
> +	DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
> +
> +retry:
> +	fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
> +	if (aop_flags & AOP_FLAG_NOFS)
> +		fgp_flags |= FGP_NOFS;
> +	folio = __filemap_get_folio(mapping, index, fgp_flags,
> +				    mapping_gfp_mask(mapping));
> +	if (!folio)
> +		return -ENOMEM;
> +
> +	if (ctx->ops->check_write_begin) {
> +		/* Allow the netfs (eg. ceph) to flush conflicts. */
> +		ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
> +		if (ret < 0) {
> +			trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
> +			if (ret == -EAGAIN)
> +				goto retry;
> +			goto error;
> +		}
> +	}
> +
> +	if (folio_test_uptodate(folio))
> +		goto have_folio;
> +
> +	/* If the page is beyond the EOF, we want to clear it - unless it's
> +	 * within the cache granule containing the EOF, in which case we need
> +	 * to preload the granule.
> +	 */
> +	if (!netfs_is_cache_enabled(ctx) &&
> +	    netfs_skip_folio_read(folio, pos, len, false)) {
> +		netfs_stat(&netfs_n_rh_write_zskip);
> +		goto have_folio_no_wait;
> +	}
> +
> +	rreq = netfs_alloc_request(mapping, file,
> +				   folio_file_pos(folio), folio_size(folio),
> +				   NETFS_READ_FOR_WRITE);
> +	if (IS_ERR(rreq)) {
> +		ret = PTR_ERR(rreq);
> +		goto error;
> +	}
> +	rreq->no_unlock_folio	= folio_index(folio);
> +	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
> +
> +	if (ctx->ops->begin_cache_operation) {
> +		ret = ctx->ops->begin_cache_operation(rreq);
> +		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
> +			goto error_put;
> +	}
> +
> +	netfs_stat(&netfs_n_rh_write_begin);
> +	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
> +
> +	/* Expand the request to meet caching requirements and download
> +	 * preferences.
> +	 */
> +	ractl._nr_pages = folio_nr_pages(folio);
> +	netfs_rreq_expand(rreq, &ractl);
> +
> +	/* We hold the folio locks, so we can drop the references */
> +	folio_get(folio);
> +	while (readahead_folio(&ractl))
> +		;
> +
> +	ret = netfs_begin_read(rreq, true);
> +	if (ret < 0)
> +		goto error;
> +
> +have_folio:
> +	ret = folio_wait_fscache_killable(folio);
> +	if (ret < 0)
> +		goto error;
> +have_folio_no_wait:
> +	*_folio = folio;
> +	_leave(" = 0");
> +	return 0;
> +
> +error_put:
> +	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
> +error:
> +	folio_unlock(folio);
> +	folio_put(folio);
> +	_leave(" = %d", ret);
> +	return ret;
> +}
> +EXPORT_SYMBOL(netfs_write_begin);
> diff --git a/fs/netfs/io.c b/fs/netfs/io.c
> index 058a534ba917..1fe9706c58a5 100644
> --- a/fs/netfs/io.c
> +++ b/fs/netfs/io.c
> @@ -246,91 +246,6 @@ static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
>  		BUG();
>  }
>  
> -/*
> - * Unlock the folios in a read operation.  We need to set PG_fscache on any
> - * folios we're going to write back before we unlock them.
> - */
> -void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
> -{
> -	struct netfs_io_subrequest *subreq;
> -	struct folio *folio;
> -	unsigned int iopos, account = 0;
> -	pgoff_t start_page = rreq->start / PAGE_SIZE;
> -	pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
> -	bool subreq_failed = false;
> -
> -	XA_STATE(xas, &rreq->mapping->i_pages, start_page);
> -
> -	if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
> -		__clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
> -		list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
> -			__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
> -		}
> -	}
> -
> -	/* Walk through the pagecache and the I/O request lists simultaneously.
> -	 * We may have a mixture of cached and uncached sections and we only
> -	 * really want to write out the uncached sections.  This is slightly
> -	 * complicated by the possibility that we might have huge pages with a
> -	 * mixture inside.
> -	 */
> -	subreq = list_first_entry(&rreq->subrequests,
> -				  struct netfs_io_subrequest, rreq_link);
> -	iopos = 0;
> -	subreq_failed = (subreq->error < 0);
> -
> -	trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
> -
> -	rcu_read_lock();
> -	xas_for_each(&xas, folio, last_page) {
> -		unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
> -		unsigned int pgend = pgpos + folio_size(folio);
> -		bool pg_failed = false;
> -
> -		for (;;) {
> -			if (!subreq) {
> -				pg_failed = true;
> -				break;
> -			}
> -			if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
> -				folio_start_fscache(folio);
> -			pg_failed |= subreq_failed;
> -			if (pgend < iopos + subreq->len)
> -				break;
> -
> -			account += subreq->transferred;
> -			iopos += subreq->len;
> -			if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
> -				subreq = list_next_entry(subreq, rreq_link);
> -				subreq_failed = (subreq->error < 0);
> -			} else {
> -				subreq = NULL;
> -				subreq_failed = false;
> -			}
> -			if (pgend == iopos)
> -				break;
> -		}
> -
> -		if (!pg_failed) {
> -			flush_dcache_folio(folio);
> -			folio_mark_uptodate(folio);
> -		}
> -
> -		if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
> -			if (folio_index(folio) == rreq->no_unlock_folio &&
> -			    test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
> -				_debug("no unlock");
> -			else
> -				folio_unlock(folio);
> -		}
> -	}
> -	rcu_read_unlock();
> -
> -	task_io_account_read(account);
> -	if (rreq->netfs_ops->done)
> -		rreq->netfs_ops->done(rreq);
> -}
> -
>  /*
>   * Handle a short read.
>   */
> @@ -750,336 +665,3 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
>  	}
>  	return ret;
>  }
> -
> -static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
> -					 loff_t *_start, size_t *_len, loff_t i_size)
> -{
> -	struct netfs_cache_resources *cres = &rreq->cache_resources;
> -
> -	if (cres->ops && cres->ops->expand_readahead)
> -		cres->ops->expand_readahead(cres, _start, _len, i_size);
> -}
> -
> -static void netfs_rreq_expand(struct netfs_io_request *rreq,
> -			      struct readahead_control *ractl)
> -{
> -	/* Give the cache a chance to change the request parameters.  The
> -	 * resultant request must contain the original region.
> -	 */
> -	netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
> -
> -	/* Give the netfs a chance to change the request parameters.  The
> -	 * resultant request must contain the original region.
> -	 */
> -	if (rreq->netfs_ops->expand_readahead)
> -		rreq->netfs_ops->expand_readahead(rreq);
> -
> -	/* Expand the request if the cache wants it to start earlier.  Note
> -	 * that the expansion may get further extended if the VM wishes to
> -	 * insert THPs and the preferred start and/or end wind up in the middle
> -	 * of THPs.
> -	 *
> -	 * If this is the case, however, the THP size should be an integer
> -	 * multiple of the cache granule size, so we get a whole number of
> -	 * granules to deal with.
> -	 */
> -	if (rreq->start  != readahead_pos(ractl) ||
> -	    rreq->len != readahead_length(ractl)) {
> -		readahead_expand(ractl, rreq->start, rreq->len);
> -		rreq->start  = readahead_pos(ractl);
> -		rreq->len = readahead_length(ractl);
> -
> -		trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
> -				 netfs_read_trace_expanded);
> -	}
> -}
> -
> -/**
> - * netfs_readahead - Helper to manage a read request
> - * @ractl: The description of the readahead request
> - *
> - * Fulfil a readahead request by drawing data from the cache if possible, or
> - * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
> - * requests from different sources will get munged together.  If necessary, the
> - * readahead window can be expanded in either direction to a more convenient
> - * alighment for RPC efficiency or to make storage in the cache feasible.
> - *
> - * The calling netfs must initialise a netfs context contiguous to the vfs
> - * inode before calling this.
> - *
> - * This is usable whether or not caching is enabled.
> - */
> -void netfs_readahead(struct readahead_control *ractl)
> -{
> -	struct netfs_io_request *rreq;
> -	struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
> -	int ret;
> -
> -	_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
> -
> -	if (readahead_count(ractl) == 0)
> -		return;
> -
> -	rreq = netfs_alloc_request(ractl->mapping, ractl->file,
> -				   readahead_pos(ractl),
> -				   readahead_length(ractl),
> -				   NETFS_READAHEAD);
> -	if (IS_ERR(rreq))
> -		return;
> -
> -	if (ctx->ops->begin_cache_operation) {
> -		ret = ctx->ops->begin_cache_operation(rreq);
> -		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
> -			goto cleanup_free;
> -	}
> -
> -	netfs_stat(&netfs_n_rh_readahead);
> -	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
> -			 netfs_read_trace_readahead);
> -
> -	netfs_rreq_expand(rreq, ractl);
> -
> -	/* Drop the refs on the folios here rather than in the cache or
> -	 * filesystem.  The locks will be dropped in netfs_rreq_unlock().
> -	 */
> -	while (readahead_folio(ractl))
> -		;
> -
> -	netfs_begin_read(rreq, false);
> -	return;
> -
> -cleanup_free:
> -	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
> -	return;
> -}
> -EXPORT_SYMBOL(netfs_readahead);
> -
> -/**
> - * netfs_readpage - Helper to manage a readpage request
> - * @file: The file to read from
> - * @subpage: A subpage of the folio to read
> - *
> - * Fulfil a readpage request by drawing data from the cache if possible, or the
> - * netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O requests
> - * from different sources will get munged together.
> - *
> - * The calling netfs must initialise a netfs context contiguous to the vfs
> - * inode before calling this.
> - *
> - * This is usable whether or not caching is enabled.
> - */
> -int netfs_readpage(struct file *file, struct page *subpage)
> -{
> -	struct folio *folio = page_folio(subpage);
> -	struct address_space *mapping = folio->mapping;
> -	struct netfs_io_request *rreq;
> -	struct netfs_i_context *ctx = netfs_i_context(mapping->host);
> -	int ret;
> -
> -	_enter("%lx", folio_index(folio));
> -
> -	rreq = netfs_alloc_request(mapping, file,
> -				   folio_file_pos(folio), folio_size(folio),
> -				   NETFS_READPAGE);
> -	if (IS_ERR(rreq)) {
> -		ret = PTR_ERR(rreq);
> -		goto alloc_error;
> -	}
> -
> -	if (ctx->ops->begin_cache_operation) {
> -		ret = ctx->ops->begin_cache_operation(rreq);
> -		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
> -			goto discard;
> -	}
> -
> -	netfs_stat(&netfs_n_rh_readpage);
> -	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
> -	return netfs_begin_read(rreq, true);
> -
> -discard:
> -	netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
> -alloc_error:
> -	folio_unlock(folio);
> -	return ret;
> -}
> -EXPORT_SYMBOL(netfs_readpage);
> -
> -/*
> - * Prepare a folio for writing without reading first
> - * @folio: The folio being prepared
> - * @pos: starting position for the write
> - * @len: length of write
> - * @always_fill: T if the folio should always be completely filled/cleared
> - *
> - * In some cases, write_begin doesn't need to read at all:
> - * - full folio write
> - * - write that lies in a folio that is completely beyond EOF
> - * - write that covers the folio from start to EOF or beyond it
> - *
> - * If any of these criteria are met, then zero out the unwritten parts
> - * of the folio and return true. Otherwise, return false.
> - */
> -static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
> -				 bool always_fill)
> -{
> -	struct inode *inode = folio_inode(folio);
> -	loff_t i_size = i_size_read(inode);
> -	size_t offset = offset_in_folio(folio, pos);
> -	size_t plen = folio_size(folio);
> -
> -	if (unlikely(always_fill)) {
> -		if (pos - offset + len <= i_size)
> -			return false; /* Page entirely before EOF */
> -		zero_user_segment(&folio->page, 0, plen);
> -		folio_mark_uptodate(folio);
> -		return true;
> -	}
> -
> -	/* Full folio write */
> -	if (offset == 0 && len >= plen)
> -		return true;
> -
> -	/* Page entirely beyond the end of the file */
> -	if (pos - offset >= i_size)
> -		goto zero_out;
> -
> -	/* Write that covers from the start of the folio to EOF or beyond */
> -	if (offset == 0 && (pos + len) >= i_size)
> -		goto zero_out;
> -
> -	return false;
> -zero_out:
> -	zero_user_segments(&folio->page, 0, offset, offset + len, len);
> -	return true;
> -}
> -
> -/**
> - * netfs_write_begin - Helper to prepare for writing
> - * @file: The file to read from
> - * @mapping: The mapping to read from
> - * @pos: File position at which the write will begin
> - * @len: The length of the write (may extend beyond the end of the folio chosen)
> - * @aop_flags: AOP_* flags
> - * @_folio: Where to put the resultant folio
> - * @_fsdata: Place for the netfs to store a cookie
> - *
> - * Pre-read data for a write-begin request by drawing data from the cache if
> - * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
> - * Multiple I/O requests from different sources will get munged together.  If
> - * necessary, the readahead window can be expanded in either direction to a
> - * more convenient alighment for RPC efficiency or to make storage in the cache
> - * feasible.
> - *
> - * The calling netfs must provide a table of operations, only one of which,
> - * issue_op, is mandatory.
> - *
> - * The check_write_begin() operation can be provided to check for and flush
> - * conflicting writes once the folio is grabbed and locked.  It is passed a
> - * pointer to the fsdata cookie that gets returned to the VM to be passed to
> - * write_end.  It is permitted to sleep.  It should return 0 if the request
> - * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
> - * be regot; or return an error.
> - *
> - * The calling netfs must initialise a netfs context contiguous to the vfs
> - * inode before calling this.
> - *
> - * This is usable whether or not caching is enabled.
> - */
> -int netfs_write_begin(struct file *file, struct address_space *mapping,
> -		      loff_t pos, unsigned int len, unsigned int aop_flags,
> -		      struct folio **_folio, void **_fsdata)
> -{
> -	struct netfs_io_request *rreq;
> -	struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
> -	struct folio *folio;
> -	unsigned int fgp_flags;
> -	pgoff_t index = pos >> PAGE_SHIFT;
> -	int ret;
> -
> -	DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
> -
> -retry:
> -	fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
> -	if (aop_flags & AOP_FLAG_NOFS)
> -		fgp_flags |= FGP_NOFS;
> -	folio = __filemap_get_folio(mapping, index, fgp_flags,
> -				    mapping_gfp_mask(mapping));
> -	if (!folio)
> -		return -ENOMEM;
> -
> -	if (ctx->ops->check_write_begin) {
> -		/* Allow the netfs (eg. ceph) to flush conflicts. */
> -		ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
> -		if (ret < 0) {
> -			trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
> -			if (ret == -EAGAIN)
> -				goto retry;
> -			goto error;
> -		}
> -	}
> -
> -	if (folio_test_uptodate(folio))
> -		goto have_folio;
> -
> -	/* If the page is beyond the EOF, we want to clear it - unless it's
> -	 * within the cache granule containing the EOF, in which case we need
> -	 * to preload the granule.
> -	 */
> -	if (!netfs_is_cache_enabled(ctx) &&
> -	    netfs_skip_folio_read(folio, pos, len, false)) {
> -		netfs_stat(&netfs_n_rh_write_zskip);
> -		goto have_folio_no_wait;
> -	}
> -
> -	rreq = netfs_alloc_request(mapping, file,
> -				   folio_file_pos(folio), folio_size(folio),
> -				   NETFS_READ_FOR_WRITE);
> -	if (IS_ERR(rreq)) {
> -		ret = PTR_ERR(rreq);
> -		goto error;
> -	}
> -	rreq->no_unlock_folio	= folio_index(folio);
> -	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
> -
> -	if (ctx->ops->begin_cache_operation) {
> -		ret = ctx->ops->begin_cache_operation(rreq);
> -		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
> -			goto error_put;
> -	}
> -
> -	netfs_stat(&netfs_n_rh_write_begin);
> -	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
> -
> -	/* Expand the request to meet caching requirements and download
> -	 * preferences.
> -	 */
> -	ractl._nr_pages = folio_nr_pages(folio);
> -	netfs_rreq_expand(rreq, &ractl);
> -
> -	/* We hold the folio locks, so we can drop the references */
> -	folio_get(folio);
> -	while (readahead_folio(&ractl))
> -		;
> -
> -	ret = netfs_begin_read(rreq, true);
> -	if (ret < 0)
> -		goto error;
> -
> -have_folio:
> -	ret = folio_wait_fscache_killable(folio);
> -	if (ret < 0)
> -		goto error;
> -have_folio_no_wait:
> -	*_folio = folio;
> -	_leave(" = 0");
> -	return 0;
> -
> -error_put:
> -	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
> -error:
> -	folio_unlock(folio);
> -	folio_put(folio);
> -	_leave(" = %d", ret);
> -	return ret;
> -}
> -EXPORT_SYMBOL(netfs_write_begin);
> 
> 

Patch itself is fine though.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
diff mbox series

Patch

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 51ece4f7bc77..88b904532bc7 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0
 
 netfs-y := \
+	buffered_read.o \
 	io.o \
 	objects.o
 
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
new file mode 100644
index 000000000000..09ba7097a970
--- /dev/null
+++ b/fs/netfs/buffered_read.c
@@ -0,0 +1,428 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Network filesystem high-level buffered read support.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/*
+ * Unlock the folios in a read operation.  We need to set PG_fscache on any
+ * folios we're going to write back before we unlock them.
+ */
+void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
+{
+	struct netfs_io_subrequest *subreq;
+	struct folio *folio;
+	unsigned int iopos, account = 0;
+	pgoff_t start_page = rreq->start / PAGE_SIZE;
+	pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+	bool subreq_failed = false;
+
+	XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+	if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
+		__clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
+		list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+			__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+		}
+	}
+
+	/* Walk through the pagecache and the I/O request lists simultaneously.
+	 * We may have a mixture of cached and uncached sections and we only
+	 * really want to write out the uncached sections.  This is slightly
+	 * complicated by the possibility that we might have huge pages with a
+	 * mixture inside.
+	 */
+	subreq = list_first_entry(&rreq->subrequests,
+				  struct netfs_io_subrequest, rreq_link);
+	iopos = 0;
+	subreq_failed = (subreq->error < 0);
+
+	trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
+
+	rcu_read_lock();
+	xas_for_each(&xas, folio, last_page) {
+		unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
+		unsigned int pgend = pgpos + folio_size(folio);
+		bool pg_failed = false;
+
+		for (;;) {
+			if (!subreq) {
+				pg_failed = true;
+				break;
+			}
+			if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+				folio_start_fscache(folio);
+			pg_failed |= subreq_failed;
+			if (pgend < iopos + subreq->len)
+				break;
+
+			account += subreq->transferred;
+			iopos += subreq->len;
+			if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+				subreq = list_next_entry(subreq, rreq_link);
+				subreq_failed = (subreq->error < 0);
+			} else {
+				subreq = NULL;
+				subreq_failed = false;
+			}
+			if (pgend == iopos)
+				break;
+		}
+
+		if (!pg_failed) {
+			flush_dcache_folio(folio);
+			folio_mark_uptodate(folio);
+		}
+
+		if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+			if (folio_index(folio) == rreq->no_unlock_folio &&
+			    test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
+				_debug("no unlock");
+			else
+				folio_unlock(folio);
+		}
+	}
+	rcu_read_unlock();
+
+	task_io_account_read(account);
+	if (rreq->netfs_ops->done)
+		rreq->netfs_ops->done(rreq);
+}
+
+static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
+					 loff_t *_start, size_t *_len, loff_t i_size)
+{
+	struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+	if (cres->ops && cres->ops->expand_readahead)
+		cres->ops->expand_readahead(cres, _start, _len, i_size);
+}
+
+static void netfs_rreq_expand(struct netfs_io_request *rreq,
+			      struct readahead_control *ractl)
+{
+	/* Give the cache a chance to change the request parameters.  The
+	 * resultant request must contain the original region.
+	 */
+	netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
+
+	/* Give the netfs a chance to change the request parameters.  The
+	 * resultant request must contain the original region.
+	 */
+	if (rreq->netfs_ops->expand_readahead)
+		rreq->netfs_ops->expand_readahead(rreq);
+
+	/* Expand the request if the cache wants it to start earlier.  Note
+	 * that the expansion may get further extended if the VM wishes to
+	 * insert THPs and the preferred start and/or end wind up in the middle
+	 * of THPs.
+	 *
+	 * If this is the case, however, the THP size should be an integer
+	 * multiple of the cache granule size, so we get a whole number of
+	 * granules to deal with.
+	 */
+	if (rreq->start  != readahead_pos(ractl) ||
+	    rreq->len != readahead_length(ractl)) {
+		readahead_expand(ractl, rreq->start, rreq->len);
+		rreq->start  = readahead_pos(ractl);
+		rreq->len = readahead_length(ractl);
+
+		trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+				 netfs_read_trace_expanded);
+	}
+}
+
+/**
+ * netfs_readahead - Helper to manage a read request
+ * @ractl: The description of the readahead request
+ *
+ * Fulfil a readahead request by drawing data from the cache if possible, or
+ * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
+ * requests from different sources will get munged together.  If necessary, the
+ * readahead window can be expanded in either direction to a more convenient
+ * alighment for RPC efficiency or to make storage in the cache feasible.
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+void netfs_readahead(struct readahead_control *ractl)
+{
+	struct netfs_io_request *rreq;
+	struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
+	int ret;
+
+	_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
+
+	if (readahead_count(ractl) == 0)
+		return;
+
+	rreq = netfs_alloc_request(ractl->mapping, ractl->file,
+				   readahead_pos(ractl),
+				   readahead_length(ractl),
+				   NETFS_READAHEAD);
+	if (IS_ERR(rreq))
+		return;
+
+	if (ctx->ops->begin_cache_operation) {
+		ret = ctx->ops->begin_cache_operation(rreq);
+		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+			goto cleanup_free;
+	}
+
+	netfs_stat(&netfs_n_rh_readahead);
+	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+			 netfs_read_trace_readahead);
+
+	netfs_rreq_expand(rreq, ractl);
+
+	/* Drop the refs on the folios here rather than in the cache or
+	 * filesystem.  The locks will be dropped in netfs_rreq_unlock().
+	 */
+	while (readahead_folio(ractl))
+		;
+
+	netfs_begin_read(rreq, false);
+	return;
+
+cleanup_free:
+	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+	return;
+}
+EXPORT_SYMBOL(netfs_readahead);
+
+/**
+ * netfs_readpage - Helper to manage a readpage request
+ * @file: The file to read from
+ * @subpage: A subpage of the folio to read
+ *
+ * Fulfil a readpage request by drawing data from the cache if possible, or the
+ * netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O requests
+ * from different sources will get munged together.
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_readpage(struct file *file, struct page *subpage)
+{
+	struct folio *folio = page_folio(subpage);
+	struct address_space *mapping = folio_file_mapping(folio);
+	struct netfs_io_request *rreq;
+	struct netfs_i_context *ctx = netfs_i_context(mapping->host);
+	int ret;
+
+	_enter("%lx", folio_index(folio));
+
+	rreq = netfs_alloc_request(mapping, file,
+				   folio_file_pos(folio), folio_size(folio),
+				   NETFS_READPAGE);
+	if (IS_ERR(rreq)) {
+		ret = PTR_ERR(rreq);
+		goto alloc_error;
+	}
+
+	if (ctx->ops->begin_cache_operation) {
+		ret = ctx->ops->begin_cache_operation(rreq);
+		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+			goto discard;
+	}
+
+	netfs_stat(&netfs_n_rh_readpage);
+	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
+	return netfs_begin_read(rreq, true);
+
+discard:
+	netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+alloc_error:
+	folio_unlock(folio);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_readpage);
+
+/*
+ * Prepare a folio for writing without reading first
+ * @folio: The folio being prepared
+ * @pos: starting position for the write
+ * @len: length of write
+ * @always_fill: T if the folio should always be completely filled/cleared
+ *
+ * In some cases, write_begin doesn't need to read at all:
+ * - full folio write
+ * - write that lies in a folio that is completely beyond EOF
+ * - write that covers the folio from start to EOF or beyond it
+ *
+ * If any of these criteria are met, then zero out the unwritten parts
+ * of the folio and return true. Otherwise, return false.
+ */
+static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
+				 bool always_fill)
+{
+	struct inode *inode = folio_inode(folio);
+	loff_t i_size = i_size_read(inode);
+	size_t offset = offset_in_folio(folio, pos);
+	size_t plen = folio_size(folio);
+
+	if (unlikely(always_fill)) {
+		if (pos - offset + len <= i_size)
+			return false; /* Page entirely before EOF */
+		zero_user_segment(&folio->page, 0, plen);
+		folio_mark_uptodate(folio);
+		return true;
+	}
+
+	/* Full folio write */
+	if (offset == 0 && len >= plen)
+		return true;
+
+	/* Page entirely beyond the end of the file */
+	if (pos - offset >= i_size)
+		goto zero_out;
+
+	/* Write that covers from the start of the folio to EOF or beyond */
+	if (offset == 0 && (pos + len) >= i_size)
+		goto zero_out;
+
+	return false;
+zero_out:
+	zero_user_segments(&folio->page, 0, offset, offset + len, len);
+	return true;
+}
+
+/**
+ * netfs_write_begin - Helper to prepare for writing
+ * @file: The file to read from
+ * @mapping: The mapping to read from
+ * @pos: File position at which the write will begin
+ * @len: The length of the write (may extend beyond the end of the folio chosen)
+ * @aop_flags: AOP_* flags
+ * @_folio: Where to put the resultant folio
+ * @_fsdata: Place for the netfs to store a cookie
+ *
+ * Pre-read data for a write-begin request by drawing data from the cache if
+ * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
+ * Multiple I/O requests from different sources will get munged together.  If
+ * necessary, the readahead window can be expanded in either direction to a
+ * more convenient alighment for RPC efficiency or to make storage in the cache
+ * feasible.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory.
+ *
+ * The check_write_begin() operation can be provided to check for and flush
+ * conflicting writes once the folio is grabbed and locked.  It is passed a
+ * pointer to the fsdata cookie that gets returned to the VM to be passed to
+ * write_end.  It is permitted to sleep.  It should return 0 if the request
+ * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
+ * be regot; or return an error.
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_write_begin(struct file *file, struct address_space *mapping,
+		      loff_t pos, unsigned int len, unsigned int aop_flags,
+		      struct folio **_folio, void **_fsdata)
+{
+	struct netfs_io_request *rreq;
+	struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
+	struct folio *folio;
+	unsigned int fgp_flags;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	int ret;
+
+	DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
+
+retry:
+	fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
+	if (aop_flags & AOP_FLAG_NOFS)
+		fgp_flags |= FGP_NOFS;
+	folio = __filemap_get_folio(mapping, index, fgp_flags,
+				    mapping_gfp_mask(mapping));
+	if (!folio)
+		return -ENOMEM;
+
+	if (ctx->ops->check_write_begin) {
+		/* Allow the netfs (eg. ceph) to flush conflicts. */
+		ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
+		if (ret < 0) {
+			trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
+			if (ret == -EAGAIN)
+				goto retry;
+			goto error;
+		}
+	}
+
+	if (folio_test_uptodate(folio))
+		goto have_folio;
+
+	/* If the page is beyond the EOF, we want to clear it - unless it's
+	 * within the cache granule containing the EOF, in which case we need
+	 * to preload the granule.
+	 */
+	if (!netfs_is_cache_enabled(ctx) &&
+	    netfs_skip_folio_read(folio, pos, len, false)) {
+		netfs_stat(&netfs_n_rh_write_zskip);
+		goto have_folio_no_wait;
+	}
+
+	rreq = netfs_alloc_request(mapping, file,
+				   folio_file_pos(folio), folio_size(folio),
+				   NETFS_READ_FOR_WRITE);
+	if (IS_ERR(rreq)) {
+		ret = PTR_ERR(rreq);
+		goto error;
+	}
+	rreq->no_unlock_folio	= folio_index(folio);
+	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
+
+	if (ctx->ops->begin_cache_operation) {
+		ret = ctx->ops->begin_cache_operation(rreq);
+		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+			goto error_put;
+	}
+
+	netfs_stat(&netfs_n_rh_write_begin);
+	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
+
+	/* Expand the request to meet caching requirements and download
+	 * preferences.
+	 */
+	ractl._nr_pages = folio_nr_pages(folio);
+	netfs_rreq_expand(rreq, &ractl);
+
+	/* We hold the folio locks, so we can drop the references */
+	folio_get(folio);
+	while (readahead_folio(&ractl))
+		;
+
+	ret = netfs_begin_read(rreq, true);
+	if (ret < 0)
+		goto error;
+
+have_folio:
+	ret = folio_wait_fscache_killable(folio);
+	if (ret < 0)
+		goto error;
+have_folio_no_wait:
+	*_folio = folio;
+	_leave(" = 0");
+	return 0;
+
+error_put:
+	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+error:
+	folio_unlock(folio);
+	folio_put(folio);
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_write_begin);
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 058a534ba917..1fe9706c58a5 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -246,91 +246,6 @@  static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
 		BUG();
 }
 
-/*
- * Unlock the folios in a read operation.  We need to set PG_fscache on any
- * folios we're going to write back before we unlock them.
- */
-void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
-{
-	struct netfs_io_subrequest *subreq;
-	struct folio *folio;
-	unsigned int iopos, account = 0;
-	pgoff_t start_page = rreq->start / PAGE_SIZE;
-	pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
-	bool subreq_failed = false;
-
-	XA_STATE(xas, &rreq->mapping->i_pages, start_page);
-
-	if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
-		__clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
-		list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
-			__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
-		}
-	}
-
-	/* Walk through the pagecache and the I/O request lists simultaneously.
-	 * We may have a mixture of cached and uncached sections and we only
-	 * really want to write out the uncached sections.  This is slightly
-	 * complicated by the possibility that we might have huge pages with a
-	 * mixture inside.
-	 */
-	subreq = list_first_entry(&rreq->subrequests,
-				  struct netfs_io_subrequest, rreq_link);
-	iopos = 0;
-	subreq_failed = (subreq->error < 0);
-
-	trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
-
-	rcu_read_lock();
-	xas_for_each(&xas, folio, last_page) {
-		unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
-		unsigned int pgend = pgpos + folio_size(folio);
-		bool pg_failed = false;
-
-		for (;;) {
-			if (!subreq) {
-				pg_failed = true;
-				break;
-			}
-			if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
-				folio_start_fscache(folio);
-			pg_failed |= subreq_failed;
-			if (pgend < iopos + subreq->len)
-				break;
-
-			account += subreq->transferred;
-			iopos += subreq->len;
-			if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
-				subreq = list_next_entry(subreq, rreq_link);
-				subreq_failed = (subreq->error < 0);
-			} else {
-				subreq = NULL;
-				subreq_failed = false;
-			}
-			if (pgend == iopos)
-				break;
-		}
-
-		if (!pg_failed) {
-			flush_dcache_folio(folio);
-			folio_mark_uptodate(folio);
-		}
-
-		if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
-			if (folio_index(folio) == rreq->no_unlock_folio &&
-			    test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
-				_debug("no unlock");
-			else
-				folio_unlock(folio);
-		}
-	}
-	rcu_read_unlock();
-
-	task_io_account_read(account);
-	if (rreq->netfs_ops->done)
-		rreq->netfs_ops->done(rreq);
-}
-
 /*
  * Handle a short read.
  */
@@ -750,336 +665,3 @@  int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 	}
 	return ret;
 }
-
-static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
-					 loff_t *_start, size_t *_len, loff_t i_size)
-{
-	struct netfs_cache_resources *cres = &rreq->cache_resources;
-
-	if (cres->ops && cres->ops->expand_readahead)
-		cres->ops->expand_readahead(cres, _start, _len, i_size);
-}
-
-static void netfs_rreq_expand(struct netfs_io_request *rreq,
-			      struct readahead_control *ractl)
-{
-	/* Give the cache a chance to change the request parameters.  The
-	 * resultant request must contain the original region.
-	 */
-	netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
-
-	/* Give the netfs a chance to change the request parameters.  The
-	 * resultant request must contain the original region.
-	 */
-	if (rreq->netfs_ops->expand_readahead)
-		rreq->netfs_ops->expand_readahead(rreq);
-
-	/* Expand the request if the cache wants it to start earlier.  Note
-	 * that the expansion may get further extended if the VM wishes to
-	 * insert THPs and the preferred start and/or end wind up in the middle
-	 * of THPs.
-	 *
-	 * If this is the case, however, the THP size should be an integer
-	 * multiple of the cache granule size, so we get a whole number of
-	 * granules to deal with.
-	 */
-	if (rreq->start  != readahead_pos(ractl) ||
-	    rreq->len != readahead_length(ractl)) {
-		readahead_expand(ractl, rreq->start, rreq->len);
-		rreq->start  = readahead_pos(ractl);
-		rreq->len = readahead_length(ractl);
-
-		trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
-				 netfs_read_trace_expanded);
-	}
-}
-
-/**
- * netfs_readahead - Helper to manage a read request
- * @ractl: The description of the readahead request
- *
- * Fulfil a readahead request by drawing data from the cache if possible, or
- * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
- * requests from different sources will get munged together.  If necessary, the
- * readahead window can be expanded in either direction to a more convenient
- * alighment for RPC efficiency or to make storage in the cache feasible.
- *
- * The calling netfs must initialise a netfs context contiguous to the vfs
- * inode before calling this.
- *
- * This is usable whether or not caching is enabled.
- */
-void netfs_readahead(struct readahead_control *ractl)
-{
-	struct netfs_io_request *rreq;
-	struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
-	int ret;
-
-	_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
-
-	if (readahead_count(ractl) == 0)
-		return;
-
-	rreq = netfs_alloc_request(ractl->mapping, ractl->file,
-				   readahead_pos(ractl),
-				   readahead_length(ractl),
-				   NETFS_READAHEAD);
-	if (IS_ERR(rreq))
-		return;
-
-	if (ctx->ops->begin_cache_operation) {
-		ret = ctx->ops->begin_cache_operation(rreq);
-		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
-			goto cleanup_free;
-	}
-
-	netfs_stat(&netfs_n_rh_readahead);
-	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
-			 netfs_read_trace_readahead);
-
-	netfs_rreq_expand(rreq, ractl);
-
-	/* Drop the refs on the folios here rather than in the cache or
-	 * filesystem.  The locks will be dropped in netfs_rreq_unlock().
-	 */
-	while (readahead_folio(ractl))
-		;
-
-	netfs_begin_read(rreq, false);
-	return;
-
-cleanup_free:
-	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
-	return;
-}
-EXPORT_SYMBOL(netfs_readahead);
-
-/**
- * netfs_readpage - Helper to manage a readpage request
- * @file: The file to read from
- * @subpage: A subpage of the folio to read
- *
- * Fulfil a readpage request by drawing data from the cache if possible, or the
- * netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O requests
- * from different sources will get munged together.
- *
- * The calling netfs must initialise a netfs context contiguous to the vfs
- * inode before calling this.
- *
- * This is usable whether or not caching is enabled.
- */
-int netfs_readpage(struct file *file, struct page *subpage)
-{
-	struct folio *folio = page_folio(subpage);
-	struct address_space *mapping = folio->mapping;
-	struct netfs_io_request *rreq;
-	struct netfs_i_context *ctx = netfs_i_context(mapping->host);
-	int ret;
-
-	_enter("%lx", folio_index(folio));
-
-	rreq = netfs_alloc_request(mapping, file,
-				   folio_file_pos(folio), folio_size(folio),
-				   NETFS_READPAGE);
-	if (IS_ERR(rreq)) {
-		ret = PTR_ERR(rreq);
-		goto alloc_error;
-	}
-
-	if (ctx->ops->begin_cache_operation) {
-		ret = ctx->ops->begin_cache_operation(rreq);
-		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
-			goto discard;
-	}
-
-	netfs_stat(&netfs_n_rh_readpage);
-	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
-	return netfs_begin_read(rreq, true);
-
-discard:
-	netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
-alloc_error:
-	folio_unlock(folio);
-	return ret;
-}
-EXPORT_SYMBOL(netfs_readpage);
-
-/*
- * Prepare a folio for writing without reading first
- * @folio: The folio being prepared
- * @pos: starting position for the write
- * @len: length of write
- * @always_fill: T if the folio should always be completely filled/cleared
- *
- * In some cases, write_begin doesn't need to read at all:
- * - full folio write
- * - write that lies in a folio that is completely beyond EOF
- * - write that covers the folio from start to EOF or beyond it
- *
- * If any of these criteria are met, then zero out the unwritten parts
- * of the folio and return true. Otherwise, return false.
- */
-static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
-				 bool always_fill)
-{
-	struct inode *inode = folio_inode(folio);
-	loff_t i_size = i_size_read(inode);
-	size_t offset = offset_in_folio(folio, pos);
-	size_t plen = folio_size(folio);
-
-	if (unlikely(always_fill)) {
-		if (pos - offset + len <= i_size)
-			return false; /* Page entirely before EOF */
-		zero_user_segment(&folio->page, 0, plen);
-		folio_mark_uptodate(folio);
-		return true;
-	}
-
-	/* Full folio write */
-	if (offset == 0 && len >= plen)
-		return true;
-
-	/* Page entirely beyond the end of the file */
-	if (pos - offset >= i_size)
-		goto zero_out;
-
-	/* Write that covers from the start of the folio to EOF or beyond */
-	if (offset == 0 && (pos + len) >= i_size)
-		goto zero_out;
-
-	return false;
-zero_out:
-	zero_user_segments(&folio->page, 0, offset, offset + len, len);
-	return true;
-}
-
-/**
- * netfs_write_begin - Helper to prepare for writing
- * @file: The file to read from
- * @mapping: The mapping to read from
- * @pos: File position at which the write will begin
- * @len: The length of the write (may extend beyond the end of the folio chosen)
- * @aop_flags: AOP_* flags
- * @_folio: Where to put the resultant folio
- * @_fsdata: Place for the netfs to store a cookie
- *
- * Pre-read data for a write-begin request by drawing data from the cache if
- * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
- * Multiple I/O requests from different sources will get munged together.  If
- * necessary, the readahead window can be expanded in either direction to a
- * more convenient alighment for RPC efficiency or to make storage in the cache
- * feasible.
- *
- * The calling netfs must provide a table of operations, only one of which,
- * issue_op, is mandatory.
- *
- * The check_write_begin() operation can be provided to check for and flush
- * conflicting writes once the folio is grabbed and locked.  It is passed a
- * pointer to the fsdata cookie that gets returned to the VM to be passed to
- * write_end.  It is permitted to sleep.  It should return 0 if the request
- * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
- * be regot; or return an error.
- *
- * The calling netfs must initialise a netfs context contiguous to the vfs
- * inode before calling this.
- *
- * This is usable whether or not caching is enabled.
- */
-int netfs_write_begin(struct file *file, struct address_space *mapping,
-		      loff_t pos, unsigned int len, unsigned int aop_flags,
-		      struct folio **_folio, void **_fsdata)
-{
-	struct netfs_io_request *rreq;
-	struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
-	struct folio *folio;
-	unsigned int fgp_flags;
-	pgoff_t index = pos >> PAGE_SHIFT;
-	int ret;
-
-	DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
-
-retry:
-	fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
-	if (aop_flags & AOP_FLAG_NOFS)
-		fgp_flags |= FGP_NOFS;
-	folio = __filemap_get_folio(mapping, index, fgp_flags,
-				    mapping_gfp_mask(mapping));
-	if (!folio)
-		return -ENOMEM;
-
-	if (ctx->ops->check_write_begin) {
-		/* Allow the netfs (eg. ceph) to flush conflicts. */
-		ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
-		if (ret < 0) {
-			trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
-			if (ret == -EAGAIN)
-				goto retry;
-			goto error;
-		}
-	}
-
-	if (folio_test_uptodate(folio))
-		goto have_folio;
-
-	/* If the page is beyond the EOF, we want to clear it - unless it's
-	 * within the cache granule containing the EOF, in which case we need
-	 * to preload the granule.
-	 */
-	if (!netfs_is_cache_enabled(ctx) &&
-	    netfs_skip_folio_read(folio, pos, len, false)) {
-		netfs_stat(&netfs_n_rh_write_zskip);
-		goto have_folio_no_wait;
-	}
-
-	rreq = netfs_alloc_request(mapping, file,
-				   folio_file_pos(folio), folio_size(folio),
-				   NETFS_READ_FOR_WRITE);
-	if (IS_ERR(rreq)) {
-		ret = PTR_ERR(rreq);
-		goto error;
-	}
-	rreq->no_unlock_folio	= folio_index(folio);
-	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
-
-	if (ctx->ops->begin_cache_operation) {
-		ret = ctx->ops->begin_cache_operation(rreq);
-		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
-			goto error_put;
-	}
-
-	netfs_stat(&netfs_n_rh_write_begin);
-	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
-
-	/* Expand the request to meet caching requirements and download
-	 * preferences.
-	 */
-	ractl._nr_pages = folio_nr_pages(folio);
-	netfs_rreq_expand(rreq, &ractl);
-
-	/* We hold the folio locks, so we can drop the references */
-	folio_get(folio);
-	while (readahead_folio(&ractl))
-		;
-
-	ret = netfs_begin_read(rreq, true);
-	if (ret < 0)
-		goto error;
-
-have_folio:
-	ret = folio_wait_fscache_killable(folio);
-	if (ret < 0)
-		goto error;
-have_folio_no_wait:
-	*_folio = folio;
-	_leave(" = 0");
-	return 0;
-
-error_put:
-	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
-error:
-	folio_unlock(folio);
-	folio_put(folio);
-	_leave(" = %d", ret);
-	return ret;
-}
-EXPORT_SYMBOL(netfs_write_begin);