From patchwork Sun Dec 31 20:40:09 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507457
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id BA9DCBA22
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 20:40:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="a+4Tv0Ix"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8342CC433C8;
	Sun, 31 Dec 2023 20:40:09 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704055209;
	bh=rAKU0IZIOe/w1YZitDWbnhBkKm1U5ovMtNbzTT+PrkQ=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=a+4Tv0IxQH3uvabSjdCHqFQQWVpuKQIM+kyqgP85zapOzf6NRoXSmEZC1tskvxtQe
	 KqeIZum8p+pLvSGt1SKgDzYJxQDMdx/P6GyqhOjlGecCXpCFeDZZHRBDHdXdPBCZLB
	 THkR6EOHKCu6sy1tGoATGIjLdjdFAjPZWdFUq0QnaipJGMHL4ko5uueB5DgcvS42kM
	 RdQ/XQmJZRIgSBkJoRumdT5A2hiQdXzzVxGFg33hHyRjjLvLqogrcqLhuO3iPeaDTK
	 +cqSbcCWf5I4RQBbn+/f+rwiF7RDavv/W/DPlvdJLHobQN2eO8Tkv5CzcKVPz0pXS3
	 U2Tsywnh/GYcg==
Date: Sun, 31 Dec 2023 12:40:09 -0800
Subject: [PATCH 1/3] xfs: map xfile pages directly into xfs_buf
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404837613.1754104.1414992009596163702.stgit@frogsfrogsfrogs>
In-Reply-To: <170404837590.1754104.3601847870577015044.stgit@frogsfrogsfrogs>
References: <170404837590.1754104.3601847870577015044.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Map the xfile pages directly into xfs_buf to reduce memory overhead.
It's silly to use memory to stage changes to shmem pages for ephemeral
btrees that don't care about transactionality.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree_mem.h  |    6 ++
 fs/xfs/libxfs/xfs_rmap_btree.c |    1 
 fs/xfs/scrub/rcbag_btree.c     |    1 
 fs/xfs/scrub/xfbtree.c         |   23 +++++-
 fs/xfs/xfs_buf.c               |  110 ++++++++++++++++++++++-------
 fs/xfs/xfs_buf.h               |   16 ++++
 fs/xfs/xfs_buf_xfile.c         |  152 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_buf_xfile.h         |   11 +++
 8 files changed, 292 insertions(+), 28 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h
index 1f961f3f55444..cfb30cb1aabc6 100644
--- a/fs/xfs/libxfs/xfs_btree_mem.h
+++ b/fs/xfs/libxfs/xfs_btree_mem.h
@@ -17,8 +17,14 @@ struct xfbtree_config {
 
 	/* Owner of this btree. */
 	unsigned long long		owner;
+
+	/* XFBTREE_* flags */
+	unsigned int			flags;
 };
 
+/* buffers should be directly mapped from memory */
+#define XFBTREE_DIRECT_MAP		(1U << 0)
+
 #ifdef CONFIG_XFS_BTREE_IN_XFILE
 unsigned int xfs_btree_mem_head_nlevels(struct xfs_buf *head_bp);
 
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 23841ee6e2ff6..71d32f9fee14d 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -672,6 +672,7 @@ xfs_rmapbt_mem_create(
 		.btree_ops	= &xfs_rmapbt_mem_ops,
 		.target		= target,
 		.owner		= agno,
+		.flags		= XFBTREE_DIRECT_MAP,
 	};
 
 	return xfbtree_create(mp, &cfg, xfbtreep);
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
index 3d66e80b7bc25..9807e08129fe4 100644
--- a/fs/xfs/scrub/rcbag_btree.c
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -233,6 +233,7 @@ rcbagbt_mem_create(
 	struct xfbtree_config	cfg = {
 		.btree_ops	= &rcbagbt_mem_ops,
 		.target		= target,
+		.flags		= XFBTREE_DIRECT_MAP,
 	};
 
 	return xfbtree_create(mp, &cfg, xfbtreep);
diff --git a/fs/xfs/scrub/xfbtree.c b/fs/xfs/scrub/xfbtree.c
index 016026947019a..9e557d87d1c9c 100644
--- a/fs/xfs/scrub/xfbtree.c
+++ b/fs/xfs/scrub/xfbtree.c
@@ -501,6 +501,9 @@ xfbtree_create(
 	if (!xfbt)
 		return -ENOMEM;
 	xfbt->target = cfg->target;
+	if (cfg->flags & XFBTREE_DIRECT_MAP)
+		xfbt->target->bt_flags |= XFS_BUFTARG_DIRECT_MAP;
+
 	xfboff_bitmap_init(&xfbt->freespace);
 
 	/* Set up min/maxrecs for this btree. */
@@ -753,7 +756,7 @@ xfbtree_trans_commit(
 
 		dirty = xfbtree_trans_bdetach(tp, bp);
 		if (dirty && !corrupt) {
-			xfs_failaddr_t	fa = bp->b_ops->verify_struct(bp);
+			xfs_failaddr_t	fa;
 
 			/*
 			 * Because this btree is ephemeral, validate the buffer
@@ -761,16 +764,30 @@ xfbtree_trans_commit(
 			 * corruption errors to the caller without shutting
 			 * down the filesystem.
 			 *
+			 * Buffers that are directly mapped to the xfile do not
+			 * need to be queued for IO at all.  Check if the DRAM
+			 * has been poisoned, however.
+			 *
 			 * If the buffer fails verification, log the failure
 			 * but continue walking the transaction items so that
 			 * we remove all ephemeral btree buffers.
 			 */
+			if (xfs_buf_check_poisoned(bp)) {
+				corrupt = true;
+				xfs_verifier_error(bp, -EFSCORRUPTED,
+						__this_address);
+				continue;
+			}
+
+			fa = bp->b_ops->verify_struct(bp);
 			if (fa) {
 				corrupt = true;
 				xfs_verifier_error(bp, -EFSCORRUPTED, fa);
-			} else {
+				continue;
+			}
+
+			if (!(bp->b_flags & _XBF_DIRECT_MAP))
 				xfs_buf_delwri_queue_here(bp, &buffer_list);
-			}
 		}
 
 		xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b62518968e784..ca7657d0ea592 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -280,19 +280,26 @@ xfs_buf_free_pages(
 
 	ASSERT(bp->b_flags & _XBF_PAGES);
 
-	if (xfs_buf_is_vmapped(bp))
-		vm_unmap_ram(bp->b_addr, bp->b_page_count);
-
 	for (i = 0; i < bp->b_page_count; i++) {
 		if (bp->b_pages[i])
 			__free_page(bp->b_pages[i]);
 	}
 	mm_account_reclaimed_pages(bp->b_page_count);
 
+	xfs_buf_free_page_array(bp);
+}
+
+void
+xfs_buf_free_page_array(
+	struct xfs_buf	*bp)
+{
+	ASSERT(bp->b_flags & _XBF_PAGES);
+
 	if (bp->b_pages != bp->b_page_array)
 		kmem_free(bp->b_pages);
 	bp->b_pages = NULL;
 	bp->b_flags &= ~_XBF_PAGES;
+	bp->b_page_count = 0;
 }
 
 static void
@@ -313,7 +320,12 @@ xfs_buf_free(
 
 	ASSERT(list_empty(&bp->b_lru));
 
-	if (bp->b_flags & _XBF_PAGES)
+	if (xfs_buf_is_vmapped(bp))
+		vm_unmap_ram(bp->b_addr, bp->b_page_count);
+
+	if (bp->b_flags & _XBF_DIRECT_MAP)
+		xfile_buf_unmap_pages(bp);
+	else if (bp->b_flags & _XBF_PAGES)
 		xfs_buf_free_pages(bp);
 	else if (bp->b_flags & _XBF_KMEM)
 		kmem_free(bp->b_addr);
@@ -352,20 +364,14 @@ xfs_buf_alloc_kmem(
 	return 0;
 }
 
-static int
-xfs_buf_alloc_pages(
+/* Make sure that we have a page list */
+int
+xfs_buf_alloc_page_array(
 	struct xfs_buf	*bp,
-	xfs_buf_flags_t	flags)
+	gfp_t		gfp_mask)
 {
-	gfp_t		gfp_mask = __GFP_NOWARN;
-	long		filled = 0;
+	ASSERT(!(bp->b_flags & _XBF_PAGES));
 
-	if (flags & XBF_READ_AHEAD)
-		gfp_mask |= __GFP_NORETRY;
-	else
-		gfp_mask |= GFP_NOFS;
-
-	/* Make sure that we have a page list */
 	bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
 	if (bp->b_page_count <= XB_PAGES) {
 		bp->b_pages = bp->b_page_array;
@@ -375,7 +381,28 @@ xfs_buf_alloc_pages(
 		if (!bp->b_pages)
 			return -ENOMEM;
 	}
+
 	bp->b_flags |= _XBF_PAGES;
+	return 0;
+}
+
+static int
+xfs_buf_alloc_pages(
+	struct xfs_buf	*bp,
+	xfs_buf_flags_t	flags)
+{
+	gfp_t		gfp_mask = __GFP_NOWARN;
+	long		filled = 0;
+	int		error;
+
+	if (flags & XBF_READ_AHEAD)
+		gfp_mask |= __GFP_NORETRY;
+	else
+		gfp_mask |= GFP_NOFS;
+
+	error = xfs_buf_alloc_page_array(bp, gfp_mask);
+	if (error)
+		return error;
 
 	/* Assure zeroed buffer for non-read cases. */
 	if (!(flags & XBF_READ))
@@ -418,7 +445,8 @@ _xfs_buf_map_pages(
 	struct xfs_buf		*bp,
 	xfs_buf_flags_t		flags)
 {
-	ASSERT(bp->b_flags & _XBF_PAGES);
+	ASSERT(bp->b_flags & (_XBF_PAGES | _XBF_DIRECT_MAP));
+
 	if (bp->b_page_count == 1) {
 		/* A single page buffer is always mappable */
 		bp->b_addr = page_address(bp->b_pages[0]);
@@ -569,7 +597,7 @@ xfs_buf_find_lock(
 			return -ENOENT;
 		}
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+		bp->b_flags &= _XBF_KMEM | _XBF_PAGES | _XBF_DIRECT_MAP;
 		bp->b_ops = NULL;
 	}
 	return 0;
@@ -628,18 +656,36 @@ xfs_buf_find_insert(
 		goto out_drop_pag;
 
 	/*
-	 * For buffers that fit entirely within a single page, first attempt to
-	 * allocate the memory from the heap to minimise memory usage. If we
-	 * can't get heap memory for these small buffers, we fall back to using
-	 * the page allocator.
+	 * If the caller is ok with direct maps to xfile pages, try that.
+	 * ENOTBLK is the magic code to fall back to allocating memory.
 	 */
-	if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
-	    xfs_buf_alloc_kmem(new_bp, flags) < 0) {
-		error = xfs_buf_alloc_pages(new_bp, flags);
-		if (error)
+	if (xfile_buftarg_can_direct_map(btp)) {
+		error = xfile_buf_map_pages(new_bp, flags);
+		if (error && error != -ENOTBLK)
 			goto out_free_buf;
+		if (!error)
+			goto insert;
 	}
 
+	/*
+	 * For buffers that fit entirely within a single page, first attempt to
+	 * allocate the memory from the heap to minimise memory usage.
+	 */
+	if (BBTOB(new_bp->b_length) < PAGE_SIZE) {
+		error = xfs_buf_alloc_kmem(new_bp, flags);
+		if (!error)
+			goto insert;
+	}
+
+	/*
+	 * For larger buffers or if we can't get heap memory for these small
+	 * buffers, fall back to using the page allocator.
+	 */
+	error = xfs_buf_alloc_pages(new_bp, flags);
+	if (error)
+		goto out_free_buf;
+
+insert:
 	spin_lock(&bch->bc_lock);
 	bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
 			&new_bp->b_rhash_head, xfs_buf_hash_params);
@@ -1584,6 +1630,20 @@ xfs_buf_end_sync_io(
 		xfs_buf_ioend(bp);
 }
 
+bool
+xfs_buf_check_poisoned(
+	struct xfs_buf		*bp)
+{
+	unsigned int		i;
+
+	for (i = 0; i < bp->b_page_count; i++) {
+		if (PageHWPoison(bp->b_pages[i]))
+			return true;
+	}
+
+	return false;
+}
+
 STATIC void
 _xfs_buf_ioapply(
 	struct xfs_buf	*bp)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5a6cf3d5a9f53..9d05c376d9dd8 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -43,6 +43,11 @@ struct xfile;
 #define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
 #define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
+#ifdef CONFIG_XFS_IN_MEMORY_FILE
+# define _XBF_DIRECT_MAP (1u << 23)/* pages directly mapped to storage */
+#else
+# define _XBF_DIRECT_MAP (0)
+#endif
 
 /* flags used only as arguments to access routines */
 /*
@@ -72,6 +77,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
+	{ _XBF_DIRECT_MAP,	"DIRECT_MAP" }, \
 	/* The following interface flags should never be set */ \
 	{ XBF_LIVESCAN,		"LIVESCAN" }, \
 	{ XBF_INCORE,		"INCORE" }, \
@@ -131,8 +137,14 @@ typedef struct xfs_buftarg {
 #ifdef CONFIG_XFS_IN_MEMORY_FILE
 /* in-memory buftarg via bt_xfile */
 # define XFS_BUFTARG_XFILE	(1U << 0)
+/*
+ * Buffer pages are direct-mapped to the xfile; caller does not care about
+ * transactional updates.
+ */
+# define XFS_BUFTARG_DIRECT_MAP	(1U << 1)
 #else
 # define XFS_BUFTARG_XFILE	(0)
+# define XFS_BUFTARG_DIRECT_MAP	(0)
 #endif
 
 #define XB_PAGES	2
@@ -382,6 +394,9 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
 			 cksum_offset);
 }
 
+int xfs_buf_alloc_page_array(struct xfs_buf *bp, gfp_t gfp_mask);
+void xfs_buf_free_page_array(struct xfs_buf *bp);
+
 /*
  *	Handling of buftargs.
  */
@@ -453,5 +468,6 @@ xfs_buftarg_verify_daddr(
 int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
 bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
 bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
+bool xfs_buf_check_poisoned(struct xfs_buf *bp);
 
 #endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_xfile.c b/fs/xfs/xfs_buf_xfile.c
index 51c5c692156b1..be1e54be070ce 100644
--- a/fs/xfs/xfs_buf_xfile.c
+++ b/fs/xfs/xfs_buf_xfile.c
@@ -18,6 +18,11 @@ xfile_buf_ioapply(
 	loff_t			pos = BBTOB(xfs_buf_daddr(bp));
 	size_t			size = BBTOB(bp->b_length);
 
+	if (bp->b_target->bt_flags & XFS_BUFTARG_DIRECT_MAP) {
+		/* direct mapping means no io necessary */
+		return 0;
+	}
+
 	if (bp->b_map_count > 1) {
 		/* We don't need or support multi-map buffers. */
 		ASSERT(0);
@@ -95,3 +100,150 @@ xfile_buftarg_nr_sectors(
 {
 	return xfile_size(btp->bt_xfile) >> SECTOR_SHIFT;
 }
+
+/* Free an xfile page that was directly mapped into the buffer cache. */
+static int
+xfile_buf_put_page(
+	struct xfile		*xfile,
+	loff_t			pos,
+	struct page		*page)
+{
+	struct xfile_page	xfpage = {
+		.page		= page,
+		.pos		= round_down(pos, PAGE_SIZE),
+	};
+
+	lock_page(xfpage.page);
+
+	return xfile_put_page(xfile, &xfpage);
+}
+
+/* Grab the xfile page for this part of the xfile. */
+static int
+xfile_buf_get_page(
+	struct xfile		*xfile,
+	loff_t			pos,
+	unsigned int		len,
+	struct page		**pagep)
+{
+	struct xfile_page	xfpage = { NULL };
+	int			error;
+
+	error = xfile_get_page(xfile, pos, len, &xfpage);
+	if (error)
+		return error;
+
+	/*
+	 * Fall back to regular DRAM buffers if tmpfs gives us fsdata or the
+	 * page pos isn't what we were expecting.
+	 */
+	if (xfpage.fsdata || xfpage.pos != round_down(pos, PAGE_SIZE)) {
+		xfile_put_page(xfile, &xfpage);
+		return -ENOTBLK;
+	}
+
+	/* Unlock the page before we start using them for the buffer cache. */
+	ASSERT(PageUptodate(xfpage.page));
+	unlock_page(xfpage.page);
+
+	*pagep = xfpage.page;
+	return 0;
+}
+
+/*
+ * Try to map storage directly, if the target supports it.  Returns 0 for
+ * success, -ENOTBLK to mean "not supported", or the usual negative errno.
+ */
+int
+xfile_buf_map_pages(
+	struct xfs_buf		*bp,
+	xfs_buf_flags_t		flags)
+{
+	struct xfs_buf_map	*map;
+	gfp_t			gfp_mask = __GFP_NOWARN;
+	const unsigned int	page_align_mask = PAGE_SIZE - 1;
+	unsigned int		m, p, n;
+	int			error;
+
+	ASSERT(xfile_buftarg_can_direct_map(bp->b_target));
+
+	/* For direct-map buffers, each map has to be page aligned. */
+	for (m = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++)
+		if (BBTOB(map->bm_bn | map->bm_len) & page_align_mask)
+			return -ENOTBLK;
+
+	if (flags & XBF_READ_AHEAD)
+		gfp_mask |= __GFP_NORETRY;
+	else
+		gfp_mask |= GFP_NOFS;
+
+	error = xfs_buf_alloc_page_array(bp, gfp_mask);
+	if (error)
+		return error;
+
+	/* Map in the xfile pages. */
+	for (m = 0, p = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++) {
+		for (n = 0; n < map->bm_len; n += BTOBB(PAGE_SIZE)) {
+			unsigned int	len;
+
+			len = min_t(unsigned int, BBTOB(map->bm_len - n),
+					PAGE_SIZE);
+
+			error = xfile_buf_get_page(bp->b_target->bt_xfile,
+					BBTOB(map->bm_bn + n), len,
+					&bp->b_pages[p++]);
+			if (error)
+				goto fail;
+		}
+	}
+
+	bp->b_flags |= _XBF_DIRECT_MAP;
+	return 0;
+
+fail:
+	/*
+	 * Release all the xfile pages and free the page array, we're falling
+	 * back to a DRAM buffer, which could be pages or a slab allocation.
+	 */
+	for (m = 0, p = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++) {
+		for (n = 0; n < map->bm_len; n += BTOBB(PAGE_SIZE)) {
+			if (bp->b_pages[p] == NULL)
+				continue;
+
+			xfile_buf_put_page(bp->b_target->bt_xfile,
+					BBTOB(map->bm_bn + n),
+					bp->b_pages[p++]);
+		}
+	}
+
+	xfs_buf_free_page_array(bp);
+	return error;
+}
+
+/* Unmap all the direct-mapped buffer pages. */
+void
+xfile_buf_unmap_pages(
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_map	*map;
+	unsigned int		m, p, n;
+	int			error = 0, err2;
+
+	ASSERT(xfile_buftarg_can_direct_map(bp->b_target));
+
+	for (m = 0, p = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++) {
+		for (n = 0; n < map->bm_len; n += BTOBB(PAGE_SIZE)) {
+			err2 = xfile_buf_put_page(bp->b_target->bt_xfile,
+					BBTOB(map->bm_bn + n),
+					bp->b_pages[p++]);
+			if (!error && err2)
+				error = err2;
+		}
+	}
+
+	if (error)
+		xfs_err(bp->b_mount, "%s failed errno %d", __func__, error);
+
+	bp->b_flags &= ~_XBF_DIRECT_MAP;
+	xfs_buf_free_page_array(bp);
+}
diff --git a/fs/xfs/xfs_buf_xfile.h b/fs/xfs/xfs_buf_xfile.h
index c8d78d01ea5df..6ff2104780010 100644
--- a/fs/xfs/xfs_buf_xfile.h
+++ b/fs/xfs/xfs_buf_xfile.h
@@ -12,9 +12,20 @@ int xfile_alloc_buftarg(struct xfs_mount *mp, const char *descr,
 		struct xfs_buftarg **btpp);
 void xfile_free_buftarg(struct xfs_buftarg *btp);
 xfs_daddr_t xfile_buftarg_nr_sectors(struct xfs_buftarg *btp);
+int xfile_buf_map_pages(struct xfs_buf *bp, xfs_buf_flags_t flags);
+void xfile_buf_unmap_pages(struct xfs_buf *bp);
+
+static inline bool xfile_buftarg_can_direct_map(const struct xfs_buftarg *btp)
+{
+	return (btp->bt_flags & XFS_BUFTARG_XFILE) &&
+	       (btp->bt_flags & XFS_BUFTARG_DIRECT_MAP);
+}
 #else
 # define xfile_buf_ioapply(bp)			(-EOPNOTSUPP)
 # define xfile_buftarg_nr_sectors(btp)		(0)
+# define xfile_buf_map_pages(b,f)		(-ENOTBLK)
+# define xfile_buf_unmap_pages(bp)		((void)0)
+# define xfile_buftarg_can_direct_map(btp)	(false)
 #endif /* CONFIG_XFS_IN_MEMORY_FILE */
 
 #endif /* __XFS_BUF_XFILE_H__ */

From patchwork Sun Dec 31 20:40:24 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507458
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 596A0BA2E
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 20:40:25 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="gUzIw4uJ"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 248D8C433C7;
	Sun, 31 Dec 2023 20:40:25 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704055225;
	bh=o++nJb3VA5Fqb5PqXPCQy2pn9EcDfOV1L0YtJ73cEoc=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=gUzIw4uJG0kDzJ6ZZ4Oo7oTK4GKDgC+egFSvoqNgJeDqVvgJV3X7ZZysVHjNLE/L9
	 oBH5VCpHeakxbjyntLXz/Gqioa8hNkdMC6lOFIfoNr3jQTTg/GQDRd2ah41pxPrjoB
	 PqJj/5GsDlE4o6Ut+HpyOTypARVCoRp74bIL2Z79H4Hr0BZ1C0WzdMquATy6/9sxCe
	 Swj2OLObTN3W52y1I1+vZMNDAtkPSEPpow2RTrvpz4TWNOgv/9zFx0byN0xvmfU6gV
	 2PZBfBDPfJDWRY+4vLph73Bd+47XYW6Zqj9UW2ZbNMrji5cgrfhXNTrEUptvREDRXF
	 RsY3OKvOHCY5g==
Date: Sun, 31 Dec 2023 12:40:24 -0800
Subject: [PATCH 2/3] xfs: use b_offset to support direct-mapping pages when
 blocksize < pagesize
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404837630.1754104.9143395380611692112.stgit@frogsfrogsfrogs>
In-Reply-To: <170404837590.1754104.3601847870577015044.stgit@frogsfrogsfrogs>
References: <170404837590.1754104.3601847870577015044.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Support using directly-mapped pages in the buffer cache when the fs
blocksize is less than the page size.  This is not strictly necessary
since the only user of direct-map buffers always uses page-sized
buffers, but I included it here for completeness.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c       |    8 ++++++--
 fs/xfs/xfs_buf_xfile.c |   20 +++++++++++++++++---
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ca7657d0ea592..d86227e852b7f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -321,7 +321,7 @@ xfs_buf_free(
 	ASSERT(list_empty(&bp->b_lru));
 
 	if (xfs_buf_is_vmapped(bp))
-		vm_unmap_ram(bp->b_addr, bp->b_page_count);
+		vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
 
 	if (bp->b_flags & _XBF_DIRECT_MAP)
 		xfile_buf_unmap_pages(bp);
@@ -434,6 +434,8 @@ xfs_buf_alloc_pages(
 		XFS_STATS_INC(bp->b_mount, xb_page_retries);
 		memalloc_retry_wait(gfp_mask);
 	}
+
+	bp->b_offset = 0;
 	return 0;
 }
 
@@ -449,7 +451,7 @@ _xfs_buf_map_pages(
 
 	if (bp->b_page_count == 1) {
 		/* A single page buffer is always mappable */
-		bp->b_addr = page_address(bp->b_pages[0]);
+		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 	} else if (flags & XBF_UNMAPPED) {
 		bp->b_addr = NULL;
 	} else {
@@ -476,6 +478,8 @@ _xfs_buf_map_pages(
 
 		if (!bp->b_addr)
 			return -ENOMEM;
+
+		bp->b_addr += bp->b_offset;
 	}
 
 	return 0;
diff --git a/fs/xfs/xfs_buf_xfile.c b/fs/xfs/xfs_buf_xfile.c
index be1e54be070ce..58469a91e72bc 100644
--- a/fs/xfs/xfs_buf_xfile.c
+++ b/fs/xfs/xfs_buf_xfile.c
@@ -163,15 +163,27 @@ xfile_buf_map_pages(
 	gfp_t			gfp_mask = __GFP_NOWARN;
 	const unsigned int	page_align_mask = PAGE_SIZE - 1;
 	unsigned int		m, p, n;
+	unsigned int		first_page_offset;
 	int			error;
 
 	ASSERT(xfile_buftarg_can_direct_map(bp->b_target));
 
-	/* For direct-map buffers, each map has to be page aligned. */
-	for (m = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++)
-		if (BBTOB(map->bm_bn | map->bm_len) & page_align_mask)
+	/*
+	 * For direct-map buffer targets with multiple mappings, the first map
+	 * must end on a page boundary and the rest of the mappings must start
+	 * and end on a page boundary.  For single-mapping buffers, we don't
+	 * care.
+	 */
+	if (bp->b_map_count > 1) {
+		map = &bp->b_maps[0];
+		if (BBTOB(map->bm_bn + map->bm_len) & page_align_mask)
 			return -ENOTBLK;
 
+		for (m = 1, map++; m < bp->b_map_count - 1; m++, map++)
+			if (BBTOB(map->bm_bn | map->bm_len) & page_align_mask)
+				return -ENOTBLK;
+	}
+
 	if (flags & XBF_READ_AHEAD)
 		gfp_mask |= __GFP_NORETRY;
 	else
@@ -182,6 +194,7 @@ xfile_buf_map_pages(
 		return error;
 
 	/* Map in the xfile pages. */
+	first_page_offset = offset_in_page(BBTOB(xfs_buf_daddr(bp)));
 	for (m = 0, p = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++) {
 		for (n = 0; n < map->bm_len; n += BTOBB(PAGE_SIZE)) {
 			unsigned int	len;
@@ -198,6 +211,7 @@ xfile_buf_map_pages(
 	}
 
 	bp->b_flags |= _XBF_DIRECT_MAP;
+	bp->b_offset = first_page_offset;
 	return 0;
 
 fail:

From patchwork Sun Dec 31 20:40:40 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507459
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 01165BA2E
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 20:40:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="ZiY9q4Rp"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id BBD38C433C8;
	Sun, 31 Dec 2023 20:40:40 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704055240;
	bh=8RS8CyCipd+oYAfa5kaPVsOPZLQjGk1tj6A8FV8dIGw=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=ZiY9q4RpUpxcN8ZcWPjjf2RHKKPmCpjTJekZKRdZoBt8w6Ojg2vk6SmlSk+GbPhQB
	 eO9LiK6N8h1hA8Nln2BLnkobIk5clQhArMrK7JyBaaF4HAdf5UVOArZulSkZDI+Dma
	 kMoeTyjH4uGqa2pa5LnG7HwI8OicGl9S2Z5sDc3T90838iDK0X8o9g8QjDdYxuGMRL
	 DTPwhQQIWQRPtmHD4MvOsq23ErweIddkmwjCUevQ/0SmDSU8N7bd9tanF3gDw6Rn20
	 4pBsegi+ZgEFsqACb5Jm/9x6NBSF9E3QQ1cV6WkSDYKkO7y3h75xu2qe2FGDRvmAEY
	 mnMAYK+VdVmvw==
Date: Sun, 31 Dec 2023 12:40:40 -0800
Subject: [PATCH 3/3] xfile: implement write caching
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404837645.1754104.3271871045806193458.stgit@frogsfrogsfrogs>
In-Reply-To: <170404837590.1754104.3601847870577015044.stgit@frogsfrogsfrogs>
References: <170404837590.1754104.3601847870577015044.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Mapping a page into the kernel's address space is expensive.  Since
the xfile contains an xfs_buf_cache object for xfbtrees and xfbtrees
aren't the only user of xfiles, we could reuse that space for a simple
MRU cache.

When there's enough metadata records being put in an xfarray/xfblob and
the fsck scans aren't IO bound, this cuts the runtime of online fsck by
about 5%.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/trace.h   |   44 +++++++
 fs/xfs/scrub/xfile.c   |  307 +++++++++++++++++++++++++++++++-----------------
 fs/xfs/scrub/xfile.h   |   23 +++-
 fs/xfs/xfs_buf_xfile.c |    7 +
 4 files changed, 273 insertions(+), 108 deletions(-)

diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 3aa1ef6a371dd..8d863f4737e90 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -964,10 +964,52 @@ DEFINE_XFILE_EVENT(xfile_pread);
 DEFINE_XFILE_EVENT(xfile_pwrite);
 DEFINE_XFILE_EVENT(xfile_seek_data);
 DEFINE_XFILE_EVENT(xfile_get_page);
-DEFINE_XFILE_EVENT(xfile_put_page);
 DEFINE_XFILE_EVENT(xfile_discard);
 DEFINE_XFILE_EVENT(xfile_prealloc);
 
+DECLARE_EVENT_CLASS(xfile_page_class,
+	TP_PROTO(struct xfile *xf, loff_t pos, struct page *page),
+	TP_ARGS(xf, pos, page),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, bytes_used)
+		__field(loff_t, pos)
+		__field(loff_t, size)
+		__field(unsigned long long, bytecount)
+		__field(pgoff_t, pgoff)
+	),
+	TP_fast_assign(
+		struct xfile_stat	statbuf;
+		int			ret;
+
+		ret = xfile_stat(xf, &statbuf);
+		if (!ret) {
+			__entry->bytes_used = statbuf.bytes;
+			__entry->size = statbuf.size;
+		} else {
+			__entry->bytes_used = -1;
+			__entry->size = -1;
+		}
+		__entry->ino = file_inode(xf->file)->i_ino;
+		__entry->pos = pos;
+		__entry->bytecount = page_size(page);
+		__entry->pgoff = page_offset(page);
+	),
+	TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx pgoff 0x%lx isize 0x%llx",
+		  __entry->ino,
+		  __entry->bytes_used,
+		  __entry->pos,
+		  __entry->bytecount,
+		  __entry->pgoff,
+		  __entry->size)
+);
+#define DEFINE_XFILE_PAGE_EVENT(name) \
+DEFINE_EVENT(xfile_page_class, name, \
+	TP_PROTO(struct xfile *xf, loff_t pos, struct page *page), \
+	TP_ARGS(xf, pos, page))
+DEFINE_XFILE_PAGE_EVENT(xfile_got_page);
+DEFINE_XFILE_PAGE_EVENT(xfile_put_page);
+
 TRACE_EVENT(xfarray_create,
 	TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
 	TP_ARGS(xfa, required_capacity),
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 9ab5d87963be2..ccef7fdcd7d9f 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -64,7 +64,7 @@ xfile_create(
 	struct xfile		*xf;
 	int			error = -ENOMEM;
 
-	xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
+	xf = kzalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
 	if (!xf)
 		return -ENOMEM;
 
@@ -103,6 +103,129 @@ xfile_create(
 	return error;
 }
 
+/* Evict a cache entry and release the page. */
+static inline int
+xfile_cache_evict(
+	struct xfile		*xf,
+	struct xfile_cache	*entry)
+{
+	int			error;
+
+	if (!entry->xfpage.page)
+		return 0;
+
+	lock_page(entry->xfpage.page);
+	kunmap(entry->kaddr);
+
+	error = xfile_put_page(xf, &entry->xfpage);
+	memset(entry, 0, sizeof(struct xfile_cache));
+	return error;
+}
+
+/*
+ * Grab a page, map it into the kernel address space, and fill out the cache
+ * entry.
+ */
+static int
+xfile_cache_fill(
+	struct xfile		*xf,
+	loff_t			key,
+	struct xfile_cache	*entry)
+{
+	int			error;
+
+	error = xfile_get_page(xf, key, PAGE_SIZE, &entry->xfpage);
+	if (error)
+		return error;
+
+	entry->kaddr = kmap(entry->xfpage.page);
+	unlock_page(entry->xfpage.page);
+	return 0;
+}
+
+/*
+ * Return the kernel address of a cached position in the xfile.  If the cache
+ * misses, the relevant page will be brought into memory, mapped, and returned.
+ * If the cache is disabled, returns NULL.
+ */
+static void *
+xfile_cache_lookup(
+	struct xfile		*xf,
+	loff_t			pos)
+{
+	loff_t			key = round_down(pos, PAGE_SIZE);
+	unsigned int		i;
+	int			ret;
+
+	if (!(xf->flags & XFILE_INTERNAL_CACHE))
+		return NULL;
+
+	/* Is it already in the cache? */
+	for (i = 0; i < XFILE_CACHE_ENTRIES; i++) {
+		if (!xf->cached[i].xfpage.page)
+			continue;
+		if (page_offset(xf->cached[i].xfpage.page) != key)
+			continue;
+
+		goto found;
+	}
+
+	/* Find the least-used slot here so we can evict it. */
+	for (i = 0; i < XFILE_CACHE_ENTRIES; i++) {
+		if (!xf->cached[i].xfpage.page)
+			goto insert;
+	}
+	i = min_t(unsigned int, i, XFILE_CACHE_ENTRIES - 1);
+
+	ret = xfile_cache_evict(xf, &xf->cached[i]);
+	if (ret)
+		return ERR_PTR(ret);
+
+insert:
+	ret = xfile_cache_fill(xf, key, &xf->cached[i]);
+	if (ret)
+		return ERR_PTR(ret);
+
+found:
+	/* Stupid MRU moves this cache entry to the front. */
+	if (i != 0)
+		swap(xf->cached[0], xf->cached[i]);
+
+	return xf->cached[0].kaddr;
+}
+
+/* Drop all cached xfile pages. */
+static void
+xfile_cache_drop(
+	struct xfile		*xf)
+{
+	unsigned int		i;
+
+	if (!(xf->flags & XFILE_INTERNAL_CACHE))
+		return;
+
+	for (i = 0; i < XFILE_CACHE_ENTRIES; i++)
+		xfile_cache_evict(xf, &xf->cached[i]);
+}
+
+/* Enable the internal xfile cache. */
+void
+xfile_cache_enable(
+	struct xfile		*xf)
+{
+	xf->flags |= XFILE_INTERNAL_CACHE;
+	memset(xf->cached, 0, sizeof(struct xfile_cache) * XFILE_CACHE_ENTRIES);
+}
+
+/* Disable the internal xfile cache. */
+void
+xfile_cache_disable(
+	struct xfile		*xf)
+{
+	xfile_cache_drop(xf);
+	xf->flags &= ~XFILE_INTERNAL_CACHE;
+}
+
 /* Close the file and release all resources. */
 void
 xfile_destroy(
@@ -112,11 +235,41 @@ xfile_destroy(
 
 	trace_xfile_destroy(xf);
 
+	xfile_cache_drop(xf);
+
 	lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
 	fput(xf->file);
 	kfree(xf);
 }
 
+/* Get a mapped page in the xfile, do not use internal cache. */
+static void *
+xfile_uncached_get(
+	struct xfile		*xf,
+	loff_t			pos,
+	struct xfile_page	*xfpage)
+{
+	loff_t			key = round_down(pos, PAGE_SIZE);
+	int			error;
+
+	error = xfile_get_page(xf, key, PAGE_SIZE, xfpage);
+	if (error)
+		return ERR_PTR(error);
+
+	return kmap_local_page(xfpage->page);
+}
+
+/* Release a mapped page that was obtained via xfile_uncached_get. */
+static int
+xfile_uncached_put(
+	struct xfile		*xf,
+	struct xfile_page	*xfpage,
+	void			*kaddr)
+{
+	kunmap_local(kaddr);
+	return xfile_put_page(xf, xfpage);
+}
+
 /*
  * Read a memory object directly from the xfile's page cache.  Unlike regular
  * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
@@ -131,8 +284,6 @@ xfile_pread(
 	loff_t			pos)
 {
 	struct inode		*inode = file_inode(xf->file);
-	struct address_space	*mapping = inode->i_mapping;
-	struct page		*page = NULL;
 	ssize_t			read = 0;
 	unsigned int		pflags;
 	int			error = 0;
@@ -146,42 +297,32 @@ xfile_pread(
 
 	pflags = memalloc_nofs_save();
 	while (count > 0) {
+		struct xfile_page xfpage;
 		void		*p, *kaddr;
 		unsigned int	len;
+		bool		cached = true;
 
 		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
 
-		/*
-		 * In-kernel reads of a shmem file cause it to allocate a page
-		 * if the mapping shows a hole.  Therefore, if we hit ENOMEM
-		 * we can continue by zeroing the caller's buffer.
-		 */
-		page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
-				__GFP_NOWARN);
-		if (IS_ERR(page)) {
-			error = PTR_ERR(page);
-			if (error != -ENOMEM)
+		kaddr = xfile_cache_lookup(xf, pos);
+		if (!kaddr) {
+			cached = false;
+			kaddr = xfile_uncached_get(xf, pos, &xfpage);
+		}
+		if (IS_ERR(kaddr)) {
+			error = PTR_ERR(kaddr);
+			break;
+		}
+
+		p = kaddr + offset_in_page(pos);
+		memcpy(buf, p, len);
+
+		if (!cached) {
+			error = xfile_uncached_put(xf, &xfpage, kaddr);
+			if (error)
 				break;
-
-			memset(buf, 0, len);
-			goto advance;
-		}
-
-		if (PageUptodate(page)) {
-			/*
-			 * xfile pages must never be mapped into userspace, so
-			 * we skip the dcache flush.
-			 */
-			kaddr = kmap_local_page(page);
-			p = kaddr + offset_in_page(pos);
-			memcpy(buf, p, len);
-			kunmap_local(kaddr);
-		} else {
-			memset(buf, 0, len);
 		}
-		put_page(page);
 
-advance:
 		count -= len;
 		pos += len;
 		buf += len;
@@ -208,9 +349,6 @@ xfile_pwrite(
 	loff_t			pos)
 {
 	struct inode		*inode = file_inode(xf->file);
-	struct address_space	*mapping = inode->i_mapping;
-	const struct address_space_operations *aops = mapping->a_ops;
-	struct page		*page = NULL;
 	ssize_t			written = 0;
 	unsigned int		pflags;
 	int			error = 0;
@@ -224,52 +362,36 @@ xfile_pwrite(
 
 	pflags = memalloc_nofs_save();
 	while (count > 0) {
-		void		*fsdata = NULL;
+		struct xfile_page xfpage;
 		void		*p, *kaddr;
 		unsigned int	len;
-		int		ret;
+		bool		cached = true;
 
 		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
 
-		/*
-		 * We call write_begin directly here to avoid all the freezer
-		 * protection lock-taking that happens in the normal path.
-		 * shmem doesn't support fs freeze, but lockdep doesn't know
-		 * that and will trip over that.
-		 */
-		error = aops->write_begin(NULL, mapping, pos, len, &page,
-				&fsdata);
-		if (error)
+		kaddr = xfile_cache_lookup(xf, pos);
+		if (!kaddr) {
+			cached = false;
+			kaddr = xfile_uncached_get(xf, pos, &xfpage);
+		}
+		if (IS_ERR(kaddr)) {
+			error = PTR_ERR(kaddr);
 			break;
-
-		/*
-		 * xfile pages must never be mapped into userspace, so we skip
-		 * the dcache flush.  If the page is not uptodate, zero it
-		 * before writing data.
-		 */
-		kaddr = kmap_local_page(page);
-		if (!PageUptodate(page)) {
-			memset(kaddr, 0, PAGE_SIZE);
-			SetPageUptodate(page);
 		}
+
 		p = kaddr + offset_in_page(pos);
 		memcpy(p, buf, len);
-		kunmap_local(kaddr);
 
-		ret = aops->write_end(NULL, mapping, pos, len, len, page,
-				fsdata);
-		if (ret < 0) {
-			error = ret;
-			break;
+		if (!cached) {
+			error = xfile_uncached_put(xf, &xfpage, kaddr);
+			if (error)
+				break;
 		}
 
-		written += ret;
-		if (ret != len)
-			break;
-
-		count -= ret;
-		pos += ret;
-		buf += ret;
+		written += len;
+		count -= len;
+		pos += len;
+		buf += len;
 	}
 	memalloc_nofs_restore(pflags);
 
@@ -286,6 +408,7 @@ xfile_discard(
 	u64			count)
 {
 	trace_xfile_discard(xf, pos, count);
+	xfile_cache_drop(xf);
 	shmem_truncate_range(file_inode(xf->file), pos, pos + count - 1);
 }
 
@@ -297,9 +420,6 @@ xfile_prealloc(
 	u64			count)
 {
 	struct inode		*inode = file_inode(xf->file);
-	struct address_space	*mapping = inode->i_mapping;
-	const struct address_space_operations *aops = mapping->a_ops;
-	struct page		*page = NULL;
 	unsigned int		pflags;
 	int			error = 0;
 
@@ -312,47 +432,22 @@ xfile_prealloc(
 
 	pflags = memalloc_nofs_save();
 	while (count > 0) {
-		void		*fsdata = NULL;
+		struct xfile_page xfpage;
+		void		*kaddr;
 		unsigned int	len;
-		int		ret;
 
 		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
 
-		/*
-		 * We call write_begin directly here to avoid all the freezer
-		 * protection lock-taking that happens in the normal path.
-		 * shmem doesn't support fs freeze, but lockdep doesn't know
-		 * that and will trip over that.
-		 */
-		error = aops->write_begin(NULL, mapping, pos, len, &page,
-				&fsdata);
+		kaddr = xfile_uncached_get(xf, pos, &xfpage);
+		if (IS_ERR(kaddr)) {
+			error = PTR_ERR(kaddr);
+			break;
+		}
+
+		error = xfile_uncached_put(xf, &xfpage, kaddr);
 		if (error)
 			break;
 
-		/*
-		 * xfile pages must never be mapped into userspace, so we skip
-		 * the dcache flush.  If the page is not uptodate, zero it to
-		 * ensure we never go lacking for space here.
-		 */
-		if (!PageUptodate(page)) {
-			void	*kaddr = kmap_local_page(page);
-
-			memset(kaddr, 0, PAGE_SIZE);
-			SetPageUptodate(page);
-			kunmap_local(kaddr);
-		}
-
-		ret = aops->write_end(NULL, mapping, pos, len, len, page,
-				fsdata);
-		if (ret < 0) {
-			error = ret;
-			break;
-		}
-		if (ret != len) {
-			error = -EIO;
-			break;
-		}
-
 		count -= len;
 		pos += len;
 	}
@@ -483,7 +578,7 @@ xfile_put_page(
 	unsigned int		pflags;
 	int			ret;
 
-	trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
+	trace_xfile_put_page(xf, xfpage->pos, xfpage->page);
 
 	/* Give back the reference that we took in xfile_get_page. */
 	put_page(xfpage->page);
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
index 849f59da6a184..4bb10829f7a07 100644
--- a/fs/xfs/scrub/xfile.h
+++ b/fs/xfs/scrub/xfile.h
@@ -24,11 +24,32 @@ static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage)
 	return xfpage->page->index;
 }
 
+struct xfile_cache {
+	struct xfile_page	xfpage;
+	void			*kaddr;
+};
+
+#define XFILE_CACHE_ENTRIES	(sizeof(struct xfs_buf_cache) / \
+				 sizeof(struct xfile_cache))
+
 struct xfile {
 	struct file		*file;
-	struct xfs_buf_cache	bcache;
+
+	union {
+		struct xfs_buf_cache	bcache;
+		struct xfile_cache	cached[XFILE_CACHE_ENTRIES];
+	};
+
+	/* XFILE_* flags */
+	unsigned int		flags;
 };
 
+/* Use the internal cache for faster access. */
+#define XFILE_INTERNAL_CACHE	(1U << 0)
+
+void xfile_cache_enable(struct xfile *xf);
+void xfile_cache_disable(struct xfile *xf);
+
 int xfile_create(const char *description, loff_t isize, struct xfile **xfilep);
 void xfile_destroy(struct xfile *xf);
 
diff --git a/fs/xfs/xfs_buf_xfile.c b/fs/xfs/xfs_buf_xfile.c
index 58469a91e72bc..cc670e8bafc4a 100644
--- a/fs/xfs/xfs_buf_xfile.c
+++ b/fs/xfs/xfs_buf_xfile.c
@@ -49,6 +49,13 @@ xfile_alloc_buftarg(
 	if (error)
 		return error;
 
+	/*
+	 * We're hooking the xfile up to the buffer cache, so disable its
+	 * internal page caching because all callers should be using xfs_buf
+	 * functions.
+	 */
+	xfile_cache_disable(xfile);
+
 	error = xfs_buf_cache_init(&xfile->bcache);
 	if (error)
 		goto out_xfile;