From patchwork Wed Dec 27 13:40:14 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508399
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0CD687ED
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:40:15 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="j6t5D7aY"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id CD3F9C433C8;
	Mon,  1 Jan 2024 00:40:14 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069614;
	bh=bAzPVhvBi/HYhV/uSt+3+w675yPt1hbABxTLtUXLH3w=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=j6t5D7aYtBSrGdEtx1A6oYV4db7Pf2SG6Wu2jEy8+wsfZiGQg+Kb8U3ReWN1lYY3c
	 fGnIUuZexqRX6ikRA36ORXNGbdN1O9ZrjirUZrlZP5pLak53LOQZxaHP/ZgClZ8oyW
	 4YLeDzoLTgc1xgPn2gDJEWS68Ko+VSqSP5tzEatbQ6L8+anlogh3J4ngJtaCgpumMa
	 7pSsI1ROkYvX/axcgEdLBr7W6ulkGDH/1+jdHrCeTjmGt/f1sYx5QZyUE95mfJq8Wu
	 ObSQuZWJ6t5MEcymG3RXt9EMOxumQJmebbEW1oqWl1yb3ckSp+ecsa+ZN8moCfQEmh
	 u4123N+wKM9bQ==
Date: Sun, 31 Dec 2023 16:40:14 +9900
Subject: [PATCH 01/10] xfs: add an ioctl to map free space into a file
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170405020337.1820796.14472849118957827138.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add a new ioctl to map free physical space into a file, at the same file
offset as if the file were a sparse image of the physical device backing
the filesystem.  The intent here is to use this to prototype a free
space defragmentation tool.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_trace.h             |    4 +
 libxfs/libxfs_priv.h            |   10 +++
 libxfs/xfs_alloc.c              |   88 +++++++++++++++++++++++
 libxfs/xfs_alloc.h              |    3 +
 libxfs/xfs_bmap.c               |  149 +++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_bmap.h               |    3 +
 libxfs/xfs_fs.h                 |    1 
 libxfs/xfs_fs_staging.h         |   15 ++++
 man/man2/ioctl_xfs_map_freesp.2 |   76 ++++++++++++++++++++
 9 files changed, 349 insertions(+)
 create mode 100644 man/man2/ioctl_xfs_map_freesp.2

diff --git a/include/xfs_trace.h b/include/xfs_trace.h
index c8368227705..328f276d498 100644
--- a/include/xfs_trace.h
+++ b/include/xfs_trace.h
@@ -26,6 +26,8 @@
 #define trace_xfs_alloc_exact_done(a)		((void) 0)
 #define trace_xfs_alloc_exact_notfound(a)	((void) 0)
 #define trace_xfs_alloc_exact_error(a)		((void) 0)
+#define trace_xfs_alloc_find_freesp(...)	((void) 0)
+#define trace_xfs_alloc_find_freesp_done(...)	((void) 0)
 #define trace_xfs_alloc_near_first(a)		((void) 0)
 #define trace_xfs_alloc_near_greater(a)		((void) 0)
 #define trace_xfs_alloc_near_lesser(a)		((void) 0)
@@ -196,6 +198,8 @@
 
 #define trace_xfs_bmap_pre_update(a,b,c,d)	((void) 0)
 #define trace_xfs_bmap_post_update(a,b,c,d)	((void) 0)
+#define trace_xfs_bmapi_freesp(...)		((void) 0)
+#define trace_xfs_bmapi_freesp_done(...)	((void) 0)
 #define trace_xfs_bunmap(a,b,c,d,e)		((void) 0)
 #define trace_xfs_read_extent(a,b,c,d)		((void) 0)
 
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index bbe7dd63443..bc8f66aa986 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -501,6 +501,16 @@ void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
 #define xfs_filestream_new_ag(ip,ag)		(0)
 #define xfs_filestream_select_ag(...)		(-ENOSYS)
 
+struct xfs_trans;
+
+static inline int
+xfs_rtallocate_extent(struct xfs_trans *tp, xfs_rtblock_t bno,
+		xfs_extlen_t minlen, xfs_extlen_t maxlen, xfs_extlen_t *len,
+		int wasdel, xfs_extlen_t prod, xfs_rtblock_t *rtblock)
+{
+	return -EOPNOTSUPP;
+}
+
 #define xfs_trans_inode_buf(tp, bp)		((void) 0)
 
 /* quota bits */
diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c
index 589e9ef3003..50626d79c3f 100644
--- a/libxfs/xfs_alloc.c
+++ b/libxfs/xfs_alloc.c
@@ -4092,3 +4092,91 @@ xfs_extfree_intent_destroy_cache(void)
 	kmem_cache_destroy(xfs_extfree_item_cache);
 	xfs_extfree_item_cache = NULL;
 }
+
+/*
+ * Find the next chunk of free space in @pag starting at @agbno and going no
+ * higher than @end_agbno.  Set @agbno and @len to whatever free space we find,
+ * or to @end_agbno if we find no space.
+ */
+int
+xfs_alloc_find_freesp(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_agblock_t		*agbno,
+	xfs_agblock_t		end_agbno,
+	xfs_extlen_t		*len)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agf_bp = NULL;
+	xfs_agblock_t		found_agbno;
+	xfs_extlen_t		found_len;
+	int			found;
+	int			error;
+
+	trace_xfs_alloc_find_freesp(mp, pag->pag_agno, *agbno,
+			end_agbno - *agbno);
+
+	error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+	if (error)
+		return error;
+
+	cur = xfs_allocbt_init_cursor(mp, tp, agf_bp, pag, XFS_BTNUM_BNO);
+
+	/* Try to find a free extent that starts before here. */
+	error = xfs_alloc_lookup_le(cur, *agbno, 0, &found);
+	if (error)
+		goto out_cur;
+	if (found) {
+		error = xfs_alloc_get_rec(cur, &found_agbno, &found_len,
+				&found);
+		if (error)
+			goto out_cur;
+		if (XFS_IS_CORRUPT(mp, !found)) {
+			xfs_btree_mark_sick(cur);
+			error = -EFSCORRUPTED;
+			goto out_cur;
+		}
+
+		if (found_agbno + found_len > *agbno)
+			goto found;
+	}
+
+	/* Examine the next record if free extent not in range. */
+	error = xfs_btree_increment(cur, 0, &found);
+	if (error)
+		goto out_cur;
+	if (!found)
+		goto next_ag;
+
+	error = xfs_alloc_get_rec(cur, &found_agbno, &found_len, &found);
+	if (error)
+		goto out_cur;
+	if (XFS_IS_CORRUPT(mp, !found)) {
+		xfs_btree_mark_sick(cur);
+		error = -EFSCORRUPTED;
+		goto out_cur;
+	}
+
+	if (found_agbno >= end_agbno)
+		goto next_ag;
+
+found:
+	/* Found something, so update the mapping. */
+	trace_xfs_alloc_find_freesp_done(mp, pag->pag_agno, found_agbno,
+			found_len);
+	if (found_agbno < *agbno) {
+		found_len -= *agbno - found_agbno;
+		found_agbno = *agbno;
+	}
+	*len = found_len;
+	*agbno = found_agbno;
+	goto out_cur;
+next_ag:
+	/* Found nothing, so advance the cursor beyond the end of the range. */
+	*agbno = end_agbno;
+	*len = 0;
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
diff --git a/libxfs/xfs_alloc.h b/libxfs/xfs_alloc.h
index 130026e981e..fedb6dc0443 100644
--- a/libxfs/xfs_alloc.h
+++ b/libxfs/xfs_alloc.h
@@ -290,5 +290,8 @@ void xfs_extfree_intent_destroy_cache(void);
 
 xfs_failaddr_t xfs_validate_ag_length(struct xfs_buf *bp, uint32_t seqno,
 		uint32_t length);
+int xfs_alloc_find_freesp(struct xfs_trans *tp, struct xfs_perag *pag,
+		xfs_agblock_t *agbno, xfs_agblock_t end_agbno,
+		xfs_extlen_t *len);
 
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index 13bcf146d08..94640a5077c 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -6436,3 +6436,152 @@ xfs_get_cowextsz_hint(
 		return XFS_DEFAULT_COWEXTSZ_HINT;
 	return a;
 }
+
+static inline xfs_fileoff_t
+xfs_fsblock_to_fileoff(
+	struct xfs_mount	*mp,
+	xfs_fsblock_t		fsbno)
+{
+	xfs_daddr_t		daddr = XFS_FSB_TO_DADDR(mp, fsbno);
+
+	return XFS_B_TO_FSB(mp, BBTOB(daddr));
+}
+
+/*
+ * Given a file and a free physical extent, map it into the file at the same
+ * offset if the file were a sparse image of the physical device.  Set @mval to
+ * whatever mapping we added to the file.
+ */
+int
+xfs_bmapi_freesp(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fsblock_t		fsbno,
+	xfs_extlen_t		len,
+	struct xfs_bmbt_irec	*mval)
+{
+	struct xfs_bmbt_irec	irec;
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		startoff;
+	bool			isrt = XFS_IS_REALTIME_INODE(ip);
+	int			nimaps;
+	int			error;
+
+	trace_xfs_bmapi_freesp(ip, fsbno, len);
+
+	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+			XFS_IEXT_ADD_NOSPLIT_CNT);
+	if (error)
+		return error;
+
+	if (isrt)
+		startoff = fsbno;
+	else
+		startoff = xfs_fsblock_to_fileoff(mp, fsbno);
+
+	/* Make sure the entire range is a hole. */
+	nimaps = 1;
+	error = xfs_bmapi_read(ip, startoff, len, &irec, &nimaps, 0);
+	if (error)
+		return error;
+
+	if (irec.br_startoff != startoff ||
+	    irec.br_startblock != HOLESTARTBLOCK ||
+	    irec.br_blockcount < len)
+		return -EINVAL;
+
+	/*
+	 * Allocate the physical extent.  We should not have dropped the lock
+	 * since the scan of the free space metadata, so this should work,
+	 * though the length may be adjusted to play nicely with metadata space
+	 * reservations.
+	 */
+	if (isrt) {
+		xfs_rtxnum_t	rtx_in, rtx_out;
+		xfs_extlen_t	rtxlen_in, rtxlen_out;
+		uint32_t	mod;
+
+		rtx_in = xfs_rtb_to_rtxrem(mp, fsbno, &mod);
+		if (mod) {
+			ASSERT(mod == 0);
+			return -EFSCORRUPTED;
+		}
+
+		rtxlen_in = xfs_rtb_to_rtxrem(mp, len, &mod);
+		if (mod) {
+			ASSERT(mod == 0);
+			return -EFSCORRUPTED;
+		}
+
+		error = xfs_rtallocate_extent(tp, rtx_in, 1, rtxlen_in,
+				&rtxlen_out, 0, 1, &rtx_out);
+		if (error)
+			return error;
+		if (rtx_out == NULLRTEXTNO) {
+			/*
+			 * We were promised the space!  In theory there aren't
+			 * any reserve lists that would prevent us from getting
+			 * the space.
+			 */
+			return -ENOSPC;
+		}
+		if (rtx_out != rtx_in) {
+			ASSERT(0);
+			xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+			return -EFSCORRUPTED;
+		}
+		mval->br_blockcount = rtxlen_out * mp->m_sb.sb_rextsize;
+	} else {
+		struct xfs_alloc_arg	args = {
+			.mp		= mp,
+			.tp		= tp,
+			.oinfo		= XFS_RMAP_OINFO_SKIP_UPDATE,
+			.resv		= XFS_AG_RESV_NONE,
+			.prod		= 1,
+			.datatype	= XFS_ALLOC_USERDATA,
+			.maxlen		= len,
+			.minlen		= 1,
+		};
+		args.pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, fsbno));
+		error = xfs_alloc_vextent_exact_bno(&args, fsbno);
+		xfs_perag_put(args.pag);
+		if (error)
+			return error;
+		if (args.fsbno == NULLFSBLOCK) {
+			/*
+			 * We were promised the space, but failed to get it.
+			 * This could be because the space is reserved for
+			 * metadata expansion, or it could be because the AGFL
+			 * fixup grabbed the first block we wanted.  Either
+			 * way, if the transaction is dirty we must commit it
+			 * and tell the caller to try again.
+			 */
+			if (tp->t_flags & XFS_TRANS_DIRTY)
+				return -EAGAIN;
+			return -ENOSPC;
+		}
+		if (args.fsbno != fsbno) {
+			ASSERT(0);
+			xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+			return -EFSCORRUPTED;
+		}
+		mval->br_blockcount = args.len;
+	}
+
+	/* Map extent into file, update quota. */
+	mval->br_startblock = fsbno;
+	mval->br_startoff = startoff;
+	mval->br_state = XFS_EXT_UNWRITTEN;
+
+	trace_xfs_bmapi_freesp_done(ip, mval);
+
+	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, mval);
+	if (isrt)
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_RTBCOUNT,
+				mval->br_blockcount);
+	else
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+				mval->br_blockcount);
+
+	return 0;
+}
diff --git a/libxfs/xfs_bmap.h b/libxfs/xfs_bmap.h
index 61c195198db..afb54a517f1 100644
--- a/libxfs/xfs_bmap.h
+++ b/libxfs/xfs_bmap.h
@@ -197,6 +197,9 @@ int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
 int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
 		xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
+int	xfs_bmapi_freesp(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fsblock_t fsbno, xfs_extlen_t len,
+		struct xfs_bmbt_irec *mval);
 int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
 		xfs_extnum_t nexts, int *done);
diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index 96688f9301e..922e9acfdc3 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -864,6 +864,7 @@ struct xfs_scrub_metadata {
 #define XFS_IOC_AG_GEOMETRY	_IOWR('X', 61, struct xfs_ag_geometry)
 /*	XFS_IOC_GETPARENTS ---- staging 62         */
 /*	XFS_IOC_RTGROUP_GEOMETRY - staging 63	   */
+/*	XFS_IOC_MAP_FREESP ---- staging 64	   */
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/libxfs/xfs_fs_staging.h b/libxfs/xfs_fs_staging.h
index f7d872f8a88..2a9effb7a84 100644
--- a/libxfs/xfs_fs_staging.h
+++ b/libxfs/xfs_fs_staging.h
@@ -303,4 +303,19 @@ xfs_getfsrefs_advance(
 /* XXX stealing XFS_IOC_GETBIOSIZE */
 #define XFS_IOC_GETFSREFCOUNTS		_IOWR('X', 47, struct xfs_getfsrefs_head)
 
+/* map free space to file */
+
+struct xfs_map_freesp {
+	__s64	offset;		/* disk address to map, in bytes */
+	__s64	len;		/* length in bytes */
+	__u64	flags;		/* must be zero */
+	__u64	pad;		/* must be zero */
+};
+
+/*
+ * XFS_IOC_MAP_FREESP maps all the free physical space in the filesystem into
+ * the file at the same offsets.  This ioctl requires CAP_SYS_ADMIN.
+ */
+#define XFS_IOC_MAP_FREESP	_IOWR('X', 64, struct xfs_map_freesp)
+
 #endif /* __XFS_FS_STAGING_H__ */
diff --git a/man/man2/ioctl_xfs_map_freesp.2 b/man/man2/ioctl_xfs_map_freesp.2
new file mode 100644
index 00000000000..ca1d9882437
--- /dev/null
+++ b/man/man2/ioctl_xfs_map_freesp.2
@@ -0,0 +1,76 @@
+.\" Copyright (c) 2023-2024 Oracle.  All rights reserved.
+.\"
+.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\" %%%LICENSE_END
+.TH IOCTL-XFS-MAP-FREESP 2 2023-11-17 "XFS"
+.SH NAME
+ioctl_xfs_map_freesp \- map free space into a file
+.SH SYNOPSIS
+.br
+.B #include <xfs/xfs_fs_staging.h>
+.PP
+.BI "int ioctl(int " fd ", XFS_IOC_MAP_FREESP, struct xfs_map_freesp *" arg );
+.SH DESCRIPTION
+Maps free space into the sparse ranges of a regular file.
+This ioctl uses
+.B struct xfs_map_freesp
+to specify the range of free space to be mapped:
+.PP
+.in +4n
+.nf
+struct xfs_map_freesp {
+	__s64   offset;
+	__s64   len;
+	__s64   flags;
+	__s64   pad;
+};
+.fi
+.in
+.PP
+.I offset
+is the physical disk address, in bytes, of the start of the range to scan.
+Each free space extent in this range will be mapped to the file if the
+corresponding range of the file is sparse.
+.PP
+.I len
+is the number of bytes in the range to scan.
+.PP
+.I flags
+must be zero; there are no flags defined yet.
+.PP
+.I pad
+must be zero.
+.SH RETURN VALUE
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+.SH ERRORS
+Error codes can be one of, but are not limited to, the following:
+.TP
+.B EFAULT
+The kernel was not able to copy into the userspace buffer.
+.TP
+.B EFSBADCRC
+Metadata checksum validation failed while performing the query.
+.TP
+.B EFSCORRUPTED
+Metadata corruption was encountered while performing the query.
+.TP
+.B EINVAL
+One of the arguments was not valid,
+or the file was not sparse.
+.TP
+.B EIO
+An I/O error was encountered while performing the query.
+.TP
+.B ENOMEM
+There was insufficient memory to perform the query.
+.TP
+.B ENOSPC
+There was insufficient disk space to commit the space mappings.
+.SH CONFORMING TO
+This API is specific to XFS filesystem on the Linux kernel.
+.SH SEE ALSO
+.BR ioctl (2)

From patchwork Wed Dec 27 13:40:30 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508400
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 113E57F9
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:40:30 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="r+XrR94k"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 84CA9C433C8;
	Mon,  1 Jan 2024 00:40:30 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069630;
	bh=ytI1yiozzpMvl5cxYBwwexdPS8/X1qnNlJV0hyfluow=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=r+XrR94k+rTTF5dC46rqQ9V38ikuMil48AOtgY3AjZGIpDDgsFkn2xcpbs2lWGqo9
	 alhsL4bDYbsCpx6F7MI8zrnOsWNrs34zyfHUhOwlTC3Y7UtUFJVHbH0BO0rKo43tjJ
	 26z+UxHz1Y4AG116mLbApbi8KdMvrf4WtXP2L0y6M1K0+viw+tCZNhTcRawuNtvjMj
	 pXu5cZG69bYq0IM/lqXY3EiPbpJ820kSrBeuqqN1pXOk1/OP5s3AgQ6BRsI7zQnQKE
	 BD64XLpY6OlNpaSobaPiTDIUxvTsgps3XK0OOdS16lcQ3ETqdBsEKDsW0/Eg1i/gXs
	 GLdscxgl0SkvA==
Date: Sun, 31 Dec 2023 16:40:30 +9900
Subject: [PATCH 02/10] xfs_io: support using XFS_IOC_MAP_FREESP to map free
 space
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170405020351.1820796.4340744744634500451.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add a command to call XFS_IOC_MAP_FREESP.  This is experimental code to
see if we can build a free space defragmenter out of this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/prealloc.c     |   35 +++++++++++++++++++++++++++++++++++
 man/man8/xfs_io.8 |    8 +++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/io/prealloc.c b/io/prealloc.c
index 5805897a4a0..0de3e142de1 100644
--- a/io/prealloc.c
+++ b/io/prealloc.c
@@ -45,6 +45,7 @@ static cmdinfo_t finsert_cmd;
 static cmdinfo_t fzero_cmd;
 static cmdinfo_t funshare_cmd;
 #endif
+static cmdinfo_t fmapfree_cmd;
 
 static int
 offset_length(
@@ -383,6 +384,30 @@ funshare_f(
 }
 #endif	/* HAVE_FALLOCATE */
 
+static int
+fmapfree_f(
+	int			argc,
+	char			**argv)
+{
+	struct xfs_flock64	segment;
+	struct xfs_map_freesp	args = { };
+
+	if (!offset_length(argv[1], argv[2], &segment)) {
+		exitcode = 1;
+		return 0;
+	}
+
+	args.offset = segment.l_start;
+	args.len = segment.l_len;
+
+	if (ioctl(file->fd, XFS_IOC_MAP_FREESP, &args)) {
+		perror("XFS_IOC_MAP_FREESP");
+		exitcode = 1;
+		return 0;
+	}
+	return 0;
+}
+
 void
 prealloc_init(void)
 {
@@ -497,4 +522,14 @@ prealloc_init(void)
 	_("unshares shared blocks within the range");
 	add_command(&funshare_cmd);
 #endif	/* HAVE_FALLOCATE */
+
+	fmapfree_cmd.name = "fmapfree";
+	fmapfree_cmd.cfunc = fmapfree_f;
+	fmapfree_cmd.argmin = 2;
+	fmapfree_cmd.argmax = 2;
+	fmapfree_cmd.flags = CMD_NOMAP_OK | CMD_FOREIGN_OK;
+	fmapfree_cmd.args = _("off len");
+	fmapfree_cmd.oneline =
+	_("maps free space into a file");
+	add_command(&fmapfree_cmd);
 }
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 411144151a1..e360d22dc38 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -513,8 +513,14 @@ Call fallocate with FALLOC_FL_INSERT_RANGE flag as described in the
 .BR fallocate (2)
 manual page to create the hole by shifting data blocks.
 .TP
+.BI fmapfree " offset length"
+Maps free physical space into the file by calling XFS_IOC_MAP_FREESP as
+described in the
+.BR XFS_IOC_MAP_FREESP (2)
+manual page.
+.TP
 .BI fpunch " offset length"
-Punches (de-allocates) blocks in the file by calling fallocate with 
+Punches (de-allocates) blocks in the file by calling fallocate with
 the FALLOC_FL_PUNCH_HOLE flag as described in the
 .BR fallocate (2)
 manual page.

From patchwork Wed Dec 27 13:40:45 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508401
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 652407EE
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:40:46 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="TX+k2W1h"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 34A8AC433C7;
	Mon,  1 Jan 2024 00:40:46 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069646;
	bh=uR6U5qNJR2PHkazOLWPItQVFrMbUjauWJmA0/2ojXE4=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=TX+k2W1habeejAu+5y+ACxtNZ11ahLCxUitmtly50QLD/GZPW8mM8yUAH5wUD8das
	 uKZJGzyBNbODs2XKCKp5/hUl2AJRIetrpfKnzZf0mimEU4MZVn5gXmk6BIUpcxz6Py
	 RZfVXJ5eidYZ8YrnJxbT85q0dQTbisAK+NtOhChW6GSph1/HbZcETKhPzxj4/D/w2G
	 PtUnpCHS5ojHh6musikk9iXP+xWvD0mFCIesi0DBCp9ljQGjqgmBwAeC8tRAHRtcPX
	 Q4bSx16Ji0kt5TY5VpJlbvwIiBQhaireq6Dq5yLKaB1EWpZZFdj+Q+MAyzt+ohVNb8
	 GekHIhk+NkdfQ==
Date: Sun, 31 Dec 2023 16:40:45 +9900
Subject: [PATCH 03/10] xfs_db: get and put blocks on the AGFL
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170405020364.1820796.4627927059186718750.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add a new xfs_db command to let people add and remove blocks from an
AGFL.  This isn't really related to rmap btree reconstruction, other
than enabling debugging code to mess around with the AGFL to exercise
various odd scenarios.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/agfl.c                |  297 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/libxfs_api_defs.h |    4 +
 man/man8/xfs_db.8        |   11 ++
 3 files changed, 308 insertions(+), 4 deletions(-)

diff --git a/db/agfl.c b/db/agfl.c
index f0f3f21a64d..662b6403cb2 100644
--- a/db/agfl.c
+++ b/db/agfl.c
@@ -15,13 +15,14 @@
 #include "output.h"
 #include "init.h"
 #include "agfl.h"
+#include "libfrog/bitmap.h"
 
 static int agfl_bno_size(void *obj, int startoff);
 static int agfl_f(int argc, char **argv);
 static void agfl_help(void);
 
 static const cmdinfo_t agfl_cmd =
-	{ "agfl", NULL, agfl_f, 0, 1, 1, N_("[agno]"),
+	{ "agfl", NULL, agfl_f, 0, -1, 1, N_("[agno] [-g nr] [-p nr]"),
 	  N_("set address to agfl block"), agfl_help };
 
 const field_t	agfl_hfld[] = { {
@@ -77,10 +78,280 @@ agfl_help(void)
 " for each allocation group.  This acts as a reserved pool of space\n"
 " separate from the general filesystem freespace (not used for user data).\n"
 "\n"
+" -g quantity\tRemove this many blocks from the AGFL.\n"
+" -p quantity\tAdd this many blocks to the AGFL.\n"
+"\n"
 ));
 
 }
 
+struct dump_info {
+	struct xfs_perag	*pag;
+	bool			leak;
+};
+
+/* Return blocks freed from the AGFL to the free space btrees. */
+static int
+free_grabbed(
+	uint64_t		start,
+	uint64_t		length,
+	void			*data)
+{
+	struct dump_info	*di = data;
+	struct xfs_perag	*pag = di->pag;
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_trans	*tp;
+	struct xfs_buf		*agf_bp;
+	int			error;
+
+	error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+			&tp);
+	if (error)
+		return error;
+
+	error = -libxfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+	if (error)
+		goto out_cancel;
+
+	error = -libxfs_free_extent(tp, pag, start, length, &XFS_RMAP_OINFO_AG,
+			XFS_AG_RESV_AGFL);
+	if (error)
+		goto out_cancel;
+
+	return -libxfs_trans_commit(tp);
+
+out_cancel:
+	libxfs_trans_cancel(tp);
+	return error;
+}
+
+/* Report blocks freed from the AGFL. */
+static int
+dump_grabbed(
+	uint64_t		start,
+	uint64_t		length,
+	void			*data)
+{
+	struct dump_info	*di = data;
+	const char		*fmt;
+
+	if (length == 1)
+		fmt = di->leak ? _("agfl %u: leaked agbno %u\n") :
+				 _("agfl %u: removed agbno %u\n");
+	else
+		fmt = di->leak ? _("agfl %u: leaked agbno %u-%u\n") :
+				 _("agfl %u: removed agbno %u-%u\n");
+
+	printf(fmt, di->pag->pag_agno, (unsigned int)start,
+			(unsigned int)(start + length - 1));
+	return 0;
+}
+
+/* Remove blocks from the AGFL. */
+static int
+agfl_get(
+	struct xfs_perag	*pag,
+	int			quantity)
+{
+	struct dump_info	di = {
+		.pag		= pag,
+		.leak		= quantity < 0,
+	};
+	struct xfs_agf		*agf;
+	struct xfs_buf		*agf_bp;
+	struct xfs_trans	*tp;
+	struct bitmap		*grabbed;
+	const unsigned int	agfl_size = libxfs_agfl_size(pag->pag_mount);
+	unsigned int		i;
+	int			error;
+
+	if (!quantity)
+		return 0;
+
+	if (di.leak)
+		quantity = -quantity;
+	quantity = min(quantity, agfl_size);
+
+	error = bitmap_alloc(&grabbed);
+	if (error)
+		goto out;
+
+	error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, quantity, 0,
+			0, &tp);
+	if (error)
+		goto out_bitmap;
+
+	error = -libxfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+	if (error)
+		goto out_cancel;
+
+	agf = agf_bp->b_addr;
+	quantity = min(quantity, be32_to_cpu(agf->agf_flcount));
+
+	for (i = 0; i < quantity; i++) {
+		xfs_agblock_t	agbno;
+
+		error = -libxfs_alloc_get_freelist(pag, tp, agf_bp, &agbno, 0);
+		if (error)
+			goto out_cancel;
+
+		if (agbno == NULLAGBLOCK) {
+			error = ENOSPC;
+			goto out_cancel;
+		}
+
+		error = bitmap_set(grabbed, agbno, 1);
+		if (error)
+			goto out_cancel;
+	}
+
+	error = -libxfs_trans_commit(tp);
+	if (error)
+		goto out_bitmap;
+
+	error = bitmap_iterate(grabbed, dump_grabbed, &di);
+	if (error)
+		goto out_bitmap;
+
+	if (!di.leak) {
+		error = bitmap_iterate(grabbed, free_grabbed, &di);
+		if (error)
+			goto out_bitmap;
+	}
+
+	bitmap_free(&grabbed);
+	return 0;
+
+out_cancel:
+	libxfs_trans_cancel(tp);
+out_bitmap:
+	bitmap_free(&grabbed);
+out:
+	if (error)
+		printf(_("agfl %u: %s\n"), pag->pag_agno, strerror(error));
+	return error;
+}
+
+/* Add blocks to the AGFL. */
+static int
+agfl_put(
+	struct xfs_perag	*pag,
+	int			quantity)
+{
+	struct xfs_alloc_arg	args = {
+		.mp		= pag->pag_mount,
+		.alignment	= 1,
+		.minlen		= 1,
+		.prod		= 1,
+		.resv		= XFS_AG_RESV_AGFL,
+		.oinfo		= XFS_RMAP_OINFO_AG,
+	};
+	struct xfs_buf		*agfl_bp;
+	struct xfs_agf		*agf;
+	struct xfs_trans	*tp;
+	xfs_fsblock_t		target;
+	const unsigned int	agfl_size = libxfs_agfl_size(pag->pag_mount);
+	unsigned int		i;
+	bool			eoag = quantity < 0;
+	int			error;
+
+	if (!quantity)
+		return 0;
+
+	if (eoag)
+		quantity = -quantity;
+	quantity = min(quantity, agfl_size);
+
+	error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, quantity, 0,
+			0, &tp);
+	if (error)
+		return error;
+	args.tp = tp;
+
+	error = -libxfs_alloc_read_agf(pag, tp, 0, &args.agbp);
+	if (error)
+		goto out_cancel;
+
+	agf = args.agbp->b_addr;
+	args.maxlen = min(quantity, agfl_size - be32_to_cpu(agf->agf_flcount));
+
+	if (eoag)
+		target = XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
+				be32_to_cpu(agf->agf_length) - 1);
+	else
+		target = XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, 0);
+
+	error = -libxfs_alloc_read_agfl(pag, tp, &agfl_bp);
+	if (error)
+		goto out_cancel;
+
+	error = -libxfs_alloc_vextent_near_bno(&args, target);
+	if (error)
+		goto out_cancel;
+
+	if (args.agbno == NULLAGBLOCK) {
+		error = ENOSPC;
+		goto out_cancel;
+	}
+
+	for (i = 0; i < args.len; i++) {
+		error = -libxfs_alloc_put_freelist(pag, tp, args.agbp,
+				agfl_bp, args.agbno + i, 0);
+		if (error)
+			goto out_cancel;
+	}
+
+	if (i == 1)
+		printf(_("agfl %u: added agbno %u\n"), pag->pag_agno,
+				args.agbno);
+	else if (i > 1)
+		printf(_("agfl %u: added agbno %u-%u\n"), pag->pag_agno,
+				args.agbno, args.agbno + i - 1);
+
+	error = -libxfs_trans_commit(tp);
+	if (error)
+		goto out;
+
+	return 0;
+
+out_cancel:
+	libxfs_trans_cancel(tp);
+out:
+	if (error)
+		printf(_("agfl %u: %s\n"), pag->pag_agno, strerror(error));
+	return error;
+}
+
+static void
+agfl_adjust(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	int			gblocks,
+	int			pblocks)
+{
+	struct xfs_perag	*pag;
+	int			error;
+
+	if (!expert_mode) {
+		printf(_("AGFL get/put only supported in expert mode.\n"));
+		exitcode = 1;
+		return;
+	}
+
+	pag = libxfs_perag_get(mp, agno);
+
+	error = agfl_get(pag, gblocks);
+	if (error)
+		goto out_pag;
+
+	error = agfl_put(pag, pblocks);
+
+out_pag:
+	libxfs_perag_put(pag);
+	if (error)
+		exitcode = 1;
+}
+
 static int
 agfl_f(
 	int		argc,
@@ -88,9 +359,25 @@ agfl_f(
 {
 	xfs_agnumber_t	agno;
 	char		*p;
+	int		c;
+	int		gblocks = 0, pblocks = 0;
 
-	if (argc > 1) {
-		agno = (xfs_agnumber_t)strtoul(argv[1], &p, 0);
+	while ((c = getopt(argc, argv, "g:p:")) != -1) {
+		switch (c) {
+		case 'g':
+			gblocks = atoi(optarg);
+			break;
+		case 'p':
+			pblocks = atoi(optarg);
+			break;
+		default:
+			agfl_help();
+			return 1;
+		}
+	}
+
+	if (argc > optind) {
+		agno = (xfs_agnumber_t)strtoul(argv[optind], &p, 0);
 		if (*p != '\0' || agno >= mp->m_sb.sb_agcount) {
 			dbprintf(_("bad allocation group number %s\n"), argv[1]);
 			return 0;
@@ -98,6 +385,10 @@ agfl_f(
 		cur_agno = agno;
 	} else if (cur_agno == NULLAGNUMBER)
 		cur_agno = 0;
+
+	if (gblocks || pblocks)
+		agfl_adjust(mp, cur_agno, gblocks, pblocks);
+
 	ASSERT(typtab[TYP_AGFL].typnm == TYP_AGFL);
 	set_cur(&typtab[TYP_AGFL],
 		XFS_AG_DADDR(mp, cur_agno, XFS_AGFL_DADDR(mp)),
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 4e7b3caba4b..52616086ef0 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -30,8 +30,12 @@
 #define xfs_allocbt_maxrecs		libxfs_allocbt_maxrecs
 #define xfs_allocbt_stage_cursor	libxfs_allocbt_stage_cursor
 #define xfs_alloc_fix_freelist		libxfs_alloc_fix_freelist
+#define xfs_alloc_get_freelist		libxfs_alloc_get_freelist
 #define xfs_alloc_min_freelist		libxfs_alloc_min_freelist
+#define xfs_alloc_put_freelist		libxfs_alloc_put_freelist
 #define xfs_alloc_read_agf		libxfs_alloc_read_agf
+#define xfs_alloc_read_agfl		libxfs_alloc_read_agfl
+#define xfs_alloc_vextent_near_bno	libxfs_alloc_vextent_near_bno
 #define xfs_alloc_vextent_start_ag	libxfs_alloc_vextent_start_ag
 
 #define xfs_ascii_ci_hashname		libxfs_ascii_ci_hashname
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 3e80bcc57de..39461398c6a 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -182,10 +182,19 @@ Set current address to the AGF block for allocation group
 .IR agno .
 If no argument is given, use the current allocation group.
 .TP
-.BI "agfl [" agno ]
+.BI "agfl [" agno "] [\-g " " quantity" "] [\-p " quantity ]
 Set current address to the AGFL block for allocation group
 .IR agno .
 If no argument is given, use the current allocation group.
+If the
+.B -g
+option is specified with a positive quantity, remove that many blocks from the
+AGFL and put them in the free space btrees.
+If the quantity is negative, remove the blocks and leak them.
+If the
+.B -p
+option is specified, add that many blocks to the AGFL.
+If the quantity is negative, the blocks are selected from the end of the AG.
 .TP
 .BI "agi [" agno ]
 Set current address to the AGI block for allocation group

From patchwork Wed Dec 27 13:41:01 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508402
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7730D7EE
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:41:02 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="J/qyNQA2"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id D8156C433C8;
	Mon,  1 Jan 2024 00:41:01 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069662;
	bh=CIB/kOXF/4fN5a8KHE9f7GLyw6bp7qiycEBrJSFvyEA=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=J/qyNQA2LJORgHQNOHHw/nAkxHbxDI0c025ZlFMQgnhvAhang0oQIKgw5UbIWpPS+
	 64l1mLoo784nLWk33UbJDxfoOpLLnuKo9+Nni30IAs0xOxCLgpSDtxyW6A09p+yjfi
	 FUMY27/z/8kugWZvs6R/EIyHYFMGZFubFYO2M1nJpf7g4y4z2CnB+6yOvOs34KETtT
	 ca3cIIDKbKIRprzIl2H35sflSbHmS7fpIE90Wn0L0Su88cL9wiJUBk791K35c8MWmy
	 VhoxXkODpbAbiGMCgek71E51PbXI5tfde+BVZejGtmoiy15PL1qU4bREYJDAGEN2Eq
	 SmGs6/twLzGDQ==
Date: Sun, 31 Dec 2023 16:41:01 +9900
Subject: [PATCH 04/10] xfs_spaceman: implement clearing free space
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170405020377.1820796.2819816737247793332.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

First attempt at evacuating all the used blocks from part of a
filesystem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 Makefile                |    2 
 libfrog/Makefile        |    5 
 libfrog/clearspace.c    | 3103 +++++++++++++++++++++++++++++++++++++++++++++++
 libfrog/clearspace.h    |   72 +
 man/man8/xfs_spaceman.8 |   17 
 spaceman/Makefile       |    2 
 spaceman/clearfree.c    |  164 ++
 spaceman/init.c         |    1 
 spaceman/space.h        |    2 
 9 files changed, 3366 insertions(+), 2 deletions(-)
 create mode 100644 libfrog/clearspace.c
 create mode 100644 libfrog/clearspace.h
 create mode 100644 spaceman/clearfree.c

diff --git a/Makefile b/Makefile
index c12df98dbef..f3a0249979c 100644
--- a/Makefile
+++ b/Makefile
@@ -97,7 +97,7 @@ quota: libxcmd
 repair: libxlog libxcmd
 copy: libxlog
 mkfs: libxcmd
-spaceman: libxcmd
+spaceman: libhandle libxcmd
 scrub: libhandle libxcmd
 rtcp: libfrog
 
diff --git a/libfrog/Makefile b/libfrog/Makefile
index 8c2d040bc29..9f2cf25ac9f 100644
--- a/libfrog/Makefile
+++ b/libfrog/Makefile
@@ -64,6 +64,11 @@ ifeq ($(HAVE_GETMNTENT),yes)
 LCFLAGS += -DHAVE_GETMNTENT
 endif
 
+ifeq ($(HAVE_GETFSMAP),yes)
+CFILES+=clearspace.c
+HFILES+=clearspace.h
+endif
+
 LDIRT = gen_crc32table crc32table.h
 
 default: ltdepend $(LTLIBRARY)
diff --git a/libfrog/clearspace.c b/libfrog/clearspace.c
new file mode 100644
index 00000000000..2cbb260546b
--- /dev/null
+++ b/libfrog/clearspace.c
@@ -0,0 +1,3103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include <linux/fsmap.h>
+#include "paths.h"
+#include "fsgeom.h"
+#include "logging.h"
+#include "bulkstat.h"
+#include "bitmap.h"
+#include "file_exchange.h"
+#include "clearspace.h"
+#include "handle.h"
+
+/*
+ * Filesystem Space Balloons
+ * =========================
+ *
+ * NOTE: Due to the evolving identity of this code, the "space_fd" or "space
+ * file" in the codebase are the same as the balloon file in this introduction.
+ * The introduction was written much later than the code.
+ *
+ * The goal of this code is to create a balloon file that is mapped to a range
+ * of the physical space that is managed by a filesystem.  There are several
+ * uses envisioned for balloon files:
+ *
+ * 1. Defragmenting free space.  Once the balloon is created, freeing it leaves
+ *    a large chunk of contiguous free space ready for reallocation.
+ *
+ * 2. Shrinking the filesystem.  If the balloon is inflated at the end of the
+ *    filesystem, the file can be handed to the shrink code.  The shrink code
+ *    can then reduce the filesystem size by the size of the balloon.
+ *
+ * 3. Constraining usage of underlying thin provisioning pools.  The space
+ *    assigned to a balloon can be DISCARDed, which prevents the filesystem
+ *    from using that space until the balloon is freed.  This can be done more
+ *    efficiently with the standard fallocate call, unless the balloon must
+ *    target specific LBA ranges.
+ *
+ * Inflating a balloon is performed in five phases: claiming unused space;
+ * freezing used space; migrating file mappings away from frozen space; moving
+ * inodes; and rebuilding metadata elsewhere.
+ *
+ * Claiming Unused Space
+ * ---------------------
+ *
+ * The first step of inflating a file balloon is to define the range of
+ * physical space to be added to the balloon and claim as much of the free
+ * space inside that range as possible.  Dirty data are flushed to disk and
+ * the block and inode garbage collectors are run to remove any speculative
+ * preallocations that might be occupying space in the target range.
+ *
+ * Second, the new XFS_IOC_MAP_FREESP ioctl is used to map free space in the
+ * target range to the balloon file.  This step will be repeated after every
+ * space-clearing step below to capture that cleared space.  Concurrent writer
+ * threads will (hopefully) be allocated space outside the target range.
+ *
+ * Freezing Used Space
+ * -------------------
+ *
+ * The second phase of inflating the balloon is to freeze as much of the
+ * allocated space within the target range as possible.  The purpose of this
+ * step is to grab a second reference to the used space, thereby preventing it
+ * from being reused elsewhere.
+ *
+ * Freezing of a physical space extent starts by using GETFSMAP to find the
+ * file owner of the space, and opening the file by handle.  The fsmap record
+ * is used to create a FICLONERANGE request to link the file range into a work
+ * file.  Once the reflink is made, any subsequent writes to any of the owners
+ * of that space are staged via copy on write.  The balloon file prevents the
+ * copy on write from being staged within the target range.  The frozen space
+ * mapping is moved from the work file to the balloon file, where it remains
+ * until the balloon file is freed.
+ *
+ * If reflink is not supported on the filesystem, used space cannot be frozen.
+ * This phase is skipped.
+ *
+ * Migrating File Mappings
+ * -----------------------
+ *
+ * Once the balloon file has been populated with as much of the target range as
+ * possible, it is time to remap file ranges that point to the frozen space.
+ *
+ * It is advantageous to remap as many blocks as can be done with as few system
+ * calls as possible to avoid fragmenting files.  Furthermore, it is preferable
+ * to remap heavily shared extents before lightly shared extents to preserve
+ * reflinks when possible.  The new GETFSREFCOUNTS call is used to rank
+ * physical space extents by size and sharing factor so that the library always
+ * tries to relocate the highest ranking space extent.
+ *
+ * Once a space extent has been selected for relocation, it is reflinked from
+ * the balloon file into the work file.  Next, fallocate is called with the
+ * FALLOC_FL_UNSHARE_RANGE mode to persist a new copy of the file data and
+ * update the mapping in the work file.  The GETFSMAP call is used to find the
+ * remaining owners of the target space.  For each owner, FIEDEDUPERANGE is
+ * used to change the owner file's mapping to the space in the work file if the
+ * owner has not been changed.
+ *
+ * If the filesystem does not support reflink, FIDEDUPERANGE will not be
+ * available.  Fortunately, there will only be one owner of the frozen space.
+ * The file range contents are instead copied through the page cache to the
+ * work file, and EXCHANGE_RANGE is used to swap the mappings if the owner
+ * file has not been modified.
+ *
+ * When the only remaining owner of the space is the balloon file, return to
+ * the GETFSREFCOUNTS step to find a new target.  This phase is complete when
+ * there are no more targets.
+ *
+ * Moving Inodes
+ * -------------
+ *
+ * NOTE: This part is not written.
+ *
+ * When GETFSMAP tells us about an inode chunk, it is necessary to move the
+ * inodes allocated in that inode chunk to a new chunk.  The first step is to
+ * create a new donor file whose inode record is not in the target range.  This
+ * file must be created in a donor directory.  Next, the file contents should
+ * be cloned, either via FICLONE for regular files or by copying the directory
+ * entries for directories.  The caller must ensure that no programs write to
+ * the victim inode while this process is ongoing.
+ *
+ * Finally, the new inode must be mapped into the same points in the directory
+ * tree as the old inode.  For each parent pointer accessible by the file,
+ * perform a RENAME_EXCHANGE operation to update the directory entry.  One
+ * obvious flaw of this method is that we cannot specify (parent, name, child)
+ * pairs to renameat, which means that the rename does the wrong thing if
+ * either directory is updated concurrently.
+ *
+ * If parent pointers are not available, this phase could be performed slowly
+ * by iterating all directories looking for entries of interest and swapping
+ * them.
+ *
+ * It is required that the caller guarantee that other applications cannot
+ * update the filesystem concurrently.
+ *
+ * Rebuilding Metadata
+ * -------------------
+ *
+ * The final phase identifies filesystem metadata occupying the target range
+ * and uses the online filesystem repair facility to rebuild the metadata
+ * structures.  Assuming that the balloon file now maps most of the space in
+ * the target range, the new structures should be located outside of the target
+ * range.  This phase runs in a loop until there is no more metadata to
+ * relocate or no progress can be made on relocating metadata.
+ *
+ * Limitations and Bugs
+ * --------------------
+ *
+ * - This code must be able to find the owners of a range of physical space.
+ *   If GETFSMAP does not return owner information, this code cannot succeed.
+ *   In other words, reverse mapping must be enabled.
+ *
+ * - We cannot freeze EOF blocks because the FICLONERANGE code does not allow
+ *   us to remap an EOF block into the middle of the balloon file.  I think we
+ *   actually succeed at reflinking the EOF block into the work file during the
+ *   freeze step, but we need to dedupe/exchange the real owners' mappings
+ *   without waiting for the freeze step.  OTOH, we /also/ want to freeze as
+ *   much space as quickly as we can.
+ *
+ * - Freeze cannot use FIECLONERANGE to reflink unwritten extents into the work
+ *   file because FICLONERANGE ignores unwritten extents.  We could create the
+ *   work file as a sparse file and use EXCHANGE_RANGE to swap the unwritten
+ *   extent with the hole, extend EOF to be allocunit aligned, and use
+ *   EXCHANGE_RANGE to move it to the balloon file.  That first exchange must
+ *   be careful to sample the owner file's bulkstat data, re-measure the file
+ *   range to confirm that the unwritten extent is still the one we want, and
+ *   only exchange if the owner file has not changed.
+ *
+ * - csp_buffercopy seems to hang if pread returns zero bytes read.  Do we dare
+ *   use copy_file_range for this instead?
+ *
+ * - None of this code knows how to move inodes.  Phase 4 is entirely
+ *   speculative fiction rooted in Dave Chinner's earlier implementation.
+ *
+ * - Does this work for realtime files?  Even for large rt extent sizes?
+ */
+
+/* VFS helpers */
+
+/* Remap the file range described by @fcr into fd, or return an errno. */
+static inline int
+clonerange(int fd, struct file_clone_range *fcr)
+{
+	int	ret;
+
+	ret = ioctl(fd, FICLONERANGE, fcr);
+	if (ret)
+		return errno;
+
+	return 0;
+}
+
+/* Exchange the file ranges described by @xchg into fd, or return an errno. */
+static inline int
+exchangerange(int fd, struct xfs_exch_range *xchg)
+{
+	int	ret;
+
+	ret = ioctl(fd, XFS_IOC_EXCHANGE_RANGE, xchg);
+	if (ret)
+		return errno;
+
+	return 0;
+}
+
+/*
+ * Deduplicate part of fd into the file range described by fdr.  If the
+ * operation succeeded, we set @same to whether or not we deduped the data and
+ * return zero.  If not, return an errno.
+ */
+static inline int
+deduperange(int fd, struct file_dedupe_range *fdr, bool *same)
+{
+	struct file_dedupe_range_info *info = &fdr->info[0];
+	int	ret;
+
+	assert(fdr->dest_count == 1);
+	*same = false;
+
+	ret = ioctl(fd, FIDEDUPERANGE, fdr);
+	if (ret)
+		return errno;
+
+	if (info->status < 0)
+		return -info->status;
+
+	if (info->status == FILE_DEDUPE_RANGE_DIFFERS)
+		return 0;
+
+	/* The kernel should never dedupe more than it was asked. */
+	assert(fdr->src_length >= info->bytes_deduped);
+
+	*same = true;
+	return 0;
+}
+
+/* Space clearing operation control */
+
+#define QUERY_BATCH_SIZE		1024
+
+struct clearspace_tgt {
+	unsigned long long	start;
+	unsigned long long	length;
+	unsigned long long	owners;
+	unsigned long long	prio;
+	unsigned long long	evacuated;
+	bool			try_again;
+};
+
+struct clearspace_req {
+	struct xfs_fd		*xfd;
+
+	/* all the blocks that we've tried to clear */
+	struct bitmap		*visited;
+
+	/* stat buffer of the open file */
+	struct stat		statbuf;
+	struct stat		temp_statbuf;
+	struct stat		space_statbuf;
+
+	/* handle to this filesystem */
+	void			*fshandle;
+	size_t			fshandle_sz;
+
+	/* physical storage that we want to clear */
+	unsigned long long	start;
+	unsigned long long	length;
+	dev_t			dev;
+
+	/* convenience variable */
+	bool			realtime:1;
+	bool			use_reflink:1;
+	bool			can_evac_metadata:1;
+
+	/*
+	 * The "space capture" file.  Each extent in this file must be mapped
+	 * to the same byte offset as the byte address of the physical space.
+	 */
+	int			space_fd;
+
+	/* work file for migrating file data */
+	int			work_fd;
+
+	/* preallocated buffers for queries */
+	struct getbmapx		*bhead;
+	struct fsmap_head	*mhead;
+	struct xfs_getfsrefs_head	*rhead;
+
+	/* buffer for copying data */
+	char			*buf;
+
+	/* buffer for deduping data */
+	struct file_dedupe_range *fdr;
+
+	/* tracing mask and indent level */
+	unsigned int		trace_mask;
+	unsigned int		trace_indent;
+};
+
+static inline bool
+csp_is_internal_owner(
+	const struct clearspace_req	*req,
+	unsigned long long		owner)
+{
+	return owner == req->temp_statbuf.st_ino ||
+	       owner == req->space_statbuf.st_ino;
+}
+
+/* Debugging stuff */
+
+static const struct csp_errstr {
+	unsigned int		mask;
+	const char		*tag;
+} errtags[] = {
+	{ CSP_TRACE_FREEZE,	"freeze" },
+	{ CSP_TRACE_GRAB,	"grab" },
+	{ CSP_TRACE_PREP,	"prep" },
+	{ CSP_TRACE_TARGET,	"target" },
+	{ CSP_TRACE_DEDUPE,	"dedupe" },
+	{ CSP_TRACE_EXCHANGE,	"exchange_range" },
+	{ CSP_TRACE_XREBUILD,	"rebuild" },
+	{ CSP_TRACE_EFFICACY,	"efficacy" },
+	{ CSP_TRACE_SETUP,	"setup" },
+	{ CSP_TRACE_DUMPFILE,	"dumpfile" },
+	{ CSP_TRACE_BITMAP,	"bitmap" },
+
+	/* prioritize high level functions over low level queries for tagging */
+	{ CSP_TRACE_FSMAP,	"fsmap" },
+	{ CSP_TRACE_FSREFS,	"fsrefs" },
+	{ CSP_TRACE_BMAPX,	"bmapx" },
+	{ CSP_TRACE_FALLOC,	"falloc" },
+	{ CSP_TRACE_STATUS,	"status" },
+	{ 0, NULL },
+};
+
+static void
+csp_debug(
+	struct clearspace_req	*req,
+	unsigned int		mask,
+	const char		*func,
+	int			line,
+	const char		*format,
+	...)
+{
+	const struct csp_errstr	*et = errtags;
+	bool			debug = (req->trace_mask & ~CSP_TRACE_STATUS);
+	int			indent = req->trace_indent;
+	va_list			args;
+
+	if ((req->trace_mask & mask) != mask)
+		return;
+
+	if (debug) {
+		while (indent > 0) {
+			fprintf(stderr, "  ");
+			indent--;
+		}
+
+		for (; et->tag; et++) {
+			if (et->mask & mask) {
+				fprintf(stderr, "%s: ", et->tag);
+				break;
+			}
+		}
+	}
+
+	va_start(args, format);
+	vfprintf(stderr, format, args);
+	va_end(args);
+
+	if (debug)
+		fprintf(stderr, " (line %d)\n", line);
+	else
+		fprintf(stderr, "\n");
+	fflush(stderr);
+}
+
+#define trace_freeze(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_FREEZE, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_grabfree(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_GRAB, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_fsmap(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_FSMAP, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_fsmap_rec(req, mask, mrec)	\
+	while (!csp_is_internal_owner((req), (mrec)->fmr_owner)) { \
+		csp_debug((req), (mask) | CSP_TRACE_FSMAP, __func__, __LINE__, \
+"fsmap phys 0x%llx owner 0x%llx offset 0x%llx bytecount 0x%llx flags 0x%x", \
+				(unsigned long long)(mrec)->fmr_physical, \
+				(unsigned long long)(mrec)->fmr_owner, \
+				(unsigned long long)(mrec)->fmr_offset, \
+				(unsigned long long)(mrec)->fmr_length, \
+				(mrec)->fmr_flags); \
+		break; \
+	}
+
+#define trace_fsrefs(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_FSREFS, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_fsrefs_rec(req, mask, rrec)	\
+	csp_debug((req), (mask) | CSP_TRACE_FSREFS, __func__, __LINE__, \
+"fsref phys 0x%llx bytecount 0x%llx owners %llu flags 0x%x", \
+			(unsigned long long)(rrec)->fcr_physical, \
+			(unsigned long long)(rrec)->fcr_length, \
+			(unsigned long long)(rrec)->fcr_owners, \
+			(rrec)->fcr_flags)
+
+#define trace_bmapx(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_BMAPX, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_bmapx_rec(req, mask, brec)	\
+	csp_debug((req), (mask) | CSP_TRACE_BMAPX, __func__, __LINE__, \
+"bmapx pos 0x%llx bytecount 0x%llx phys 0x%llx flags 0x%x", \
+			(unsigned long long)BBTOB((brec)->bmv_offset), \
+			(unsigned long long)BBTOB((brec)->bmv_length), \
+			(unsigned long long)BBTOB((brec)->bmv_block), \
+			(brec)->bmv_oflags)
+
+#define trace_prep(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_PREP, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_target(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_TARGET, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_dedupe(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_DEDUPE, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_falloc(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_FALLOC, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_exchange(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_EXCHANGE, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_xrebuild(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_XREBUILD, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_setup(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_SETUP, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_status(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_STATUS, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_dumpfile(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_DUMPFILE, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_bitmap(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_BITMAP, __func__, __LINE__, format, __VA_ARGS__)
+
+/* VFS Iteration helpers */
+
+static inline void
+start_spacefd_iter(struct clearspace_req *req)
+{
+	req->trace_indent++;
+}
+
+static inline void
+end_spacefd_iter(struct clearspace_req *req)
+{
+	req->trace_indent--;
+}
+
+/*
+ * Iterate each hole in the space-capture file.  Returns 1 if holepos/length
+ * has been set to a hole; 0 if there aren't any holes left, or -1 for error.
+ */
+static inline int
+spacefd_hole_iter(
+	const struct clearspace_req	*req,
+	loff_t			*holepos,
+	loff_t			*length)
+{
+	loff_t			end = req->start + req->length;
+	loff_t			h;
+	loff_t			d;
+
+	if (*length == 0)
+		d = req->start;
+	else
+		d = *holepos + *length;
+	if (d >= end)
+		return 0;
+
+	h = lseek(req->space_fd, d, SEEK_HOLE);
+	if (h < 0) {
+		perror(_("finding start of hole in space capture file"));
+		return h;
+	}
+	if (h >= end)
+		return 0;
+
+	d = lseek(req->space_fd, h, SEEK_DATA);
+	if (d < 0 && errno == ENXIO)
+		d = end;
+	if (d < 0) {
+		perror(_("finding end of hole in space capture file"));
+		return d;
+	}
+	if (d > end)
+		d = end;
+
+	*holepos = h;
+	*length = d - h;
+	return 1;
+}
+
+/*
+ * Iterate each written region in the space-capture file.  Returns 1 if
+ * datapos/length have been set to a data area; 0 if there isn't any data left,
+ * or -1 for error.
+ */
+static int
+spacefd_data_iter(
+	const struct clearspace_req	*req,
+	loff_t			*datapos,
+	loff_t			*length)
+{
+	loff_t			end = req->start + req->length;
+	loff_t			d;
+	loff_t			h;
+
+	if (*length == 0)
+		h = req->start;
+	else
+		h = *datapos + *length;
+	if (h >= end)
+		return 0;
+
+	d = lseek(req->space_fd, h, SEEK_DATA);
+	if (d < 0 && errno == ENXIO)
+		return 0;
+	if (d < 0) {
+		perror(_("finding start of data in space capture file"));
+		return d;
+	}
+	if (d >= end)
+		return 0;
+
+	h = lseek(req->space_fd, d, SEEK_HOLE);
+	if (h < 0) {
+		perror(_("finding end of data in space capture file"));
+		return h;
+	}
+	if (h > end)
+		h = end;
+
+	*datapos = d;
+	*length = h - d;
+	return 1;
+}
+
+/* Filesystem space usage queries */
+
+/* Allocate the structures needed for a fsmap query. */
+static void
+start_fsmap_query(
+	struct clearspace_req	*req,
+	dev_t			dev,
+	unsigned long long	physical,
+	unsigned long long	length)
+{
+	struct fsmap_head	*mhead = req->mhead;
+
+	assert(req->mhead->fmh_count == 0);
+	memset(mhead, 0, sizeof(struct fsmap_head));
+	mhead->fmh_count = QUERY_BATCH_SIZE;
+	mhead->fmh_keys[0].fmr_device = dev;
+	mhead->fmh_keys[0].fmr_physical = physical;
+	mhead->fmh_keys[1].fmr_device = dev;
+	mhead->fmh_keys[1].fmr_physical = physical + length;
+	mhead->fmh_keys[1].fmr_owner = ULLONG_MAX;
+	mhead->fmh_keys[1].fmr_flags = UINT_MAX;
+	mhead->fmh_keys[1].fmr_offset = ULLONG_MAX;
+
+	trace_fsmap(req, "dev %u:%u physical 0x%llx bytecount 0x%llx highkey 0x%llx",
+			major(dev), minor(dev),
+			(unsigned long long)physical,
+			(unsigned long long)length,
+			(unsigned long long)mhead->fmh_keys[1].fmr_physical);
+	req->trace_indent++;
+}
+
+static inline void
+end_fsmap_query(
+	struct clearspace_req	*req)
+{
+	req->trace_indent--;
+	req->mhead->fmh_count = 0;
+}
+
+/* Set us up for the next run_fsmap_query, or return false. */
+static inline bool
+advance_fsmap_cursor(struct fsmap_head *mhead)
+{
+	struct fsmap	*mrec;
+
+	mrec = &mhead->fmh_recs[mhead->fmh_entries - 1];
+	if (mrec->fmr_flags & FMR_OF_LAST)
+		return false;
+
+	fsmap_advance(mhead);
+	return true;
+}
+
+/*
+ * Run a GETFSMAP query.  Returns 1 if there are rows, 0 if there are no rows,
+ * or -1 for error.
+ */
+static inline int
+run_fsmap_query(
+	struct clearspace_req	*req)
+{
+	struct fsmap_head	*mhead = req->mhead;
+	int			ret;
+
+	if (mhead->fmh_entries > 0 && !advance_fsmap_cursor(mhead))
+		return 0;
+
+	trace_fsmap(req,
+ "ioctl dev %u:%u physical 0x%llx length 0x%llx highkey 0x%llx",
+			major(mhead->fmh_keys[0].fmr_device),
+			minor(mhead->fmh_keys[0].fmr_device),
+			(unsigned long long)mhead->fmh_keys[0].fmr_physical,
+			(unsigned long long)mhead->fmh_keys[0].fmr_length,
+			(unsigned long long)mhead->fmh_keys[1].fmr_physical);
+
+	ret = ioctl(req->xfd->fd, FS_IOC_GETFSMAP, mhead);
+	if (ret) {
+		perror(_("querying fsmap data"));
+		return -1;
+	}
+
+	if (!(mhead->fmh_oflags & FMH_OF_DEV_T)) {
+		fprintf(stderr, _("fsmap does not return dev_t.\n"));
+		return -1;
+	}
+
+	if (mhead->fmh_entries == 0)
+		return 0;
+
+	return 1;
+}
+
+#define for_each_fsmap_row(req, rec) \
+	for ((rec) = (req)->mhead->fmh_recs; \
+	     (rec) < (req)->mhead->fmh_recs + (req)->mhead->fmh_entries; \
+	     (rec)++)
+
+/* Allocate the structures needed for a fsrefcounts query. */
+static void
+start_fsrefs_query(
+	struct clearspace_req	*req,
+	dev_t			dev,
+	unsigned long long	physical,
+	unsigned long long	length)
+{
+	struct xfs_getfsrefs_head	*rhead = req->rhead;
+
+	assert(req->rhead->fch_count == 0);
+	memset(rhead, 0, sizeof(struct xfs_getfsrefs_head));
+	rhead->fch_count = QUERY_BATCH_SIZE;
+	rhead->fch_keys[0].fcr_device = dev;
+	rhead->fch_keys[0].fcr_physical = physical;
+	rhead->fch_keys[1].fcr_device = dev;
+	rhead->fch_keys[1].fcr_physical = physical + length;
+	rhead->fch_keys[1].fcr_owners = ULLONG_MAX;
+	rhead->fch_keys[1].fcr_flags = UINT_MAX;
+
+	trace_fsrefs(req, "dev %u:%u physical 0x%llx bytecount 0x%llx highkey 0x%llx",
+			major(dev), minor(dev),
+			(unsigned long long)physical,
+			(unsigned long long)length,
+			(unsigned long long)rhead->fch_keys[1].fcr_physical);
+	req->trace_indent++;
+}
+
+static inline void
+end_fsrefs_query(
+	struct clearspace_req	*req)
+{
+	req->trace_indent--;
+	req->rhead->fch_count = 0;
+}
+
+/* Set us up for the next run_fsrefs_query, or return false. */
+static inline bool
+advance_fsrefs_query(struct xfs_getfsrefs_head *rhead)
+{
+	struct xfs_getfsrefs	*rrec;
+
+	rrec = &rhead->fch_recs[rhead->fch_entries - 1];
+	if (rrec->fcr_flags & FCR_OF_LAST)
+		return false;
+
+	xfs_getfsrefs_advance(rhead);
+	return true;
+}
+
+/*
+ * Run a GETFSREFCOUNTS query.  Returns 1 if there are rows, 0 if there are
+ * no rows, or -1 for error.
+ */
+static inline int
+run_fsrefs_query(
+	struct clearspace_req	*req)
+{
+	struct xfs_getfsrefs_head	*rhead = req->rhead;
+	int			ret;
+
+	if (rhead->fch_entries > 0 && !advance_fsrefs_query(rhead))
+		return 0;
+
+	trace_fsrefs(req,
+ "ioctl dev %u:%u physical 0x%llx length 0x%llx highkey 0x%llx",
+			major(rhead->fch_keys[0].fcr_device),
+			minor(rhead->fch_keys[0].fcr_device),
+			(unsigned long long)rhead->fch_keys[0].fcr_physical,
+			(unsigned long long)rhead->fch_keys[0].fcr_length,
+			(unsigned long long)rhead->fch_keys[1].fcr_physical);
+
+	ret = ioctl(req->xfd->fd, XFS_IOC_GETFSREFCOUNTS, rhead);
+	if (ret) {
+		perror(_("querying refcount data"));
+		return -1;
+	}
+
+	if (!(rhead->fch_oflags & FCH_OF_DEV_T)) {
+		fprintf(stderr, _("fsrefcounts does not return dev_t.\n"));
+		return -1;
+	}
+
+	if (rhead->fch_entries == 0)
+		return 0;
+
+	return 1;
+}
+
+#define for_each_fsref_row(req, rec) \
+	for ((rec) = (req)->rhead->fch_recs; \
+	     (rec) < (req)->rhead->fch_recs + (req)->rhead->fch_entries; \
+	     (rec)++)
+
+/* Allocate the structures needed for a bmapx query. */
+static void
+start_bmapx_query(
+	struct clearspace_req	*req,
+	unsigned int		fork,
+	unsigned long long	pos,
+	unsigned long long	length)
+{
+	struct getbmapx		*bhead = req->bhead;
+
+	assert(fork == BMV_IF_ATTRFORK || fork == BMV_IF_COWFORK || !fork);
+	assert(req->bhead->bmv_count == 0);
+
+	memset(bhead, 0, sizeof(struct getbmapx));
+	bhead[0].bmv_offset = BTOBB(pos);
+	bhead[0].bmv_length = BTOBB(length);
+	bhead[0].bmv_count = QUERY_BATCH_SIZE + 1;
+	bhead[0].bmv_iflags = fork | BMV_IF_PREALLOC | BMV_IF_DELALLOC;
+
+	trace_bmapx(req, "%s pos 0x%llx bytecount 0x%llx",
+			fork == BMV_IF_COWFORK ? "cow" : fork == BMV_IF_ATTRFORK ? "attr" : "data",
+			(unsigned long long)BBTOB(bhead[0].bmv_offset),
+			(unsigned long long)BBTOB(bhead[0].bmv_length));
+	req->trace_indent++;
+}
+
+static inline void
+end_bmapx_query(
+	struct clearspace_req	*req)
+{
+	req->trace_indent--;
+	req->bhead->bmv_count = 0;
+}
+
+/* Set us up for the next run_bmapx_query, or return false. */
+static inline bool
+advance_bmapx_query(struct getbmapx *bhead)
+{
+	struct getbmapx		*brec;
+	unsigned long long	next_offset;
+	unsigned long long	end = bhead->bmv_offset + bhead->bmv_length;
+
+	brec = &bhead[bhead->bmv_entries];
+	if (brec->bmv_oflags & BMV_OF_LAST)
+		return false;
+
+	next_offset = brec->bmv_offset + brec->bmv_length;
+	if (next_offset > end)
+		return false;
+
+	bhead->bmv_offset = next_offset;
+	bhead->bmv_length = end - next_offset;
+	return true;
+}
+
+/*
+ * Run a GETBMAPX query.  Returns 1 if there are rows, 0 if there are no rows,
+ * or -1 for error.
+ */
+static inline int
+run_bmapx_query(
+	struct clearspace_req	*req,
+	int			fd)
+{
+	struct getbmapx		*bhead = req->bhead;
+	unsigned int		fork;
+	int			ret;
+
+	if (bhead->bmv_entries > 0 && !advance_bmapx_query(bhead))
+		return 0;
+
+	fork = bhead[0].bmv_iflags & (BMV_IF_COWFORK | BMV_IF_ATTRFORK);
+	trace_bmapx(req, "ioctl %s pos 0x%llx bytecount 0x%llx",
+			fork == BMV_IF_COWFORK ? "cow" : fork == BMV_IF_ATTRFORK ? "attr" : "data",
+			(unsigned long long)BBTOB(bhead[0].bmv_offset),
+			(unsigned long long)BBTOB(bhead[0].bmv_length));
+
+	ret = ioctl(fd, XFS_IOC_GETBMAPX, bhead);
+	if (ret) {
+		perror(_("querying bmapx data"));
+		return -1;
+	}
+
+	if (bhead->bmv_entries == 0)
+		return 0;
+
+	return 1;
+}
+
+#define for_each_bmapx_row(req, rec) \
+	for ((rec) = (req)->bhead + 1; \
+	     (rec) < (req)->bhead + 1 + (req)->bhead->bmv_entries; \
+	     (rec)++)
+
+static inline void
+csp_dump_bmapx_row(
+	struct clearspace_req	*req,
+	unsigned int		nr,
+	const struct getbmapx	*brec)
+{
+	if (brec->bmv_block == -1) {
+		trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx hole",
+				nr,
+				(unsigned long long)BBTOB(brec->bmv_offset),
+				(unsigned long long)BBTOB(brec->bmv_length));
+		return;
+	}
+
+	if (brec->bmv_block == -2) {
+		trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx delalloc",
+				nr,
+				(unsigned long long)BBTOB(brec->bmv_offset),
+				(unsigned long long)BBTOB(brec->bmv_length));
+		return;
+	}
+
+	trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx phys 0x%llx flags 0x%x",
+			nr,
+			(unsigned long long)BBTOB(brec->bmv_offset),
+			(unsigned long long)BBTOB(brec->bmv_length),
+			(unsigned long long)BBTOB(brec->bmv_block),
+			brec->bmv_oflags);
+}
+
+static inline void
+csp_dump_bmapx(
+	struct clearspace_req	*req,
+	int			fd,
+	unsigned int		indent,
+	const char		*tag)
+{
+	unsigned int		nr;
+	int			ret;
+
+	trace_dumpfile(req, "DUMP BMAP OF DATA FORK %s", tag);
+	start_bmapx_query(req, 0, req->start, req->length);
+	nr = 0;
+	while ((ret = run_bmapx_query(req, fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			csp_dump_bmapx_row(req, nr++, brec);
+			if (nr > 10)
+				goto dump_cow;
+		}
+	}
+
+dump_cow:
+	end_bmapx_query(req);
+	trace_dumpfile(req, "DUMP BMAP OF COW FORK %s", tag);
+	start_bmapx_query(req, BMV_IF_COWFORK, req->start, req->length);
+	nr = 0;
+	while ((ret = run_bmapx_query(req, fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			csp_dump_bmapx_row(req, nr++, brec);
+			if (nr > 10)
+				goto dump_attr;
+		}
+	}
+
+dump_attr:
+	end_bmapx_query(req);
+	trace_dumpfile(req, "DUMP BMAP OF ATTR FORK %s", tag);
+	start_bmapx_query(req, BMV_IF_ATTRFORK, req->start, req->length);
+	nr = 0;
+	while ((ret = run_bmapx_query(req, fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			csp_dump_bmapx_row(req, nr++, brec);
+			if (nr > 10)
+				goto stop;
+		}
+	}
+
+stop:
+	end_bmapx_query(req);
+	trace_dumpfile(req, "DONE DUMPING %s", tag);
+}
+
+/* Return the first bmapx for the given file range. */
+static int
+bmapx_one(
+	struct clearspace_req	*req,
+	int			fd,
+	unsigned long long	pos,
+	unsigned long long	length,
+	struct getbmapx		*brec)
+{
+	struct getbmapx		bhead[2];
+	int			ret;
+
+	memset(bhead, 0, sizeof(struct getbmapx) * 2);
+	bhead[0].bmv_offset = BTOBB(pos);
+	bhead[0].bmv_length = BTOBB(length);
+	bhead[0].bmv_count = 2;
+	bhead[0].bmv_iflags = BMV_IF_PREALLOC | BMV_IF_DELALLOC;
+
+	ret = ioctl(fd, XFS_IOC_GETBMAPX, bhead);
+	if (ret) {
+		perror(_("simple bmapx query"));
+		return -1;
+	}
+
+	if (bhead->bmv_entries > 0) {
+		memcpy(brec, &bhead[1], sizeof(struct getbmapx));
+		return 0;
+	}
+
+	memset(brec, 0, sizeof(struct getbmapx));
+	brec->bmv_offset = pos;
+	brec->bmv_block = -1;	/* hole */
+	brec->bmv_length = length;
+	return 0;
+}
+
+/* Constrain space map records. */
+static void
+__trim_fsmap(
+	uint64_t		start,
+	uint64_t		length,
+	struct fsmap		*fsmap)
+{
+	unsigned long long	delta, end;
+	bool			need_off;
+
+	need_off = (fsmap->fmr_flags & (FMR_OF_EXTENT_MAP |
+					FMR_OF_SPECIAL_OWNER));
+
+	if (fsmap->fmr_physical < start) {
+		delta = start - fsmap->fmr_physical;
+		fsmap->fmr_physical = start;
+		fsmap->fmr_length -= delta;
+		if (need_off)
+			fsmap->fmr_offset += delta;
+	}
+
+	end = fsmap->fmr_physical + fsmap->fmr_length;
+	if (end > start + length) {
+		delta = end - (start + length);
+		fsmap->fmr_length -= delta;
+	}
+}
+
+static inline void
+trim_target_fsmap(const struct clearspace_tgt *tgt, struct fsmap *fsmap)
+{
+	return __trim_fsmap(tgt->start, tgt->length, fsmap);
+}
+
+static inline void
+trim_request_fsmap(const struct clearspace_req *req, struct fsmap *fsmap)
+{
+	return __trim_fsmap(req->start, req->length, fsmap);
+}
+
+/* Actual space clearing code */
+
+/*
+ * Map all the free space in the region that we're clearing to the space
+ * catcher file.
+ */
+static int
+csp_grab_free_space(
+	struct clearspace_req	*req)
+{
+	struct xfs_map_freesp	args = {
+		.offset		= req->start,
+		.len		= req->length,
+	};
+	int			ret;
+
+	trace_grabfree(req, "start 0x%llx length 0x%llx",
+			(unsigned long long)req->start,
+			(unsigned long long)req->length);
+
+	ret = ioctl(req->space_fd, XFS_IOC_MAP_FREESP, &args);
+	if (ret) {
+		perror(_("map free space to space capture file"));
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Rank a refcount record.  We prefer to tackle highly shared and longer
+ * extents first.
+ */
+static inline unsigned long long
+csp_space_prio(
+	const struct xfs_fsop_geom	*g,
+	const struct xfs_getfsrefs	*p)
+{
+	unsigned long long		blocks = p->fcr_length / g->blocksize;
+	unsigned long long		ret = blocks * p->fcr_owners;
+
+	if (ret < blocks || ret < p->fcr_owners)
+		return UINT64_MAX;
+	return ret;
+}
+
+/* Make the current refcount record the clearing target if desirable. */
+static void
+csp_adjust_target(
+	struct clearspace_req		*req,
+	struct clearspace_tgt		*target,
+	const struct xfs_getfsrefs	*rec,
+	unsigned long long		prio)
+{
+	if (prio < target->prio)
+		return;
+	if (prio == target->prio &&
+	    rec->fcr_length <= target->length)
+		return;
+
+	/* Ignore results that go beyond the end of what we wanted. */
+	if (rec->fcr_physical >= req->start + req->length)
+		return;
+
+	/* Ignore regions that we already tried to clear. */
+	if (bitmap_test(req->visited, rec->fcr_physical, rec->fcr_length))
+		return;
+
+	trace_target(req,
+ "set target, prio 0x%llx -> 0x%llx phys 0x%llx bytecount 0x%llx",
+			target->prio, prio,
+			(unsigned long long)rec->fcr_physical,
+			(unsigned long long)rec->fcr_length);
+
+	target->start = rec->fcr_physical;
+	target->length = rec->fcr_length;
+	target->owners = rec->fcr_owners;
+	target->prio = prio;
+}
+
+/*
+ * Decide if this refcount record maps to extents that are sufficiently
+ * interesting to target.
+ */
+static int
+csp_evaluate_refcount(
+	struct clearspace_req		*req,
+	const struct xfs_getfsrefs	*rrec,
+	struct clearspace_tgt		*target)
+{
+	const struct xfs_fsop_geom	*fsgeom = &req->xfd->fsgeom;
+	unsigned long long		prio = csp_space_prio(fsgeom, rrec);
+	int				ret;
+
+	if (rrec->fcr_device != req->dev)
+		return 0;
+
+	if (prio < target->prio)
+		return 0;
+
+	/*
+	 * XFS only supports sharing data blocks.  If there's more than one
+	 * owner, we know that we can easily move the blocks.
+	 */
+	if (rrec->fcr_owners > 1) {
+		csp_adjust_target(req, target, rrec, prio);
+		return 0;
+	}
+
+	/*
+	 * Otherwise, this extent has single owners.  Walk the fsmap records to
+	 * figure out if they're movable or not.
+	 */
+	start_fsmap_query(req, rrec->fcr_device, rrec->fcr_physical,
+			rrec->fcr_length);
+	while ((ret = run_fsmap_query(req)) > 0) {
+		struct fsmap	*mrec;
+		uint64_t	next_phys = 0;
+
+		for_each_fsmap_row(req, mrec) {
+			struct xfs_getfsrefs	fake_rec = { };
+
+			trace_fsmap_rec(req, CSP_TRACE_TARGET, mrec);
+
+			if (mrec->fmr_device != rrec->fcr_device)
+				continue;
+			if (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER)
+				continue;
+			if (csp_is_internal_owner(req, mrec->fmr_owner))
+				continue;
+
+			/*
+			 * If the space has become shared since the fsrefs
+			 * query, just skip this record.  We might come back to
+			 * it in a later iteration.
+			 */
+			if (mrec->fmr_physical < next_phys)
+				continue;
+
+			/* Fake enough of a fsrefs to calculate the priority. */
+			fake_rec.fcr_physical = mrec->fmr_physical;
+			fake_rec.fcr_length = mrec->fmr_length;
+			fake_rec.fcr_owners = 1;
+			prio = csp_space_prio(fsgeom, &fake_rec);
+
+			/* Target unwritten extents first; they're cheap. */
+			if (mrec->fmr_flags & FMR_OF_PREALLOC)
+				prio |= (1ULL << 63);
+
+			csp_adjust_target(req, target, &fake_rec, prio);
+
+			next_phys = mrec->fmr_physical + mrec->fmr_length;
+		}
+	}
+	end_fsmap_query(req);
+
+	return ret;
+}
+
+/*
+ * Given a range of storage to search, find the most appealing target for space
+ * clearing.  If nothing suitable is found, the target will be zeroed.
+ */
+static int
+csp_find_target(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target)
+{
+	int			ret;
+
+	memset(target, 0, sizeof(struct clearspace_tgt));
+
+	start_fsrefs_query(req, req->dev, req->start, req->length);
+	while ((ret = run_fsrefs_query(req)) > 0) {
+		struct xfs_getfsrefs	*rrec;
+
+		for_each_fsref_row(req, rrec) {
+			trace_fsrefs_rec(req, CSP_TRACE_TARGET, rrec);
+			ret = csp_evaluate_refcount(req, rrec, target);
+			if (ret) {
+				end_fsrefs_query(req);
+				return ret;
+			}
+		}
+	}
+	end_fsrefs_query(req);
+
+	if (target->length != 0) {
+		/*
+		 * Mark this extent visited so that we won't try again this
+		 * round.
+		 */
+		trace_bitmap(req, "set filedata start 0x%llx length 0x%llx",
+				target->start, target->length);
+		ret = bitmap_set(req->visited, target->start, target->length);
+		if (ret) {
+			perror(_("marking file extent visited"));
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/* Try to evacuate blocks by using online repair. */
+static int
+csp_evac_file_metadata(
+	struct clearspace_req		*req,
+	struct clearspace_tgt		*target,
+	const struct fsmap		*mrec,
+	int				fd,
+	const struct xfs_bulkstat	*bulkstat)
+{
+	struct xfs_scrub_metadata	scrub = {
+		.sm_type		= XFS_SCRUB_TYPE_PROBE,
+		.sm_flags		= XFS_SCRUB_IFLAG_REPAIR |
+					  XFS_SCRUB_IFLAG_FORCE_REBUILD,
+	};
+	struct xfs_fd			*xfd = req->xfd;
+	int				ret;
+
+	trace_xrebuild(req,
+ "ino 0x%llx pos 0x%llx bytecount 0x%llx phys 0x%llx flags 0x%llx",
+				(unsigned long long)mrec->fmr_owner,
+				(unsigned long long)mrec->fmr_offset,
+				(unsigned long long)mrec->fmr_physical,
+				(unsigned long long)mrec->fmr_length,
+				(unsigned long long)mrec->fmr_flags);
+
+	if (fd == -1) {
+		scrub.sm_ino = mrec->fmr_owner;
+		scrub.sm_gen = bulkstat->bs_gen;
+		fd = xfd->fd;
+	}
+
+	if (mrec->fmr_flags & FMR_OF_ATTR_FORK) {
+		if (mrec->fmr_flags & FMR_OF_EXTENT_MAP)
+			scrub.sm_type = XFS_SCRUB_TYPE_BMBTA;
+		else
+			scrub.sm_type = XFS_SCRUB_TYPE_XATTR;
+	} else if (mrec->fmr_flags & FMR_OF_EXTENT_MAP) {
+		scrub.sm_type = XFS_SCRUB_TYPE_BMBTD;
+	} else if (S_ISLNK(bulkstat->bs_mode)) {
+		scrub.sm_type = XFS_SCRUB_TYPE_SYMLINK;
+	} else if (S_ISDIR(bulkstat->bs_mode)) {
+		scrub.sm_type = XFS_SCRUB_TYPE_DIR;
+	}
+
+	if (scrub.sm_type == XFS_SCRUB_TYPE_PROBE)
+		return 0;
+
+	trace_xrebuild(req, "ino 0x%llx gen 0x%x type %u",
+			(unsigned long long)mrec->fmr_owner,
+			(unsigned int)bulkstat->bs_gen,
+			(unsigned int)scrub.sm_type);
+
+	ret = ioctl(fd, XFS_IOC_SCRUB_METADATA, &scrub);
+	if (ret) {
+		fprintf(stderr,
+	_("evacuating inode 0x%llx metadata type %u: %s\n"),
+				(unsigned long long)mrec->fmr_owner,
+				scrub.sm_type, strerror(errno));
+		return -1;
+	}
+
+	target->evacuated++;
+	return 0;
+}
+
+/*
+ * Open an inode via handle.  Returns a file descriptor, -2 if the file is
+ * gone, or -1 on error.
+ */
+static int
+csp_open_by_handle(
+	struct clearspace_req	*req,
+	int			oflags,
+	uint64_t		ino,
+	uint32_t		gen)
+{
+	struct xfs_handle	handle = { };
+	struct xfs_fsop_handlereq hreq = {
+		.oflags		= oflags | O_NOATIME | O_NOFOLLOW |
+				  O_NOCTTY | O_LARGEFILE,
+		.ihandle	= &handle,
+		.ihandlen	= sizeof(handle),
+	};
+	int			ret;
+
+	memcpy(&handle.ha_fsid, req->fshandle, sizeof(handle.ha_fsid));
+	handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
+			sizeof(handle.ha_fid.fid_len);
+	handle.ha_fid.fid_pad = 0;
+	handle.ha_fid.fid_ino = ino;
+	handle.ha_fid.fid_gen = gen;
+
+	/*
+	 * Since we extracted the fshandle from the open file instead of using
+	 * path_to_fshandle, the fsid cache doesn't know about the fshandle.
+	 * Construct the open by handle request manually.
+	 */
+	ret = ioctl(req->xfd->fd, XFS_IOC_OPEN_BY_HANDLE, &hreq);
+	if (ret < 0) {
+		if (errno == ENOENT || errno == EINVAL)
+			return -2;
+
+		fprintf(stderr, _("open inode 0x%llx: %s\n"),
+				(unsigned long long)ino,
+				strerror(errno));
+		return -1;
+	}
+
+	return ret;
+}
+
+/*
+ * Open a file for evacuation.  Returns a positive errno on error; a fd in @fd
+ * if the caller is supposed to do something; or @fd == -1 if there's nothing
+ * further to do.
+ */
+static int
+csp_evac_open(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target,
+	const struct fsmap	*mrec,
+	struct xfs_bulkstat	*bulkstat,
+	int			oflags,
+	int			*fd)
+{
+	struct xfs_bulkstat	__bs;
+	int			target_fd;
+	int			ret;
+
+	*fd = -1;
+
+	if (csp_is_internal_owner(req, mrec->fmr_owner) ||
+	    (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER))
+		goto nothing_to_do;
+
+	if (bulkstat == NULL)
+		bulkstat = &__bs;
+
+	/*
+	 * Snapshot this file so that we can perform a fresh-only exchange.
+	 * For other types of files we just skip to the evacuation step.
+	 */
+	ret = -xfrog_bulkstat_single(req->xfd, mrec->fmr_owner, 0, bulkstat);
+	if (ret) {
+		if (ret == ENOENT || ret == EINVAL)
+			goto nothing_to_do;
+
+		fprintf(stderr, _("bulkstat inode 0x%llx: %s\n"),
+				(unsigned long long)mrec->fmr_owner,
+				strerror(ret));
+		return ret;
+	}
+
+	/*
+	 * If we get stats for a different inode, the file may have been freed
+	 * out from under us and there's nothing to do.
+	 */
+	if (bulkstat->bs_ino != mrec->fmr_owner)
+		goto nothing_to_do;
+
+	/*
+	 * We're only allowed to open regular files and directories via handle
+	 * so jump to online rebuild for all other file types.
+	 */
+	if (!S_ISREG(bulkstat->bs_mode) && !S_ISDIR(bulkstat->bs_mode))
+		return csp_evac_file_metadata(req, target, mrec, -1,
+				bulkstat);
+
+	if (S_ISDIR(bulkstat->bs_mode))
+		oflags = O_RDONLY;
+
+	target_fd = csp_open_by_handle(req, oflags, mrec->fmr_owner,
+			bulkstat->bs_gen);
+	if (target_fd == -2)
+		goto nothing_to_do;
+	if (target_fd < 0)
+		return -target_fd;
+
+	/*
+	 * Exchange only works for regular file data blocks.  If that isn't the
+	 * case, our only recourse is online rebuild.
+	 */
+	if (S_ISDIR(bulkstat->bs_mode) ||
+	    (mrec->fmr_flags & (FMR_OF_ATTR_FORK | FMR_OF_EXTENT_MAP))) {
+		int	ret2;
+
+		ret = csp_evac_file_metadata(req, target, mrec, target_fd,
+				bulkstat);
+		ret2 = close(target_fd);
+		if (!ret && ret2)
+			ret = ret2;
+		return ret;
+	}
+
+	*fd = target_fd;
+	return 0;
+
+nothing_to_do:
+	target->try_again = true;
+	return 0;
+}
+
+/* Unshare the space in the work file that we're using for deduplication. */
+static int
+csp_unshare_workfile(
+	struct clearspace_req	*req,
+	unsigned long long	start,
+	unsigned long long	length)
+{
+	int			ret;
+
+	trace_falloc(req, "funshare workfd pos 0x%llx bytecount 0x%llx",
+			start, length);
+
+	ret = fallocate(req->work_fd, FALLOC_FL_UNSHARE_RANGE, start, length);
+	if (ret) {
+		perror(_("unsharing work file"));
+		return ret;
+	}
+
+	ret = fsync(req->work_fd);
+	if (ret) {
+		perror(_("syncing work file"));
+		return ret;
+	}
+
+	/* Make sure we didn't get any space within the clearing range. */
+	start_bmapx_query(req, 0, start, length);
+	while ((ret = run_bmapx_query(req, req->work_fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			unsigned long long	p, l;
+
+			trace_bmapx_rec(req, CSP_TRACE_FALLOC, brec);
+			p = BBTOB(brec->bmv_block);
+			l = BBTOB(brec->bmv_length);
+
+			if (p + l < req->start || p >= req->start + req->length)
+				continue;
+
+			trace_prep(req,
+	"workfd has extent inside clearing range, phys 0x%llx fsbcount 0x%llx",
+					p, l);
+			end_bmapx_query(req);
+			return -1;
+		}
+	}
+	end_bmapx_query(req);
+
+	return 0;
+}
+
+/* Try to deduplicate every block in the fdr request, if we can. */
+static int
+csp_evac_dedupe_loop(
+	struct clearspace_req		*req,
+	struct clearspace_tgt		*target,
+	unsigned long long		ino,
+	int				max_reqlen)
+{
+	struct file_dedupe_range	*fdr = req->fdr;
+	struct file_dedupe_range_info	*info = &fdr->info[0];
+	loff_t				last_unshare_off = -1;
+	int				ret;
+
+	while (fdr->src_length > 0) {
+		struct getbmapx		brec;
+		bool			same;
+		unsigned int		old_reqlen = fdr->src_length;
+
+		if (max_reqlen && fdr->src_length > max_reqlen)
+			fdr->src_length = max_reqlen;
+
+		trace_dedupe(req, "ino 0x%llx pos 0x%llx bytecount 0x%llx",
+				ino,
+				(unsigned long long)info->dest_offset,
+				(unsigned long long)fdr->src_length);
+
+		ret = bmapx_one(req, req->work_fd, fdr->src_offset,
+				fdr->src_length, &brec);
+		if (ret)
+			return ret;
+
+		trace_dedupe(req, "workfd pos 0x%llx phys 0x%llx",
+				(unsigned long long)fdr->src_offset,
+				(unsigned long long)BBTOB(brec.bmv_block));
+
+		ret = deduperange(req->work_fd, fdr, &same);
+		if (ret == ENOSPC && last_unshare_off < fdr->src_offset) {
+			req->trace_indent++;
+			trace_dedupe(req, "funshare workfd at phys 0x%llx",
+					(unsigned long long)fdr->src_offset);
+			/*
+			 * If we ran out of space, it's possible that we have
+			 * reached the maximum sharing factor of the blocks in
+			 * the work file.  Try unsharing the range of the work
+			 * file to get a singly-owned range and loop again.
+			 */
+			ret = csp_unshare_workfile(req, fdr->src_offset,
+					fdr->src_length);
+			req->trace_indent--;
+			if (ret)
+				return ret;
+
+			ret = fsync(req->work_fd);
+			if (ret) {
+				perror(_("sync after unshare work file"));
+				return ret;
+			}
+
+			last_unshare_off = fdr->src_offset;
+			fdr->src_length = old_reqlen;
+			continue;
+		}
+		if (ret == EINVAL) {
+			/*
+			 * If we can't dedupe get the block, it's possible that
+			 * src_fd was punched or truncated out from under us.
+			 * Treat this the same way we would if the contents
+			 * didn't match.
+			 */
+			trace_dedupe(req, "cannot evac space, moving on", 0);
+			same = false;
+			ret = 0;
+		}
+		if (ret) {
+			fprintf(stderr, _("evacuating inode 0x%llx: %s\n"),
+					ino, strerror(ret));
+			return ret;
+		}
+
+		if (same) {
+			req->trace_indent++;
+			trace_dedupe(req,
+	"evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx",
+					ino,
+					(unsigned long long)info->dest_offset,
+					(unsigned long long)info->bytes_deduped);
+			req->trace_indent--;
+
+			target->evacuated++;
+		} else {
+			req->trace_indent++;
+			trace_dedupe(req,
+	"failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx",
+					ino,
+					(unsigned long long)info->dest_offset,
+					(unsigned long long)fdr->src_length);
+			req->trace_indent--;
+
+			target->try_again = true;
+
+			/*
+			 * If we aren't single-stepping the deduplication,
+			 * stop early so that the caller goes into single-step
+			 * mode.
+			 */
+			if (!max_reqlen) {
+				fdr->src_length = old_reqlen;
+				return 0;
+			}
+
+			/* Contents changed, move on to the next block. */
+			info->bytes_deduped = fdr->src_length;
+		}
+		fdr->src_length = old_reqlen;
+
+		fdr->src_offset += info->bytes_deduped;
+		info->dest_offset += info->bytes_deduped;
+		fdr->src_length -= info->bytes_deduped;
+	}
+
+	return 0;
+}
+
+/*
+ * Evacuate one fsmapping by using dedupe to remap data stored in the target
+ * range to a copy stored in the work file.
+ */
+static int
+csp_evac_dedupe_fsmap(
+	struct clearspace_req		*req,
+	struct clearspace_tgt		*target,
+	const struct fsmap		*mrec)
+{
+	struct file_dedupe_range	*fdr = req->fdr;
+	struct file_dedupe_range_info	*info = &fdr->info[0];
+	bool				can_single_step;
+	int				target_fd;
+	int				ret, ret2;
+
+	if (mrec->fmr_device != req->dev) {
+		fprintf(stderr, _("wrong fsmap device in results.\n"));
+		return -1;
+	}
+
+	ret = csp_evac_open(req, target, mrec, NULL, O_RDONLY, &target_fd);
+	if (ret || target_fd < 0)
+		return ret;
+
+	/*
+	 * Use dedupe to try to shift the target file's mappings to use the
+	 * copy of the data that's in the work file.
+	 */
+	fdr->src_offset = mrec->fmr_physical;
+	fdr->src_length = mrec->fmr_length;
+	fdr->dest_count = 1;
+	info->dest_fd = target_fd;
+	info->dest_offset = mrec->fmr_offset;
+
+	can_single_step = mrec->fmr_length > req->xfd->fsgeom.blocksize;
+
+	/* First we try to do the entire thing all at once. */
+	ret = csp_evac_dedupe_loop(req, target, mrec->fmr_owner, 0);
+	if (ret)
+		goto out_fd;
+
+	/* If there's any work left, try again one block at a time. */
+	if (can_single_step && fdr->src_length > 0) {
+		ret = csp_evac_dedupe_loop(req, target, mrec->fmr_owner,
+				req->xfd->fsgeom.blocksize);
+		if (ret)
+			goto out_fd;
+	}
+
+out_fd:
+	ret2 = close(target_fd);
+	if (!ret && ret2)
+		ret = ret2;
+	return ret;
+}
+
+/* Use deduplication to remap data extents away from where we're clearing. */
+static int
+csp_evac_dedupe(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target)
+{
+	int			ret;
+
+	start_fsmap_query(req, req->dev, target->start, target->length);
+	while ((ret = run_fsmap_query(req)) > 0) {
+		struct fsmap	*mrec;
+
+		for_each_fsmap_row(req, mrec) {
+			trace_fsmap_rec(req, CSP_TRACE_DEDUPE, mrec);
+			trim_target_fsmap(target, mrec);
+
+			req->trace_indent++;
+			ret = csp_evac_dedupe_fsmap(req, target, mrec);
+			req->trace_indent--;
+			if (ret)
+				goto out;
+
+			ret = csp_grab_free_space(req);
+			if (ret)
+				goto out;
+		}
+	}
+
+out:
+	end_fsmap_query(req);
+	if (ret)
+		trace_dedupe(req, "ret %d", ret);
+	return ret;
+}
+
+#define BUFFERCOPY_BUFSZ		65536
+
+/*
+ * Use a memory buffer to copy part of src_fd to dst_fd, or return an errno. */
+static int
+csp_buffercopy(
+	struct clearspace_req	*req,
+	int			src_fd,
+	loff_t			src_off,
+	int			dst_fd,
+	loff_t			dst_off,
+	loff_t			len)
+{
+	int			ret = 0;
+
+	while (len > 0) {
+		size_t count = min(BUFFERCOPY_BUFSZ, len);
+		ssize_t bytes_read, bytes_written;
+
+		bytes_read = pread(src_fd, req->buf, count, src_off);
+		if (bytes_read < 0) {
+			ret = errno;
+			break;
+		}
+
+		bytes_written = pwrite(dst_fd, req->buf, bytes_read, dst_off);
+		if (bytes_written < 0) {
+			ret = errno;
+			break;
+		}
+
+		src_off += bytes_written;
+		dst_off += bytes_written;
+		len -= bytes_written;
+	}
+
+	return ret;
+}
+
+/*
+ * Prepare the work file to assist in evacuating file data by copying the
+ * contents of the frozen space into the work file.
+ */
+static int
+csp_prepare_for_dedupe(
+	struct clearspace_req	*req)
+{
+	struct file_clone_range	fcr;
+	struct stat		statbuf;
+	loff_t			datapos = 0;
+	loff_t			length = 0;
+	int			ret;
+
+	ret = fstat(req->space_fd, &statbuf);
+	if (ret) {
+		perror(_("space capture file"));
+		return ret;
+	}
+
+	ret = ftruncate(req->work_fd, 0);
+	if (ret) {
+		perror(_("truncate work file"));
+		return ret;
+	}
+
+	ret = ftruncate(req->work_fd, statbuf.st_size);
+	if (ret) {
+		perror(_("reset work file"));
+		return ret;
+	}
+
+	/* Make a working copy of the frozen file data. */
+	start_spacefd_iter(req);
+	while ((ret = spacefd_data_iter(req, &datapos, &length)) > 0) {
+		trace_prep(req, "clone spacefd data 0x%llx length 0x%llx",
+				(long long)datapos, (long long)length);
+
+		fcr.src_fd = req->space_fd;
+		fcr.src_offset = datapos;
+		fcr.src_length = length;
+		fcr.dest_offset = datapos;
+
+		ret = clonerange(req->work_fd, &fcr);
+		if (ret == ENOSPC) {
+			req->trace_indent++;
+			trace_prep(req,
+	"falling back to buffered copy at 0x%llx",
+					(long long)datapos);
+			req->trace_indent--;
+			ret = csp_buffercopy(req, req->space_fd, datapos,
+					req->work_fd, datapos, length);
+		}
+		if (ret) {
+			perror(
+	_("copying space capture file contents to work file"));
+			return ret;
+		}
+	}
+	end_spacefd_iter(req);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Unshare the work file so that it contains an identical copy of the
+	 * contents of the space capture file but mapped to different blocks.
+	 * This is key to using dedupe to migrate file space away from the
+	 * requested region.
+	 */
+	req->trace_indent++;
+	ret = csp_unshare_workfile(req, req->start, req->length);
+	req->trace_indent--;
+	return ret;
+}
+
+/*
+ * Evacuate one fsmapping by using dedupe to remap data stored in the target
+ * range to a copy stored in the work file.
+ */
+static int
+csp_evac_exchange_fsmap(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target,
+	const struct fsmap	*mrec)
+{
+	struct xfs_bulkstat	bulkstat;
+	struct xfs_exch_range	xchg = { };
+	struct getbmapx		brec;
+	int			target_fd;
+	int			ret, ret2;
+
+	if (mrec->fmr_device != req->dev) {
+		fprintf(stderr, _("wrong fsmap device in results.\n"));
+		return -1;
+	}
+
+	ret = csp_evac_open(req, target, mrec, &bulkstat, O_RDWR, &target_fd);
+	if (ret || target_fd < 0)
+		return ret;
+
+	ret = ftruncate(req->work_fd, 0);
+	if (ret) {
+		perror(_("truncating work file"));
+		goto out_fd;
+	}
+
+	/*
+	 * Copy the data from the original file to the work file.  We assume
+	 * that the work file will end up with different data blocks and that
+	 * they're outside of the requested range.
+	 */
+	ret = csp_buffercopy(req, target_fd, mrec->fmr_offset, req->work_fd,
+			mrec->fmr_offset, mrec->fmr_length);
+	if (ret) {
+		fprintf(stderr, _("copying target file to work file: %s\n"),
+				strerror(ret));
+		goto out_fd;
+	}
+
+	ret = fsync(req->work_fd);
+	if (ret) {
+		perror(_("flush work file for fiexchange"));
+		goto out_fd;
+	}
+
+	ret = bmapx_one(req, req->work_fd, mrec->fmr_physical,
+			mrec->fmr_length, &brec);
+	if (ret)
+		return ret;
+
+	trace_exchange(req, "workfd pos 0x%llx phys 0x%llx",
+			(unsigned long long)mrec->fmr_physical,
+			(unsigned long long)BBTOB(brec.bmv_block));
+
+	/*
+	 * Exchange the mappings, with the freshness check enabled.  This
+	 * should result in the target file being switched to new blocks unless
+	 * it has changed, in which case we bounce out and find a new target.
+	 */
+	xfrog_file_exchange_prep(NULL, XFS_EXCH_RANGE_NONATOMIC,
+			mrec->fmr_offset, req->work_fd, mrec->fmr_offset,
+			mrec->fmr_length, &xchg);
+	xfrog_file_exchange_require_file2_fresh(&xchg, &bulkstat);
+	ret = exchangerange(target_fd, &xchg);
+	if (ret) {
+		if (ret == EBUSY) {
+			req->trace_indent++;
+			trace_exchange(req,
+ "failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx",
+					bulkstat.bs_ino,
+					(unsigned long long)mrec->fmr_offset,
+					(unsigned long long)mrec->fmr_length);
+			req->trace_indent--;
+			target->try_again = true;
+		} else {
+			fprintf(stderr,
+	_("exchanging target and work file contents: %s\n"),
+					strerror(ret));
+		}
+		goto out_fd;
+	}
+
+	req->trace_indent++;
+	trace_exchange(req,
+ "evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx",
+			bulkstat.bs_ino,
+			(unsigned long long)mrec->fmr_offset,
+			(unsigned long long)mrec->fmr_length);
+	req->trace_indent--;
+	target->evacuated++;
+
+out_fd:
+	ret2 = close(target_fd);
+	if (!ret && ret2)
+		ret = ret2;
+	return ret;
+}
+
+/*
+ * Try to evacuate all data blocks in the target region by copying the contents
+ * to a new file and exchanging the extents.
+ */
+static int
+csp_evac_exchange(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target)
+{
+	int			ret;
+
+	start_fsmap_query(req, req->dev, target->start, target->length);
+	while ((ret = run_fsmap_query(req)) > 0) {
+		struct fsmap	*mrec;
+
+		for_each_fsmap_row(req, mrec) {
+			trace_fsmap_rec(req, CSP_TRACE_EXCHANGE, mrec);
+			trim_target_fsmap(target, mrec);
+
+			req->trace_indent++;
+			ret = csp_evac_exchange_fsmap(req, target, mrec);
+			req->trace_indent--;
+			if (ret)
+				goto out;
+
+			ret = csp_grab_free_space(req);
+			if (ret)
+				goto out;
+		}
+	}
+out:
+	end_fsmap_query(req);
+	if (ret)
+		trace_exchange(req, "ret %d", ret);
+	return ret;
+}
+
+/* Try to evacuate blocks by using online repair to rebuild AG metadata. */
+static int
+csp_evac_ag_metadata(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target,
+	uint32_t		agno,
+	uint32_t		mask)
+{
+	struct xfs_scrub_metadata scrub = {
+		.sm_flags	= XFS_SCRUB_IFLAG_REPAIR |
+				  XFS_SCRUB_IFLAG_FORCE_REBUILD,
+	};
+	unsigned int		i;
+	int			ret;
+
+	trace_xrebuild(req, "agno 0x%x mask 0x%x",
+			(unsigned int)agno,
+			(unsigned int)mask);
+
+	for (i = XFS_SCRUB_TYPE_AGFL; i < XFS_SCRUB_TYPE_REFCNTBT; i++) {
+
+		if (!(mask & (1U << i)))
+			continue;
+
+		scrub.sm_type = i;
+
+		req->trace_indent++;
+		trace_xrebuild(req, "agno %u type %u",
+				(unsigned int)agno,
+				(unsigned int)scrub.sm_type);
+		req->trace_indent--;
+
+		ret = ioctl(req->xfd->fd, XFS_IOC_SCRUB_METADATA, &scrub);
+		if (ret) {
+			if (errno == ENOENT || errno == ENOSPC)
+				continue;
+			fprintf(stderr, _("rebuilding ag %u type %u: %s\n"),
+					(unsigned int)agno, scrub.sm_type,
+					strerror(errno));
+			return -1;
+		}
+
+		target->evacuated++;
+
+		ret = csp_grab_free_space(req);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* Compute a scrub mask for a fsmap special owner. */
+static uint32_t
+fsmap_owner_to_scrub_mask(__u64 owner)
+{
+	switch (owner) {
+	case XFS_FMR_OWN_FREE:
+	case XFS_FMR_OWN_UNKNOWN:
+	case XFS_FMR_OWN_FS:
+	case XFS_FMR_OWN_LOG:
+		/* can't move these */
+		return 0;
+	case XFS_FMR_OWN_AG:
+		return (1U << XFS_SCRUB_TYPE_BNOBT) |
+		       (1U << XFS_SCRUB_TYPE_CNTBT) |
+		       (1U << XFS_SCRUB_TYPE_AGFL) |
+		       (1U << XFS_SCRUB_TYPE_RMAPBT);
+	case XFS_FMR_OWN_INOBT:
+		return (1U << XFS_SCRUB_TYPE_INOBT) |
+		       (1U << XFS_SCRUB_TYPE_FINOBT);
+	case XFS_FMR_OWN_REFC:
+		return (1U << XFS_SCRUB_TYPE_REFCNTBT);
+	case XFS_FMR_OWN_INODES:
+	case XFS_FMR_OWN_COW:
+		/* don't know how to get rid of these */
+		return 0;
+	case XFS_FMR_OWN_DEFECTIVE:
+		/* good, get rid of it */
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+/* Try to clear all per-AG metadata from the requested range. */
+static int
+csp_evac_fs_metadata(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target,
+	bool			*cleared_anything)
+{
+	uint32_t		curr_agno = -1U;
+	uint32_t		curr_mask = 0;
+	int			ret = 0;
+
+	if (req->realtime)
+		return 0;
+
+	start_fsmap_query(req, req->dev, target->start, target->length);
+	while ((ret = run_fsmap_query(req)) > 0) {
+		struct fsmap	*mrec;
+
+		for_each_fsmap_row(req, mrec) {
+			uint64_t	daddr;
+			uint32_t	agno;
+			uint32_t	mask;
+
+			if (mrec->fmr_device != req->dev)
+				continue;
+			if (!(mrec->fmr_flags & FMR_OF_SPECIAL_OWNER))
+				continue;
+
+			/* Ignore regions that we already tried to clear. */
+			if (bitmap_test(req->visited, mrec->fmr_physical,
+						mrec->fmr_length))
+				continue;
+
+			mask = fsmap_owner_to_scrub_mask(mrec->fmr_owner);
+			if (!mask)
+				continue;
+
+			trace_fsmap_rec(req, CSP_TRACE_XREBUILD, mrec);
+
+			daddr = BTOBB(mrec->fmr_physical);
+			agno = cvt_daddr_to_agno(req->xfd, daddr);
+
+			trace_xrebuild(req,
+	"agno 0x%x -> 0x%x mask 0x%x owner %lld",
+					curr_agno, agno, curr_mask,
+					(unsigned long long)mrec->fmr_owner);
+
+			if (curr_agno == -1U) {
+				curr_agno = agno;
+			} else if (curr_agno != agno) {
+				ret = csp_evac_ag_metadata(req, target,
+						curr_agno, curr_mask);
+				if (ret)
+					goto out;
+
+				*cleared_anything = true;
+				curr_agno = agno;
+				curr_mask = 0;
+			}
+
+			/* Put this on the list and try to clear it once. */
+			curr_mask |= mask;
+			ret = bitmap_set(req->visited, mrec->fmr_physical,
+					mrec->fmr_length);
+			if (ret) {
+				perror(_("marking metadata extent visited"));
+				goto out;
+			}
+		}
+	}
+
+	if (curr_agno != -1U && curr_mask != 0) {
+		ret = csp_evac_ag_metadata(req, target, curr_agno, curr_mask);
+		if (ret)
+			goto out;
+		*cleared_anything = true;
+	}
+
+	if (*cleared_anything)
+		trace_bitmap(req, "set metadata start 0x%llx length 0x%llx",
+				target->start, target->length);
+
+out:
+	end_fsmap_query(req);
+	if (ret)
+		trace_xrebuild(req, "ret %d", ret);
+	return ret;
+}
+
+/*
+ * Check that at least the start of the mapping was frozen into the work file
+ * at the correct offset.  Set @len to the number of bytes that were frozen.
+ * Returns -1 for error, zero if written extents are waiting to be mapped into
+ * the space capture file, or 1 if there's nothing to transfer to the space
+ * capture file.
+ */
+enum freeze_outcome {
+	FREEZE_FAILED = -1,
+	FREEZE_DONE,
+	FREEZE_SKIP,
+};
+
+static enum freeze_outcome
+csp_freeze_check_outcome(
+	struct clearspace_req	*req,
+	const struct fsmap	*mrec,
+	unsigned long long	*len)
+{
+	struct getbmapx		brec;
+	int			ret;
+
+	*len = 0;
+
+	ret = bmapx_one(req, req->work_fd, 0, mrec->fmr_length, &brec);
+	if (ret)
+		return FREEZE_FAILED;
+
+	trace_freeze(req,
+ "check if workfd pos 0x0 phys 0x%llx len 0x%llx maps to phys 0x%llx len 0x%llx",
+			(unsigned long long)mrec->fmr_physical,
+			(unsigned long long)mrec->fmr_length,
+			(unsigned long long)BBTOB(brec.bmv_block),
+			(unsigned long long)BBTOB(brec.bmv_length));
+
+	/* freeze of an unwritten extent punches a hole in the work file. */
+	if ((mrec->fmr_flags & FMR_OF_PREALLOC) && brec.bmv_block == -1) {
+		*len = min(mrec->fmr_length, BBTOB(brec.bmv_length));
+		return FREEZE_SKIP;
+	}
+
+	/*
+	 * freeze of a written extent must result in the same physical space
+	 * being mapped into the work file.
+	 */
+	if (!(mrec->fmr_flags & FMR_OF_PREALLOC) &&
+	    BBTOB(brec.bmv_block) == mrec->fmr_physical) {
+		*len = min(mrec->fmr_length, BBTOB(brec.bmv_length));
+		return FREEZE_DONE;
+	}
+
+	/*
+	 * We didn't find what we were looking for, which implies that the
+	 * mapping changed out from under us.  Punch out everything that could
+	 * have been mapped into the work file.  Set @len to zero and return so
+	 * that we try again with the next mapping.
+	 */
+	trace_falloc(req, "reset workfd isize 0x0", 0);
+
+	ret = ftruncate(req->work_fd, 0);
+	if (ret) {
+		perror(_("resetting work file after failed freeze"));
+		return FREEZE_FAILED;
+	}
+
+	return FREEZE_SKIP;
+}
+
+/*
+ * Open a file to try to freeze whatever data is in the requested range.
+ *
+ * Returns nonzero on error.  Returns zero and a file descriptor in @fd if the
+ * caller is supposed to do something; or returns zero and @fd == -1 if there's
+ * nothing to freeze.
+ */
+static int
+csp_freeze_open(
+	struct clearspace_req	*req,
+	const struct fsmap	*mrec,
+	int			*fd)
+{
+	struct xfs_bulkstat	bulkstat;
+	int			oflags = O_RDWR;
+	int			target_fd;
+	int			ret;
+
+	*fd = -1;
+
+	ret = -xfrog_bulkstat_single(req->xfd, mrec->fmr_owner, 0, &bulkstat);
+	if (ret) {
+		if (ret == ENOENT || ret == EINVAL)
+			return 0;
+
+		fprintf(stderr, _("bulkstat inode 0x%llx: %s\n"),
+				(unsigned long long)mrec->fmr_owner,
+				strerror(errno));
+		return ret;
+	}
+
+	/*
+	 * If we get stats for a different inode, the file may have been freed
+	 * out from under us and there's nothing to do.
+	 */
+	if (bulkstat.bs_ino != mrec->fmr_owner)
+		return 0;
+
+	/* Skip anything we can't freeze. */
+	if (!S_ISREG(bulkstat.bs_mode) && !S_ISDIR(bulkstat.bs_mode))
+		return 0;
+
+	if (S_ISDIR(bulkstat.bs_mode))
+		oflags = O_RDONLY;
+
+	target_fd = csp_open_by_handle(req, oflags, mrec->fmr_owner,
+			bulkstat.bs_gen);
+	if (target_fd == -2)
+		return 0;
+	if (target_fd < 0)
+		return target_fd;
+
+	/*
+	 * Skip mappings for directories, xattr data, and block mapping btree
+	 * blocks.  We still have to close the file though.
+	 */
+	if (S_ISDIR(bulkstat.bs_mode) ||
+	    (mrec->fmr_flags & (FMR_OF_ATTR_FORK | FMR_OF_EXTENT_MAP))) {
+		return close(target_fd);
+	}
+
+	*fd = target_fd;
+	return 0;
+}
+
+static inline uint64_t rounddown_64(uint64_t x, uint64_t y)
+{
+	return (x / y) * y;
+}
+
+/*
+ * Deal with a frozen extent containing a partially written EOF block.  Either
+ * we use funshare to get src_fd to release the block, or we reduce the length
+ * of the frozen extent by one block.
+ */
+static int
+csp_freeze_unaligned_eofblock(
+	struct clearspace_req	*req,
+	int			src_fd,
+	const struct fsmap	*mrec,
+	unsigned long long	*frozen_len)
+{
+	struct getbmapx		brec;
+	struct stat		statbuf;
+	loff_t			work_offset, length;
+	int			ret;
+
+	ret = fstat(req->work_fd, &statbuf);
+	if (ret) {
+		perror(_("statting work file"));
+		return ret;
+	}
+
+	/*
+	 * The frozen extent is less than the size of the work file, which
+	 * means that we're already block aligned.
+	 */
+	if (*frozen_len <= statbuf.st_size)
+		return 0;
+
+	/* The frozen extent does not contain a partially written EOF block. */
+	if (statbuf.st_size % statbuf.st_blksize == 0)
+		return 0;
+
+	/*
+	 * Unshare what we think is a partially written EOF block of the
+	 * original file, to try to force it to release that block.
+	 */
+	work_offset = rounddown_64(statbuf.st_size, statbuf.st_blksize);
+	length = statbuf.st_size - work_offset;
+
+	trace_freeze(req,
+ "unaligned eofblock 0x%llx work_size 0x%llx blksize 0x%x work_offset 0x%llx work_length 0x%llx",
+			*frozen_len, statbuf.st_size, statbuf.st_blksize,
+			work_offset, length);
+
+	ret = fallocate(src_fd, FALLOC_FL_UNSHARE_RANGE,
+			mrec->fmr_offset + work_offset, length);
+	if (ret) {
+		perror(_("unsharing original file"));
+		return ret;
+	}
+
+	ret = fsync(src_fd);
+	if (ret) {
+		perror(_("flushing original file"));
+		return ret;
+	}
+
+	ret = bmapx_one(req, req->work_fd, work_offset, length, &brec);
+	if (ret)
+		return ret;
+
+	if (BBTOB(brec.bmv_block) != mrec->fmr_physical + work_offset) {
+		fprintf(stderr,
+ _("work file offset 0x%llx maps to phys 0x%llx, expected 0x%llx"),
+				(unsigned long long)work_offset,
+				(unsigned long long)BBTOB(brec.bmv_block),
+				(unsigned long long)mrec->fmr_physical);
+		return -1;
+	}
+
+	/*
+	 * If the block is still shared, there must be other owners of this
+	 * block.  Round down the frozen length and we'll come back to it
+	 * eventually.
+	 */
+	if (brec.bmv_oflags & BMV_OF_SHARED) {
+		*frozen_len = work_offset;
+		return 0;
+	}
+
+	/*
+	 * Not shared anymore, so increase the size of the file to the next
+	 * block boundary so that we can reflink it into the space capture
+	 * file.
+	 */
+	ret = ftruncate(req->work_fd,
+			BBTOB(brec.bmv_length) + BBTOB(brec.bmv_offset));
+	if (ret) {
+		perror(_("expanding work file"));
+		return ret;
+	}
+
+	/* Double-check that we didn't lose the block. */
+	ret = bmapx_one(req, req->work_fd, work_offset, length, &brec);
+	if (ret)
+		return ret;
+
+	if (BBTOB(brec.bmv_block) != mrec->fmr_physical + work_offset) {
+		fprintf(stderr,
+ _("work file offset 0x%llx maps to phys 0x%llx, should be 0x%llx"),
+				(unsigned long long)work_offset,
+				(unsigned long long)BBTOB(brec.bmv_block),
+				(unsigned long long)mrec->fmr_physical);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Given a fsmap, try to reflink the physical space into the space capture
+ * file.
+ */
+static int
+csp_freeze_req_fsmap(
+	struct clearspace_req	*req,
+	unsigned long long	*cursor,
+	const struct fsmap	*mrec)
+{
+	struct fsmap		short_mrec;
+	struct file_clone_range	fcr = { };
+	unsigned long long	frozen_len;
+	enum freeze_outcome	outcome;
+	int			src_fd;
+	int			ret, ret2;
+
+	if (mrec->fmr_device != req->dev) {
+		fprintf(stderr, _("wrong fsmap device in results.\n"));
+		return -1;
+	}
+
+	/* Ignore mappings for our secret files. */
+	if (csp_is_internal_owner(req, mrec->fmr_owner))
+		return 0;
+
+	/* Ignore mappings before the cursor. */
+	if (mrec->fmr_physical + mrec->fmr_length < *cursor)
+		return 0;
+
+	/* Jump past mappings for metadata. */
+	if (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER)
+		goto skip;
+
+	/*
+	 * Open this file so that we can try to freeze its data blocks.
+	 * For other types of files we just skip to the evacuation step.
+	 */
+	ret = csp_freeze_open(req, mrec, &src_fd);
+	if (ret)
+		return ret;
+	if (src_fd < 0)
+		goto skip;
+
+	/*
+	 * If the cursor is in the middle of this mapping, increase the start
+	 * of the mapping to start at the cursor.
+	 */
+	if (mrec->fmr_physical < *cursor) {
+		unsigned long long	delta = *cursor - mrec->fmr_physical;
+
+		short_mrec = *mrec;
+		short_mrec.fmr_physical = *cursor;
+		short_mrec.fmr_offset += delta;
+		short_mrec.fmr_length -= delta;
+
+		mrec = &short_mrec;
+	}
+
+	req->trace_indent++;
+	if (mrec->fmr_length == 0) {
+		trace_freeze(req, "skipping zero-length freeze", 0);
+		goto out_fd;
+	}
+
+	/*
+	 * Reflink the mapping from the source file into the empty work file so
+	 * that a write will be written elsewhere.  The only way to reflink a
+	 * partially written EOF block is if the kernel can reset the work file
+	 * size so that the post-EOF part of the block remains post-EOF.  If we
+	 * can't do that, we're sunk.  If the mapping is unwritten, we'll leave
+	 * a hole in the work file.
+	 */
+	ret = ftruncate(req->work_fd, 0);
+	if (ret) {
+		perror(_("truncating work file for freeze"));
+		goto out_fd;
+	}
+
+	fcr.src_fd = src_fd;
+	fcr.src_offset = mrec->fmr_offset;
+	fcr.src_length = mrec->fmr_length;
+	fcr.dest_offset = 0;
+
+	trace_freeze(req,
+ "reflink ino 0x%llx offset 0x%llx bytecount 0x%llx into workfd",
+			(unsigned long long)mrec->fmr_owner,
+			(unsigned long long)fcr.src_offset,
+			(unsigned long long)fcr.src_length);
+
+	ret = clonerange(req->work_fd, &fcr);
+	if (ret == EINVAL) {
+		/*
+		 * If that didn't work, try reflinking to EOF and picking out
+		 * whatever pieces we want.
+		 */
+		fcr.src_length = 0;
+
+		trace_freeze(req,
+ "reflink ino 0x%llx offset 0x%llx to EOF into workfd",
+				(unsigned long long)mrec->fmr_owner,
+				(unsigned long long)fcr.src_offset);
+
+		ret = clonerange(req->work_fd, &fcr);
+	}
+	if (ret == EINVAL) {
+		/*
+		 * If we still can't get the block, it's possible that src_fd
+		 * was punched or truncated out from under us, so we just move
+		 * on to the next fsmap.
+		 */
+		trace_freeze(req, "cannot freeze space, moving on", 0);
+		ret = 0;
+		goto out_fd;
+	}
+	if (ret) {
+		fprintf(stderr, _("freezing space to work file: %s\n"),
+				strerror(ret));
+		goto out_fd;
+	}
+
+	req->trace_indent++;
+	outcome = csp_freeze_check_outcome(req, mrec, &frozen_len);
+	req->trace_indent--;
+	switch (outcome) {
+	case FREEZE_FAILED:
+		ret = -1;
+		goto out_fd;
+	case FREEZE_SKIP:
+		*cursor += frozen_len;
+		goto out_fd;
+	case FREEZE_DONE:
+		break;
+	}
+
+	/*
+	 * If we tried reflinking to EOF to capture a partially written EOF
+	 * block in the work file, we need to unshare the end of the source
+	 * file before we try to reflink the frozen space into the space
+	 * capture file.
+	 */
+	if (fcr.src_length == 0) {
+		ret = csp_freeze_unaligned_eofblock(req, src_fd, mrec,
+				&frozen_len);
+		if (ret)
+			goto out_fd;
+	}
+
+	/*
+	 * We've frozen the mapping by reflinking it into the work file and
+	 * confirmed that the work file has the space we wanted.  Now we need
+	 * to map the same extent into the space capture file.  If reflink
+	 * fails because we're out of space, fall back to EXCHANGE_RANGE.  The
+	 * end goal is to populate the space capture file; we don't care about
+	 * the contents of the work file.
+	 */
+	fcr.src_fd = req->work_fd;
+	fcr.src_offset = 0;
+	fcr.dest_offset = mrec->fmr_physical;
+	fcr.src_length = frozen_len;
+
+	trace_freeze(req, "reflink phys 0x%llx len 0x%llx to spacefd",
+			(unsigned long long)mrec->fmr_physical,
+			(unsigned long long)mrec->fmr_length);
+
+	ret = clonerange(req->space_fd, &fcr);
+	if (ret == ENOSPC) {
+		struct xfs_exch_range	xchg;
+
+		xfrog_file_exchange_prep(NULL, XFS_EXCH_RANGE_NONATOMIC,
+				mrec->fmr_physical, req->work_fd,
+				mrec->fmr_physical, frozen_len, &xchg);
+		ret = exchangerange(req->space_fd, &xchg);
+	}
+	if (ret) {
+		fprintf(stderr, _("freezing space to space capture file: %s\n"),
+				strerror(ret));
+		goto out_fd;
+	}
+
+	*cursor += frozen_len;
+out_fd:
+	ret2 = close(src_fd);
+	if (!ret && ret2)
+		ret = ret2;
+	req->trace_indent--;
+	if (ret)
+		trace_freeze(req, "ret %d", ret);
+	return ret;
+skip:
+	*cursor += mrec->fmr_length;
+	return 0;
+}
+
+/*
+ * Try to freeze all the space in the requested range against overwrites.
+ *
+ * For each file data fsmap within each hole in the part of the space capture
+ * file corresponding to the requested range, try to reflink the space into the
+ * space capture file so that any subsequent writes to the original owner are
+ * CoW and nobody else can allocate the space.  If we cannot use reflink to
+ * freeze all the space, we cannot proceed with the clearing.
+ */
+static int
+csp_freeze_req_range(
+	struct clearspace_req	*req)
+{
+	unsigned long long	cursor = req->start;
+	loff_t			holepos = 0;
+	loff_t			length = 0;
+	int			ret;
+
+	ret = ftruncate(req->space_fd, req->start + req->length);
+	if (ret) {
+		perror(_("setting up space capture file"));
+		return ret;
+	}
+
+	if (!req->use_reflink)
+		return 0;
+
+	start_spacefd_iter(req);
+	while ((ret = spacefd_hole_iter(req, &holepos, &length)) > 0) {
+		trace_freeze(req, "spacefd hole 0x%llx length 0x%llx",
+				(long long)holepos, (long long)length);
+
+		start_fsmap_query(req, req->dev, holepos, length);
+		while ((ret = run_fsmap_query(req)) > 0) {
+			struct fsmap	*mrec;
+
+			for_each_fsmap_row(req, mrec) {
+				trace_fsmap_rec(req, CSP_TRACE_FREEZE, mrec);
+				trim_request_fsmap(req, mrec);
+				ret = csp_freeze_req_fsmap(req, &cursor, mrec);
+				if (ret) {
+					end_fsmap_query(req);
+					goto out;
+				}
+			}
+		}
+		end_fsmap_query(req);
+	}
+out:
+	end_spacefd_iter(req);
+	return ret;
+}
+
+/*
+ * Dump all speculative preallocations, COW staging blocks, and inactive inodes
+ * to try to free up as much space as we can.
+ */
+static int
+csp_collect_garbage(
+	struct clearspace_req	*req)
+{
+	struct xfs_fs_eofblocks	eofb = {
+		.eof_version	= XFS_EOFBLOCKS_VERSION,
+		.eof_flags	= XFS_EOF_FLAGS_SYNC,
+	};
+	int			ret;
+
+	ret = ioctl(req->xfd->fd, XFS_IOC_FREE_EOFBLOCKS, &eofb);
+	if (ret) {
+		perror(_("xfs garbage collector"));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+csp_prepare(
+	struct clearspace_req	*req)
+{
+	blkcnt_t		old_blocks = 0;
+	int			ret;
+
+	/*
+	 * Empty out CoW forks and speculative post-EOF preallocations before
+	 * starting the clearing process.  This may be somewhat overkill.
+	 */
+	ret = syncfs(req->xfd->fd);
+	if (ret) {
+		perror(_("syncing filesystem"));
+		return ret;
+	}
+
+	ret = csp_collect_garbage(req);
+	if (ret)
+		return ret;
+
+	/*
+	 * Set up the space capture file as a large sparse file mirroring the
+	 * physical space that we want to defragment.
+	 */
+	ret = ftruncate(req->space_fd, req->start + req->length);
+	if (ret) {
+		perror(_("setting up space capture file"));
+		return ret;
+	}
+
+	/*
+	 * If we don't have reflink, just grab the free space and move on to
+	 * copying and exchanging file contents.
+	 */
+	if (!req->use_reflink)
+		return csp_grab_free_space(req);
+
+	/*
+	 * Try to freeze as much of the requested range as we can, grab the
+	 * free space in that range, and run freeze again to pick up anything
+	 * that may have been allocated while all that was going on.
+	 */
+	do {
+		struct stat	statbuf;
+
+		ret = csp_freeze_req_range(req);
+		if (ret)
+			return ret;
+
+		ret = csp_grab_free_space(req);
+		if (ret)
+			return ret;
+
+		ret = fstat(req->space_fd, &statbuf);
+		if (ret)
+			return ret;
+
+		if (old_blocks == statbuf.st_blocks)
+			break;
+		old_blocks = statbuf.st_blocks;
+	} while (1);
+
+	/*
+	 * If reflink is enabled, our strategy is to dedupe to free blocks in
+	 * the area that we're clearing without making any user-visible changes
+	 * to the file contents.  For all the written file data blocks in area
+	 * we're clearing, make an identical copy in the work file that is
+	 * backed by blocks that are not in the clearing area.
+	 */
+	return csp_prepare_for_dedupe(req);
+}
+
+/* Set up the target to clear all metadata from the given range. */
+static inline void
+csp_target_metadata(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target)
+{
+	target->start = req->start;
+	target->length = req->length;
+	target->prio = 0;
+	target->evacuated = 0;
+	target->owners = 0;
+	target->try_again = false;
+}
+
+/*
+ * Loop through the space to find the most appealing part of the device to
+ * clear, then try to evacuate everything within.
+ */
+int
+clearspace_run(
+	struct clearspace_req	*req)
+{
+	struct clearspace_tgt	target;
+	const struct csp_errstr	*es;
+	bool			cleared_anything;
+	int			ret;
+
+	if (req->trace_mask) {
+		fprintf(stderr, "debug flags 0x%x:", req->trace_mask);
+		for (es = errtags; es->tag; es++) {
+			if (req->trace_mask & es->mask)
+				fprintf(stderr, " %s", es->tag);
+		}
+		fprintf(stderr, "\n");
+	}
+
+	req->trace_indent = 0;
+	trace_status(req,
+ _("Clearing dev %u:%u physical 0x%llx bytecount 0x%llx."),
+			major(req->dev), minor(req->dev),
+			req->start, req->length);
+
+	if (req->trace_mask & ~CSP_TRACE_STATUS)
+		trace_status(req, "reflink? %d evac_metadata? %d",
+				req->use_reflink, req->can_evac_metadata);
+
+	ret = bitmap_alloc(&req->visited);
+	if (ret) {
+		perror(_("allocating visited bitmap"));
+		return ret;
+	}
+
+	ret = csp_prepare(req);
+	if (ret)
+		goto out_bitmap;
+
+	/* Evacuate as many file blocks as we can. */
+	do {
+		ret = csp_find_target(req, &target);
+		if (ret)
+			goto out_bitmap;
+
+		if (target.length == 0)
+			break;
+
+		trace_target(req,
+	"phys 0x%llx len 0x%llx owners 0x%llx prio 0x%llx",
+				target.start, target.length,
+				target.owners, target.prio);
+
+		if (req->use_reflink)
+			ret = csp_evac_dedupe(req, &target);
+		else
+			ret = csp_evac_exchange(req, &target);
+		if (ret)
+			goto out_bitmap;
+
+		trace_status(req, _("Evacuated %llu file items."),
+				target.evacuated);
+	} while (target.evacuated > 0 || target.try_again);
+
+	if (!req->can_evac_metadata)
+		goto out_bitmap;
+
+	/* Evacuate as many AG metadata blocks as we can. */
+	do {
+		csp_target_metadata(req, &target);
+
+		ret = csp_evac_fs_metadata(req, &target, &cleared_anything);
+		if (ret)
+			goto out_bitmap;
+
+		trace_status(req, "evacuated %llu metadata items",
+				target.evacuated);
+	} while (target.evacuated > 0 && cleared_anything);
+
+out_bitmap:
+	bitmap_free(&req->visited);
+	return ret;
+}
+
+/* How much space did we actually clear? */
+int
+clearspace_efficacy(
+	struct clearspace_req	*req,
+	unsigned long long	*cleared_bytes)
+{
+	unsigned long long	cleared = 0;
+	int			ret;
+
+	start_bmapx_query(req, 0, req->start, req->length);
+	while ((ret = run_bmapx_query(req, req->space_fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			if (brec->bmv_block == -1)
+				continue;
+
+			trace_bmapx_rec(req, CSP_TRACE_EFFICACY, brec);
+
+			if (brec->bmv_offset != brec->bmv_block) {
+				fprintf(stderr,
+	_("space capture file mapped incorrectly\n"));
+				end_bmapx_query(req);
+				return -1;
+			}
+			cleared += BBTOB(brec->bmv_length);
+		}
+	}
+	end_bmapx_query(req);
+	if (ret)
+		return ret;
+
+	*cleared_bytes = cleared;
+	return 0;
+}
+
+/*
+ * Create a temporary file on the same volume (data/rt) that we're trying to
+ * clear free space on.
+ */
+static int
+csp_open_tempfile(
+	struct clearspace_req	*req,
+	struct stat		*statbuf)
+{
+	struct fsxattr		fsx;
+	int			fd, ret;
+
+	fd = openat(req->xfd->fd, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600);
+	if (fd < 0) {
+		perror(_("opening temp file"));
+		return -1;
+	}
+
+	/* Make sure we got the same filesystem as the open file. */
+	ret = fstat(fd, statbuf);
+	if (ret) {
+		perror(_("stat temp file"));
+		goto fail;
+	}
+	if (statbuf->st_dev != req->statbuf.st_dev) {
+		fprintf(stderr,
+	_("Cannot create temp file on same fs as open file.\n"));
+		goto fail;
+	}
+
+	/* Ensure this file targets the correct data/rt device. */
+	ret = ioctl(fd, FS_IOC_FSGETXATTR, &fsx);
+	if (ret) {
+		perror(_("FSGETXATTR temp file"));
+		goto fail;
+	}
+
+	if (!!(fsx.fsx_xflags & FS_XFLAG_REALTIME) != req->realtime) {
+		if (req->realtime)
+			fsx.fsx_xflags |= FS_XFLAG_REALTIME;
+		else
+			fsx.fsx_xflags &= ~FS_XFLAG_REALTIME;
+
+		ret = ioctl(fd, FS_IOC_FSSETXATTR, &fsx);
+		if (ret) {
+			perror(_("FSSETXATTR temp file"));
+			goto fail;
+		}
+	}
+
+	trace_setup(req, "opening temp inode 0x%llx as fd %d",
+			(unsigned long long)statbuf->st_ino, fd);
+
+	return fd;
+fail:
+	close(fd);
+	return -1;
+}
+
+/* Extract fshandle from the open file. */
+static int
+csp_install_file(
+	struct clearspace_req	*req,
+	struct xfs_fd		*xfd)
+{
+	void			*handle;
+	size_t			handle_sz;
+	int			ret;
+
+	ret = fstat(xfd->fd, &req->statbuf);
+	if (ret)
+		return ret;
+
+	if (!S_ISDIR(req->statbuf.st_mode)) {
+		errno = -ENOTDIR;
+		return -1;
+	}
+
+	ret = fd_to_handle(xfd->fd, &handle, &handle_sz);
+	if (ret)
+		return ret;
+
+	ret = handle_to_fshandle(handle, handle_sz, &req->fshandle,
+			&req->fshandle_sz);
+	if (ret)
+		return ret;
+
+	free_handle(handle, handle_sz);
+	req->xfd = xfd;
+	return 0;
+}
+
+/* Decide if we can use online repair to evacuate metadata. */
+static void
+csp_detect_evac_metadata(
+	struct clearspace_req		*req)
+{
+	struct xfs_scrub_metadata	scrub = {
+		.sm_type		= XFS_SCRUB_TYPE_PROBE,
+		.sm_flags		= XFS_SCRUB_IFLAG_REPAIR |
+					  XFS_SCRUB_IFLAG_FORCE_REBUILD,
+	};
+	int				ret;
+
+	ret = ioctl(req->xfd->fd, XFS_IOC_SCRUB_METADATA, &scrub);
+	if (ret)
+		return;
+
+	/*
+	 * We'll try to evacuate metadata if the probe works.  This doesn't
+	 * guarantee success; it merely means that the kernel call exists.
+	 */
+	req->can_evac_metadata = true;
+}
+
+/* Detect XFS_IOC_MAP_FREESP; this is critical for grabbing free space! */
+static int
+csp_detect_map_freesp(
+	struct clearspace_req	*req)
+{
+	struct xfs_map_freesp	args = {
+		.offset		= 0,
+		.len		= 1,
+	};
+	int			ret;
+
+	/*
+	 * A single-byte fallocate request will succeed without doing anything
+	 * to the filesystem.
+	 */
+	ret = ioctl(req->work_fd, XFS_IOC_MAP_FREESP, &args);
+	if (!ret)
+		return 0;
+
+	if (errno == EOPNOTSUPP) {
+		fprintf(stderr,
+	_("Filesystem does not support XFS_IOC_MAP_FREESP\n"));
+		return -1;
+	}
+
+	perror(_("test XFS_IOC_MAP_FREESP on work file"));
+	return -1;
+}
+
+/*
+ * Assemble operation information to clear the physical space in part of a
+ * filesystem.
+ */
+int
+clearspace_init(
+	struct clearspace_req		**reqp,
+	const struct clearspace_init	*attrs)
+{
+	struct clearspace_req		*req;
+	int				ret;
+
+	req = calloc(1, sizeof(struct clearspace_req));
+	if (!req) {
+		perror(_("malloc clearspace"));
+		return -1;
+	}
+
+	req->work_fd = -1;
+	req->space_fd = -1;
+	req->trace_mask = attrs->trace_mask;
+
+	req->realtime = attrs->is_realtime;
+	req->dev = attrs->dev;
+	req->start = attrs->start;
+	req->length = attrs->length;
+
+	ret = csp_install_file(req, attrs->xfd);
+	if (ret) {
+		perror(attrs->fname);
+		goto fail;
+	}
+
+	csp_detect_evac_metadata(req);
+
+	req->work_fd = csp_open_tempfile(req, &req->temp_statbuf);
+	if (req->work_fd < 0)
+		goto fail;
+
+	req->space_fd = csp_open_tempfile(req, &req->space_statbuf);
+	if (req->space_fd < 0)
+		goto fail;
+
+	ret = csp_detect_map_freesp(req);
+	if (ret)
+		goto fail;
+
+	req->mhead = calloc(1, fsmap_sizeof(QUERY_BATCH_SIZE));
+	if (!req->mhead) {
+		perror(_("opening fs mapping query"));
+		goto fail;
+	}
+
+	req->rhead = calloc(1, xfs_getfsrefs_sizeof(QUERY_BATCH_SIZE));
+	if (!req->rhead) {
+		perror(_("opening refcount query"));
+		goto fail;
+	}
+
+	req->bhead = calloc(QUERY_BATCH_SIZE + 1, sizeof(struct getbmapx));
+	if (!req->bhead) {
+		perror(_("opening file mapping query"));
+		goto fail;
+	}
+
+	req->buf = malloc(BUFFERCOPY_BUFSZ);
+	if (!req->buf) {
+		perror(_("allocating file copy buffer"));
+		goto fail;
+	}
+
+	req->fdr = calloc(1, sizeof(struct file_dedupe_range) +
+			     sizeof(struct file_dedupe_range_info));
+	if (!req->fdr) {
+		perror(_("allocating dedupe control buffer"));
+		goto fail;
+	}
+
+	req->use_reflink = req->xfd->fsgeom.flags & XFS_FSOP_GEOM_FLAGS_REFLINK;
+
+	*reqp = req;
+	return 0;
+fail:
+	clearspace_free(&req);
+	return -1;
+}
+
+/* Free all resources associated with a space clearing request. */
+int
+clearspace_free(
+	struct clearspace_req	**reqp)
+{
+	struct clearspace_req	*req = *reqp;
+	int			ret = 0;
+
+	if (!req)
+		return 0;
+
+	*reqp = NULL;
+	free(req->fdr);
+	free(req->buf);
+	free(req->bhead);
+	free(req->rhead);
+	free(req->mhead);
+
+	if (req->space_fd >= 0) {
+		ret = close(req->space_fd);
+		if (ret)
+			perror(_("closing space capture file"));
+	}
+
+	if (req->work_fd >= 0) {
+		int	ret2 = close(req->work_fd);
+
+		if (ret2) {
+			perror(_("closing work file"));
+			if (!ret && ret2)
+				ret = ret2;
+		}
+	}
+
+	if (req->fshandle)
+		free_handle(req->fshandle, req->fshandle_sz);
+	free(req);
+	return ret;
+}
diff --git a/libfrog/clearspace.h b/libfrog/clearspace.h
new file mode 100644
index 00000000000..07149eb818c
--- /dev/null
+++ b/libfrog/clearspace.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __LIBFROG_CLEARSPACE_H__
+#define __LIBFROG_CLEARSPACE_H__
+
+struct clearspace_req;
+
+struct clearspace_init {
+	/* Open file and its pathname */
+	struct xfs_fd		*xfd;
+	const char		*fname;
+
+	/* Which device do we want? */
+	bool			is_realtime;
+	dev_t			dev;
+
+	/* Range of device to clear. */
+	unsigned long long	start;
+	unsigned long long	length;
+
+	unsigned int		trace_mask;
+};
+
+int clearspace_init(struct clearspace_req **reqp,
+		const struct clearspace_init *init);
+int clearspace_free(struct clearspace_req **reqp);
+
+int clearspace_run(struct clearspace_req *req);
+
+int clearspace_efficacy(struct clearspace_req *req,
+		unsigned long long *cleared_bytes);
+
+/* Debugging levels */
+
+#define CSP_TRACE_FREEZE	(1U << 0)
+#define CSP_TRACE_GRAB		(1U << 1)
+#define CSP_TRACE_FSMAP		(1U << 2)
+#define CSP_TRACE_FSREFS	(1U << 3)
+#define CSP_TRACE_BMAPX		(1U << 4)
+#define CSP_TRACE_PREP		(1U << 5)
+#define CSP_TRACE_TARGET	(1U << 6)
+#define CSP_TRACE_DEDUPE	(1U << 7)
+#define CSP_TRACE_FALLOC	(1U << 8)
+#define CSP_TRACE_EXCHANGE	(1U << 9)
+#define CSP_TRACE_XREBUILD	(1U << 10)
+#define CSP_TRACE_EFFICACY	(1U << 11)
+#define CSP_TRACE_SETUP		(1U << 12)
+#define CSP_TRACE_STATUS	(1U << 13)
+#define CSP_TRACE_DUMPFILE	(1U << 14)
+#define CSP_TRACE_BITMAP	(1U << 15)
+
+#define CSP_TRACE_ALL		(CSP_TRACE_FREEZE | \
+				 CSP_TRACE_GRAB | \
+				 CSP_TRACE_FSMAP | \
+				 CSP_TRACE_FSREFS | \
+				 CSP_TRACE_BMAPX | \
+				 CSP_TRACE_PREP	 | \
+				 CSP_TRACE_TARGET | \
+				 CSP_TRACE_DEDUPE | \
+				 CSP_TRACE_FALLOC | \
+				 CSP_TRACE_EXCHANGE | \
+				 CSP_TRACE_XREBUILD | \
+				 CSP_TRACE_EFFICACY | \
+				 CSP_TRACE_SETUP | \
+				 CSP_TRACE_STATUS | \
+				 CSP_TRACE_DUMPFILE | \
+				 CSP_TRACE_BITMAP)
+
+#endif /* __LIBFROG_CLEARSPACE_H__ */
diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8
index 7d2d1ff94ee..a326b9a6486 100644
--- a/man/man8/xfs_spaceman.8
+++ b/man/man8/xfs_spaceman.8
@@ -25,6 +25,23 @@ then the program exits.
 
 .SH COMMANDS
 .TP
+.BI "clearfree [ \-n nr ] [ \-r ] [ \-v mask ] " start " " length
+Try to clear the specified physical range in the filesystem.
+The
+.B start
+and
+.B length
+arguments must be given in units of bytes.
+If the
+.B -n
+option is given, run the clearing algorithm this many times.
+If the
+.B -r
+option is given, clear the realtime device.
+If the
+.B -v
+option is given, print what's happening every step of the way.
+.TP
 .BI "freesp [ \-dgrs ] [-a agno]... [ \-b | \-e bsize | \-h bsize | \-m factor ]"
 With no arguments,
 .B freesp
diff --git a/spaceman/Makefile b/spaceman/Makefile
index d6fccc361cf..75df4ce86c2 100644
--- a/spaceman/Makefile
+++ b/spaceman/Makefile
@@ -19,7 +19,7 @@ LLDLIBS += $(LIBEDITLINE) $(LIBTERMCAP)
 endif
 
 ifeq ($(HAVE_GETFSMAP),yes)
-CFILES += freesp.c
+CFILES += freesp.c clearfree.c
 endif
 
 default: depend $(LTCOMMAND)
diff --git a/spaceman/clearfree.c b/spaceman/clearfree.c
new file mode 100644
index 00000000000..a5f984f0159
--- /dev/null
+++ b/spaceman/clearfree.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "platform_defs.h"
+#include "command.h"
+#include "init.h"
+#include "libfrog/paths.h"
+#include "input.h"
+#include "libfrog/fsgeom.h"
+#include "libfrog/clearspace.h"
+#include "handle.h"
+#include "space.h"
+
+static void
+clearfree_help(void)
+{
+	printf(_(
+"Evacuate the contents of the given range of physical storage in the filesystem"
+"\n"
+" -n -- Run the space clearing algorithm this many times.\n"
+" -r -- clear space on the realtime device.\n"
+" -v -- verbosity level, or \"all\" to print everything.\n"
+"\n"
+"The start and length arguments are required, and must be specified in units\n"
+"of bytes.\n"
+"\n"));
+}
+
+static int
+clearfree_f(
+	int			argc,
+	char			**argv)
+{
+	struct clearspace_init	attrs = {
+		.xfd		= &file->xfd,
+		.fname		= file->name,
+	};
+	struct clearspace_req	*req = NULL;
+	unsigned long long	cleared;
+	unsigned long		arg;
+	long long		lnum;
+	unsigned int		i, nr = 1;
+	int			c, ret;
+
+	while ((c = getopt(argc, argv, "n:rv:")) != EOF) {
+		switch (c) {
+		case 'n':
+			errno = 0;
+			arg = strtoul(optarg, NULL, 0);
+			if (errno) {
+				perror(optarg);
+				return 1;
+			}
+			if (arg > UINT_MAX)
+				arg = UINT_MAX;
+			nr = arg;
+			break;
+		case 'r':	/* rt device */
+			attrs.is_realtime = true;
+			break;
+		case 'v':	/* Verbose output */
+			if (!strcmp(optarg, "all")) {
+				attrs.trace_mask = CSP_TRACE_ALL;
+			} else {
+				errno = 0;
+				attrs.trace_mask = strtoul(optarg, NULL, 0);
+				if (errno) {
+					perror(optarg);
+					return 1;
+				}
+			}
+			break;
+		default:
+			exitcode = 1;
+			clearfree_help();
+			return 0;
+		}
+	}
+
+	if (attrs.trace_mask)
+		attrs.trace_mask |= CSP_TRACE_STATUS;
+
+	if (argc != optind + 2) {
+		clearfree_help();
+		goto fail;
+	}
+
+	if (attrs.is_realtime) {
+		if (file->xfd.fsgeom.rtblocks == 0) {
+			fprintf(stderr, _("No realtime volume present.\n"));
+			goto fail;
+		}
+		attrs.dev = file->fs_path.fs_rtdev;
+	} else {
+		attrs.dev = file->fs_path.fs_datadev;
+	}
+
+	lnum = cvtnum(file->xfd.fsgeom.blocksize, file->xfd.fsgeom.sectsize,
+			argv[optind]);
+	if (lnum < 0) {
+		fprintf(stderr, _("Bad clearfree start sector %s.\n"),
+				argv[optind]);
+		goto fail;
+	}
+	attrs.start = lnum;
+
+	lnum = cvtnum(file->xfd.fsgeom.blocksize, file->xfd.fsgeom.sectsize,
+			argv[optind + 1]);
+	if (lnum < 0) {
+		fprintf(stderr, _("Bad clearfree length %s.\n"),
+				argv[optind + 1]);
+		goto fail;
+	}
+	attrs.length = lnum;
+
+	ret = clearspace_init(&req, &attrs);
+	if (ret)
+		goto fail;
+
+	for (i = 0; i < nr; i++) {
+		ret = clearspace_run(req);
+		if (ret)
+			goto fail;
+	}
+
+	ret = clearspace_efficacy(req, &cleared);
+	if (ret)
+		goto fail;
+
+	printf(_("Cleared 0x%llx bytes (%.1f%%) from 0x%llx to 0x%llx.\n"),
+			cleared, 100.0 * cleared / attrs.length, attrs.start,
+			attrs.start + attrs.length);
+
+	ret = clearspace_free(&req);
+	if (ret)
+		goto fail;
+
+	fshandle_destroy();
+	return 0;
+fail:
+	fshandle_destroy();
+	exitcode = 1;
+	return 1;
+}
+
+static struct cmdinfo clearfree_cmd = {
+	.name		= "clearfree",
+	.cfunc		= clearfree_f,
+	.argmin		= 0,
+	.argmax		= -1,
+	.flags		= CMD_FLAG_ONESHOT,
+	.args		= "[-n runs] [-r] [-v mask] start length",
+	.help		= clearfree_help,
+};
+
+void
+clearfree_init(void)
+{
+	clearfree_cmd.oneline = _("clear free space in the filesystem");
+
+	add_command(&clearfree_cmd);
+}
diff --git a/spaceman/init.c b/spaceman/init.c
index cf1ff3cbb0e..bce62dec47f 100644
--- a/spaceman/init.c
+++ b/spaceman/init.c
@@ -35,6 +35,7 @@ init_commands(void)
 	trim_init();
 	freesp_init();
 	health_init();
+	clearfree_init();
 }
 
 static int
diff --git a/spaceman/space.h b/spaceman/space.h
index 28fa35a3047..509e923375f 100644
--- a/spaceman/space.h
+++ b/spaceman/space.h
@@ -31,8 +31,10 @@ extern void	quit_init(void);
 extern void	trim_init(void);
 #ifdef HAVE_GETFSMAP
 extern void	freesp_init(void);
+extern void	clearfree_init(void);
 #else
 # define freesp_init()	do { } while (0)
+# define clearfree_init()	do { } while(0)
 #endif
 extern void	info_init(void);
 extern void	health_init(void);

From patchwork Wed Dec 27 13:41:17 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508403
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 479927ED
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:41:17 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="CfW8liAw"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id B5AB2C433C8;
	Mon,  1 Jan 2024 00:41:17 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069677;
	bh=Ik2QWMPoDXVn9xEANYGIaMosqOunH5gqcGSOggLc2QU=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=CfW8liAw0kacb+ppkpeaUHf/OETJR5GumhyrX/AOTRc1WOxRo32RxDdIQtW7EXzDz
	 V8iZ4gEXD9LAMPu9Q2OSQHEAzj1jcqOml5TPhjoa0p0ISY/GdUc9QZ8vim+LachdLH
	 roujZEm+eeei+PxVMma2VRvP8KlgJLXCP3LuVcOCiNR/WNz1WxlFKiEh/80SxIuVin
	 c7n8C5G6q6KYcSMIA7KUZZzpkKNcD4B7BxYUC8DBJfAVIS/Zevpzwr48Zv+EqPJuD9
	 2KqiL2XztFfUrNRisPI0Lqa1e/JWIFr9MwXS9wi2UgcvqYjSBpz/rDDHAI43RahLFW
	 757gX+rsRuAjg==
Date: Sun, 31 Dec 2023 16:41:17 +9900
Subject: [PATCH 05/10] spaceman: physically move a regular inode
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: Dave Chinner <dchinner@redhat.com>, linux-xfs@vger.kernel.org
Message-ID: <170405020391.1820796.12905760133039234923.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Dave Chinner <dchinner@redhat.com>

To be able to shrink a filesystem, we need to be able to physically
move an inode and all it's data and metadata from it's current
location to a new AG.  Add a command to spaceman to allow an inode
to be moved to a new AG.

This new command is not intended to be a perfect solution. I am not
trying to handle atomic movement of open files - this is intended to
be run as a maintenance operation on idle filesystem. If root
filesystems are the target, then this should be run via a rescue
environment that is not executing directly on the root fs. With
those caveats in place, we can do the entire inode move as a set of
non-destructive operations finalised by an atomic inode swap
without any needing special kernel support.

To ensure we move metadata such as BMBT blocks even if we don't need
to move data, we clone the data to a new inode that we've allocated
in the destination AG. This will result in new bmbt blocks being
allocated in the new location even though the data is not copied.
Attributes need to be copied one at a time from the original inode.

If data needs to be moved, then we use fallocate(UNSHARE) to create
a private copy of the range of data that needs to be moved in the
new inode. This will be allocated in the destination AG by normal
allocation policy.

Once the new inode has been finalised, use RENAME_EXCHANGE to swap
it into place and unlink the original inode to free up all the
resources it still pins.

There are many optimisations still possible to speed this up, but
the goal here is "functional" rather than "optimal". Performance can
be optimised once all the parts for a "empty the tail of the
filesystem before shrink" operation are implemented and solidly
tested.

This functionality has been smoke tested by creating a 32MB data
file with 4k extents and several hundred attributes:

$ cat test.sh
fname=/mnt/scratch/foo
xfs_io -f -c "pwrite 0 32m" -c sync $fname
for (( i=0; i < 4096 ; i++ )); do
	xfs_io -c "fpunch $((i * 8))k 4k" $fname
done

for (( i=0; i < 100 ; i++ )); do
	setfattr -n user.blah.$i.$i.blah -v blah.$i.$i.blah $fname
	setfattr -n user.foo.$i.$i.foo -v $i.cantbele.$i.ve.$i.tsnotbutter $fname
done
for (( i=0; i < 100 ; i++ )); do
	setfattr -n security.baz.$i.$i.baz -v wotchul$i$iookinat $fname
done

xfs_io -c stat -c "bmap -vp" -c "bmap -avp" $fname
xfs_spaceman -c "move_inode -a 22" /mnt/scratch/foo
xfs_io -c stat -c "bmap -vp" -c "bmap -avp" $fname
$

and the output looks something like:

$ sudo ./test.sh
....
fd.path = "/mnt/scratch/foo"
fd.flags = non-sync,non-direct,read-write
stat.ino = 133
/mnt/scratch/foo:
 EXT: FILE-OFFSET      BLOCK-RANGE       AG AG-OFFSET        TOTAL FLAGS
   0: [0..7]:          hole                                      8
   1: [8..15]:         208..215           0 (208..215)           8 000000
   2: [16..23]:        hole                                      8
   3: [24..31]:        224..231           0 (224..231)           8 000000
....
8189: [65512..65519]:  65712..65719       0 (65712..65719)       8 000000
8190: [65520..65527]:  hole                                      8
8191: [65528..65535]:  65728..65735       0 (65728..65735)       8 000000
mnt/scratch/foo:
 EXT: FILE-OFFSET      BLOCK-RANGE       AG AG-OFFSET        TOTAL FLAGS
   0: [0..7]:          392..399           0 (392..399)           8 000000
   1: [8..15]:         408..415           0 (408..415)           8 000000
   2: [16..23]:        424..431           0 (424..431)           8 000000
   3: [24..31]:        456..463           0 (456..463)           8 000000
move mnt /mnt/scratch, path /mnt/scratch/foo, agno 22
fd.path = "/mnt/scratch/foo"
fd.flags = non-sync,non-direct,read-write
stat.ino = 47244651475
....
/mnt/scratch/foo:
 EXT: FILE-OFFSET      BLOCK-RANGE               AG AG-OFFSET        TOTAL FLAGS
   0: [0..7]:          hole                                              8
   1: [8..15]:         47244763192..47244763199  22 (123112..123119)     8 000000
   2: [16..23]:        hole                                              8
   3: [24..31]:        47244763208..47244763215  22 (123128..123135)     8 000000
....
8189: [65512..65519]:  47244828808..47244828815  22 (188728..188735)     8 000000
8190: [65520..65527]:  hole                                              8
8191: [65528..65535]:  47244828824..47244828831  22 (188744..188751)     8 000000
/mnt/scratch/foo:
 EXT: FILE-OFFSET      BLOCK-RANGE               AG AG-OFFSET        TOTAL FLAGS
   0: [0..7]:          47244763176..47244763183  22 (123096..123103)     8 000000
$


Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 man/man8/xfs_spaceman.8 |    4 
 spaceman/Makefile       |    4 
 spaceman/init.c         |    1 
 spaceman/move_inode.c   |  563 +++++++++++++++++++++++++++++++++++++++++++++++
 spaceman/space.h        |    1 
 5 files changed, 571 insertions(+), 2 deletions(-)
 create mode 100644 spaceman/move_inode.c

diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8
index a326b9a6486..f898a8bbe84 100644
--- a/man/man8/xfs_spaceman.8
+++ b/man/man8/xfs_spaceman.8
@@ -146,6 +146,10 @@ Report on the health of the files at the given path.
 .TP
 .BR "help [ " command " ]"
 Display a brief description of one or all commands.
+.TP
+.BI "move_inode \-a agno"
+Move the currently open file into the specified allocation group.
+
 .TP
 .BI "prealloc [ \-u id ] [ \-g id ] [ -p id ] [ \-m minlen ] [ \-s ]"
 Removes speculative preallocation.
diff --git a/spaceman/Makefile b/spaceman/Makefile
index 75df4ce86c2..41ab95a07b2 100644
--- a/spaceman/Makefile
+++ b/spaceman/Makefile
@@ -7,10 +7,10 @@ include $(TOPDIR)/include/builddefs
 
 LTCOMMAND = xfs_spaceman
 HFILES = init.h space.h
-CFILES = info.c init.c file.c health.c prealloc.c trim.c
+CFILES = info.c init.c file.c health.c move_inode.c prealloc.c trim.c
 LSRCFILES = xfs_info.sh
 
-LLDLIBS = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG)
+LLDLIBS = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG) $(LIBHANDLE)
 LTDEPENDENCIES = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG)
 LLDFLAGS = -static
 
diff --git a/spaceman/init.c b/spaceman/init.c
index bce62dec47f..dbeebcf97b9 100644
--- a/spaceman/init.c
+++ b/spaceman/init.c
@@ -36,6 +36,7 @@ init_commands(void)
 	freesp_init();
 	health_init();
 	clearfree_init();
+	move_inode_init();
 }
 
 static int
diff --git a/spaceman/move_inode.c b/spaceman/move_inode.c
new file mode 100644
index 00000000000..6238a48e948
--- /dev/null
+++ b/spaceman/move_inode.c
@@ -0,0 +1,563 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+
+#include "libxfs.h"
+#include "libfrog/fsgeom.h"
+#include "command.h"
+#include "init.h"
+#include "libfrog/paths.h"
+#include "space.h"
+#include "input.h"
+#include "handle.h"
+
+#include <linux/fiemap.h>
+#include <linux/falloc.h>
+#include <attr/attributes.h>
+
+static cmdinfo_t move_inode_cmd;
+
+/*
+ * We can't entirely use O_TMPFILE here because we want to use RENAME_EXCHANGE
+ * to swap the inode once rebuild is complete. Hence the new file has to be
+ * somewhere in the namespace for rename to act upon. Hence we use a normal
+ * open(O_CREATE) for now.
+ *
+ * This could potentially use O_TMPFILE to rebuild the entire inode, the use
+ * a linkat()/renameat2() pair to add it to the namespace then atomically
+ * replace the original.
+ */
+static int
+create_tmpfile(
+	const char	*mnt,
+	struct xfs_fd	*xfd,
+	xfs_agnumber_t	agno,
+	char		**tmpfile,
+	int		*tmpfd)
+{
+	char		name[PATH_MAX + 1];
+	mode_t		mask;
+	int		fd;
+	int		i;
+	int		ret;
+
+	/* construct tmpdir */
+	mask = umask(0);
+
+	snprintf(name, PATH_MAX, "%s/.spaceman", mnt);
+	ret = mkdir(name, 0700);
+	if (ret) {
+		if (errno != EEXIST) {
+			fprintf(stderr, _("could not create tmpdir: %s: %s\n"),
+					name, strerror(errno));
+			ret = -errno;
+			goto out_cleanup;
+		}
+	}
+
+	/* loop creating directories until we get one in the right AG */
+	for (i = 0; i < xfd->fsgeom.agcount; i++) {
+		struct stat	st;
+
+		snprintf(name, PATH_MAX, "%s/.spaceman/dir%d", mnt, i);
+		ret = mkdir(name, 0700);
+		if (ret) {
+			if (errno != EEXIST) {
+				fprintf(stderr,
+					_("cannot create tmpdir: %s: %s\n"),
+				       name, strerror(errno));
+				ret = -errno;
+				goto out_cleanup_dir;
+			}
+		}
+		ret = lstat(name, &st);
+		if (ret) {
+			fprintf(stderr, _("cannot stat tmpdir: %s: %s\n"),
+				       name, strerror(errno));
+			ret = -errno;
+			rmdir(name);
+			goto out_cleanup_dir;
+		}
+		if (cvt_ino_to_agno(xfd, st.st_ino) == agno)
+			break;
+
+		/* remove directory in wrong AG */
+		rmdir(name);
+	}
+
+	if (i == xfd->fsgeom.agcount) {
+		/*
+		 * Nothing landed in the selected AG! Must have been skipped
+		 * because the AG is out of space.
+		 */
+		fprintf(stderr, _("Cannot create AG tmpdir.\n"));
+		ret = -ENOSPC;
+		goto out_cleanup_dir;
+	}
+
+	/* create tmpfile */
+	snprintf(name, PATH_MAX, "%s/.spaceman/dir%d/tmpfile.%d", mnt, i, getpid());
+	fd = open(name, O_CREAT|O_EXCL|O_RDWR, 0700);
+	if (fd < 0) {
+		fprintf(stderr, _("cannot create tmpfile: %s: %s\n"),
+		       name, strerror(errno));
+		ret = -errno;
+	}
+
+	/* return name and fd */
+	(void)umask(mask);
+	*tmpfd = fd;
+	*tmpfile = strdup(name);
+
+	return 0;
+out_cleanup_dir:
+	snprintf(name, PATH_MAX, "%s/.spaceman", mnt);
+	rmdir(name);
+out_cleanup:
+	(void)umask(mask);
+	return ret;
+}
+
+static int
+get_attr(
+	void		*hdl,
+	size_t		hlen,
+	char		*name,
+	void		*attrbuf,
+	int		*attrlen,
+	int		attr_ns)
+{
+	struct xfs_attr_multiop	ops = {
+		.am_opcode	= ATTR_OP_GET,
+		.am_attrname	= name,
+		.am_attrvalue	= attrbuf,
+		.am_length	= *attrlen,
+		.am_flags	= attr_ns,
+	};
+	int		ret;
+
+	ret = attr_multi_by_handle(hdl, hlen, &ops, 1, 0);
+	if (ret < 0) {
+		fprintf(stderr, _("attr_multi_by_handle(GET): %s\n"),
+			strerror(errno));
+		return -errno;
+	}
+	*attrlen = ops.am_length;
+	return 0;
+}
+
+static int
+set_attr(
+	void		*hdl,
+	size_t		hlen,
+	char		*name,
+	void		*attrbuf,
+	int		attrlen,
+	int		attr_ns)
+{
+	struct xfs_attr_multiop	ops = {
+		.am_opcode	= ATTR_OP_SET,
+		.am_attrname	= name,
+		.am_attrvalue	= attrbuf,
+		.am_length	= attrlen,
+		.am_flags	= ATTR_CREATE | attr_ns,
+	};
+	int		ret;
+
+	ret = attr_multi_by_handle(hdl, hlen, &ops, 1, 0);
+	if (ret < 0) {
+		fprintf(stderr, _("attr_multi_by_handle(SET): %s\n"),
+			strerror(errno));
+		return -errno;
+	}
+	return 0;
+}
+
+/*
+ * Copy all the attributes from the original source file into the replacement
+ * destination.
+ *
+ * Oh the humanity of deprecated Irix compatible attr interfaces that are more
+ * functional and useful than their native Linux replacements!
+ */
+static int
+copy_attrs(
+	int			srcfd,
+	int			dstfd,
+	int			attr_ns)
+{
+	void			*shdl;
+	void			*dhdl;
+	size_t			shlen;
+	size_t			dhlen;
+	attrlist_cursor_t	cursor;
+	attrlist_t		*alist;
+	struct attrlist_ent	*ent;
+	char			alistbuf[XATTR_LIST_MAX];
+	char			attrbuf[XATTR_SIZE_MAX];
+	int			attrlen;
+	int			error;
+	int			i;
+
+	memset(&cursor, 0, sizeof(cursor));
+
+	/*
+	 * All this handle based stuff is hoop jumping to avoid:
+	 *
+	 * a) deprecated API warnings because attr_list, attr_get and attr_set
+	 *    have been deprecated hence through compiler warnings; and
+	 *
+	 * b) listxattr() failing hard if there are more than 64kB worth of attr
+	 *    names on the inode so is unusable.
+	 *
+	 * That leaves libhandle as the only usable interface for iterating all
+	 * xattrs on an inode reliably. Lucky for us, libhandle is part of
+	 * xfsprogs, so this hoop jump isn't going to get ripped out from under
+	 * us any time soon.
+	 */
+	error = fd_to_handle(srcfd, (void **)&shdl, &shlen);
+	if (error) {
+		fprintf(stderr, _("fd_to_handle(shdl): %s\n"),
+			strerror(errno));
+		return -errno;
+	}
+	error = fd_to_handle(dstfd, (void **)&dhdl, &dhlen);
+	if (error) {
+		fprintf(stderr, _("fd_to_handle(dhdl): %s\n"),
+			strerror(errno));
+		goto out_free_shdl;
+	}
+
+	/* loop to iterate all xattrs */
+	error = attr_list_by_handle(shdl, shlen, alistbuf,
+					XATTR_LIST_MAX, attr_ns, &cursor);
+	if (error) {
+		fprintf(stderr, _("attr_list_by_handle(shdl): %s\n"),
+			strerror(errno));
+	}
+	while (!error) {
+		alist = (attrlist_t *)alistbuf;
+
+		/*
+		 * We loop one attr at a time for initial implementation
+		 * simplicity. attr_multi_by_handle() can retrieve and set
+		 * multiple attrs in a single call, but that is more complex.
+		 * Get it working first, then optimise.
+		 */
+		for (i = 0; i < alist->al_count; i++) {
+			ent = ATTR_ENTRY(alist, i);
+
+			/* get xattr (val, len) from name */
+			attrlen = XATTR_SIZE_MAX;
+			error = get_attr(shdl, shlen, ent->a_name, attrbuf,
+						&attrlen, attr_ns);
+			if (error)
+				break;
+
+			/* set xattr (val, len) to name */
+			error = set_attr(dhdl, dhlen, ent->a_name, attrbuf,
+						attrlen, ATTR_CREATE | attr_ns);
+			if (error)
+				break;
+		}
+
+		if (!alist->al_more)
+			break;
+		error = attr_list_by_handle(shdl, shlen, alistbuf,
+					XATTR_LIST_MAX, attr_ns, &cursor);
+	}
+
+	free_handle(dhdl, dhlen);
+out_free_shdl:
+	free_handle(shdl, shlen);
+	return error ? -errno : 0;
+}
+
+/*
+ * scan the range of the new file for data that isn't in the destination AG
+ * and unshare it to create a new copy of it in the current target location
+ * of the new file.
+ */
+#define EXTENT_BATCH 32
+static int
+unshare_data(
+	struct xfs_fd	*xfd,
+	int		destfd,
+	xfs_agnumber_t	agno)
+{
+	int		ret;
+	struct fiemap	*fiemap;
+	int		done = 0;
+	int		fiemap_flags = FIEMAP_FLAG_SYNC;
+	int		i;
+	int		map_size;
+	__u64		last_logical = 0;	/* last extent offset handled */
+	off64_t		range_end = -1LL;	/* mapping end*/
+
+	/* fiemap loop over extents */
+	map_size = sizeof(struct fiemap) +
+		(EXTENT_BATCH * sizeof(struct fiemap_extent));
+	fiemap = malloc(map_size);
+	if (!fiemap) {
+		fprintf(stderr, _("%s: malloc of %d bytes failed.\n"),
+			progname, map_size);
+		return -ENOMEM;
+	}
+
+	while (!done) {
+		memset(fiemap, 0, map_size);
+		fiemap->fm_flags = fiemap_flags;
+		fiemap->fm_start = last_logical;
+		fiemap->fm_length = range_end - last_logical;
+		fiemap->fm_extent_count = EXTENT_BATCH;
+
+		ret = ioctl(destfd, FS_IOC_FIEMAP, (unsigned long)fiemap);
+		if (ret < 0) {
+			fprintf(stderr, "%s: ioctl(FS_IOC_FIEMAP): %s\n",
+				progname, strerror(errno));
+			free(fiemap);
+			return -errno;
+		}
+
+		/* No more extents to map, exit */
+		if (!fiemap->fm_mapped_extents)
+			break;
+
+		for (i = 0; i < fiemap->fm_mapped_extents; i++) {
+			struct fiemap_extent	*extent;
+			xfs_agnumber_t		this_agno;
+
+			extent = &fiemap->fm_extents[i];
+			this_agno = cvt_daddr_to_agno(xfd,
+					cvt_btobbt(extent->fe_physical));
+
+			/*
+			 * If extent not in dst AG, unshare whole extent to
+			 * trigger reallocated of the extent to be local to
+			 * the current inode.
+			 */
+			if (this_agno != agno) {
+				ret = fallocate(destfd, FALLOC_FL_UNSHARE_RANGE,
+					extent->fe_logical, extent->fe_length);
+				if (ret) {
+					fprintf(stderr,
+						"%s: fallocate(UNSHARE): %s\n",
+						progname, strerror(errno));
+					return -errno;
+				}
+			}
+
+			last_logical = extent->fe_logical + extent->fe_length;
+
+			/* Kernel has told us there are no more extents */
+			if (extent->fe_flags & FIEMAP_EXTENT_LAST) {
+				done = 1;
+				break;
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * Exchange the inodes at the two paths indicated after first ensuring that the
+ * owners, permissions and timestamps are set correctly in the tmpfile.
+ */
+static int
+exchange_inodes(
+	struct xfs_fd	*xfd,
+	int		tmpfd,
+	const char	*tmpfile,
+	const char	*path)
+{
+	struct timespec	ts[2];
+	struct stat	st;
+	int		ret;
+
+	ret = fstat(xfd->fd, &st);
+	if (ret)
+		return -errno;
+
+	/* set user ids */
+	ret = fchown(tmpfd, st.st_uid, st.st_gid);
+	if (ret)
+		return -errno;
+
+	/* set permissions */
+	ret = fchmod(tmpfd, st.st_mode);
+	if (ret)
+		return -errno;
+
+	/* set timestamps */
+	ts[0] = st.st_atim;
+	ts[1] = st.st_mtim;
+	ret = futimens(tmpfd, ts);
+	if (ret)
+		return -errno;
+
+	/* exchange the two inodes */
+	ret = renameat2(AT_FDCWD, tmpfile, AT_FDCWD, path, RENAME_EXCHANGE);
+	if (ret)
+		return -errno;
+	return 0;
+}
+
+static int
+move_file_to_ag(
+	const char		*mnt,
+	const char		*path,
+	struct xfs_fd		*xfd,
+	xfs_agnumber_t		agno)
+{
+	int			ret;
+	int			tmpfd = -1;
+	char			*tmpfile = NULL;
+
+	fprintf(stderr, "move mnt %s, path %s, agno %d\n", mnt, path, agno);
+
+	/* create temporary file in agno */
+	ret = create_tmpfile(mnt, xfd, agno, &tmpfile, &tmpfd);
+	if (ret)
+		return ret;
+
+	/* clone data to tempfile */
+	ret = ioctl(tmpfd, FICLONE, xfd->fd);
+	if (ret)
+		goto out_cleanup;
+
+	/* copy system attributes to tempfile */
+	ret = copy_attrs(xfd->fd, tmpfd, ATTR_ROOT);
+	if (ret)
+		goto out_cleanup;
+
+	/* copy user attributes to tempfile */
+	ret = copy_attrs(xfd->fd, tmpfd, 0);
+	if (ret)
+		goto out_cleanup;
+
+	/* unshare data to move it */
+	ret = unshare_data(xfd, tmpfd, agno);
+	if (ret)
+		goto out_cleanup;
+
+	/* swap the inodes over */
+	ret = exchange_inodes(xfd, tmpfd, tmpfile, path);
+
+out_cleanup:
+	if (ret == -1)
+		ret = -errno;
+
+	close(tmpfd);
+	if (tmpfile)
+		unlink(tmpfile);
+	free(tmpfile);
+
+	return ret;
+}
+
+static int
+move_inode_f(
+	int			argc,
+	char			**argv)
+{
+	void			*fshandle;
+	size_t			fshdlen;
+	xfs_agnumber_t		agno = 0;
+	struct stat		st;
+	int			ret;
+	int			c;
+
+	while ((c = getopt(argc, argv, "a:")) != EOF) {
+		switch (c) {
+		case 'a':
+			agno = cvt_u32(optarg, 10);
+			if (errno) {
+				fprintf(stderr, _("bad agno value %s\n"),
+					optarg);
+				return command_usage(&move_inode_cmd);
+			}
+			break;
+		default:
+			return command_usage(&move_inode_cmd);
+		}
+	}
+
+	if (optind != argc)
+		return command_usage(&move_inode_cmd);
+
+	if (agno >= file->xfd.fsgeom.agcount) {
+		fprintf(stderr,
+_("Destination AG %d does not exist. Filesystem only has %d AGs\n"),
+			agno, file->xfd.fsgeom.agcount);
+		exitcode = 1;
+		return 0;
+	}
+
+	/* this is so we can use fd_to_handle() later on */
+	ret = path_to_fshandle(file->fs_path.fs_dir, &fshandle, &fshdlen);
+	if (ret < 0) {
+		fprintf(stderr, _("Cannot get fshandle for mount %s: %s\n"),
+			file->fs_path.fs_dir, strerror(errno));
+		goto exit_fail;
+	}
+
+	ret = fstat(file->xfd.fd, &st);
+	if (ret) {
+		fprintf(stderr, _("stat(%s) failed: %s\n"),
+			file->name, strerror(errno));
+		goto exit_fail;
+	}
+
+	if (S_ISREG(st.st_mode)) {
+		ret = move_file_to_ag(file->fs_path.fs_dir, file->name,
+				&file->xfd, agno);
+	} else {
+		fprintf(stderr, _("Unsupported: %s is not a regular file.\n"),
+			file->name);
+		goto exit_fail;
+	}
+
+	if (ret) {
+		fprintf(stderr, _("Failed to move inode to AG %d: %s\n"),
+			agno, strerror(-ret));
+		goto exit_fail;
+	}
+	fshandle_destroy();
+	return 0;
+
+exit_fail:
+	fshandle_destroy();
+	exitcode = 1;
+	return 0;
+}
+
+static void
+move_inode_help(void)
+{
+	printf(_(
+"\n"
+"Physically move an inode into a new allocation group\n"
+"\n"
+" -a agno       -- destination AG agno for the current open file\n"
+"\n"));
+
+}
+
+void
+move_inode_init(void)
+{
+	move_inode_cmd.name = "move_inode";
+	move_inode_cmd.altname = "mvino";
+	move_inode_cmd.cfunc = move_inode_f;
+	move_inode_cmd.argmin = 2;
+	move_inode_cmd.argmax = 2;
+	move_inode_cmd.args = "-a agno";
+	move_inode_cmd.flags = CMD_FLAG_ONESHOT;
+	move_inode_cmd.oneline = _("Move an inode into a new AG.");
+	move_inode_cmd.help = move_inode_help;
+
+	add_command(&move_inode_cmd);
+}
+
diff --git a/spaceman/space.h b/spaceman/space.h
index 509e923375f..96c3c356f13 100644
--- a/spaceman/space.h
+++ b/spaceman/space.h
@@ -38,5 +38,6 @@ extern void	clearfree_init(void);
 #endif
 extern void	info_init(void);
 extern void	health_init(void);
+void		move_inode_init(void);
 
 #endif /* XFS_SPACEMAN_SPACE_H_ */

From patchwork Wed Dec 27 13:41:32 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508404
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C55627ED
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:41:33 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="ctc6dFSc"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4C545C433C7;
	Mon,  1 Jan 2024 00:41:33 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069693;
	bh=RkRIkVpiZSkVp2whyt2PNqL9+XBgNgxtl7+z61FVzjc=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=ctc6dFScyEqVmV/c4j2gwuuG3eBJKK0aEBi2PzYIDcVAoRUyap0tEJw5PNKf1Fwh+
	 CAFBnNAyIrJAUx7io8oTrEtvptDZE8XUJhP5/ccxTqorKPTFH3toWzIX1VA2yRqFjT
	 p1xloDflZV4zC9CDJwOIWlP8BmeY7n5LSOWvb3h5SB9h+5MiiqWlP0nmrpsPc51G1o
	 naigFWs4E+GMs65B8fJEC6Iw+ab+3oCmLr3mbfujxVJpQCKkfCD4lFgMlZLs7SHCzN
	 xWRhPJ+tH9TaId7Lbtspp4v1ldBD45RVcV8VnOzusSnSHYa2iDQxBis1FNhvXGumSV
	 UYvWYn8zTtXRQ==
Date: Sun, 31 Dec 2023 16:41:32 +9900
Subject: [PATCH 06/10] spaceman: find owners of space in an AG
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: Dave Chinner <dchinner@redhat.com>, linux-xfs@vger.kernel.org
Message-ID: <170405020404.1820796.3007269083335699062.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Dave Chinner <dchinner@redhat.com>

Before we can move inodes for a shrink operation, we have to find
all the inodes that own space in the AG(s) we want to empty.

This implementation uses FS_IOC_GETFSMAP on the assumption that
filesystems to be shrunk have reverse mapping enabled as it is the
only way to identify inode related metadata that userspace is unable
to see or influence (e.g. BMBT blocks) that may be located in the
specific AG. We can use GETFSMAP to identify both inodes to be moved
(via XFS_FMR_OWN_INODES records) and inodes with just data and/or
metadata to be moved.

Once we have identified all the inodes to be moved, we have to
map them to paths so that we can use renameat2() to exchange the
directory entries pointing at the moved inode atomically. We also
need to record inodes with hard links and all of the paths to the
inode so that hard links can be recreated appropriately.

This requires a directory tree walk to discover the paths (until
parent pointers are a thing). Hence for filesystems that aren't
reverse mapping enabled, we can eventually use this pass to discover
inodes with visible data and metadata that need to be moved.

As we resolve the paths to the inodes to be moved, output the
information to stdout so that it can be acted upon by other
utilities. This results in a command that acts similar to find but
with a physical location filter rather than an inode metadata
filter.

Again, this is not meant to be an optimal implementation. It
shouldn't suck, but there is plenty of scope for performance
optimisation, especially with a multithreaded and/or async directory
traversal/parent pointer path resolution process to hide access
latencies.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/fsgeom.h        |   19 ++
 libfrog/radix-tree.c    |    2 
 libfrog/radix-tree.h    |    2 
 man/man8/xfs_spaceman.8 |   11 +
 spaceman/Makefile       |    2 
 spaceman/find_owner.c   |  482 +++++++++++++++++++++++++++++++++++++++++++++++
 spaceman/init.c         |    4 
 spaceman/space.h        |    2 
 8 files changed, 522 insertions(+), 2 deletions(-)
 create mode 100644 spaceman/find_owner.c

diff --git a/libfrog/fsgeom.h b/libfrog/fsgeom.h
index e5d43695901..073a03e8681 100644
--- a/libfrog/fsgeom.h
+++ b/libfrog/fsgeom.h
@@ -103,6 +103,25 @@ cvt_ino_to_agino(
 	return ino & ((1ULL << xfd->aginolog) - 1);
 }
 
+/* Convert an AG block to an AG inode number. */
+static inline uint32_t
+cvt_agbno_to_agino(
+	const struct xfs_fd	*xfd,
+	xfs_agblock_t		agbno)
+{
+	return agbno << xfd->inopblog;
+}
+
+/* Calculate the number of inodes in a byte range */
+static inline uint32_t
+cvt_b_to_inode_count(
+	const struct xfs_fd	*xfd,
+	uint64_t		bytes)
+{
+	return (bytes >> xfd->blocklog) << xfd->inopblog;
+}
+
+
 /*
  * Convert a linear fs block offset number into bytes.  This is the runtime
  * equivalent of XFS_FSB_TO_B, which means that it is /not/ for segmented fsbno
diff --git a/libfrog/radix-tree.c b/libfrog/radix-tree.c
index 261fc2487de..788d11612e2 100644
--- a/libfrog/radix-tree.c
+++ b/libfrog/radix-tree.c
@@ -377,6 +377,8 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 	unsigned int height, shift;
 	struct radix_tree_node *slot;
 
+	ASSERT(tag < RADIX_TREE_MAX_TAGS);
+
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
 		return NULL;
diff --git a/libfrog/radix-tree.h b/libfrog/radix-tree.h
index dad5f5b7203..c10826a615b 100644
--- a/libfrog/radix-tree.h
+++ b/libfrog/radix-tree.h
@@ -28,7 +28,7 @@ do {									\
 } while (0)
 
 #ifdef RADIX_TREE_TAGS
-#define RADIX_TREE_MAX_TAGS 2
+#define RADIX_TREE_MAX_TAGS 3
 #endif
 
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8
index f898a8bbe84..6fef6949aa6 100644
--- a/man/man8/xfs_spaceman.8
+++ b/man/man8/xfs_spaceman.8
@@ -41,6 +41,14 @@ option is given, clear the realtime device.
 If the
 .B -v
 option is given, print what's happening every step of the way.
+.TP
+.BI "find_owner \-a agno"
+Create an internal structure to map physical space in the given allocation
+group to file paths.
+This enables space reorganization on a mounted filesystem by enabling
+us to find files.
+Unclear why we can't just use FSMAP and BULKSTAT to open by handle.
+
 .TP
 .BI "freesp [ \-dgrs ] [-a agno]... [ \-b | \-e bsize | \-h bsize | \-m factor ]"
 With no arguments,
@@ -195,6 +203,9 @@ Wait for removal to complete.
 .B print
 Display a list of all open files.
 .TP
+.B resolve_owner
+Resolves space in the filesystem to file paths, maybe?
+.TP
 .B quit
 Exit
 .BR xfs_spaceman .
diff --git a/spaceman/Makefile b/spaceman/Makefile
index 41ab95a07b2..19ce8862131 100644
--- a/spaceman/Makefile
+++ b/spaceman/Makefile
@@ -7,7 +7,7 @@ include $(TOPDIR)/include/builddefs
 
 LTCOMMAND = xfs_spaceman
 HFILES = init.h space.h
-CFILES = info.c init.c file.c health.c move_inode.c prealloc.c trim.c
+CFILES = find_owner.c info.c init.c file.c health.c move_inode.c prealloc.c trim.c
 LSRCFILES = xfs_info.sh
 
 LLDLIBS = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG) $(LIBHANDLE)
diff --git a/spaceman/find_owner.c b/spaceman/find_owner.c
new file mode 100644
index 00000000000..7667d9d3660
--- /dev/null
+++ b/spaceman/find_owner.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017 Oracle.
+ * Copyright (c) 2020 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+
+#include "libxfs.h"
+#include <linux/fiemap.h>
+#include "libfrog/fsgeom.h"
+#include "libfrog/radix-tree.h"
+#include "command.h"
+#include "init.h"
+#include "libfrog/paths.h"
+#include <linux/fsmap.h>
+#include "space.h"
+#include "input.h"
+
+static cmdinfo_t find_owner_cmd;
+static cmdinfo_t resolve_owner_cmd;
+
+#define NR_EXTENTS 128
+
+static RADIX_TREE(inode_tree, 0);
+#define MOVE_INODE	0
+#define MOVE_BLOCKS	1
+#define INODE_PATH	2
+int inode_count;
+int inode_paths;
+
+static void
+track_inode_chunks(
+	struct xfs_fd	*xfd,
+	xfs_agnumber_t	agno,
+	uint64_t	physaddr,
+	uint64_t	length)
+{
+	xfs_agblock_t	agbno = cvt_b_to_agbno(xfd, physaddr);
+	uint64_t	first_ino = cvt_agino_to_ino(xfd, agno,
+						cvt_agbno_to_agino(xfd, agbno));
+	uint64_t	num_inodes = cvt_b_to_inode_count(xfd, length);
+	int		i;
+
+	printf(_("AG %d\tInode Range to move: 0x%llx - 0x%llx (length 0x%llx)\n"),
+			agno,
+			(unsigned long long)first_ino,
+			(unsigned long long)first_ino + num_inodes - 1,
+			(unsigned long long)length);
+
+	for (i = 0; i < num_inodes; i++) {
+		if (!radix_tree_lookup(&inode_tree, first_ino + i)) {
+			radix_tree_insert(&inode_tree, first_ino + i,
+					(void *)first_ino + i);
+			inode_count++;
+		}
+		radix_tree_tag_set(&inode_tree, first_ino + i, MOVE_INODE);
+	}
+}
+
+static void
+track_inode(
+	struct xfs_fd	*xfd,
+	xfs_agnumber_t	agno,
+	uint64_t	owner,
+	uint64_t	physaddr,
+	uint64_t	length)
+{
+	if (radix_tree_tag_get(&inode_tree, owner, MOVE_BLOCKS))
+		return;
+
+	printf(_("AG %d\tInode 0x%llx: blocks to move to move: 0x%llx - 0x%llx\n"),
+			agno,
+			(unsigned long long)owner,
+			(unsigned long long)physaddr,
+			(unsigned long long)physaddr + length - 1);
+	if (!radix_tree_lookup(&inode_tree, owner)) {
+		radix_tree_insert(&inode_tree, owner, (void *)owner);
+		inode_count++;
+	}
+	radix_tree_tag_set(&inode_tree, owner, MOVE_BLOCKS);
+}
+
+static void
+scan_ag(
+	xfs_agnumber_t		agno)
+{
+	struct fsmap_head	*fsmap;
+	struct fsmap		*extent;
+	struct fsmap		*l, *h;
+	struct fsmap		*p;
+	struct xfs_fd		*xfd = &file->xfd;
+	int			ret;
+	int			i;
+
+	fsmap = malloc(fsmap_sizeof(NR_EXTENTS));
+	if (!fsmap) {
+		fprintf(stderr, _("%s: fsmap malloc failed.\n"), progname);
+		exitcode = 1;
+		return;
+	}
+
+	memset(fsmap, 0, sizeof(*fsmap));
+	fsmap->fmh_count = NR_EXTENTS;
+	l = fsmap->fmh_keys;
+	h = fsmap->fmh_keys + 1;
+	l->fmr_physical = cvt_agbno_to_b(xfd, agno, 0);
+	h->fmr_physical = cvt_agbno_to_b(xfd, agno + 1, 0);
+	l->fmr_device = h->fmr_device = file->fs_path.fs_datadev;
+	h->fmr_owner = ULLONG_MAX;
+	h->fmr_flags = UINT_MAX;
+	h->fmr_offset = ULLONG_MAX;
+
+	while (true) {
+		printf("Inode count %d\n", inode_count);
+		ret = ioctl(xfd->fd, FS_IOC_GETFSMAP, fsmap);
+		if (ret < 0) {
+			fprintf(stderr, _("%s: FS_IOC_GETFSMAP [\"%s\"]: %s\n"),
+				progname, file->name, strerror(errno));
+			free(fsmap);
+			exitcode = 1;
+			return;
+		}
+
+		/* No more extents to map, exit */
+		if (!fsmap->fmh_entries)
+			break;
+
+		/*
+		 * Walk the extents, ignore everything except inode chunks
+		 * and inode owned blocks.
+		 */
+		for (i = 0, extent = fsmap->fmh_recs;
+		     i < fsmap->fmh_entries;
+		     i++, extent++) {
+			if (extent->fmr_flags & FMR_OF_SPECIAL_OWNER) {
+				if (extent->fmr_owner != XFS_FMR_OWN_INODES)
+					continue;
+				/*
+				 * This extent contains inodes that need to be
+				 * moved into another AG. Convert the extent to
+				 * a range of inode numbers and track them all.
+				 */
+				track_inode_chunks(xfd, agno,
+							extent->fmr_physical,
+							extent->fmr_length);
+
+				continue;
+			}
+
+			/*
+			 * Extent is owned by an inode that may be located
+			 * anywhere in the filesystem, not just this AG.
+			 */
+			track_inode(xfd, agno, extent->fmr_owner,
+					extent->fmr_physical,
+					extent->fmr_length);
+		}
+
+		p = &fsmap->fmh_recs[fsmap->fmh_entries - 1];
+		if (p->fmr_flags & FMR_OF_LAST)
+			break;
+		fsmap_advance(fsmap);
+	}
+
+	free(fsmap);
+}
+
+/*
+ * find inodes that own physical space in a given AG.
+ */
+static int
+find_owner_f(
+	int			argc,
+	char			**argv)
+{
+	xfs_agnumber_t		agno = -1;
+	int			c;
+
+	while ((c = getopt(argc, argv, "a:")) != EOF) {
+		switch (c) {
+		case 'a':
+			agno = cvt_u32(optarg, 10);
+			if (errno) {
+				fprintf(stderr, _("bad agno value %s\n"),
+					optarg);
+				return command_usage(&find_owner_cmd);
+			}
+			break;
+		default:
+			return command_usage(&find_owner_cmd);
+		}
+	}
+
+	if (optind != argc)
+		return command_usage(&find_owner_cmd);
+
+	if (agno == -1 || agno >= file->xfd.fsgeom.agcount) {
+		fprintf(stderr,
+_("Destination AG %d does not exist. Filesystem only has %d AGs\n"),
+			agno, file->xfd.fsgeom.agcount);
+		exitcode = 1;
+		return 0;
+	}
+
+	/*
+	 * Check that rmap is enabled so that GETFSMAP is actually useful.
+	 */
+	if (!(file->xfd.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_RMAPBT)) {
+		fprintf(stderr,
+_("Filesystem at %s does not have reverse mapping enabled. Aborting.\n"),
+			file->fs_path.fs_dir);
+		exitcode = 1;
+		return 0;
+	}
+
+	scan_ag(agno);
+	return 0;
+}
+
+static void
+find_owner_help(void)
+{
+	printf(_(
+"\n"
+"Find inodes owning physical blocks in a given AG.\n"
+"\n"
+" -a agno  -- Scan the given AG agno.\n"
+"\n"));
+
+}
+
+void
+find_owner_init(void)
+{
+	find_owner_cmd.name = "find_owner";
+	find_owner_cmd.altname = "fown";
+	find_owner_cmd.cfunc = find_owner_f;
+	find_owner_cmd.argmin = 2;
+	find_owner_cmd.argmax = 2;
+	find_owner_cmd.args = "-a agno";
+	find_owner_cmd.flags = CMD_FLAG_ONESHOT;
+	find_owner_cmd.oneline = _("Find inodes owning physical blocks in a given AG");
+	find_owner_cmd.help = find_owner_help;
+
+	add_command(&find_owner_cmd);
+}
+
+/*
+ * for each dirent we get returned, look up the inode tree to see if it is an
+ * inode we need to process. If it is, then replace the entry in the tree with
+ * a structure containing the current path and mark the entry as resolved.
+ */
+struct inode_path {
+	uint64_t		ino;
+	struct list_head	path_list;
+	uint32_t		link_count;
+	char			path[1];
+};
+
+static int
+resolve_owner_cb(
+	const char		*path,
+	const struct stat	*stat,
+	int			status,
+	struct FTW		*data)
+{
+	struct inode_path	*ipath, *slot_ipath;
+	int			pathlen;
+	void			**slot;
+
+	/*
+	 * Lookup the slot rather than the entry so we can replace the contents
+	 * without another lookup later on.
+	 */
+	slot = radix_tree_lookup_slot(&inode_tree, stat->st_ino);
+	if (!slot || *slot == NULL)
+		return 0;
+
+	/* Could not get stat data? Fail! */
+	if (status == FTW_NS) {
+		fprintf(stderr,
+_("Failed to obtain stat(2) information from path %s. Aborting\n"),
+			path);
+		return -EPERM;
+	}
+
+	/* Allocate a new inode path and record the path in it. */
+	pathlen = strlen(path);
+	ipath = calloc(1, sizeof(*ipath) + pathlen + 1);
+	if (!ipath) {
+		fprintf(stderr,
+_("Aborting: Storing path %s for inode 0x%lx failed: %s\n"),
+			path, stat->st_ino, strerror(ENOMEM));
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&ipath->path_list);
+	memcpy(&ipath->path[0], path, pathlen);
+	ipath->ino = stat->st_ino;
+
+	/*
+	 * If the slot contains the inode number we just looked up, then we
+	 * haven't recorded a path for it yet. If that is the case, we just
+	 * set the link count of the path to 1 and replace the slot contents
+	 * with our new_ipath.
+	 */
+	if (stat->st_ino == (uint64_t)*slot) {
+		ipath->link_count = 1;
+		*slot = ipath;
+		radix_tree_tag_set(&inode_tree, stat->st_ino, INODE_PATH);
+		inode_paths++;
+		return 0;
+	}
+
+	/*
+	 * Multiple hard links to this inode. The slot already contains an
+	 * ipath pointer, so we add the new ipath to the tail of the list held
+	 * by the slot's ipath and bump the link count of the slot's ipath to
+	 * keep track of how many hard links the inode has.
+	 */
+	slot_ipath = *slot;
+	slot_ipath->link_count++;
+	list_add_tail(&ipath->path_list, &slot_ipath->path_list);
+	return 0;
+}
+
+/*
+ * This should be parallelised - pass subdirs off to a work queue, have the
+ * work queue processes subdirs, queueing more subdirs to work on.
+ */
+static int
+walk_mount(
+	const char	*mntpt)
+{
+	int		ret;
+
+	ret = nftw(mntpt, resolve_owner_cb,
+                        100, FTW_PHYS | FTW_MOUNT | FTW_DEPTH);
+	if (ret)
+		return -errno;
+	return 0;
+}
+
+static int
+list_inode_paths(void)
+{
+	struct inode_path	*ipath;
+	uint64_t		idx = 0;
+	int			ret;
+
+	do {
+		bool		move_blocks;
+		bool		move_inode;
+
+		ret = radix_tree_gang_lookup_tag(&inode_tree, (void **)&ipath,
+						idx, 1, INODE_PATH);
+		if (!ret)
+			break;
+		idx = ipath->ino + 1;
+
+		/* Grab status tags and remove from tree. */
+		move_blocks = radix_tree_tag_get(&inode_tree, ipath->ino,
+						MOVE_BLOCKS);
+		move_inode = radix_tree_tag_get(&inode_tree, ipath->ino,
+						MOVE_INODE);
+		radix_tree_delete(&inode_tree, ipath->ino);
+
+		/* Print the initial path with inode number and state. */
+		printf("0x%.16llx\t%s\t%s\t%8d\t%s\n",
+				(unsigned long long)ipath->ino,
+				move_blocks ? "BLOCK" : "---",
+				move_inode ? "INODE" : "---",
+				ipath->link_count, ipath->path);
+		ipath->link_count--;
+
+		/* Walk all the hard link paths and emit them. */
+		while (!list_empty(&ipath->path_list)) {
+			struct inode_path	*hpath;
+
+			hpath = list_first_entry(&ipath->path_list,
+					struct inode_path, path_list);
+			list_del(&hpath->path_list);
+			ipath->link_count--;
+
+			printf("\t\t\t\t\t%s\n", hpath->path);
+		}
+		if (ipath->link_count) {
+			printf(_("Link count anomaly: %d paths left over\n"),
+				ipath->link_count);
+		}
+		free(ipath);
+	} while (true);
+
+	/*
+	 * Any inodes remaining in the tree at this point indicate inodes whose
+	 * paths were not found. This will be unlinked but still open inodes or
+	 * lost inodes due to corruptions. Either way, a shrink will not succeed
+	 * until these inodes are removed from the filesystem.
+	 */
+	idx = 0;
+	do {
+		uint64_t	ino;
+
+
+		ret = radix_tree_gang_lookup(&inode_tree, (void **)&ino, idx, 1);
+		if (!ret) {
+			if (idx != 0)
+				ret = -EBUSY;
+			break;
+		}
+		idx = ino + 1;
+		printf(_("No path found for inode 0x%llx!\n"),
+				(unsigned long long)ino);
+		radix_tree_delete(&inode_tree, ino);
+	} while (true);
+
+	return ret;
+}
+
+/*
+ * Resolve inode numbers to paths via a directory tree walk.
+ */
+static int
+resolve_owner_f(
+	int	argc,
+	char	**argv)
+{
+	int	ret;
+
+	if (!inode_tree.rnode) {
+		fprintf(stderr,
+_("Inode list has not been populated. No inodes to resolve.\n"));
+		return 0;
+	}
+
+	ret = walk_mount(file->fs_path.fs_dir);
+	if (ret) {
+		fprintf(stderr,
+_("Failed to resolve all paths from mount point %s: %s\n"),
+			file->fs_path.fs_dir, strerror(-ret));
+		exitcode = 1;
+		return 0;
+	}
+
+	ret = list_inode_paths();
+	if (ret) {
+		fprintf(stderr,
+_("Failed to list all resolved paths from mount point %s: %s\n"),
+			file->fs_path.fs_dir, strerror(-ret));
+		exitcode = 1;
+		return 0;
+	}
+	return 0;
+}
+
+static void
+resolve_owner_help(void)
+{
+	printf(_(
+"\n"
+"Resolve inodes owning physical blocks in a given AG.\n"
+"This requires the find_owner command to be run first to populate the table\n"
+"of inodes that need to have their paths resolved.\n"
+"\n"));
+
+}
+
+void
+resolve_owner_init(void)
+{
+	resolve_owner_cmd.name = "resolve_owner";
+	resolve_owner_cmd.altname = "rown";
+	resolve_owner_cmd.cfunc = resolve_owner_f;
+	resolve_owner_cmd.argmin = 0;
+	resolve_owner_cmd.argmax = 0;
+	resolve_owner_cmd.args = "";
+	resolve_owner_cmd.flags = CMD_FLAG_ONESHOT;
+	resolve_owner_cmd.oneline = _("Resolve patches to inodes owning physical blocks in a given AG");
+	resolve_owner_cmd.help = resolve_owner_help;
+
+	add_command(&resolve_owner_cmd);
+}
+
diff --git a/spaceman/init.c b/spaceman/init.c
index dbeebcf97b9..8b0af14e566 100644
--- a/spaceman/init.c
+++ b/spaceman/init.c
@@ -10,6 +10,7 @@
 #include "input.h"
 #include "init.h"
 #include "libfrog/paths.h"
+#include "libfrog/radix-tree.h"
 #include "space.h"
 
 char	*progname;
@@ -37,6 +38,8 @@ init_commands(void)
 	health_init();
 	clearfree_init();
 	move_inode_init();
+	find_owner_init();
+	resolve_owner_init();
 }
 
 static int
@@ -71,6 +74,7 @@ init(
 	setlocale(LC_ALL, "");
 	bindtextdomain(PACKAGE, LOCALEDIR);
 	textdomain(PACKAGE);
+	radix_tree_init();
 
 	fs_table_initialise(0, NULL, 0, NULL);
 	while ((c = getopt(argc, argv, "c:p:V")) != EOF) {
diff --git a/spaceman/space.h b/spaceman/space.h
index 96c3c356f13..cffb1882153 100644
--- a/spaceman/space.h
+++ b/spaceman/space.h
@@ -39,5 +39,7 @@ extern void	clearfree_init(void);
 extern void	info_init(void);
 extern void	health_init(void);
 void		move_inode_init(void);
+void		find_owner_init(void);
+void		resolve_owner_init(void);
 
 #endif /* XFS_SPACEMAN_SPACE_H_ */

From patchwork Wed Dec 27 13:41:48 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508405
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 653E17EE
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:41:49 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="Vjzh0Bnl"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id E94B9C433C8;
	Mon,  1 Jan 2024 00:41:48 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069709;
	bh=yfGQUaC895VHyYtOre4lGz7TIPquYZrCNt7IctmAWQI=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=Vjzh0Bnl9+rAzvzVC6QfRjR1mDQOXXUt0zPTVfoAxS1AFHZuN2K+jixgBtUY5bIjT
	 QJCX3DrwhNvvkhhv605OFTJZs7vIbNXX3CfVShfXXlJQLjYeqjnWPzlsZjQRVOaq/t
	 H70E11HBhPaHzK2crhUAxEv1S1AJMDPNaZDHWp/sr9G1mvzpCFiw5GfsxmsgDv+4ee
	 xzZQ36suP1iVYKaMJPTwQeUusSF3gzdqRGhq7bbPFCpCbbdSjaHgLsl4FaDre4XZFd
	 gLt1fFFeM6+alxA1uP7gDnp63wM2kCOcxJGz/fKoIC97Nm29Lu+bYzfz0u0vW0Yvky
	 wTdMXioSCWpMQ==
Date: Sun, 31 Dec 2023 16:41:48 +9900
Subject: [PATCH 07/10] xfs_spaceman: wrap radix tree accesses in find_owner.c
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170405020418.1820796.17700189620459203140.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Wrap the raw radix tree accesses here so that we can provide an
alternate implementation on platforms where radix tree indices cannot
store a full 64-bit inode number.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 spaceman/Makefile     |    2 -
 spaceman/find_owner.c |   76 +++++++++------------------------
 spaceman/relocation.c |  114 +++++++++++++++++++++++++++++++++++++++++++++++++
 spaceman/relocation.h |   46 ++++++++++++++++++++
 4 files changed, 183 insertions(+), 55 deletions(-)
 create mode 100644 spaceman/relocation.c
 create mode 100644 spaceman/relocation.h

diff --git a/spaceman/Makefile b/spaceman/Makefile
index 19ce8862131..16a13e4bc19 100644
--- a/spaceman/Makefile
+++ b/spaceman/Makefile
@@ -7,7 +7,7 @@ include $(TOPDIR)/include/builddefs
 
 LTCOMMAND = xfs_spaceman
 HFILES = init.h space.h
-CFILES = find_owner.c info.c init.c file.c health.c move_inode.c prealloc.c trim.c
+CFILES = find_owner.c info.c init.c file.c health.c move_inode.c prealloc.c relocation.c trim.c
 LSRCFILES = xfs_info.sh
 
 LLDLIBS = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG) $(LIBHANDLE)
diff --git a/spaceman/find_owner.c b/spaceman/find_owner.c
index 7667d9d3660..4e03add75dc 100644
--- a/spaceman/find_owner.c
+++ b/spaceman/find_owner.c
@@ -15,19 +15,13 @@
 #include <linux/fsmap.h>
 #include "space.h"
 #include "input.h"
+#include "relocation.h"
 
 static cmdinfo_t find_owner_cmd;
 static cmdinfo_t resolve_owner_cmd;
 
 #define NR_EXTENTS 128
 
-static RADIX_TREE(inode_tree, 0);
-#define MOVE_INODE	0
-#define MOVE_BLOCKS	1
-#define INODE_PATH	2
-int inode_count;
-int inode_paths;
-
 static void
 track_inode_chunks(
 	struct xfs_fd	*xfd,
@@ -39,7 +33,7 @@ track_inode_chunks(
 	uint64_t	first_ino = cvt_agino_to_ino(xfd, agno,
 						cvt_agbno_to_agino(xfd, agbno));
 	uint64_t	num_inodes = cvt_b_to_inode_count(xfd, length);
-	int		i;
+	uint64_t	i;
 
 	printf(_("AG %d\tInode Range to move: 0x%llx - 0x%llx (length 0x%llx)\n"),
 			agno,
@@ -47,14 +41,8 @@ track_inode_chunks(
 			(unsigned long long)first_ino + num_inodes - 1,
 			(unsigned long long)length);
 
-	for (i = 0; i < num_inodes; i++) {
-		if (!radix_tree_lookup(&inode_tree, first_ino + i)) {
-			radix_tree_insert(&inode_tree, first_ino + i,
-					(void *)first_ino + i);
-			inode_count++;
-		}
-		radix_tree_tag_set(&inode_tree, first_ino + i, MOVE_INODE);
-	}
+	for (i = 0; i < num_inodes; i++)
+		set_reloc_iflag(first_ino + i, MOVE_INODE);
 }
 
 static void
@@ -65,7 +53,7 @@ track_inode(
 	uint64_t	physaddr,
 	uint64_t	length)
 {
-	if (radix_tree_tag_get(&inode_tree, owner, MOVE_BLOCKS))
+	if (test_reloc_iflag(owner, MOVE_BLOCKS))
 		return;
 
 	printf(_("AG %d\tInode 0x%llx: blocks to move to move: 0x%llx - 0x%llx\n"),
@@ -73,11 +61,8 @@ track_inode(
 			(unsigned long long)owner,
 			(unsigned long long)physaddr,
 			(unsigned long long)physaddr + length - 1);
-	if (!radix_tree_lookup(&inode_tree, owner)) {
-		radix_tree_insert(&inode_tree, owner, (void *)owner);
-		inode_count++;
-	}
-	radix_tree_tag_set(&inode_tree, owner, MOVE_BLOCKS);
+
+	set_reloc_iflag(owner, MOVE_BLOCKS);
 }
 
 static void
@@ -111,7 +96,7 @@ scan_ag(
 	h->fmr_offset = ULLONG_MAX;
 
 	while (true) {
-		printf("Inode count %d\n", inode_count);
+		printf("Inode count %llu\n", get_reloc_count());
 		ret = ioctl(xfd->fd, FS_IOC_GETFSMAP, fsmap);
 		if (ret < 0) {
 			fprintf(stderr, _("%s: FS_IOC_GETFSMAP [\"%s\"]: %s\n"),
@@ -245,18 +230,6 @@ find_owner_init(void)
 	add_command(&find_owner_cmd);
 }
 
-/*
- * for each dirent we get returned, look up the inode tree to see if it is an
- * inode we need to process. If it is, then replace the entry in the tree with
- * a structure containing the current path and mark the entry as resolved.
- */
-struct inode_path {
-	uint64_t		ino;
-	struct list_head	path_list;
-	uint32_t		link_count;
-	char			path[1];
-};
-
 static int
 resolve_owner_cb(
 	const char		*path,
@@ -266,14 +239,14 @@ resolve_owner_cb(
 {
 	struct inode_path	*ipath, *slot_ipath;
 	int			pathlen;
-	void			**slot;
+	struct inode_path	**slot;
 
 	/*
 	 * Lookup the slot rather than the entry so we can replace the contents
 	 * without another lookup later on.
 	 */
-	slot = radix_tree_lookup_slot(&inode_tree, stat->st_ino);
-	if (!slot || *slot == NULL)
+	slot = get_reloc_ipath_slot(stat->st_ino);
+	if (!slot)
 		return 0;
 
 	/* Could not get stat data? Fail! */
@@ -303,11 +276,10 @@ _("Aborting: Storing path %s for inode 0x%lx failed: %s\n"),
 	 * set the link count of the path to 1 and replace the slot contents
 	 * with our new_ipath.
 	 */
-	if (stat->st_ino == (uint64_t)*slot) {
+	if (*slot == UNLINKED_IPATH) {
 		ipath->link_count = 1;
 		*slot = ipath;
-		radix_tree_tag_set(&inode_tree, stat->st_ino, INODE_PATH);
-		inode_paths++;
+		set_reloc_iflag(stat->st_ino, INODE_PATH);
 		return 0;
 	}
 
@@ -351,18 +323,15 @@ list_inode_paths(void)
 		bool		move_blocks;
 		bool		move_inode;
 
-		ret = radix_tree_gang_lookup_tag(&inode_tree, (void **)&ipath,
-						idx, 1, INODE_PATH);
-		if (!ret)
+		ipath = get_next_reloc_ipath(idx);
+		if (!ipath)
 			break;
 		idx = ipath->ino + 1;
 
 		/* Grab status tags and remove from tree. */
-		move_blocks = radix_tree_tag_get(&inode_tree, ipath->ino,
-						MOVE_BLOCKS);
-		move_inode = radix_tree_tag_get(&inode_tree, ipath->ino,
-						MOVE_INODE);
-		radix_tree_delete(&inode_tree, ipath->ino);
+		move_blocks = test_reloc_iflag(ipath->ino, MOVE_BLOCKS);
+		move_inode = test_reloc_iflag(ipath->ino, MOVE_INODE);
+		forget_reloc_ino(ipath->ino);
 
 		/* Print the initial path with inode number and state. */
 		printf("0x%.16llx\t%s\t%s\t%8d\t%s\n",
@@ -400,9 +369,8 @@ list_inode_paths(void)
 	do {
 		uint64_t	ino;
 
-
-		ret = radix_tree_gang_lookup(&inode_tree, (void **)&ino, idx, 1);
-		if (!ret) {
+		ino = get_next_reloc_unlinked(idx);
+		if (!ino) {
 			if (idx != 0)
 				ret = -EBUSY;
 			break;
@@ -410,7 +378,7 @@ list_inode_paths(void)
 		idx = ino + 1;
 		printf(_("No path found for inode 0x%llx!\n"),
 				(unsigned long long)ino);
-		radix_tree_delete(&inode_tree, ino);
+		forget_reloc_ino(ino);
 	} while (true);
 
 	return ret;
@@ -426,7 +394,7 @@ resolve_owner_f(
 {
 	int	ret;
 
-	if (!inode_tree.rnode) {
+	if (!is_reloc_populated()) {
 		fprintf(stderr,
 _("Inode list has not been populated. No inodes to resolve.\n"));
 		return 0;
diff --git a/spaceman/relocation.c b/spaceman/relocation.c
new file mode 100644
index 00000000000..7c7d9a2b4b2
--- /dev/null
+++ b/spaceman/relocation.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+
+#include "libxfs.h"
+#include "libfrog/fsgeom.h"
+#include "libfrog/radix-tree.h"
+#include "libfrog/paths.h"
+#include "command.h"
+#include "init.h"
+#include "space.h"
+#include "input.h"
+#include "relocation.h"
+#include "handle.h"
+
+static unsigned long long inode_count;
+static unsigned long long inode_paths;
+
+unsigned long long
+get_reloc_count(void)
+{
+	return inode_count;
+}
+
+static RADIX_TREE(relocation_data, 0);
+
+bool
+is_reloc_populated(void)
+{
+	return relocation_data.rnode != NULL;
+}
+
+bool
+test_reloc_iflag(
+	uint64_t	ino,
+	unsigned int	flag)
+{
+	return radix_tree_tag_get(&relocation_data, ino, flag);
+}
+
+void
+set_reloc_iflag(
+	uint64_t	ino,
+	unsigned int	flag)
+{
+	if (!radix_tree_lookup(&relocation_data, ino)) {
+		radix_tree_insert(&relocation_data, ino, UNLINKED_IPATH);
+		if (flag != INODE_PATH)
+			inode_count++;
+	}
+	if (flag == INODE_PATH)
+		inode_paths++;
+
+	radix_tree_tag_set(&relocation_data, ino, flag);
+}
+
+struct inode_path *
+get_next_reloc_ipath(
+	uint64_t	ino)
+{
+	struct inode_path	*ipath;
+	int			ret;
+
+	ret = radix_tree_gang_lookup_tag(&relocation_data, (void **)&ipath,
+			ino, 1, INODE_PATH);
+	if (!ret)
+		return NULL;
+	return ipath;
+}
+
+uint64_t
+get_next_reloc_unlinked(
+	uint64_t	ino)
+{
+	uint64_t	next_ino;
+	int		ret;
+
+	ret = radix_tree_gang_lookup(&relocation_data, (void **)&next_ino, ino,
+			1);
+	if (!ret)
+		return 0;
+	return next_ino;
+}
+
+/*
+ * Return a pointer to a pointer where the caller can read or write a pointer
+ * to an inode path structure.
+ *
+ * The pointed-to pointer will be set to UNLINKED_IPATH if there is no ipath
+ * associated with this inode but the inode has been flagged for relocation.
+ *
+ * Returns NULL if the inode is not flagged for relocation.
+ */
+struct inode_path **
+get_reloc_ipath_slot(
+	uint64_t		ino)
+{
+	struct inode_path	**slot;
+
+	slot = (struct inode_path **)radix_tree_lookup_slot(&relocation_data,
+			ino);
+	if (!slot || *slot == NULL)
+		return NULL;
+	return slot;
+}
+
+void
+forget_reloc_ino(
+	uint64_t		ino)
+{
+	radix_tree_delete(&relocation_data, ino);
+}
diff --git a/spaceman/relocation.h b/spaceman/relocation.h
new file mode 100644
index 00000000000..f05a871915d
--- /dev/null
+++ b/spaceman/relocation.h
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef XFS_SPACEMAN_RELOCATION_H_
+#define XFS_SPACEMAN_RELOCATION_H_
+
+bool is_reloc_populated(void);
+unsigned long long get_reloc_count(void);
+
+/*
+ * Tags for the relocation_data tree that indicate what it contains and the
+ * discovery information that needed to be stored.
+ */
+#define MOVE_INODE	0
+#define MOVE_BLOCKS	1
+#define INODE_PATH	2
+
+bool test_reloc_iflag(uint64_t ino, unsigned int flag);
+void set_reloc_iflag(uint64_t ino, unsigned int flag);
+struct inode_path *get_next_reloc_ipath(uint64_t ino);
+uint64_t get_next_reloc_unlinked(uint64_t ino);
+struct inode_path **get_reloc_ipath_slot(uint64_t ino);
+void forget_reloc_ino(uint64_t ino);
+
+/*
+ * When the entry in the relocation_data tree is tagged with INODE_PATH, the
+ * entry contains a structure that tracks the discovered paths to the inode. If
+ * the inode has multiple hard links, then we chain each individual path found
+ * via the path_list and record the number of paths in the link_count entry.
+ */
+struct inode_path {
+	uint64_t		ino;
+	struct list_head	path_list;
+	uint32_t		link_count;
+	char			path[1];
+};
+
+/*
+ * Sentinel value for inodes that we have to move but haven't yet found a path
+ * to.
+ */
+#define UNLINKED_IPATH		((struct inode_path *)1)
+
+#endif /* XFS_SPACEMAN_RELOCATION_H_ */

From patchwork Wed Dec 27 13:42:04 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508406
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C6E377EF
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:42:04 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="Ec0f4u/n"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 983E0C433C8;
	Mon,  1 Jan 2024 00:42:04 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069724;
	bh=Q29ko0xklu7WgMsKn2ToxV7cTkrhXGB2diVpJ58EJFY=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=Ec0f4u/nr3+zq7QbegeaSMIwELCd3cscWnZ4vkUzgwhU1WxsIsABPaVTqUQNA6Okk
	 f4oKd7zjjm9unGtyYE4WOsy3n5U373CH3efKB07FyRfYgoSPExLzJRWCxZmRPWUYZv
	 62DX+psWKxG+fjq31LKgvuUwA5LeJ/0dMSl3KLdvl5GoWKLP7UgBSb8R+oZqms93I/
	 VDA920hGp+rrSiccAZbdQV+kCWwQn9iRgn0NDEXtEmbKm9vFUdT3Xx/2r7udstbYyS
	 qYdFIVKQHfEqRosRhoVIpnEO88ldtUqOyWVWGZ5Z4Cz/0z7YP2xTcN7AWOO4d0rlUy
	 TFoLuWhssAOpg==
Date: Sun, 31 Dec 2023 16:42:04 +9900
Subject: [PATCH 08/10] xfs_spaceman: port relocation structure to 32-bit
 systems
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170405020431.1820796.10977053037186099530.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

We can't use the radix tree to store relocation information on 32-bit
systems because unsigned longs are not large enough to hold 64-bit
inodes.  Use an avl64 tree instead.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 configure.ac          |    1 
 include/builddefs.in  |    1 
 m4/package_libcdev.m4 |   20 +++++
 spaceman/Makefile     |    4 +
 spaceman/relocation.c |  203 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 229 insertions(+)

diff --git a/configure.ac b/configure.ac
index 3b36d769eac..db6366dcdab 100644
--- a/configure.ac
+++ b/configure.ac
@@ -258,6 +258,7 @@ AC_HAVE_MEMFD_CLOEXEC
 AC_HAVE_MEMFD_NOEXEC_SEAL
 AC_HAVE_O_TMPFILE
 AC_HAVE_MKOSTEMP_CLOEXEC
+AC_USE_RADIX_TREE_FOR_INUMS
 
 AC_CONFIG_FILES([include/builddefs])
 AC_OUTPUT
diff --git a/include/builddefs.in b/include/builddefs.in
index 6668e9bbe8b..30c8f301bca 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -138,6 +138,7 @@ HAVE_MEMFD_CLOEXEC = @have_memfd_cloexec@
 HAVE_MEMFD_NOEXEC_SEAL = @have_memfd_noexec_seal@
 HAVE_O_TMPFILE = @have_o_tmpfile@
 HAVE_MKOSTEMP_CLOEXEC = @have_mkostemp_cloexec@
+USE_RADIX_TREE_FOR_INUMS = @use_radix_tree_for_inums@
 
 GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
 #	   -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl
diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4
index 2228697a7a3..003379ec2b8 100644
--- a/m4/package_libcdev.m4
+++ b/m4/package_libcdev.m4
@@ -612,3 +612,23 @@ AC_DEFUN([AC_HAVE_MKOSTEMP_CLOEXEC],
        AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)])
     AC_SUBST(have_mkostemp_cloexec)
   ])
+
+#
+# Check if the radix tree index (unsigned long) is large enough to hold a
+# 64-bit inode number
+#
+AC_DEFUN([AC_USE_RADIX_TREE_FOR_INUMS],
+  [ AC_MSG_CHECKING([if radix tree can store XFS inums])
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+#include <sys/param.h>
+#include <stdint.h>
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+    ]], [[
+         typedef uint64_t    xfs_ino_t;
+
+         BUILD_BUG_ON(sizeof(unsigned long) < sizeof(xfs_ino_t));
+         return 0;
+    ]])],[use_radix_tree_for_inums=yes
+       AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)])
+    AC_SUBST(use_radix_tree_for_inums)
+  ])
diff --git a/spaceman/Makefile b/spaceman/Makefile
index 16a13e4bc19..9c866339ac5 100644
--- a/spaceman/Makefile
+++ b/spaceman/Makefile
@@ -22,6 +22,10 @@ ifeq ($(HAVE_GETFSMAP),yes)
 CFILES += freesp.c clearfree.c
 endif
 
+ifeq ($(USE_RADIX_TREE_FOR_INUMS),yes)
+LCFLAGS += -DUSE_RADIX_TREE_FOR_INUMS
+endif
+
 default: depend $(LTCOMMAND)
 
 include $(BUILDRULES)
diff --git a/spaceman/relocation.c b/spaceman/relocation.c
index 7c7d9a2b4b2..1c0db6a1dab 100644
--- a/spaceman/relocation.c
+++ b/spaceman/relocation.c
@@ -6,7 +6,11 @@
 
 #include "libxfs.h"
 #include "libfrog/fsgeom.h"
+#ifdef USE_RADIX_TREE_FOR_INUMS
 #include "libfrog/radix-tree.h"
+#else
+#include "libfrog/avl64.h"
+#endif /* USE_RADIX_TREE_FOR_INUMS */
 #include "libfrog/paths.h"
 #include "command.h"
 #include "init.h"
@@ -24,6 +28,7 @@ get_reloc_count(void)
 	return inode_count;
 }
 
+#ifdef USE_RADIX_TREE_FOR_INUMS
 static RADIX_TREE(relocation_data, 0);
 
 bool
@@ -112,3 +117,201 @@ forget_reloc_ino(
 {
 	radix_tree_delete(&relocation_data, ino);
 }
+#else
+struct reloc_node {
+	struct avl64node	node;
+	uint64_t		ino;
+	struct inode_path	*ipath;
+	unsigned int		flags;
+};
+
+static uint64_t
+reloc_start(
+	struct avl64node	*node)
+{
+	struct reloc_node	*rln;
+
+	rln = container_of(node, struct reloc_node, node);
+	return rln->ino;
+}
+
+static uint64_t
+reloc_end(
+	struct avl64node	*node)
+{
+	struct reloc_node	*rln;
+
+	rln = container_of(node, struct reloc_node, node);
+	return rln->ino + 1;
+}
+
+static struct avl64ops reloc_ops = {
+	reloc_start,
+	reloc_end,
+};
+
+static struct avl64tree_desc	relocation_data = {
+	.avl_ops = &reloc_ops,
+};
+
+bool
+is_reloc_populated(void)
+{
+	return relocation_data.avl_firstino != NULL;
+}
+
+static inline struct reloc_node *
+reloc_lookup(
+	uint64_t		ino)
+{
+	avl64node_t		*node;
+
+	node = avl64_find(&relocation_data, ino);
+	if (!node)
+		return NULL;
+
+	return container_of(node, struct reloc_node, node);
+}
+
+static inline struct reloc_node *
+reloc_insert(
+	uint64_t		ino)
+{
+	struct reloc_node	*rln;
+	avl64node_t		*node;
+
+	rln = malloc(sizeof(struct reloc_node));
+	if (!rln)
+		return NULL;
+
+	rln->node.avl_nextino = NULL;
+	rln->ino = ino;
+	rln->ipath = UNLINKED_IPATH;
+	rln->flags = 0;
+
+	node = avl64_insert(&relocation_data, &rln->node);
+	if (node == NULL) {
+		free(rln);
+		return NULL;
+	}
+
+	return rln;
+}
+
+bool
+test_reloc_iflag(
+	uint64_t		ino,
+	unsigned int		flag)
+{
+	struct reloc_node	*rln;
+
+	rln = reloc_lookup(ino);
+	if (!rln)
+		return false;
+
+	return rln->flags & flag;
+}
+
+void
+set_reloc_iflag(
+	uint64_t		ino,
+	unsigned int		flag)
+{
+	struct reloc_node	*rln;
+
+	rln = reloc_lookup(ino);
+	if (!rln) {
+		rln = reloc_insert(ino);
+		if (!rln)
+			abort();
+		if (flag != INODE_PATH)
+			inode_count++;
+	}
+	if (flag == INODE_PATH)
+		inode_paths++;
+
+	rln->flags |= flag;
+}
+
+#define avl_for_each_range_safe(pos, n, l, first, last) \
+	for (pos = (first), n = pos->avl_nextino, l = (last)->avl_nextino; \
+			pos != (l); \
+			pos = n, n = pos ? pos->avl_nextino : NULL)
+
+struct inode_path *
+get_next_reloc_ipath(
+	uint64_t		ino)
+{
+	struct avl64node	*firstn;
+	struct avl64node	*lastn;
+	struct avl64node	*pos;
+	struct avl64node	*n;
+	struct avl64node	*l;
+	struct reloc_node	*rln;
+
+	avl64_findranges(&relocation_data, ino - 1, -1ULL, &firstn, &lastn);
+	if (firstn == NULL && lastn == NULL)
+		return NULL;
+
+	avl_for_each_range_safe(pos, n, l, firstn, lastn) {
+		rln = container_of(pos, struct reloc_node, node);
+
+		if (rln->flags & INODE_PATH)
+			return rln->ipath;
+	}
+
+	return NULL;
+}
+
+uint64_t
+get_next_reloc_unlinked(
+	uint64_t		ino)
+{
+	struct avl64node	*firstn;
+	struct avl64node	*lastn;
+	struct avl64node	*pos;
+	struct avl64node	*n;
+	struct avl64node	*l;
+	struct reloc_node	*rln;
+
+	avl64_findranges(&relocation_data, ino - 1, -1ULL, &firstn, &lastn);
+	if (firstn == NULL && lastn == NULL)
+		return 0;
+
+	avl_for_each_range_safe(pos, n, l, firstn, lastn) {
+		rln = container_of(pos, struct reloc_node, node);
+
+		if (!(rln->flags & INODE_PATH))
+			return rln->ino;
+	}
+
+	return 0;
+}
+
+struct inode_path **
+get_reloc_ipath_slot(
+	uint64_t		ino)
+{
+	struct reloc_node	*rln;
+
+	rln = reloc_lookup(ino);
+	if (!rln)
+		return NULL;
+
+	return &rln->ipath;
+}
+
+void
+forget_reloc_ino(
+	uint64_t		ino)
+{
+	struct reloc_node	*rln;
+
+	rln = reloc_lookup(ino);
+	if (!rln)
+		return;
+
+	avl64_delete(&relocation_data, &rln->node);
+	free(rln);
+}
+#endif /* USE_RADIX_TREE_FOR_INUMS */

From patchwork Wed Dec 27 13:42:19 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508407
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B1C1E20E8
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:42:20 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="KTl3jo0p"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 3AEBBC433B6;
	Mon,  1 Jan 2024 00:42:20 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069740;
	bh=jzDRhOnRQRqYJCpDxZMnGdgm0xRQqkz3VeitXjFF2pk=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=KTl3jo0pj8SsS+bpZrWKAzHM/4//MWdiBdcnPNU3sfkNPUi4emj/6faxCuVNMGL32
	 KAb7Ku49A79GHMRHaf79hZh0BubZZgbpnlDorsxPPkqYRJmiZZojM8kPwdPwPsoAsp
	 bcw4f8wmgiIYzaugilfbJo1rW1iXoxhfKd5b039HpUkNBywjoFie3ztmtHUxviezSC
	 vOhhHO03UZG0JPo1W+LQ0Sjs/ktsAP4dkJeEUF5fFxMraNkETQAtcheHoMy/q7nBfR
	 b8N1q8jMeg8qs1vjM0XkJmkzQFSJvO8MKehLAjp8fOZ40TgKCctj4sg/bqKy1prwOt
	 kO5wYZD+D1xzA==
Date: Sun, 31 Dec 2023 16:42:19 +9900
Subject: [PATCH 09/10] spaceman: relocate the contents of an AG
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: Dave Chinner <dchinner@redhat.com>, linux-xfs@vger.kernel.org
Message-ID: <170405020444.1820796.11110446647220627270.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Dave Chinner <dchinner@redhat.com>

Shrinking a filesystem needs to first remove all the active user
data and metadata from the AGs that are going to be lopped off the
filesystem. Before we can do this, we have to relocate this
information to a region of the filesystem that is going to be
retained.

We have a function to move an inode and all it's related
information to a specific AG, we have functions to find the
owners of all the information in an AG and we can find their paths.
This gives us all the information we need to relocate all the
objects in an AG we are going to remove via shrinking.

Firstly we scan the AG to be emptied to find the inodes that need to
be relocated, then we scan the directory structure to find all the
paths to those inodes that need to be moved. Then we iterate over
all the inodes to be moved attempting to move them to the lowest
numbers AGs.

When the destination AG fills up, we'll get ENOSPC from
the moving code and this is a trigger to bump the destination AG and
retry the move. If we haven't moved all the inodes and their data by
the time the destination reaches the source AG, then the entire
operation will fail with ENOSPC - there is not enough room in the
filesystem to empty the selected AG in preparation for a shrink.

This, once again, is not intended as an optimal or even guaranteed
way of emptying an AG for shrink. It simply provides the basic
algorithm and mechanisms we need to perform a shrink operation.
Improvements and optimisations will come in time, but we can't get
to an optimal solution without first having basic functionality in
place.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/fsgeom.h        |   10 ++
 man/man8/xfs_spaceman.8 |    8 ++
 spaceman/find_owner.c   |   32 +++---
 spaceman/init.c         |    1 
 spaceman/move_inode.c   |    7 +
 spaceman/relocation.c   |  234 +++++++++++++++++++++++++++++++++++++++++++++++
 spaceman/relocation.h   |    5 +
 spaceman/space.h        |    1 
 8 files changed, 280 insertions(+), 18 deletions(-)

diff --git a/libfrog/fsgeom.h b/libfrog/fsgeom.h
index 073a03e8681..f23184a0d34 100644
--- a/libfrog/fsgeom.h
+++ b/libfrog/fsgeom.h
@@ -202,6 +202,16 @@ cvt_daddr_to_agno(
 	return cvt_bb_to_off_fsbt(xfd, daddr) / xfd->fsgeom.agblocks;
 }
 
+/* Convert sparse filesystem block to AG Number */
+static inline uint32_t
+cvt_fsb_to_agno(
+	struct xfs_fd		*xfd,
+	uint64_t		fsbno)
+{
+	return fsbno >> xfd->agblklog;
+}
+
+
 /* Convert sector number to AG block number. */
 static inline uint32_t
 cvt_daddr_to_agbno(
diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8
index 6fef6949aa6..b6488810cfa 100644
--- a/man/man8/xfs_spaceman.8
+++ b/man/man8/xfs_spaceman.8
@@ -202,9 +202,17 @@ Wait for removal to complete.
 .TP
 .B print
 Display a list of all open files.
+.TP
+.BI "relocate \-a agno [ \-h agno ]"
+Empty out the given allocation group by moving file data elsewhere.
+The
+.B -h
+option specifies the highest allocation group into which we can move data.
+
 .TP
 .B resolve_owner
 Resolves space in the filesystem to file paths, maybe?
+
 .TP
 .B quit
 Exit
diff --git a/spaceman/find_owner.c b/spaceman/find_owner.c
index 4e03add75dc..3236290e3fd 100644
--- a/spaceman/find_owner.c
+++ b/spaceman/find_owner.c
@@ -9,10 +9,10 @@
 #include <linux/fiemap.h>
 #include "libfrog/fsgeom.h"
 #include "libfrog/radix-tree.h"
-#include "command.h"
-#include "init.h"
 #include "libfrog/paths.h"
 #include <linux/fsmap.h>
+#include "command.h"
+#include "init.h"
 #include "space.h"
 #include "input.h"
 #include "relocation.h"
@@ -65,8 +65,8 @@ track_inode(
 	set_reloc_iflag(owner, MOVE_BLOCKS);
 }
 
-static void
-scan_ag(
+int
+find_relocation_targets(
 	xfs_agnumber_t		agno)
 {
 	struct fsmap_head	*fsmap;
@@ -80,8 +80,7 @@ scan_ag(
 	fsmap = malloc(fsmap_sizeof(NR_EXTENTS));
 	if (!fsmap) {
 		fprintf(stderr, _("%s: fsmap malloc failed.\n"), progname);
-		exitcode = 1;
-		return;
+		return -ENOMEM;
 	}
 
 	memset(fsmap, 0, sizeof(*fsmap));
@@ -102,8 +101,7 @@ scan_ag(
 			fprintf(stderr, _("%s: FS_IOC_GETFSMAP [\"%s\"]: %s\n"),
 				progname, file->name, strerror(errno));
 			free(fsmap);
-			exitcode = 1;
-			return;
+			return -errno;
 		}
 
 		/* No more extents to map, exit */
@@ -148,6 +146,7 @@ scan_ag(
 	}
 
 	free(fsmap);
+	return 0;
 }
 
 /*
@@ -159,6 +158,7 @@ find_owner_f(
 	char			**argv)
 {
 	xfs_agnumber_t		agno = -1;
+	int			ret;
 	int			c;
 
 	while ((c = getopt(argc, argv, "a:")) != EOF) {
@@ -198,7 +198,9 @@ _("Filesystem at %s does not have reverse mapping enabled. Aborting.\n"),
 		return 0;
 	}
 
-	scan_ag(agno);
+	ret = find_relocation_targets(agno);
+	if (ret)
+		exitcode = 1;
 	return 0;
 }
 
@@ -299,8 +301,8 @@ _("Aborting: Storing path %s for inode 0x%lx failed: %s\n"),
  * This should be parallelised - pass subdirs off to a work queue, have the
  * work queue processes subdirs, queueing more subdirs to work on.
  */
-static int
-walk_mount(
+int
+resolve_target_paths(
 	const char	*mntpt)
 {
 	int		ret;
@@ -361,9 +363,9 @@ list_inode_paths(void)
 
 	/*
 	 * Any inodes remaining in the tree at this point indicate inodes whose
-	 * paths were not found. This will be unlinked but still open inodes or
-	 * lost inodes due to corruptions. Either way, a shrink will not succeed
-	 * until these inodes are removed from the filesystem.
+	 * paths were not found. This will be free inodes or unlinked but still
+	 * open inodes. Either way, a shrink will not succeed until these inodes
+	 * are removed from the filesystem.
 	 */
 	idx = 0;
 	do {
@@ -400,7 +402,7 @@ _("Inode list has not been populated. No inodes to resolve.\n"));
 		return 0;
 	}
 
-	ret = walk_mount(file->fs_path.fs_dir);
+	ret = resolve_target_paths(file->fs_path.fs_dir);
 	if (ret) {
 		fprintf(stderr,
 _("Failed to resolve all paths from mount point %s: %s\n"),
diff --git a/spaceman/init.c b/spaceman/init.c
index 8b0af14e566..cfe1b96fb66 100644
--- a/spaceman/init.c
+++ b/spaceman/init.c
@@ -40,6 +40,7 @@ init_commands(void)
 	move_inode_init();
 	find_owner_init();
 	resolve_owner_init();
+	relocate_init();
 }
 
 static int
diff --git a/spaceman/move_inode.c b/spaceman/move_inode.c
index 6238a48e948..9190b1d3ab2 100644
--- a/spaceman/move_inode.c
+++ b/spaceman/move_inode.c
@@ -12,6 +12,7 @@
 #include "space.h"
 #include "input.h"
 #include "handle.h"
+#include "relocation.h"
 
 #include <linux/fiemap.h>
 #include <linux/falloc.h>
@@ -404,8 +405,8 @@ exchange_inodes(
 	return 0;
 }
 
-static int
-move_file_to_ag(
+int
+relocate_file_to_ag(
 	const char		*mnt,
 	const char		*path,
 	struct xfs_fd		*xfd,
@@ -511,7 +512,7 @@ _("Destination AG %d does not exist. Filesystem only has %d AGs\n"),
 	}
 
 	if (S_ISREG(st.st_mode)) {
-		ret = move_file_to_ag(file->fs_path.fs_dir, file->name,
+		ret = relocate_file_to_ag(file->fs_path.fs_dir, file->name,
 				&file->xfd, agno);
 	} else {
 		fprintf(stderr, _("Unsupported: %s is not a regular file.\n"),
diff --git a/spaceman/relocation.c b/spaceman/relocation.c
index 1c0db6a1dab..7b125cc0ae1 100644
--- a/spaceman/relocation.c
+++ b/spaceman/relocation.c
@@ -315,3 +315,237 @@ forget_reloc_ino(
 	free(rln);
 }
 #endif /* USE_RADIX_TREE_FOR_INUMS */
+
+static struct cmdinfo relocate_cmd;
+
+static int
+relocate_targets_to_ag(
+	const char		*mnt,
+	xfs_agnumber_t		dst_agno)
+{
+	struct inode_path	*ipath;
+	uint64_t		idx = 0;
+	int			ret = 0;
+
+	do {
+		struct xfs_fd	xfd = {0};
+		struct stat	st;
+
+		/* lookup first relocation target */
+		ipath = get_next_reloc_ipath(idx);
+		if (!ipath)
+			break;
+
+		/* XXX: don't handle hard link cases yet */
+		if (ipath->link_count > 1) {
+			fprintf(stderr,
+		"FIXME! Skipping hardlinked inode at path %s\n",
+				ipath->path);
+			goto next;
+		}
+
+
+		ret = stat(ipath->path, &st);
+		if (ret) {
+			fprintf(stderr, _("stat(%s) failed: %s\n"),
+				ipath->path, strerror(errno));
+			goto next;
+		}
+
+		if (!S_ISREG(st.st_mode)) {
+			fprintf(stderr,
+		_("FIXME! Skipping %s: not a regular file.\n"),
+				ipath->path);
+			goto next;
+		}
+
+		ret = xfd_open(&xfd, ipath->path, O_RDONLY);
+		if (ret) {
+			fprintf(stderr, _("xfd_open(%s) failed: %s\n"),
+				ipath->path, strerror(-ret));
+			goto next;
+		}
+
+		/* move to destination AG */
+		ret = relocate_file_to_ag(mnt, ipath->path, &xfd, dst_agno);
+		xfd_close(&xfd);
+
+		/*
+		 * If the destination AG has run out of space, we do not remove
+		 * this inode from relocation data so it will be immediately
+		 * retried in the next AG. Other errors will be fatal.
+		 */
+		if (ret < 0)
+			return ret;
+next:
+		/* remove from relocation data */
+		idx = ipath->ino + 1;
+		forget_reloc_ino(ipath->ino);
+	} while (ret != -ENOSPC);
+
+	return ret;
+}
+
+static int
+relocate_targets(
+	const char		*mnt,
+	xfs_agnumber_t		highest_agno)
+{
+	xfs_agnumber_t		dst_agno = 0;
+	int			ret;
+
+	for (dst_agno = 0; dst_agno <= highest_agno; dst_agno++) {
+		ret = relocate_targets_to_ag(mnt, dst_agno);
+		if (ret == -ENOSPC)
+			continue;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Relocate all the user objects in an AG to lower numbered AGs.
+ */
+static int
+relocate_f(
+	int		argc,
+	char		**argv)
+{
+	xfs_agnumber_t	target_agno = -1;
+	xfs_agnumber_t	highest_agno = -1;
+	xfs_agnumber_t	log_agno;
+	void		*fshandle;
+	size_t		fshdlen;
+	int		c;
+	int		ret;
+
+	while ((c = getopt(argc, argv, "a:h:")) != EOF) {
+		switch (c) {
+		case 'a':
+			target_agno = cvt_u32(optarg, 10);
+			if (errno) {
+				fprintf(stderr, _("bad target agno value %s\n"),
+					optarg);
+				return command_usage(&relocate_cmd);
+			}
+			break;
+		case 'h':
+			highest_agno = cvt_u32(optarg, 10);
+			if (errno) {
+				fprintf(stderr, _("bad highest agno value %s\n"),
+					optarg);
+				return command_usage(&relocate_cmd);
+			}
+			break;
+		default:
+			return command_usage(&relocate_cmd);
+		}
+	}
+
+	if (optind != argc)
+		return command_usage(&relocate_cmd);
+
+	if (target_agno == -1) {
+		fprintf(stderr, _("Target AG must be specified!\n"));
+		return command_usage(&relocate_cmd);
+	}
+
+	log_agno = cvt_fsb_to_agno(&file->xfd, file->xfd.fsgeom.logstart);
+	if (target_agno <= log_agno) {
+		fprintf(stderr,
+_("Target AG %d must be higher than the journal AG (AG %d). Aborting.\n"),
+			target_agno, log_agno);
+		goto out_fail;
+	}
+
+	if (target_agno >= file->xfd.fsgeom.agcount) {
+		fprintf(stderr,
+_("Target AG %d does not exist. Filesystem only has %d AGs\n"),
+			target_agno, file->xfd.fsgeom.agcount);
+		goto out_fail;
+	}
+
+	if (highest_agno == -1)
+		highest_agno = target_agno - 1;
+
+	if (highest_agno >= target_agno) {
+		fprintf(stderr,
+_("Highest destination AG %d must be less than target AG %d. Aborting.\n"),
+			highest_agno, target_agno);
+		goto out_fail;
+	}
+
+	if (is_reloc_populated()) {
+		fprintf(stderr,
+_("Relocation data populated from previous commands. Aborting.\n"));
+		goto out_fail;
+	}
+
+	/* this is so we can use fd_to_handle() later on */
+	ret = path_to_fshandle(file->fs_path.fs_dir, &fshandle, &fshdlen);
+	if (ret < 0) {
+		fprintf(stderr, _("Cannot get fshandle for mount %s: %s\n"),
+			file->fs_path.fs_dir, strerror(errno));
+		goto out_fail;
+	}
+
+	ret = find_relocation_targets(target_agno);
+	if (ret) {
+		fprintf(stderr,
+_("Failure during target discovery. Aborting.\n"));
+		goto out_fail;
+	}
+
+	ret = resolve_target_paths(file->fs_path.fs_dir);
+	if (ret) {
+		fprintf(stderr,
+_("Failed to resolve all paths from mount point %s: %s\n"),
+			file->fs_path.fs_dir, strerror(-ret));
+		goto out_fail;
+	}
+
+	ret = relocate_targets(file->fs_path.fs_dir, highest_agno);
+	if (ret) {
+		fprintf(stderr,
+_("Failed to relocate all targets out of AG %d: %s\n"),
+			target_agno, strerror(-ret));
+		goto out_fail;
+	}
+
+	return 0;
+out_fail:
+	exitcode = 1;
+	return 0;
+}
+
+static void
+relocate_help(void)
+{
+	printf(_(
+"\n"
+"Relocate all the user data and metadata in an AG.\n"
+"\n"
+"This function will discover all the relocatable objects in a single AG and\n"
+"move them to a lower AG as preparation for a shrink operation.\n"
+"\n"
+"	-a <agno>	Allocation group to empty\n"
+"	-h <agno>	Highest target AG allowed to relocate into\n"
+"\n"));
+
+}
+
+void
+relocate_init(void)
+{
+	relocate_cmd.name = "relocate";
+	relocate_cmd.altname = "relocate";
+	relocate_cmd.cfunc = relocate_f;
+	relocate_cmd.argmin = 2;
+	relocate_cmd.argmax = 4;
+	relocate_cmd.args = "-a agno [-h agno]";
+	relocate_cmd.flags = CMD_FLAG_ONESHOT;
+	relocate_cmd.oneline = _("Relocate data in an AG.");
+	relocate_cmd.help = relocate_help;
+
+	add_command(&relocate_cmd);
+}
diff --git a/spaceman/relocation.h b/spaceman/relocation.h
index f05a871915d..d4c71b7bb7f 100644
--- a/spaceman/relocation.h
+++ b/spaceman/relocation.h
@@ -43,4 +43,9 @@ struct inode_path {
  */
 #define UNLINKED_IPATH		((struct inode_path *)1)
 
+int find_relocation_targets(xfs_agnumber_t agno);
+int relocate_file_to_ag(const char *mnt, const char *path, struct xfs_fd *xfd,
+			xfs_agnumber_t agno);
+int resolve_target_paths(const char *mntpt);
+
 #endif /* XFS_SPACEMAN_RELOCATION_H_ */
diff --git a/spaceman/space.h b/spaceman/space.h
index cffb1882153..8c2b3e5464d 100644
--- a/spaceman/space.h
+++ b/spaceman/space.h
@@ -41,5 +41,6 @@ extern void	health_init(void);
 void		move_inode_init(void);
 void		find_owner_init(void);
 void		resolve_owner_init(void);
+void		relocate_init(void);
 
 #endif /* XFS_SPACEMAN_SPACE_H_ */

From patchwork Wed Dec 27 13:42:35 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13508408
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 55C1D6123
	for <linux-xfs@vger.kernel.org>; Mon,  1 Jan 2024 00:42:36 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="mnY+rVKr"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id C9642C433C7;
	Mon,  1 Jan 2024 00:42:35 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704069755;
	bh=25aE4LSLY2OPnVgzLGHUZpo5VSe9M+ammHFi7tO4hj8=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=mnY+rVKrOefoHmHvayCY9/4/YQLFgmkNQRK0W1KntOsF73VLS8+oKLyR1LN8CM4vV
	 Y7AEI1dYoggCb9WWy4clol9hvw1WFSG6E7/phNQqqwqmwJfHak4Ri2boY8Pys30kn5
	 uHfWTHngVqTnauJirJHwndKP4dtv8rzz3S3JXkEnkIPcGousxq4BBZhFD+zLN2YHHx
	 1z2hOT1LhSDZqyBIXjS8xeXMcOpcjz9AeUqQRKj07dtifffdDrKycZ4nnTA8czVP1s
	 MKkkmKxSflojpJn6eMEHBZ38XnTepkBH7mkWw714Zmjh6mfKgh8VxdWSpZLgqJnFFP
	 XFrwwz11lb/vA==
Date: Sun, 31 Dec 2023 16:42:35 +9900
Subject: [PATCH 10/10] spaceman: move inodes with hardlinks
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: Dave Chinner <dchinner@redhat.com>, linux-xfs@vger.kernel.org
Message-ID: <170405020458.1820796.14241728569056483133.stgit@frogsfrogsfrogs>
In-Reply-To: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
References: <170405020316.1820796.451112156000559887.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Dave Chinner <dchinner@redhat.com>

When a inode to be moved to a different AG has multiple hard links,
we need to "move" all the hard links, too. To do this, we need to
create temporary hardlinks to the new file, and then use rename
exchange to swap all the hardlinks that point to the old inode
with new hardlinks that point to the new inode.

We already know that an inode has hard links via the path discovery,
and we can check it against the link count that is reported for the
inode before we start building the link farm.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 spaceman/find_owner.c |   13 +----
 spaceman/move_inode.c |  119 +++++++++++++++++++++++++++++++++++++++++++++----
 spaceman/relocation.c |   35 ++++++++++----
 spaceman/relocation.h |    6 ++
 4 files changed, 140 insertions(+), 33 deletions(-)

diff --git a/spaceman/find_owner.c b/spaceman/find_owner.c
index 3236290e3fd..2b3dc653271 100644
--- a/spaceman/find_owner.c
+++ b/spaceman/find_owner.c
@@ -240,7 +240,6 @@ resolve_owner_cb(
 	struct FTW		*data)
 {
 	struct inode_path	*ipath, *slot_ipath;
-	int			pathlen;
 	struct inode_path	**slot;
 
 	/*
@@ -260,17 +259,9 @@ _("Failed to obtain stat(2) information from path %s. Aborting\n"),
 	}
 
 	/* Allocate a new inode path and record the path in it. */
-	pathlen = strlen(path);
-	ipath = calloc(1, sizeof(*ipath) + pathlen + 1);
-	if (!ipath) {
-		fprintf(stderr,
-_("Aborting: Storing path %s for inode 0x%lx failed: %s\n"),
-			path, stat->st_ino, strerror(ENOMEM));
+	ipath = ipath_alloc(path, stat);
+	if (!ipath)
 		return -ENOMEM;
-	}
-	INIT_LIST_HEAD(&ipath->path_list);
-	memcpy(&ipath->path[0], path, pathlen);
-	ipath->ino = stat->st_ino;
 
 	/*
 	 * If the slot contains the inode number we just looked up, then we
diff --git a/spaceman/move_inode.c b/spaceman/move_inode.c
index 9190b1d3ab2..e8e3a64cb39 100644
--- a/spaceman/move_inode.c
+++ b/spaceman/move_inode.c
@@ -36,12 +36,14 @@ create_tmpfile(
 	struct xfs_fd	*xfd,
 	xfs_agnumber_t	agno,
 	char		**tmpfile,
-	int		*tmpfd)
+	int		*tmpfd,
+	int		link_count)
 {
 	char		name[PATH_MAX + 1];
+	char		linkname[PATH_MAX + 1];
 	mode_t		mask;
 	int		fd;
-	int		i;
+	int		i, j;
 	int		ret;
 
 	/* construct tmpdir */
@@ -105,14 +107,36 @@ create_tmpfile(
 		fprintf(stderr, _("cannot create tmpfile: %s: %s\n"),
 		       name, strerror(errno));
 		ret = -errno;
+		goto out_cleanup_dir;
 	}
 
+	/* Create hard links to temporary file. */
+	for (j = link_count; j > 1; i--) {
+		snprintf(linkname, PATH_MAX, "%s/.spaceman/dir%d/tmpfile.%d.hardlink.%d", mnt, i, getpid(), j);
+		ret = link(name, linkname);
+		if (ret < 0) {
+			fprintf(stderr, _("cannot create hardlink: %s: %s\n"),
+			       linkname, strerror(errno));
+			ret = -errno;
+			goto out_cleanup_links;
+		}
+	}
+
+
 	/* return name and fd */
 	(void)umask(mask);
 	*tmpfd = fd;
 	*tmpfile = strdup(name);
 
 	return 0;
+
+out_cleanup_links:
+	for (; j <= link_count; j++) {
+		snprintf(linkname, PATH_MAX, "%s/.spaceman/dir%d/tmpfile.%d.hardlink.%d", mnt, i, getpid(), j);
+		unlink(linkname);
+	}
+	close(fd);
+	unlink(name);
 out_cleanup_dir:
 	snprintf(name, PATH_MAX, "%s/.spaceman", mnt);
 	rmdir(name);
@@ -405,21 +429,53 @@ exchange_inodes(
 	return 0;
 }
 
+static int
+exchange_hardlinks(
+	struct inode_path	*ipath,
+	const char		*tmpfile)
+{
+	char			linkname[PATH_MAX];
+	struct inode_path	*linkpath;
+	int			i = 2;
+	int			ret;
+
+	list_for_each_entry(linkpath, &ipath->path_list, path_list) {
+		if (i++ > ipath->link_count) {
+			fprintf(stderr, "ipath link count mismatch!\n");
+			return 0;
+		}
+
+		snprintf(linkname, PATH_MAX, "%s.hardlink.%d", tmpfile, i);
+		ret = renameat2(AT_FDCWD, linkname,
+				AT_FDCWD, linkpath->path, RENAME_EXCHANGE);
+		if (ret) {
+			fprintf(stderr,
+		"failed to exchange hard link %s with %s: %s\n",
+				linkname, linkpath->path, strerror(errno));
+			return -errno;
+		}
+	}
+	return 0;
+}
+
 int
 relocate_file_to_ag(
 	const char		*mnt,
-	const char		*path,
+	struct inode_path	*ipath,
 	struct xfs_fd		*xfd,
 	xfs_agnumber_t		agno)
 {
 	int			ret;
 	int			tmpfd = -1;
 	char			*tmpfile = NULL;
+	int			i;
 
-	fprintf(stderr, "move mnt %s, path %s, agno %d\n", mnt, path, agno);
+	fprintf(stderr, "move mnt %s, path %s, agno %d\n",
+			mnt, ipath->path, agno);
 
 	/* create temporary file in agno */
-	ret = create_tmpfile(mnt, xfd, agno, &tmpfile, &tmpfd);
+	ret = create_tmpfile(mnt, xfd, agno, &tmpfile, &tmpfd,
+				ipath->link_count);
 	if (ret)
 		return ret;
 
@@ -444,12 +500,28 @@ relocate_file_to_ag(
 		goto out_cleanup;
 
 	/* swap the inodes over */
-	ret = exchange_inodes(xfd, tmpfd, tmpfile, path);
+	ret = exchange_inodes(xfd, tmpfd, tmpfile, ipath->path);
+	if (ret)
+		goto out_cleanup;
+
+	/* swap the hard links over */
+	ret = exchange_hardlinks(ipath, tmpfile);
+	if (ret)
+		goto out_cleanup;
 
 out_cleanup:
 	if (ret == -1)
 		ret = -errno;
 
+	/* remove old hard links */
+	for (i = 2; i <= ipath->link_count; i++) {
+		char linkname[PATH_MAX + 256]; // anti-warning-crap
+
+		snprintf(linkname, PATH_MAX + 256, "%s.hardlink.%d", tmpfile, i);
+		unlink(linkname);
+	}
+
+	/* remove tmpfile */
 	close(tmpfd);
 	if (tmpfile)
 		unlink(tmpfile);
@@ -458,11 +530,32 @@ relocate_file_to_ag(
 	return ret;
 }
 
+static int
+build_ipath(
+	const char		*path,
+	struct stat		*st,
+	struct inode_path	**ipathp)
+{
+	struct inode_path	*ipath;
+
+	*ipathp = NULL;
+
+	ipath = ipath_alloc(path, st);
+	if (!ipath)
+		return -ENOMEM;
+
+	/* we only move a single path with move_inode */
+	ipath->link_count = 1;
+	*ipathp = ipath;
+	return 0;
+}
+
 static int
 move_inode_f(
 	int			argc,
 	char			**argv)
 {
+	struct inode_path	*ipath = NULL;
 	void			*fshandle;
 	size_t			fshdlen;
 	xfs_agnumber_t		agno = 0;
@@ -511,24 +604,30 @@ _("Destination AG %d does not exist. Filesystem only has %d AGs\n"),
 		goto exit_fail;
 	}
 
-	if (S_ISREG(st.st_mode)) {
-		ret = relocate_file_to_ag(file->fs_path.fs_dir, file->name,
-				&file->xfd, agno);
-	} else {
+	if (!S_ISREG(st.st_mode)) {
 		fprintf(stderr, _("Unsupported: %s is not a regular file.\n"),
 			file->name);
 		goto exit_fail;
 	}
 
+	ret = build_ipath(file->name, &st, &ipath);
+	if (ret)
+		goto exit_fail;
+
+	ret = relocate_file_to_ag(file->fs_path.fs_dir, ipath,
+				&file->xfd, agno);
 	if (ret) {
 		fprintf(stderr, _("Failed to move inode to AG %d: %s\n"),
 			agno, strerror(-ret));
 		goto exit_fail;
 	}
+	free(ipath);
 	fshandle_destroy();
 	return 0;
 
 exit_fail:
+	if (ipath)
+		free(ipath);
 	fshandle_destroy();
 	exitcode = 1;
 	return 0;
diff --git a/spaceman/relocation.c b/spaceman/relocation.c
index 7b125cc0ae1..b0960272168 100644
--- a/spaceman/relocation.c
+++ b/spaceman/relocation.c
@@ -318,6 +318,30 @@ forget_reloc_ino(
 
 static struct cmdinfo relocate_cmd;
 
+struct inode_path *
+ipath_alloc(
+	const char		*path,
+	const struct stat	*stat)
+{
+	struct inode_path	*ipath;
+	int			pathlen = strlen(path);
+
+	/* Allocate a new inode path and record the path in it. */
+	ipath = calloc(1, sizeof(*ipath) + pathlen + 1);
+	if (!ipath) {
+		fprintf(stderr,
+_("Failed to allocate ipath %s for inode 0x%llx failed: %s\n"),
+			path, (unsigned long long)stat->st_ino,
+			strerror(-errno));
+		return NULL;
+	}
+	INIT_LIST_HEAD(&ipath->path_list);
+	memcpy(&ipath->path[0], path, pathlen);
+	ipath->ino = stat->st_ino;
+
+	return ipath;
+}
+
 static int
 relocate_targets_to_ag(
 	const char		*mnt,
@@ -336,15 +360,6 @@ relocate_targets_to_ag(
 		if (!ipath)
 			break;
 
-		/* XXX: don't handle hard link cases yet */
-		if (ipath->link_count > 1) {
-			fprintf(stderr,
-		"FIXME! Skipping hardlinked inode at path %s\n",
-				ipath->path);
-			goto next;
-		}
-
-
 		ret = stat(ipath->path, &st);
 		if (ret) {
 			fprintf(stderr, _("stat(%s) failed: %s\n"),
@@ -367,7 +382,7 @@ relocate_targets_to_ag(
 		}
 
 		/* move to destination AG */
-		ret = relocate_file_to_ag(mnt, ipath->path, &xfd, dst_agno);
+		ret = relocate_file_to_ag(mnt, ipath, &xfd, dst_agno);
 		xfd_close(&xfd);
 
 		/*
diff --git a/spaceman/relocation.h b/spaceman/relocation.h
index d4c71b7bb7f..2c807aa678e 100644
--- a/spaceman/relocation.h
+++ b/spaceman/relocation.h
@@ -43,9 +43,11 @@ struct inode_path {
  */
 #define UNLINKED_IPATH		((struct inode_path *)1)
 
+struct inode_path *ipath_alloc(const char *path, const struct stat *st);
+
 int find_relocation_targets(xfs_agnumber_t agno);
-int relocate_file_to_ag(const char *mnt, const char *path, struct xfs_fd *xfd,
-			xfs_agnumber_t agno);
+int relocate_file_to_ag(const char *mnt, struct inode_path *ipath,
+			struct xfs_fd *xfd, xfs_agnumber_t agno);
 int resolve_target_paths(const char *mntpt);
 
 #endif /* XFS_SPACEMAN_RELOCATION_H_ */